From 87f6013c45b6044a983971b5e2f044155a10913b Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Thu, 15 Jun 2023 14:52:32 +0800
Subject: [PATCH 01/14] feat(cmake): add cmake of cinn

---
 CMakeLists.txt                   |  41 ++-
 cmake/cinn.cmake                 | 299 ++++++++++++++++++++
 cmake/cinn/config.cmake          |  11 +
 cmake/cinn/core.cmake            | 459 +++++++++++++++++++++++++++++++
 cmake/cinn/export.map            |   6 +
 cmake/cinn/external/absl.cmake   |  78 ++++++
 cmake/cinn/external/boost.cmake  |  65 +++++
 cmake/cinn/external/ginac.cmake  |  36 +++
 cmake/cinn/external/isl.cmake    |  32 +++
 cmake/cinn/external/jitify.cmake |  28 ++
 cmake/cinn/external/llvm.cmake   | 129 +++++++++
 cmake/cinn/external/openmp.cmake |  37 +++
 cmake/cinn/llvm.cmake            |  86 ++++++
 cmake/cinn/nvrtc.cmake           |  24 ++
 cmake/cinn/nvtx.cmake            |  53 ++++
 cmake/cinn/system.cmake          | 106 +++++++
 cmake/cinn/version.cmake         |  76 +++++
 cmake/external/cinn.cmake        |  96 -------
 cmake/external/pybind11.cmake    |   2 +-
 cmake/third_party.cmake          |  37 ++-
 python/CMakeLists.txt            |  40 +++
 python/setup_cinn.py.in          | 181 ++++++++++++
 22 files changed, 1803 insertions(+), 119 deletions(-)
 create mode 100644 cmake/cinn.cmake
 create mode 100755 cmake/cinn/config.cmake
 create mode 100644 cmake/cinn/core.cmake
 create mode 100644 cmake/cinn/export.map
 create mode 100644 cmake/cinn/external/absl.cmake
 create mode 100644 cmake/cinn/external/boost.cmake
 create mode 100644 cmake/cinn/external/ginac.cmake
 create mode 100644 cmake/cinn/external/isl.cmake
 create mode 100644 cmake/cinn/external/jitify.cmake
 create mode 100644 cmake/cinn/external/llvm.cmake
 create mode 100644 cmake/cinn/external/openmp.cmake
 create mode 100644 cmake/cinn/llvm.cmake
 create mode 100644 cmake/cinn/nvrtc.cmake
 create mode 100644 cmake/cinn/nvtx.cmake
 create mode 100644 cmake/cinn/system.cmake
 create mode 100644 cmake/cinn/version.cmake
 delete mode 100644 cmake/external/cinn.cmake
 create mode 100644 python/setup_cinn.py.in

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a75d8c35552d..9d354485811f0 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,6 +63,16 @@ option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
 option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
 option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
 option(WITH_SHARED_PHI "Compile PaddlePaddle with SHARED LIB of PHI" OFF)
+option(CINN_ONLY "Compile CINN only in Paddle" OFF)
+
+find_package(Git REQUIRED)
+
+# config GIT_URL with github mirrors to speed up dependent repos clone
+option(GIT_URL "Git URL to clone dependent repos" ${GIT_URL})
+if(NOT GIT_URL)
+  set(GIT_URL "https://github.com")
+endif()
+
 # Note(zhouwei): It use option above, so put here
 include(init)
 include(generic) # simplify cmake module
@@ -112,7 +122,7 @@ endif()
 
 if(WIN32)
   option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
-  message("Build static library of PHI")
+
   set(CMAKE_SUPPRESS_REGENERATION ON)
   set(CMAKE_STATIC_LIBRARY_PREFIX lib)
 
@@ -229,13 +239,6 @@ else()
   )
 endif()
 
-find_package(Git REQUIRED)
-
-# config GIT_URL with github mirrors to speed up dependent repos clone
-option(GIT_URL "Git URL to clone dependent repos" ${GIT_URL})
-if(NOT GIT_URL)
-  set(GIT_URL "https://github.com")
-endif()
 
 find_package(Threads REQUIRED)
 
@@ -569,6 +572,28 @@ include(third_party
 
 include(flags) # set paddle compile flags
 
+#------------- cinn cmake config start --------------
+
+if(WITH_CINN)
+  message(STATUS "Compile Paddle with CINN.")
+  include(cmake/cinn.cmake)
+  add_definitions(-DPADDLE_WITH_CINN)
+  if(WITH_GPU)
+    add_definitions(-DCINN_WITH_CUDA)
+    add_definitions(-DCINN_WITH_CUDNN)
+  endif()
+
+  if(CINN_ONLY)
+    if(WITH_PYTHON)
+      add_subdirectory(python)
+    endif()
+    add_subdirectory(test)
+    return()
+  endif()
+endif()
+
+#------------- cinn cmake config end --------------
+
 if(WITH_PROFILER)
   find_package(Gperftools REQUIRED)
   include_directories(${GPERFTOOLS_INCLUDE_DIR})
diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
new file mode 100644
index 0000000000000..74fdf7c4ae358
--- /dev/null
+++ b/cmake/cinn.cmake
@@ -0,0 +1,299 @@
+set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE)
+
+set(CINN_THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(DOWNLOAD_MODEL_DIR "${CINN_THIRD_PARTY_PATH}/model")
+
+if(NOT DEFINED ENV{runtime_include_dir})
+  message(
+    STATUS
+      "set runtime_include_dir: ${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
+  set(ENV{runtime_include_dir} "${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
+  add_definitions(
+    -DRUNTIME_INCLUDE_DIR="${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
+endif()
+
+if(WITH_TESTING)
+  add_definitions(-DCINN_WITH_TEST)
+endif()
+if(WITH_DEBUG)
+  add_definitions(-DCINN_WITH_DEBUG)
+endif()
+
+
+# TODO(zhhsplendid): CINN has lots of warnings during early development.
+# They will be treated as errors under paddle. We set no-error now and we will
+# clean the code in the future.
+add_definitions(-w)
+
+include(cmake/cinn/version.cmake)
+# include the customized configures
+if(EXISTS ${CMAKE_BINARY_DIR}/config.cmake)
+  include(${CMAKE_BINARY_DIR}/config.cmake)
+endif()
+
+if(WITH_GPU)
+  message(STATUS "Enable CINN CUDA")
+  add_definitions(-DCINN_WITH_CUDA)
+  message(STATUS "Enable CINN CUDNN")
+  add_definitions(-DCINN_WITH_CUDNN)
+  enable_language(CUDA)
+  find_package(CUDA REQUIRED)
+  include_directories(${CUDA_INCLUDE_DIRS})
+  include_directories(${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda)
+  include_directories(/usr/lib/x86_64-linux-gnu)
+  set(CUDA_SEPARABLE_COMPILATION ON)
+
+  cuda_select_nvcc_arch_flags(ARCH_FLAGS Auto)
+  list(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+
+  message(
+    STATUS
+      "copy paddle/cinn/common/float16.h paddle/cinn/common/bfloat16.h to $ENV{runtime_include_dir}"
+  )
+  file(COPY paddle/cinn/common/float16.h paddle/cinn/common/bfloat16.h
+       DESTINATION $ENV{runtime_include_dir})
+
+  find_library(CUDASTUB libcuda.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/
+                                         REQUIRED)
+  find_library(CUBLAS libcublas.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+                                         /usr/lib REQUIRED)
+  find_library(CUDNN libcudnn.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib
+                                       REQUIRED)
+  find_library(CURAND libcurand.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+                                         /usr/lib REQUIRED)
+  find_library(CUSOLVER libcusolver.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+                                             /usr/lib REQUIRED)
+endif()
+
+set(cinnapi_src CACHE INTERNAL "" FORCE)
+set(core_src CACHE INTERNAL "" FORCE)
+set(core_includes CACHE INTERNAL "" FORCE)
+set(core_proto_includes CACHE INTERNAL "" FORCE)
+
+include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${CMAKE_BINARY_DIR})
+
+include(cmake/generic.cmake)
+include(cmake/cinn/system.cmake)
+include(cmake/cinn/core.cmake)
+include(cmake/cinn/external/absl.cmake)
+include(cmake/cinn/nvrtc.cmake)
+include(cmake/cinn/nvtx.cmake)
+include(cmake/cinn/external/llvm.cmake)
+include(cmake/cinn/external/isl.cmake)
+include(cmake/cinn/external/ginac.cmake)
+include(cmake/cinn/external/openmp.cmake)
+include(cmake/cinn/external/jitify.cmake)
+
+
+set(LINK_FLAGS
+    "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cinn/export.map"
+    CACHE INTERNAL "")
+set(global_test_args
+    "--cinn_x86_builtin_code_root=${CMAKE_SOURCE_DIR}/paddle/cinn/backends")
+
+set(Python_VIRTUALENV FIRST)
+
+if(NOT PYTHON_EXECUTABLE)
+  find_package(PythonInterp ${PY_VERSION} REQUIRED)
+endif()
+
+if(NOT PYTHON_LIBRARIES)
+  find_package(PythonLibs ${PY_VERSION} REQUIRED)
+endif()
+
+message(STATUS "PYTHON_LIBRARIES: ${PYTHON_LIBRARIES}")
+message(STATUS "PYTHON_INCLUDE_DIR: ${PYTHON_INCLUDE_DIR}")
+
+include_directories(${PYTHON_INCLUDE_DIR})
+
+set(core_deps CACHE INTERNAL "" FORCE)
+set(hlir_src CACHE INTERNAL "" FORCE)
+
+# TODO(chenweihang): The logic later depends adding cinn subdirectory here,
+# but better to move to paddle/CMakeLists.txt
+add_subdirectory(paddle/cinn)
+
+set(core_src "${cinnapi_src}")
+
+cinn_cc_library(
+  cinnapi
+  SHARED
+  SRCS
+  ${cinnapi_src}
+  DEPS
+  glog
+  ${llvm_libs}
+  cinn_framework_proto
+  param_proto
+  auto_schedule_proto
+  schedule_desc_proto
+  absl
+  isl
+  ginac
+  pybind
+  ${jitify_deps})
+add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
+add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
+
+target_link_libraries(cinnapi ${PYTHON_LIBRARIES})
+
+if(WITH_MKL)
+  target_link_libraries(cinnapi cinn_mklml)
+  add_dependencies(cinnapi cinn_mklml)
+  if(WITH_MKLDNN)
+    target_link_libraries(cinnapi mkldnn)
+    add_dependencies(cinnapi mkldnn)
+  endif()
+endif()
+
+if(WITH_GPU)
+  target_link_libraries(
+    cinnapi
+    ${CUDA_NVRTC_LIB}
+    ${CUDA_LIBRARIES}
+    ${CUDASTUB}
+    ${CUBLAS}
+    ${CUDNN}
+    ${CURAND}
+    ${CUSOLVER})
+  if(NVTX_FOUND)
+    target_link_libraries(cinnapi ${CUDA_NVTX_LIB})
+  endif()
+endif()
+
+function(gen_cinncore LINKTYPE)
+  set(CINNCORE_TARGET cinncore)
+  if(${LINKTYPE} STREQUAL "STATIC")
+    set(CINNCORE_TARGET cinncore_static)
+  endif()
+  cinn_cc_library(
+    ${CINNCORE_TARGET}
+    ${LINKTYPE}
+    SRCS
+    ${core_src}
+    DEPS
+    glog
+    ${llvm_libs}
+    cinn_framework_proto
+    param_proto
+    auto_schedule_proto
+    schedule_desc_proto
+    absl
+    isl
+    ginac)
+  add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
+  add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
+
+  add_dependencies(${CINNCORE_TARGET} pybind)
+  target_link_libraries(${CINNCORE_TARGET} ${PYTHON_LIBRARIES})
+
+  if(WITH_MKL)
+    target_link_libraries(${CINNCORE_TARGET} cinn_mklml)
+    add_dependencies(${CINNCORE_TARGET} cinn_mklml)
+    if(WITH_MKLDNN)
+      target_link_libraries(${CINNCORE_TARGET} mkldnn)
+      add_dependencies(${CINNCORE_TARGET} mkldnn)
+    endif()
+  endif()
+
+  if(WITH_GPU)
+    target_link_libraries(
+      ${CINNCORE_TARGET}
+      ${CUDA_NVRTC_LIB}
+      ${CUDA_LIBRARIES}
+      ${CUDASTUB}
+      ${CUBLAS}
+      ${CUDNN}
+      ${CURAND}
+      ${CUSOLVER}
+      ${jitify_deps})
+    if(NVTX_FOUND)
+      target_link_libraries(${CINNCORE_TARGET} ${CUDA_NVTX_LIB})
+    endif()
+  endif()
+endfunction()
+
+gen_cinncore(STATIC)
+gen_cinncore(SHARED)
+
+# --------distribute cinncore lib and include begin--------
+set(PUBLISH_LIBS ON)
+if(PUBLISH_LIBS)
+  set(core_includes
+      "${core_includes};paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh")
+  foreach(header ${core_includes})
+    get_filename_component(prefix ${header} DIRECTORY)
+    file(COPY ${header}
+         DESTINATION ${CMAKE_BINARY_DIR}/dist/cinn/include/${prefix})
+  endforeach()
+
+  foreach(proto_header ${core_proto_includes})
+    string(REPLACE ${CMAKE_BINARY_DIR}/ "" relname ${proto_header})
+    get_filename_component(prefix ${relname} DIRECTORY)
+    set(target_name ${CMAKE_BINARY_DIR}/dist/cinn/include/${relname})
+    add_custom_command(
+      TARGET cinnapi
+      POST_BUILD
+      COMMENT "copy generated proto header '${relname}' to dist"
+      COMMAND cmake -E copy ${proto_header} ${target_name} DEPENDS cinnapi)
+  endforeach()
+
+  add_custom_command(
+    TARGET cinnapi
+    POST_BUILD
+    COMMAND cmake -E copy ${CMAKE_BINARY_DIR}/libcinnapi.so
+            ${CMAKE_BINARY_DIR}/dist/cinn/lib/libcinnapi.so
+            COMMAND cmake -E copy_directory ${CINN_THIRD_PARTY_PATH}/install
+            ${CMAKE_BINARY_DIR}/dist/third_party DEPENDS cinnapi)
+  add_custom_command(
+    TARGET cinncore_static
+    POST_BUILD
+    COMMAND cmake -E copy ${CMAKE_BINARY_DIR}/libcinncore_static.a
+            ${CMAKE_BINARY_DIR}/dist/cinn/lib/libcinncore_static.a
+    COMMAND
+      cmake -E copy
+      ${CMAKE_BINARY_DIR}/paddle/cinn/frontend/paddle/libcinn_framework_proto.a
+      ${CMAKE_BINARY_DIR}/dist/cinn/lib/libcinn_framework_proto.a
+    COMMAND
+      cmake -E copy ${CMAKE_BINARY_DIR}/paddle/cinn/hlir/pe/libparam_proto.a
+      ${CMAKE_BINARY_DIR}/dist/cinn/lib/libparam_proto.a
+    COMMAND
+      cmake -E copy
+      ${CMAKE_BINARY_DIR}/paddle/cinn/auto_schedule/libauto_schedule_proto.a
+      ${CMAKE_BINARY_DIR}/dist/cinn/lib/libauto_schedule_proto.a
+    COMMAND
+      cmake -E copy ${CMAKE_BINARY_DIR}/paddle/cinn/ir/libschedule_desc_proto.a
+      ${CMAKE_BINARY_DIR}/dist/cinn/lib/libschedule_desc_proto.a
+    COMMENT "distribute libcinncore_static.a and related header files." DEPENDS
+            cinncore_static)
+endif()
+# --------distribute cinncore lib and include end--------
+
+set(CINN_LIB_NAME "libcinnapi.so")
+set(CINN_LIB_LOCATION "${CMAKE_BINARY_DIR}/dist/cinn/lib")
+set(CINN_LIB "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}")
+
+######################################
+# Add CINN's dependencies header files
+######################################
+
+# Add absl
+set(ABSL_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/third_party/absl/include")
+include_directories(${ABSL_INCLUDE_DIR})
+
+# Add isl
+set(ISL_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/third_party/isl/include")
+include_directories(${ISL_INCLUDE_DIR})
+
+# Add LLVM
+set(LLVM_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/third_party/llvm/include")
+include_directories(${LLVM_INCLUDE_DIR})
+
+######################################################
+# Put external_cinn and dependencies together as a lib
+######################################################
+
+set(CINN_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/cinn/include")
+include_directories(${CINN_INCLUDE_DIR})
diff --git a/cmake/cinn/config.cmake b/cmake/cinn/config.cmake
new file mode 100755
index 0000000000000..4a390539fabef
--- /dev/null
+++ b/cmake/cinn/config.cmake
@@ -0,0 +1,11 @@
+# The home path of ISL
+# Required!
+set(ISL_HOME "")
+
+# Whether enable NVidia CUDA support.
+# Possible values: ON, OFF
+set(WITH_GPU ON)
+
+set(WITH_MKL ON)
+set(WITH_MKLDNN ON)
+set(USE_OPENMP "intel")
diff --git a/cmake/cinn/core.cmake b/cmake/cinn/core.cmake
new file mode 100644
index 0000000000000..91809b697aeec
--- /dev/null
+++ b/cmake/cinn/core.cmake
@@ -0,0 +1,459 @@
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -fPIC -mavx -mfma -Wno-write-strings -Wno-psabi")
+
+set(PADDLE_RESOURCE_URL
+    "http://paddle-inference-dist.bj.bcebos.com"
+    CACHE STRING "inference download url")
+
+function(cinn_cc_library TARGET_NAME)
+  set(options STATIC static SHARED shared)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(cinn_cc_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  if(cinn_cc_library_SRCS)
+    if(cinn_cc_library_SHARED OR cinn_cc_library_shared) # build *.so
+      add_library(${TARGET_NAME} SHARED ${cinn_cc_library_SRCS})
+    else()
+      add_library(${TARGET_NAME} STATIC ${cinn_cc_library_SRCS})
+    endif()
+
+    if(cinn_cc_library_DEPS)
+      # Don't need link libwarpctc.so
+      target_link_libraries(${TARGET_NAME} ${cinn_cc_library_DEPS})
+      add_dependencies(${TARGET_NAME} ${cinn_cc_library_DEPS})
+    endif()
+
+    # cpplint code style
+    foreach(source_file ${cinn_cc_library_SRCS})
+      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        list(APPEND cinn_cc_library_HEADERS
+             ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+      endif()
+    endforeach()
+  else(cinn_cc_library_SRCS)
+    if(cinn_cc_library_DEPS)
+      cinn_merge_static_libs(${TARGET_NAME} ${cinn_cc_library_DEPS})
+    else()
+      message(
+        FATAL_ERROR
+          "Please specify source files or libraries in cinn_cc_library(${TARGET_NAME} ...)."
+      )
+    endif()
+  endif(cinn_cc_library_SRCS)
+
+  if((NOT ("${TARGET_NAME}" STREQUAL "cinn_gtest_main"))
+     AND (NOT ("${TARGET_NAME}" STREQUAL "utils"))
+     AND (NOT ("${TARGET_NAME}" STREQUAL "lib")))
+    target_link_libraries(${TARGET_NAME} Threads::Threads)
+
+  endif(
+    (NOT ("${TARGET_NAME}" STREQUAL "cinn_gtest_main"))
+    AND (NOT ("${TARGET_NAME}" STREQUAL "utils"))
+    AND (NOT ("${TARGET_NAME}" STREQUAL "lib")))
+endfunction(cinn_cc_library)
+
+list(APPEND CMAKE_CTEST_ARGUMENTS)
+
+function(remove_gflags TARGET_NAME)
+  get_target_property(TARGET_LIBRARIES ${TARGET_NAME} LINK_LIBRARIES)
+  list(REMOVE_ITEM TARGET_LIBRARIES glog)
+  list(REMOVE_ITEM TARGET_LIBRARIES gflags)
+  set_property(TARGET ${TARGET_NAME} PROPERTY LINK_LIBRARIES
+                                              ${TARGET_LIBRARIES})
+endfunction()
+
+function(cinn_cc_test TARGET_NAME)
+  if(WITH_TESTING)
+    set(options SERIAL)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(cinn_cc_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${cinn_cc_test_SRCS})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${os_dependency_modules}
+                          cinn_gtest_main gtest glog ${cinn_cc_test_DEPS})
+    add_dependencies(${TARGET_NAME} cinn_gtest_main gtest glog
+                     ${cinn_cc_test_DEPS})
+
+    add_test(
+      NAME ${TARGET_NAME}
+      COMMAND ${TARGET_NAME} "${cinn_cc_test_ARGS}"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if(${cinn_cc_test_SERIAL})
+      set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 6000)
+    remove_gflags(${TARGET_NAME})
+  endif()
+endfunction()
+
+function(cinn_nv_library TARGET_NAME)
+  if(WITH_GPU)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(cinn_nv_library "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    if(cinn_nv_library_SRCS)
+      if(cinn_nv_library_SHARED OR cinn_nv_library_shared) # build *.so
+        cuda_add_library(${TARGET_NAME} SHARED ${cinn_nv_library_SRCS})
+      else()
+        cuda_add_library(${TARGET_NAME} STATIC ${cinn_nv_library_SRCS})
+      endif()
+      if(cinn_nv_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${cinn_nv_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${cinn_nv_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${cinn_nv_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND cinn_nv_library_HEADERS
+               ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+    else(cinn_nv_library_SRCS)
+      if(cinn_nv_library_DEPS)
+        cinn_merge_static_libs(${TARGET_NAME} ${cinn_nv_library_DEPS})
+      else()
+        message(FATAL
+                "Please specify source file or library in cinn_nv_library.")
+      endif()
+    endif(cinn_nv_library_SRCS)
+    target_link_libraries(${TARGET_NAME} Threads::Threads)
+  endif()
+endfunction(cinn_nv_library)
+
+function(cinn_nv_binary TARGET_NAME)
+  if(WITH_GPU)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(cinn_nv_binary "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    cuda_add_executable(${TARGET_NAME} ${cinn_nv_binary_SRCS})
+    if(cinn_nv_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${cinn_nv_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${cinn_nv_binary_DEPS})
+      common_link(${TARGET_NAME})
+    endif()
+  endif()
+endfunction(cinn_nv_binary)
+
+function(cinn_nv_test TARGET_NAME)
+  if(WITH_GPU AND WITH_TESTING)
+    set(options SERIAL)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(cinn_nv_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    cuda_add_executable(${TARGET_NAME} ${cinn_nv_test_SRCS})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(
+      ${TARGET_NAME}
+      ${cinn_nv_test_DEPS}
+      cinn_gtest_main
+      gtest
+      ${os_dependency_modules}
+      ${CUDNN_LIBRARY}
+      ${CUBLAS_LIBRARIES}
+      ${CUDA_LIBRARIES})
+    add_dependencies(${TARGET_NAME} ${cinn_nv_test_DEPS} cinn_gtest_main gtest)
+    common_link(${TARGET_NAME})
+    # add_test(${TARGET_NAME} ${TARGET_NAME})
+    add_test(
+      NAME ${TARGET_NAME}
+      COMMAND ${TARGET_NAME} "${cinn_nv_test_ARGS}"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if(cinn_nv_test_SERIAL)
+      set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
+    target_link_libraries(
+      ${TARGET_NAME} Threads::Threads ${CUDA_NVRTC_LIB} ${CUDA_LIBRARIES}
+      ${CUDA_cudart_static_LIBRARY}
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda.so)
+    if(NVTX_FOUND)
+      target_link_libraries(${TARGET_NAME} ${CUDA_NVTX_LIB})
+    endif()
+    remove_gflags(${TARGET_NAME})
+  endif()
+endfunction(cinn_nv_test)
+
+# Add dependency that TARGET will depend on test result of DEP, this function executes the DEP during make.
+function(add_run_test_dependency TARGET_NAME DEP_NAME)
+  if(WITH_TESTING)
+    set(custom_target_name ${TARGET_NAME}_TEST_OUTPUT_DEPENDENCY_ON_${DEP_NAME})
+    add_custom_target(
+      ${custom_target_name}
+      COMMAND
+        cd ${CMAKE_CURRENT_BINARY_DIR} && ./${DEP_NAME}
+        --cinn_x86_builtin_code_root=${CMAKE_SOURCE_DIR}/paddle/cinn/backends
+      COMMAND cd ${CMAKE_BINARY_DIR}
+      DEPENDS ${DEP_NAME})
+    add_dependencies(${TARGET_NAME} ${DEP_NAME} ${custom_target_name})
+  endif(WITH_TESTING)
+endfunction(add_run_test_dependency)
+
+# find all third_party modules is used for paddle static library
+# for reduce the dependency when building the inference libs.
+set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
+function(find_fluid_thirdparties TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path
+                       ${__target_path})
+  string(FIND "${__target_path}" "third_party" pos)
+  if(pos GREATER 1)
+    get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY)
+    set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}")
+  endif()
+endfunction(find_fluid_thirdparties)
+
+function(cinn_merge_static_libs TARGET_NAME)
+  set(libs ${ARGN})
+  list(REMOVE_DUPLICATES libs)
+
+  # Get all propagation dependencies from the merged libraries
+  foreach(lib ${libs})
+    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
+  endforeach()
+  if(libs_deps)
+    list(REMOVE_DUPLICATES libs_deps)
+  endif()
+
+  # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps
+  # also help to track dependencies.
+  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+  if(APPLE) # Use OSX's libtool to merge archives
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(
+      OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs})
+
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS}
+         "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
+    add_custom_command(
+      TARGET ${TARGET_NAME}
+      POST_BUILD
+      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
+      COMMAND /usr/bin/libtool -static -o
+              "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
+  endif(APPLE)
+  if(LINUX
+  )# general UNIX: use "ar" to extract objects and re-add to a common lib
+    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
+
+    foreach(lib ${libs})
+      set(objlistfile ${target_DIR}/${lib}.objlist
+      )# list of objects in the input library
+      set(objdir ${target_DIR}/${lib}.objdir)
+
+      add_custom_command(
+        OUTPUT ${objdir}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
+        DEPENDS ${lib})
+
+      add_custom_command(
+        OUTPUT ${objlistfile}
+        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
+        DEPENDS ${lib} ${objdir}
+        WORKING_DIRECTORY ${objdir})
+
+      list(APPEND target_OBJS "${objlistfile}")
+    endforeach()
+
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(
+      OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs} ${target_OBJS})
+
+    # Generate dummy static lib
+    file(WRITE ${target_SRCS}
+         "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    # Get the file name of the generated library
+    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
+
+    add_custom_command(
+      TARGET ${TARGET_NAME}
+      POST_BUILD
+      COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
+      COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
+      WORKING_DIRECTORY ${target_DIR})
+  endif(LINUX)
+  if(WIN32)
+
+    # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs.
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(
+      OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs})
+
+    # Generate dummy static lib
+    file(WRITE ${target_SRCS}
+         "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
+    # msvc will put library in directory of "/Release/xxxlib" by default
+    #       COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
+    add_custom_command(
+      TARGET ${TARGET_NAME}
+      POST_BUILD
+      COMMAND cmake -E make_directory
+              "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
+      COMMAND
+        lib
+        /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib
+        ${libfiles})
+  endif(WIN32)
+endfunction(cinn_merge_static_libs)
+
+# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
+# Usage:
+#   paddle_protobuf_generate_cpp(<proto_srcs> <proto_hdrs> <proto_files>)
+
+function(paddle_protobuf_generate_cpp SRCS HDRS)
+  if(NOT ARGN)
+    message(
+      SEND_ERROR
+        "Error: paddle_protobuf_generate_cpp() called without any proto files")
+    return()
+  endif()
+
+  set(${SRCS})
+  set(${HDRS})
+
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+
+    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
+    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
+    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
+    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
+
+    add_custom_command(
+      OUTPUT "${_protobuf_protoc_src}" "${_protobuf_protoc_hdr}"
+      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} -I${CMAKE_SOURCE_DIR} --cpp_out
+              "${CMAKE_BINARY_DIR}" ${ABS_FIL}
+      DEPENDS ${ABS_FIL} protoc
+      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+      VERBATIM)
+  endforeach()
+
+  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+  set(${SRCS}
+      ${${SRCS}}
+      PARENT_SCOPE)
+  set(${HDRS}
+      ${${HDRS}}
+      PARENT_SCOPE)
+endfunction()
+
+function(cinn_proto_library TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(cinn_proto_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  set(proto_srcs)
+  set(proto_hdrs)
+  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${cinn_proto_library_SRCS})
+  cinn_cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS
+                  ${cinn_proto_library_DEPS} protobuf)
+  set("${TARGET_NAME}_HDRS"
+      ${proto_hdrs}
+      PARENT_SCOPE)
+  set("${TARGET_NAME}_SRCS"
+      ${proto_srcs}
+      PARENT_SCOPE)
+endfunction()
+
+function(common_link TARGET_NAME)
+  if(WITH_PROFILER)
+    target_link_libraries(${TARGET_NAME} gperftools::profiler)
+  endif()
+
+  if(WITH_JEMALLOC)
+    target_link_libraries(${TARGET_NAME} jemalloc::jemalloc)
+  endif()
+endfunction()
+
+# This method is borrowed from Paddle-Lite.
+function(download_and_uncompress INSTALL_DIR URL FILENAME)
+  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
+  string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
+  set(EXTERNAL_PROJECT_NAME "extern_lite_download_${FILENAME_EX}")
+  set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
+  ExternalProject_Add(
+    ${EXTERNAL_PROJECT_NAME}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${INSTALL_DIR}
+    DOWNLOAD_COMMAND
+      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME}
+      ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf
+      ${INSTALL_DIR}/${FILENAME}
+    DOWNLOAD_DIR ${INSTALL_DIR}
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND "")
+endfunction()
+
+function(gather_srcs SRC_GROUP)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs "SRCS")
+  cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+  foreach(cpp ${prefix_SRCS})
+    set(${SRC_GROUP}
+        "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${cpp}"
+        CACHE INTERNAL "")
+  endforeach()
+endfunction()
+
+function(core_gather_headers)
+  file(
+    GLOB includes
+    LIST_DIRECTORIES false
+    RELATIVE ${CMAKE_SOURCE_DIR}
+    *.h)
+
+  foreach(header ${includes})
+    set(core_includes
+        "${core_includes};${header}"
+        CACHE INTERNAL "")
+  endforeach()
+endfunction()
diff --git a/cmake/cinn/export.map b/cmake/cinn/export.map
new file mode 100644
index 0000000000000..0b1aff5de9c00
--- /dev/null
+++ b/cmake/cinn/export.map
@@ -0,0 +1,6 @@
+{
+	global:
+        RegisterKernels;
+	local:
+		*;
+};
diff --git a/cmake/cinn/external/absl.cmake b/cmake/cinn/external/absl.cmake
new file mode 100644
index 0000000000000..93c70c54959d4
--- /dev/null
+++ b/cmake/cinn/external/absl.cmake
@@ -0,0 +1,78 @@
+include(ExternalProject)
+
+set(ABSL_SOURCES_DIR ${CINN_THIRD_PARTY_PATH}/absl)
+set(ABSL_INSTALL_DIR ${CINN_THIRD_PARTY_PATH}/install/absl)
+
+set(ABSL_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+
+set(ABSL_REPOSITORY "https://github.com/abseil/abseil-cpp.git")
+set(ABSL_TAG "20210324.2")
+
+set(OPTIONAL_ARGS
+    "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+    "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+    "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
+    "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
+    "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+    "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
+    "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}")
+
+ExternalProject_Add(
+  external_absl
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  DEPENDS gflags
+  GIT_REPOSITORY ${ABSL_REPOSITORY}
+  GIT_TAG ${ABSL_TAG}
+  PREFIX ${ABSL_SOURCES_DIR}
+  UPDATE_COMMAND ""
+  CMAKE_ARGS ${OPTIONAL_ARGS}
+             -DCMAKE_INSTALL_PREFIX=${ABSL_INSTALL_DIR}
+             -DCMAKE_INSTALL_LIBDIR=${ABSL_INSTALL_DIR}/lib
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DWITH_GFLAGS=ON
+             -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
+             -DBUILD_TESTING=OFF
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             ${EXTERNAL_OPTIONAL_ARGS}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_INSTALL_PREFIX:PATH=${ABSL_INSTALL_DIR}
+    -DCMAKE_INSTALL_LIBDIR:PATH=${ABSL_INSTALL_DIR}/lib
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_base.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_hash.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_wyhash.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_city.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_strings.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_throw_delegate.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_bad_any_cast_impl.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_bad_optional_access.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_bad_variant_access.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_raw_hash_set.a)
+
+# It may be more convinent if we just include all absl libs
+set(ABSL_LIB_NAMES
+    hash
+    wyhash
+    city
+    strings
+    throw_delegate
+    bad_any_cast_impl
+    bad_optional_access
+    bad_variant_access
+    raw_hash_set)
+set(ABSL_LIBS "")
+
+add_library(absl STATIC IMPORTED GLOBAL)
+set_property(TARGET absl PROPERTY IMPORTED_LOCATION
+                                  ${ABSL_INSTALL_DIR}/lib/libabsl_base.a)
+
+if(NOT USE_PREBUILD_EXTERNAL)
+  add_dependencies(absl external_absl)
+endif()
+foreach(lib_name ${ABSL_LIB_NAMES})
+  target_link_libraries(absl
+                        INTERFACE ${ABSL_INSTALL_DIR}/lib/libabsl_${lib_name}.a)
+endforeach()
+include_directories(${ABSL_INSTALL_DIR}/include)
diff --git a/cmake/cinn/external/boost.cmake b/cmake/cinn/external/boost.cmake
new file mode 100644
index 0000000000000..773b2f89f1704
--- /dev/null
+++ b/cmake/cinn/external/boost.cmake
@@ -0,0 +1,65 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(BOOST_PROJECT "extern_boost")
+# To release PaddlePaddle as a pip package, we have to follow the
+# manylinux1 standard, which features as old Linux kernels and
+# compilers as possible and recommends CentOS 5. Indeed, the earliest
+# CentOS version that works with NVIDIA CUDA is CentOS 6.  And a new
+# version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
+# checked that the devtools package of CentOS 6 installs boost 1.41.0.
+# So we use 1.41.0 here.
+set(BOOST_VER "1.41.0")
+set(BOOST_TAR
+    "boost_1_41_0"
+    CACHE STRING "" FORCE)
+set(BOOST_URL
+    "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz"
+    CACHE STRING "" FORCE)
+
+message(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+
+set(BOOST_SOURCES_DIR ${CINN_THIRD_PARTY_PATH}/boost)
+set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
+
+set(BOOST_INCLUDE_DIR
+    "${BOOST_DOWNLOAD_DIR}"
+    CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
+include_directories(${BOOST_INCLUDE_DIR})
+
+ExternalProject_Add(
+  ${BOOST_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
+  URL ${BOOST_URL}
+  DOWNLOAD_NO_PROGRESS 1
+  PREFIX ${BOOST_SOURCES_DIR}
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  UPDATE_COMMAND "")
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(boost STATIC ${dummyfile})
+else()
+  add_library(boost INTERFACE)
+endif()
+
+add_dependencies(boost ${BOOST_PROJECT})
+set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
diff --git a/cmake/cinn/external/ginac.cmake b/cmake/cinn/external/ginac.cmake
new file mode 100644
index 0000000000000..5c31ac32fd790
--- /dev/null
+++ b/cmake/cinn/external/ginac.cmake
@@ -0,0 +1,36 @@
+include(ExternalProject)
+
+# gmp-6.2.1 https://gmplib.org/download/gmp/gmp-6.2.1.tar.xz
+# cln-1.3.6 https://www.ginac.de/CLN/cln-1.3.6.tar.bz2
+# ginac-1.8.1 https://www.ginac.de/ginac-1.8.1.tar.bz2
+#  all build with CFLAGS="-fPIC -DPIC" CXXFLAGS="-fPIC -DPIC" --enable-static=yes
+
+set(GINAC_DOWNLOAD_URL
+    https://paddle-inference-dist.bj.bcebos.com/CINN/ginac-1.8.1_cln-1.3.6_gmp-6.2.1.tar.gz
+)
+set(GINAC_MD5 ebc3e4b7770dd604777ac3f01bfc8b06)
+
+ExternalProject_Add(
+  external_ginac
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${GINAC_DOWNLOAD_URL}
+  URL_MD5 ${GINAC_MD5}
+  PREFIX ${CINN_THIRD_PARTY_PATH}/ginac
+  SOURCE_DIR ${CINN_THIRD_PARTY_PATH}/install/ginac
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  UPDATE_COMMAND ""
+  INSTALL_COMMAND ""
+  BUILD_BYPRODUCTS ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libginac.a
+  BUILD_BYPRODUCTS ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libcln.a
+  BUILD_BYPRODUCTS ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libgmp.a)
+
+add_library(ginac STATIC IMPORTED GLOBAL)
+add_dependencies(ginac external_ginac)
+set_property(
+  TARGET ginac PROPERTY IMPORTED_LOCATION
+                        ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libginac.a)
+target_link_libraries(
+  ginac INTERFACE ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libcln.a
+                  ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libgmp.a)
+include_directories(${CINN_THIRD_PARTY_PATH}/install/ginac/include)
diff --git a/cmake/cinn/external/isl.cmake b/cmake/cinn/external/isl.cmake
new file mode 100644
index 0000000000000..a78dee350a5ad
--- /dev/null
+++ b/cmake/cinn/external/isl.cmake
@@ -0,0 +1,32 @@
+include(ExternalProject)
+
+# isl https://github.com/inducer/ISL
+# commit-id 6a1760fe46967cda2a06387793a6b7d4a0876581
+#   depends on llvm f9dc2b7079350d0fed3bb3775f496b90483c9e42
+#   depends on gmp-6.2.1
+# static build
+# CPPFLAGS="-fPIC -DPIC" ./configure --with-gmp-prefix=<gmp-install-path> --with-clang-prefix=<llvm-install-path> --enable-shared=no --enable-static=yes
+
+set(ISL_DOWNLOAD_URL
+    https://paddle-inference-dist.bj.bcebos.com/CINN/isl-6a1760fe.tar.gz)
+set(ISL_MD5 fff10083fb79d394b8a7b7b2089f6183)
+
+ExternalProject_Add(
+  external_isl
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${ISL_DOWNLOAD_URL}
+  URL_MD5 ${ISL_MD5}
+  PREFIX ${CINN_THIRD_PARTY_PATH}/isl
+  SOURCE_DIR ${CINN_THIRD_PARTY_PATH}/install/isl
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  UPDATE_COMMAND ""
+  INSTALL_COMMAND ""
+  BUILD_BYPRODUCTS ${CINN_THIRD_PARTY_PATH}/install/isl/lib/libisl.a)
+
+add_library(isl STATIC IMPORTED GLOBAL)
+set_property(
+  TARGET isl PROPERTY IMPORTED_LOCATION
+                      ${CINN_THIRD_PARTY_PATH}/install/isl/lib/libisl.a)
+add_dependencies(isl external_isl)
+include_directories(${CINN_THIRD_PARTY_PATH}/install/isl/include)
diff --git a/cmake/cinn/external/jitify.cmake b/cmake/cinn/external/jitify.cmake
new file mode 100644
index 0000000000000..b04d64b12b8fb
--- /dev/null
+++ b/cmake/cinn/external/jitify.cmake
@@ -0,0 +1,28 @@
+if(NOT WITH_GPU)
+  set(JITIFY_FOUND OFF)
+  return()
+endif()
+
+include(ExternalProject)
+
+set(JITIFY_SOURCE_PATH ${CINN_THIRD_PARTY_PATH}/install/jitify)
+
+ExternalProject_Add(
+  external_jitify
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/NVIDIA/jitify.git"
+  GIT_TAG 57de649139c866eb83acacfe50c92ad7c6278776
+  GIT_TAG master
+  PREFIX ${CINN_THIRD_PARTY_PATH}/jitify
+  SOURCE_DIR ${JITIFY_SOURCE_PATH}
+  CONFIGURE_COMMAND ""
+  PATCH_COMMAND ""
+  BUILD_COMMAND ""
+  UPDATE_COMMAND ""
+  INSTALL_COMMAND "")
+
+include_directories(${JITIFY_SOURCE_PATH})
+
+add_library(extern_jitify INTERFACE)
+add_dependencies(extern_jitify external_jitify)
+set(jitify_deps extern_jitify)
diff --git a/cmake/cinn/external/llvm.cmake b/cmake/cinn/external/llvm.cmake
new file mode 100644
index 0000000000000..29ab0967e3053
--- /dev/null
+++ b/cmake/cinn/external/llvm.cmake
@@ -0,0 +1,129 @@
+include(FetchContent)
+
+# set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz)
+# set(LLVM_MD5 39d32b6be466781dddf5869318dcba53)
+
+set(LLVM_DOWNLOAD_URL
+    https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11-glibc2.17.tar.gz)
+set(LLVM_MD5 33c7d3cc6d370585381e8d90bd7c2198)
+
+set(FETCHCONTENT_BASE_DIR ${CINN_THIRD_PARTY_PATH}/llvm)
+set(FETCHCONTENT_QUIET OFF)
+FetchContent_Declare(
+  external_llvm
+  URL ${LLVM_DOWNLOAD_URL}
+  URL_MD5 ${LLVM_MD5}
+  PREFIX ${CINN_THIRD_PARTY_PATH}/llvm SOURCE_DIR
+  ${CINN_THIRD_PARTY_PATH}/install/llvm)
+if(NOT LLVM_PATH)
+  FetchContent_GetProperties(external_llvm)
+  if(NOT external_llvm_POPULATED)
+    FetchContent_Populate(external_llvm)
+  endif()
+  set(LLVM_PATH ${CINN_THIRD_PARTY_PATH}/install/llvm)
+  set(LLVM_DIR ${CINN_THIRD_PARTY_PATH}/install/llvm/lib/cmake/llvm)
+  set(MLIR_DIR ${CINN_THIRD_PARTY_PATH}/install/llvm/lib/cmake/mlir)
+else()
+  set(LLVM_DIR ${LLVM_PATH}/lib/cmake/llvm)
+  set(MLIR_DIR ${LLVM_PATH}/lib/cmake/mlir)
+endif()
+
+if(${CMAKE_CXX_COMPILER} STREQUAL "clang++")
+  set(CMAKE_EXE_LINKER_FLAGS
+      "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
+endif()
+
+message(STATUS "set LLVM_DIR: ${LLVM_DIR}")
+message(STATUS "set MLIR_DIR: ${MLIR_DIR}")
+find_package(LLVM REQUIRED CONFIG HINTS ${LLVM_DIR})
+find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR})
+find_package(ZLIB REQUIRED)
+
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+include(AddLLVM)
+
+include_directories(${LLVM_INCLUDE_DIRS})
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+include(AddLLVM)
+include(TableGen)
+include(AddMLIR)
+
+message(STATUS "Found MLIR: ${MLIR_DIR}")
+message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+
+# To build with MLIR, the LLVM is build from source code using the following flags:
+
+#[==[
+cmake -G Ninja ../llvm \
+  -DLLVM_ENABLE_PROJECTS="mlir;clang" \
+  -DLLVM_BUILD_EXAMPLES=OFF \
+  -DLLVM_TARGETS_TO_BUILD="X86" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DLLVM_ENABLE_ASSERTIONS=ON \
+  -DLLVM_ENABLE_ZLIB=OFF \
+  -DLLVM_ENABLE_RTTI=ON \
+  -DLLVM_ENABLE_TERMINFO=OFF \
+  -DCMAKE_INSTALL_PREFIX=./install
+#]==]
+
+# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
+# Update: to build llvm in manylinux docker with glibc-2.17, and use it in manylinux and ubuntu docker,
+# the patch https://gist.github.com/zhiqiu/6e8d969176dce13d98fd15338a16265e is needed.
+
+add_definitions(${LLVM_DEFINITIONS})
+
+llvm_map_components_to_libnames(
+  llvm_libs
+  Support
+  Core
+  irreader
+  X86
+  executionengine
+  orcjit
+  mcjit
+  all
+  codegen)
+
+message(STATUS "LLVM libs: ${llvm_libs}")
+
+get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS)
+message(STATUS "MLIR libs: ${mlir_libs}")
+add_definitions(${LLVM_DEFINITIONS})
+
+# The minimum needed libraries for MLIR IR parse and transform.
+set(MLIR_IR_LIBS
+    MLIRAnalysis
+    MLIRStandardOps
+    MLIRPass
+    MLIRParser
+    MLIRDialect
+    MLIRIR
+    MLIROptLib)
+
+# tb_base is the name of a xxx.td file (without the .td suffix)
+function(mlir_tablegen_on td_base)
+  set(options)
+  set(oneValueArgs DIALECT)
+  cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+
+  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
+  mlir_tablegen(${td_base}.hpp.inc -gen-op-decls)
+  mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
+  if(mlir_tablegen_on_DIALECT)
+    mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls
+                  -dialect=${mlir_tablegen_on_DIALECT})
+  endif()
+  add_public_tablegen_target(${td_base}_IncGen)
+  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+endfunction()
+
+function(mlir_add_rewriter td_base)
+  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
+  mlir_tablegen(${td_base}.hpp.inc -gen-rewriters
+                "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
+  add_public_tablegen_target(${td_base}_IncGen)
+  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+endfunction()
diff --git a/cmake/cinn/external/openmp.cmake b/cmake/cinn/external/openmp.cmake
new file mode 100644
index 0000000000000..2a0194636d6c2
--- /dev/null
+++ b/cmake/cinn/external/openmp.cmake
@@ -0,0 +1,37 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(USE_OPENMP STREQUAL "gnu")
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    add_definitions(-DCINN_USE_OPENMP)
+    set(WITH_OPENMP ON)
+    message(STATUS "Build with OpenMP ${OpenMP_CXX_LIBRARIES}")
+    message(STATUS "CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS})
+  else()
+    set(WITH_OPENMP OFF)
+  endif()
+elseif(USE_OPENMP STREQUAL "intel")
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    message(STATUS "CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS})
+    add_definitions(-DCINN_USE_OPENMP)
+    set(WITH_OPENMP ON)
+    message(STATUS "Build with OpenMP " ${MKLML_IOMP_LIB})
+  else()
+    set(WITH_OPENMP OFF)
+  endif()
+endif()
diff --git a/cmake/cinn/llvm.cmake b/cmake/cinn/llvm.cmake
new file mode 100644
index 0000000000000..4fc274e6983cd
--- /dev/null
+++ b/cmake/cinn/llvm.cmake
@@ -0,0 +1,86 @@
+if(${CMAKE_CXX_COMPILER} STREQUAL "clang++")
+  set(CMAKE_EXE_LINKER_FLAGS
+      "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
+endif()
+
+message(STATUS "set LLVM_DIR: ${LLVM_DIR}")
+message(STATUS "set MLIR_DIR: ${MLIR_DIR}")
+find_package(LLVM REQUIRED CONFIG HINTS ${LLVM_DIR})
+find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR})
+find_package(ZLIB REQUIRED)
+
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+include(AddLLVM)
+
+include_directories(${LLVM_INCLUDE_DIRS})
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+include(AddLLVM)
+include(TableGen)
+include(AddMLIR)
+
+message(STATUS "Found MLIR: ${MLIR_DIR}")
+message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+
+# To build with MLIR, the LLVM is build from source code using the following flags:
+
+#[==[
+cmake -G Ninja ../llvm \
+  -DLLVM_ENABLE_PROJECTS=mlir \
+  -DLLVM_BUILD_EXAMPLES=OFF \
+  -DLLVM_TARGETS_TO_BUILD="X86" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DLLVM_ENABLE_ASSERTIONS=ON \
+  -DLLVM_ENABLE_ZLIB=OFF \
+  -DLLVM_ENABLE_RTTI=ON \
+#]==]
+# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
+
+add_definitions(${LLVM_DEFINITIONS})
+
+llvm_map_components_to_libnames(
+  llvm_libs
+  Support
+  Core
+  irreader
+  X86
+  executionengine
+  orcjit
+  mcjit
+  all
+  codegen)
+
+message(STATUS "LLVM libs: ${llvm_libs}")
+
+get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS)
+message(STATUS "MLIR libs: ${mlir_libs}")
+add_definitions(${LLVM_DEFINITIONS})
+
+# The minimum needed libraries for MLIR IR parse and transform.
+set(MLIR_IR_LIBS
+    MLIRAnalysis
+    MLIRStandardOps
+    MLIRPass
+    MLIRParser
+    MLIRDialect
+    MLIRIR
+    MLIROptLib)
+
+# tb_base is the name of a xxx.td file (without the .td suffix)
+function(mlir_tablegen_on td_base)
+  set(options)
+  set(oneValueArgs DIALECT)
+  cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+
+  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
+  mlir_tablegen(${td_base}.hpp.inc -gen-op-decls)
+  mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
+  if(mlir_tablegen_on_DIALECT)
+    mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls
+                  -dialect=${mlir_tablegen_on_DIALECT})
+  endif()
+  add_public_tablegen_target(${td_base}_IncGen)
+  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+endfunction()
diff --git a/cmake/cinn/nvrtc.cmake b/cmake/cinn/nvrtc.cmake
new file mode 100644
index 0000000000000..987bebfab0c05
--- /dev/null
+++ b/cmake/cinn/nvrtc.cmake
@@ -0,0 +1,24 @@
+if(NOT WITH_GPU)
+  return()
+endif()
+
+find_package(PkgConfig)
+
+find_library(
+  CUDA_NVRTC_LIB libnvrtc nvrtc
+  HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64" "${LIBNVRTC_LIBRARY_DIR}"
+        "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" /usr/lib64 /usr/local/cuda/lib64)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LibNVRTC DEFAULT_MSG CUDA_NVRTC_LIB)
+
+message(STATUS "found NVRTC: ${CUDA_NVRTC_LIB}")
+
+mark_as_advanced(CUDA_NVRTC_LIB)
+
+if(NOT LIBNVRTC_FOUND)
+  message(
+    FATAL_ERROR
+      "Cuda NVRTC Library not found: Specify the LIBNVRTC_LIBRARY_DIR where libnvrtc is located"
+  )
+endif()
diff --git a/cmake/cinn/nvtx.cmake b/cmake/cinn/nvtx.cmake
new file mode 100644
index 0000000000000..d5d2049a68d40
--- /dev/null
+++ b/cmake/cinn/nvtx.cmake
@@ -0,0 +1,53 @@
+if((NOT WITH_GPU)
+   OR WIN32
+   OR APPLE)
+  set(NVTX_FOUND OFF)
+  return()
+endif()
+
+set(NVTX_ROOT
+    "/usr"
+    CACHE PATH "NVTX ROOT")
+find_path(
+  NVTX_INCLUDE_DIR nvToolsExt.h
+  PATHS ${NVTX_ROOT} ${NVTX_ROOT}/include $ENV{NVTX_ROOT}
+        $ENV{NVTX_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
+  NO_DEFAULT_PATH)
+
+get_filename_component(__libpath_hint ${CUDA_CUDART_LIBRARY} PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(
+  APPEND
+  NVTX_CHECK_LIBRARY_DIRS
+  ${NVTX_ROOT}
+  ${NVTX_ROOT}/lib64
+  ${NVTX_ROOT}/lib
+  ${NVTX_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+  $ENV{NVTX_ROOT}
+  $ENV{NVTX_ROOT}/lib64
+  $ENV{NVTX_ROOT}/lib
+  ${CUDA_TOOLKIT_ROOT_DIR}
+  ${CUDA_TOOLKIT_ROOT_DIR}/targets/${TARGET_ARCH}-linux/lib)
+
+find_library(
+  CUDA_NVTX_LIB
+  NAMES libnvToolsExt.so
+  PATHS ${NVTX_CHECK_LIBRARY_DIRS} ${NVTX_INCLUDE_DIR} ${__libpath_hint}
+  NO_DEFAULT_PATH
+  DOC "Path to the NVTX library.")
+
+if(NVTX_INCLUDE_DIR AND CUDA_NVTX_LIB)
+  set(NVTX_FOUND ON)
+else()
+  set(NVTX_FOUND OFF)
+endif()
+
+if(NVTX_FOUND)
+  include_directories(${NVTX_INCLUDE_DIR})
+  add_definitions(-DCINN_WITH_NVTX)
+endif()
diff --git a/cmake/cinn/system.cmake b/cmake/cinn/system.cmake
new file mode 100644
index 0000000000000..b7e8a760712fc
--- /dev/null
+++ b/cmake/cinn/system.cmake
@@ -0,0 +1,106 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Detects the OS and sets appropriate variables.
+# CMAKE_SYSTEM_NAME only give us a coarse-grained name of the OS CMake is
+# building for, but the host processor name like centos is necessary
+# in some scenes to distinguish system for customization.
+#
+# for instance, protobuf libs path is <install_dir>/lib64
+# on CentOS, but <install_dir>/lib on other systems.
+
+if(UNIX AND NOT APPLE)
+  # except apple from nix*Os family
+  set(LINUX TRUE)
+endif()
+
+if(WIN32)
+  set(HOST_SYSTEM "win32")
+else()
+  if(APPLE)
+    set(HOST_SYSTEM "macosx")
+    exec_program(
+      sw_vers ARGS
+      -productVersion
+      OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
+    string(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
+    if(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
+      # Set cache variable - end user may change this during ccmake or cmake-gui configure.
+      set(CMAKE_OSX_DEPLOYMENT_TARGET
+          ${MACOS_VERSION}
+          CACHE
+            STRING
+            "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value."
+      )
+    endif()
+    set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+  else()
+
+    if(EXISTS "/etc/issue")
+      file(READ "/etc/issue" LINUX_ISSUE)
+      if(LINUX_ISSUE MATCHES "CentOS")
+        set(HOST_SYSTEM "centos")
+      elseif(LINUX_ISSUE MATCHES "Debian")
+        set(HOST_SYSTEM "debian")
+      elseif(LINUX_ISSUE MATCHES "Ubuntu")
+        set(HOST_SYSTEM "ubuntu")
+      elseif(LINUX_ISSUE MATCHES "Red Hat")
+        set(HOST_SYSTEM "redhat")
+      elseif(LINUX_ISSUE MATCHES "Fedora")
+        set(HOST_SYSTEM "fedora")
+      endif()
+
+      string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION
+                   "${LINUX_ISSUE}")
+    endif()
+
+    if(EXISTS "/etc/redhat-release")
+      file(READ "/etc/redhat-release" LINUX_ISSUE)
+      if(LINUX_ISSUE MATCHES "CentOS")
+        set(HOST_SYSTEM "centos")
+      endif()
+    endif()
+
+    if(NOT HOST_SYSTEM)
+      set(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
+    endif()
+
+  endif()
+endif()
+
+# query number of logical cores
+cmake_host_system_information(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
+
+mark_as_advanced(HOST_SYSTEM CPU_CORES)
+
+message(
+  STATUS
+    "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
+message(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
+
+# external dependencies log output
+set(EXTERNAL_PROJECT_LOG_ARGS
+    LOG_DOWNLOAD
+    0 # Wrap download in script to log output
+    LOG_UPDATE
+    1 # Wrap update in script to log output
+    LOG_CONFIGURE
+    1 # Wrap configure in script to log output
+    LOG_BUILD
+    0 # Wrap build in script to log output
+    LOG_TEST
+    1 # Wrap test in script to log output
+    LOG_INSTALL
+    0 # Wrap install in script to log output
+)
diff --git a/cmake/cinn/version.cmake b/cmake/cinn/version.cmake
new file mode 100644
index 0000000000000..6b5534ae9184f
--- /dev/null
+++ b/cmake/cinn/version.cmake
@@ -0,0 +1,76 @@
+# Get the latest git tag.
+set(CINN_VERSION $ENV{CINN_VERSION})
+set(tmp_version "HEAD")
+set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
+while("${CINN_VERSION}" STREQUAL "")
+  # Check current branch name
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_BRANCH_NAME
+    RESULT_VARIABLE GIT_BRANCH_RESULT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT ${GIT_BRANCH_RESULT})
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always
+              ${tmp_version}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+      OUTPUT_VARIABLE GIT_TAG_NAME
+      RESULT_VARIABLE GIT_RESULT
+      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT ${GIT_RESULT})
+      # Check if current branch is release branch
+      if(${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
+        # Check the tag is a correct version
+        if(${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+          # if no tag was found, set CINN_VERSION to 0.0.0 to represent latest
+          set(CINN_VERSION "0.0.0")
+        elseif(${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+          string(REPLACE "v" "" CINN_VERSION ${GIT_TAG_NAME})
+        else() # otherwise, get the previous git tag name.
+          set(tmp_version "${GIT_TAG_NAME}~1")
+        endif()
+      else()
+        execute_process(
+          COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
+          WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+          OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
+          RESULT_VARIABLE GIT_EXACT_TAG_RESULT
+          ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if(NOT ${GIT_EXACT_TAG_NAME})
+          # Check if current branch is tag branch
+          if(${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+            string(REPLACE "v" "" CINN_VERSION ${GIT_EXACT_TAG_NAME})
+          else()
+            set(CINN_VERSION "0.0.0")
+          endif()
+        else()
+          # otherwise, we always set CINN_VERSION to 0.0.0 to represent latest
+          set(CINN_VERSION "0.0.0")
+        endif()
+      endif()
+    else()
+      set(CINN_VERSION "0.0.0")
+      message(WARNING "Cannot add CINN version from git tag")
+    endif()
+  else()
+    set(CINN_VERSION "0.0.0")
+    message(WARNING "Cannot add CINN version for wrong git branch result")
+  endif()
+endwhile()
+
+string(REPLACE "-" "." CINN_VER_LIST ${CINN_VERSION})
+string(REPLACE "." ";" CINN_VER_LIST ${CINN_VER_LIST})
+list(GET CINN_VER_LIST 0 CINN_MAJOR_VER)
+list(GET CINN_VER_LIST 1 CINN_MINOR_VER)
+list(GET CINN_VER_LIST 2 CINN_PATCH_VER)
+math(EXPR CINN_VERSION_INTEGER "${CINN_MAJOR_VER} * 1000000
+    + ${CINN_MINOR_VER} * 1000 + ${CINN_PATCH_VER}")
+
+add_definitions(-DCINN_VERSION=${CINN_VERSION})
+add_definitions(-DCINN_VERSION_INTEGER=${CINN_VERSION_INTEGER})
+message(
+  STATUS
+    "CINN version is ${CINN_VERSION} (major: ${CINN_MAJOR_VER}, minor: ${CINN_MINOR_VER}, patch: ${CINN_PATCH_VER})"
+)
diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
deleted file mode 100644
index 7d494ef516cae..0000000000000
--- a/cmake/external/cinn.cmake
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT WITH_CINN)
-  return()
-endif()
-
-if(NOT CINN_GIT_TAG)
-  set(CINN_GIT_TAG develop)
-endif()
-
-message(STATUS "CINN  version: " ${CINN_GIT_TAG})
-
-# TODO(zhhsplendid): CINN has lots of warnings during early development.
-# They will be treated as errors under paddle. We set no-error now and we will
-# clean the code in the future.
-add_definitions(-w)
-
-######################################
-# Build CINN from Git External Project
-######################################
-include(ExternalProject)
-set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
-set(CINN_OPTIONAL_ARGS
-    -DPY_VERSION=${PY_VERSION}
-    -DWITH_CUDA=${WITH_GPU}
-    -DWITH_CUDNN=${WITH_GPU}
-    -DWITH_MKL_CBLAS=${WITH_MKL}
-    -DWITH_MKLDNN=${WITH_MKL}
-    -DPUBLISH_LIBS=ON
-    -DWITH_TESTING=ON
-    -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}
-    -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR}
-    -DPYTHON_LIBRARIES=${PYTHON_LIBRARIES})
-set(CINN_BUILD_COMMAND ${CMAKE_COMMAND} --build . --target cinnapi -j)
-set(CINN_BINARY_DIR ${CINN_PREFIX_DIR}/src/external_cinn-build)
-set(CINN_LIB_NAME "libcinnapi.so")
-set(CINN_LIB_LOCATION "${CINN_BINARY_DIR}/dist/cinn/lib")
-set(CINN_LIB "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}")
-
-ExternalProject_Add(
-  external_cinn
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/CINN.git"
-  GIT_TAG ${CINN_GIT_TAG}
-  PREFIX ${CINN_PREFIX_DIR}
-  BUILD_COMMAND ${CINN_BUILD_COMMAND}
-  INSTALL_COMMAND ""
-  CMAKE_ARGS ${CINN_OPTIONAL_ARGS}
-  BUILD_BYPRODUCTS ${CINN_LIB})
-
-ExternalProject_Get_Property(external_cinn BINARY_DIR)
-ExternalProject_Get_Property(external_cinn SOURCE_DIR)
-set(CINN_SOURCE_DIR ${SOURCE_DIR})
-
-message(STATUS "CINN BINARY_DIR: ${CINN_BINARY_DIR}")
-message(STATUS "CINN SOURCE_DIR: ${CINN_SOURCE_DIR}")
-
-######################################
-# Add CINN's dependencies header files
-######################################
-
-# Add absl
-set(ABSL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/absl/include")
-include_directories(${ABSL_INCLUDE_DIR})
-
-# Add isl
-set(ISL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/isl/include")
-include_directories(${ISL_INCLUDE_DIR})
-
-# Add LLVM
-set(LLVM_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/llvm/include")
-include_directories(${LLVM_INCLUDE_DIR})
-
-######################################################
-# Put external_cinn and dependencies together as a lib
-######################################################
-
-set(CINN_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/cinn/include")
-
-add_library(cinn SHARED IMPORTED GLOBAL)
-set_target_properties(cinn PROPERTIES IMPORTED_LOCATION
-                                      "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}")
-include_directories(${CINN_INCLUDE_DIR})
-add_dependencies(cinn external_cinn)
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 6ce8290d72f42..1e0838145a63d 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -24,7 +24,7 @@ set(SOURCE_INCLUDE_DIR ${SOURCE_DIR}/include)
 include_directories(${PYBIND_INCLUDE_DIR})
 
 set(PYBIND_PATCH_COMMAND "")
-if(NOT WIN32)
+if(NOT WIN32 AND NOT CINN_ONLY)
   file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/pybind/cast.h.patch
        native_dst)
   # Note: [Why calling some `git` commands before `patch`?]
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 43f5604f2808c..d33b86944008a 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -260,6 +260,29 @@ if(${CMAKE_VERSION} VERSION_GREATER "3.5.2")
 endif()
 
 ########################### include third_party according to flags ###############################
+
+if(CINN_ONLY)
+  include(external/zlib)
+  include(external/gflags)
+  include(external/glog)
+  include(external/gtest)
+  include(external/protobuf)
+  if(WITH_PYTHON)
+    include(external/pybind11)
+  endif()
+  if(WITH_MKL)
+    include(external/mklml)
+    generate_dummy_static_lib(LIB_NAME "cinn_mklml" GENERATOR "mklml.cmake")
+    target_link_libraries(cinn_mklml ${MKLML_LIB} ${MKLML_IOMP_LIB})
+    add_definitions(-DCINN_WITH_MKL_CBLAS)
+  endif()
+  if(WITH_MKLDNN)
+    include(external/mkldnn)
+    add_definitions(-DCINN_WITH_MKLDNN)
+  endif()
+  return()
+endif()
+
 include(external/zlib) # download, build, install zlib
 include(external/gflags) # download, build, install gflags
 include(external/glog) # download, build, install glog
@@ -474,20 +497,6 @@ if(WITH_LITE)
   include(external/lite)
 endif()
 
-if(WITH_CINN)
-  message(STATUS "Compile Paddle with CINN.")
-  include(external/cinn)
-  add_definitions(-DPADDLE_WITH_CINN)
-  if(WITH_GPU)
-    add_definitions(-DCINN_WITH_CUDA)
-    add_definitions(-DCINN_WITH_CUDNN)
-  endif()
-  if(WITH_MKL)
-    add_definitions(-DCINN_WITH_MKL_CBLAS)
-    add_definitions(-DCINN_WITH_MKLDNN)
-  endif()
-endif()
-
 if(WITH_CRYPTO)
   include(external/cryptopp) # download, build, install cryptopp
   list(APPEND third_party_deps extern_cryptopp)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 8d9073b398417..cd7dc7e12f2a3 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,3 +1,43 @@
+if(CINN_ONLY)
+  file(GLOB_RECURSE CINN_PY_FILES ${PROJECT_SOURCE_DIR}/python/cinn/*.py)
+  set(CINN_PYTHON_DIR ${PROJECT_SOURCE_DIR}/python/cinn)
+  set(CINN_CORE_API ${CMAKE_BINARY_DIR}/python/cinn/core_api.so)
+
+  if(WITH_GPU)
+    set(PACKAGE_NAME "cinn-gpu")
+  else()
+    set(PACKAGE_NAME "cinn")
+  endif()
+  set(SETUP_LOG_FILE "setup.py.log")
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup_cinn.py.in
+                 ${CMAKE_CURRENT_BINARY_DIR}/setup_cinn.py)
+
+  if(NOT PYTHON_EXECUTABLE)
+    find_package(PythonInterp ${PY_VERSION} REQUIRED)
+    find_package(PythonLibs ${PY_VERSION} REQUIRED)
+  endif()
+
+  message(STATUS "PYTHON_EXECUTABLE: ${PYTHON_EXECUTABLE}")
+
+  # There may be a link file called core_api.so under the dir ${CINN_PYTHON_DIR} due to the `mac_doc`
+  # function defined in build.sh. So, we need to copy the directory ${CINN_PYTHON_DIR} first and
+  # then core_api.so.
+  add_custom_command(
+    OUTPUT ${CINN_CORE_API} POST_BUILD
+    COMMAND cp -rf --remove-destination ${CINN_PYTHON_DIR}
+            ${CMAKE_BINARY_DIR}/python/cinn
+    COMMAND cp --remove-destination
+            ${CMAKE_BINARY_DIR}/paddle/cinn/pybind/core_api.so ${CINN_CORE_API}
+    COMMAND cd ${CMAKE_CURRENT_BINARY_DIR} && ${PYTHON_EXECUTABLE} setup_cinn.py
+            bdist_wheel
+    DEPENDS core_api ${CINN_PY_FILES})
+
+  add_custom_target(COPY_CINN_CORE_API ALL DEPENDS ${CINN_CORE_API}
+                                                   ${CINN_PY_FILES})
+
+  return()
+endif()
+
 file(GLOB UTILS_PY_FILES . ./paddle/legacy/utils/*.py)
 file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/*.py)
 set(PY_FILES paddle/__init__.py ${UTILS_PY_FILES} ${FLUID_PY_FILES})
diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in
new file mode 100644
index 0000000000000..fbdaac8625840
--- /dev/null
+++ b/python/setup_cinn.py.in
@@ -0,0 +1,181 @@
+import os
+import re
+import sys
+import shutil
+import errno
+from contextlib import contextmanager
+from setuptools import setup
+
+def set_rpath(lib, rpath):
+    command = "patchelf --set-rpath '{}' {}".format(rpath, lib)
+    if os.system(command) != 0:
+        raise Exception("patch {} failed, command: {}".format(lib, command))
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE,
+            cwd="${PROJECT_SOURCE_DIR}").communicate()[0].strip()
+    except:
+        git_commit = b'Unknown'
+    git_commit = git_commit.decode()
+    return str(git_commit)
+
+def _get_version_detail(idx):
+    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
+        so detail index must less than 3"
+
+    if re.match('${TAG_VERSION_REGEX}', '${PADDLE_VERSION}'):
+        version_details = '${PADDLE_VERSION}'.split('.')
+
+        if len(version_details) >= 3:
+            return version_details[idx]
+
+    return 0
+
+def get_major():
+    return int(_get_version_detail(0))
+
+def get_minor():
+    return int(_get_version_detail(1))
+
+def get_patch():
+    return str(_get_version_detail(2))
+
+def get_cuda_version():
+    if '${WITH_GPU}' == 'ON':
+        return '${CUDA_VERSION}'
+    else:
+        return 'False'
+
+def get_cudnn_version():
+    if '${WITH_GPU}' == 'ON':
+        temp_cudnn_version = ''
+        if '${CUDNN_MAJOR_VERSION}':
+            temp_cudnn_version += '${CUDNN_MAJOR_VERSION}'
+            if '${CUDNN_MINOR_VERSION}':
+                temp_cudnn_version += '.${CUDNN_MINOR_VERSION}'
+                if '${CUDNN_PATCHLEVEL_VERSION}':
+                    temp_cudnn_version += '.${CUDNN_PATCHLEVEL_VERSION}'
+        return temp_cudnn_version
+    else:
+        return 'False'
+
+def is_taged():
+    try:
+        cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
+        git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd="${PROJECT_SOURCE_DIR}").communicate()[0].strip()
+        git_tag = git_tag.decode()
+    except:
+        return False
+
+    if str(git_tag).replace('v', '') == '${CINN_VERSION}':
+        return True
+    else:
+        return False
+
+def write_version_py(filename='cinn/version/info.py'):
+    cnt = '''# THIS FILE IS GENERATED FROM CINN SETUP.PY
+#
+full_version    = '%(major)d.%(minor)d.%(patch)s'
+major           = '%(major)d'
+minor           = '%(minor)d'
+patch           = '%(patch)s'
+cuda_version    = '%(cuda)s'
+cudnn_version   = '%(cudnn)s'
+istaged         = %(istaged)s
+commit          = '%(commit)s'
+with_mkl        = '%(with_mkl)s'
+'''
+    commit = git_commit()
+
+    dirname = os.path.dirname(filename)
+
+    try:
+        os.makedirs(dirname)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+    with open(filename, 'w') as f:
+        f.write(cnt % {
+            'major': get_major(),
+            'minor': get_minor(),
+            'patch': get_patch(),
+            'version': '${CINN_VERSION}',
+            'cuda': get_cuda_version(),
+            'cudnn': get_cudnn_version(),
+            'commit': commit,
+            'istaged': is_taged(),
+            'with_mkl': '${WITH_MKL}'})
+
+write_version_py(filename='${CINN_BINARY_DIR}/python/cinn/version/info.py')
+
+if sys.platform != 'win32':
+    @contextmanager
+    def redirect_stdout():
+        f_log = open('${SETUP_LOG_FILE}', 'w')
+        origin_stdout = sys.stdout
+        sys.stdout = f_log
+        yield
+        f_log = sys.stdout
+        sys.stdout = origin_stdout
+        f_log.close()
+else:
+    @contextmanager
+    def redirect_stdout():
+        yield
+
+libs_path = '${CMAKE_BINARY_DIR}/python/cinn/libs'
+
+cinnlibs = []
+package_data = {'cinn': ['core_api.so'], 'cinn.libs': []}
+
+if '${WITH_MKL}' == 'ON':
+    cinnlibs.append('${MKLML_LIB}')
+    cinnlibs.append('${MKLML_IOMP_LIB}')
+
+if '${WITH_GPU}' == 'ON':
+    cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh')
+    cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/cuda/float16.h')
+    cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/cuda/bfloat16.h')
+
+for lib in cinnlibs:
+    shutil.copy(lib, libs_path)
+    libname = os.path.basename(lib)
+    if lib.endswith('so'):
+        set_rpath(os.path.join(libs_path, libname) , '$ORIGIN/')
+    package_data['cinn.libs'].append(libname)
+
+set_rpath('${CMAKE_BINARY_DIR}/python/cinn/core_api.so', '$ORIGIN/libs/')
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE,
+            cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
+    except:
+        git_commit = 'Unknown'
+    git_commit = git_commit.decode()
+    return str(git_commit)
+
+packages = ["cinn",
+            "cinn.auto_schedule",
+            "cinn.auto_schedule.cost_model",
+            "cinn.ir",
+            "cinn.libs",
+            "cinn.version"
+            ]
+
+with redirect_stdout():
+    setup(
+        name='${PACKAGE_NAME}',
+        version='${CINN_VERSION}',
+        description='CINN: a Compiler Infrastructure for Neural Networks',
+        maintainer="PaddlePaddle",
+        maintainer_email="Paddle-better@baidu.com",
+        url='https://github.com/PaddlePaddle/Paddle',
+        license='Apache Software License',
+        packages=packages,
+        package_data=package_data
+    )

From d19edcad3f781ca8e15f72653dd8e018aa0ac114 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Thu, 15 Jun 2023 14:58:13 +0800
Subject: [PATCH 02/14] feat(cmake): add cmake of cinn python test

---
 test/cinn/CMakeLists.txt | 269 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 269 insertions(+)
 create mode 100644 test/cinn/CMakeLists.txt

diff --git a/test/cinn/CMakeLists.txt b/test/cinn/CMakeLists.txt
new file mode 100644
index 0000000000000..3497958bf3d73
--- /dev/null
+++ b/test/cinn/CMakeLists.txt
@@ -0,0 +1,269 @@
+set(CINN_PYTHON_TEST_DIR ${PROJECT_SOURCE_DIR}/test/cinn)
+set(CINN_CORE_API ${CMAKE_BINARY_DIR}/python/cinn/core_api.so)
+
+add_custom_command(
+  OUTPUT ${CINN_CORE_API} POST_BUILD
+  COMMAND cp --remove-destination
+          ${CMAKE_BINARY_DIR}/paddle/cinn/pybind/core_api.so ${CINN_CORE_API}
+  DEPENDS core_api ${CINN_PY_FILES})
+
+
+set(BASIC_TEST_NAMES
+    test_matmul
+    test_common
+    test_packed_func
+    test_pe_elementwise
+    test_pe_reduction
+    test_pe_transform
+    test_op_broadcast
+    #    test_op_transform
+)
+
+foreach(basic_test_name ${BASIC_TEST_NAMES})
+  add_test(
+    NAME ${basic_test_name}
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      ${CMAKE_CURRENT_SOURCE_DIR}/${basic_test_name}.py
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+endforeach()
+
+if(NOT ${WITH_GPU})
+  #    ADD_TEST(NAME test_op_nn
+  #        COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH}
+  #        python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_op_nn.py WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+  #    )
+endif()
+
+if(WITH_GPU)
+  # TODO(thisjiang): revert test_cinn_frontend after fix inference mul problem
+  # ADD_TEST(NAME test_cinn_frontend
+  #     COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH}
+  #     python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_frontend.py
+  #     ${CMAKE_BINARY_DIR}/thirds/naive_mul_model
+  #     ${CMAKE_BINARY_DIR}/thirds/multi_fc_model
+  #     "${WITH_GPU}" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+  # )
+  add_test(
+    NAME test_netbuilder
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_netbuilder.py "${WITH_GPU}"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+endif()
+
+#ADD_TEST(NAME test_computation_python
+#    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH}
+#    python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_computation.py
+#    ${CMAKE_BINARY_DIR}/thirds/naive_mul_model
+#    "${WITH_GPU}" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+#)
+
+#ADD_TEST(NAME test_cinn_ops_check
+#    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH}
+#    python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_ops.py "${WITH_GPU}"
+#    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+#)
+
+add_test(
+  NAME test_cinn_op_benchmark
+  COMMAND
+    ${CMAKE_COMMAND} -E env
+    PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_op_benchmark.py "${WITH_GPU}"
+  WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+if(WITH_GPU)
+  add_test(
+    NAME test_cinn_fake_resnet
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet.py
+      "${CMAKE_BINARY_DIR}/thirds/resnet_model" "${WITH_GPU}"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_test(
+    NAME test_cinn_real_resnet18
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet18.py
+      "${CMAKE_BINARY_DIR}/thirds/ResNet18" "${WITH_GPU}"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_test(
+    NAME test_cinn_real_mobilenetV2
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_mobilenetv2.py
+      "${CMAKE_BINARY_DIR}/thirds/MobileNetV2" "${WITH_GPU}"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_test(
+    NAME test_cinn_real_efficientnet
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_efficientnet.py
+      "${CMAKE_BINARY_DIR}/thirds/EfficientNet" "${WITH_GPU}"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_test(
+    NAME test_cinn_real_mobilenetV1
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_mobilenetv1.py
+      "${CMAKE_BINARY_DIR}/thirds/MobilenetV1" "${WITH_GPU}"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_test(
+    NAME test_cinn_real_resnet50
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet50.py
+      "${CMAKE_BINARY_DIR}/thirds/ResNet50" "${WITH_GPU}"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_test(
+    NAME test_cinn_real_squeezenet
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_squeezenet.py
+      "${CMAKE_BINARY_DIR}/thirds/SqueezeNet" "${WITH_GPU}"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_test(
+    NAME test_paddle_model_convertor
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_paddle_model_convertor.py --path
+      "${CMAKE_BINARY_DIR}/thirds/resnet_model"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+endif()
+
+#ADD_TEST(NAME test_cinn_real_facedet
+#    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH}
+#    python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_facedet.py "${CMAKE_BINARY_DIR}/thirds/FaceDet" "${WITH_GPU}"
+#    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+if(WITH_GPU)
+  file(
+    GLOB CINN_OP_TEST
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "ops/test_*.py")
+  set(EXCLUDE_OP test_conv2d_op)
+
+  if(WITH_GPU)
+    add_test(
+      NAME test_conv2d_op
+      COMMAND
+        ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        ${CMAKE_CURRENT_SOURCE_DIR}/ops/test_conv2d_op.py
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  endif()
+
+  foreach(op_test_name ${EXCLUDE_OP})
+    list(REMOVE_ITEM CINN_OP_TEST ops/${op_test_name}.py)
+  endforeach()
+
+  foreach(op_test_name ${CINN_OP_TEST})
+    string(REGEX REPLACE ".py" "" op_test_name ${op_test_name})
+    add_test(
+      NAME ${op_test_name}
+      COMMAND
+        ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        ${CMAKE_CURRENT_SOURCE_DIR}/${op_test_name}.py
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  endforeach()
+
+  # test op mapper
+  file(
+    GLOB CINN_OP_MAPPER_TEST
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "op_mappers/test_*.py")
+  set(EXCLUDE_OP_MAPPER test_mul_op test_conv2d_op)
+
+  if(WITH_GPU)
+    add_test(
+      NAME test_mul_op_mapper
+      COMMAND
+        ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        ${CMAKE_CURRENT_SOURCE_DIR}/op_mappers/test_mul_op.py
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+    add_test(
+      NAME test_conv2d_op_mapper
+      COMMAND
+        ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        ${CMAKE_CURRENT_SOURCE_DIR}/op_mappers/test_conv2d_op.py
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  endif()
+
+  foreach(op_mapper_test_name ${EXCLUDE_OP_MAPPER})
+    list(REMOVE_ITEM CINN_OP_MAPPER_TEST op_mappers/${op_mapper_test_name}.py)
+  endforeach()
+
+  foreach(op_mapper_test_name ${CINN_OP_MAPPER_TEST})
+    string(REGEX REPLACE ".py" "" op_mapper_test_name ${op_mapper_test_name})
+    add_test(
+      NAME "${op_mapper_test_name}_mapper"
+      COMMAND
+        ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        ${CMAKE_CURRENT_SOURCE_DIR}/${op_mapper_test_name}.py
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  endforeach()
+
+  # test pass test
+  file(
+    GLOB CINN_PASS_TEST
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "passes/test_*.py")
+
+  foreach(pass_test_name ${EXCLUDE_PASS})
+    list(REMOVE_ITEM CINN_PASS_TEST passes/${pass_test_name}.py)
+  endforeach()
+
+  foreach(pass_test_name ${CINN_PASS_TEST})
+    string(REGEX REPLACE ".py" "" pass_test_name ${pass_test_name})
+    add_test(
+      NAME ${pass_test_name}
+      COMMAND
+        ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${PROJECT_SOURCE_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH} python3
+        ${CMAKE_CURRENT_SOURCE_DIR}/${pass_test_name}.py
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  endforeach()
+
+  file(
+    GLOB CINN_FUSION_TEST
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "fusion/test_*.py")
+
+  foreach(fusion_test_name ${EXCLUDE_FUSION})
+    list(REMOVE_ITEM CINN_FUSION_TEST fusion/${fusion_test_name}.py)
+  endforeach()
+
+  foreach(fusion_test_name ${CINN_FUSION_TEST})
+    string(REGEX REPLACE ".py" "" fusion_test_name ${fusion_test_name})
+    add_test(
+      NAME ${fusion_test_name}
+      COMMAND
+        ${CMAKE_COMMAND} -E env
+        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        ${CMAKE_CURRENT_SOURCE_DIR}/${fusion_test_name}.py
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  endforeach()
+
+endif()

From 796f1d5eb20ba0d3e3ac6977e1683a3403228aa7 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Thu, 15 Jun 2023 16:34:18 +0800
Subject: [PATCH 03/14] feat(cmake): add jit

---
 cmake/cinn/external/jitify.cmake              |    1 -
 paddle/cinn/CMakeLists.txt                    |   21 +
 paddle/cinn/auto_schedule/CMakeLists.txt      |   22 +
 .../auto_schedule/analysis/CMakeLists.txt     |    5 +
 .../cinn/auto_schedule/analysis/analyze_ir.cc |  176 +
 .../cinn/auto_schedule/analysis/analyze_ir.h  |   48 +
 .../auto_schedule/analysis/analyze_ir_test.cc |  181 +
 paddle/cinn/auto_schedule/auto_schedule.proto |   26 +
 paddle/cinn/auto_schedule/auto_tuner.cc       |  163 +
 paddle/cinn/auto_schedule/auto_tuner.h        |   79 +
 paddle/cinn/auto_schedule/auto_tuner_test.cc  |  164 +
 .../auto_schedule/cost_model/CMakeLists.txt   |    7 +
 .../cost_model/expr_cost_model.cc             |   77 +
 .../cost_model/expr_cost_model.h              |   45 +
 .../cinn/auto_schedule/cost_model/feature.cc  |  175 +
 .../cinn/auto_schedule/cost_model/feature.h   |  178 +
 .../cost_model/feature_extractor.cc           |  299 ++
 .../cost_model/feature_extractor.h            |   60 +
 .../cost_model/feature_extractor_test.cc      |  158 +
 .../auto_schedule/cost_model/feature_test.cc  |   28 +
 .../cost_model/xgb_cost_model.cc              |  135 +
 .../auto_schedule/cost_model/xgb_cost_model.h |   75 +
 .../cost_model/xgb_cost_model_test.cc         |   69 +
 .../auto_schedule/database/CMakeLists.txt     |    6 +
 .../cinn/auto_schedule/database/database.cc   |  122 +
 paddle/cinn/auto_schedule/database/database.h |  102 +
 .../auto_schedule/database/database_test.cc   |   70 +
 .../database/jsonfile_database.cc             |   99 +
 .../database/jsonfile_database.h              |   52 +
 .../database/jsonfile_database_test.cc        |  214 ++
 .../cinn/auto_schedule/measure/CMakeLists.txt |    6 +
 paddle/cinn/auto_schedule/measure/measure.h   |   79 +
 .../auto_schedule/measure/measurer_test.cc    |  127 +
 .../measure/schedule_measurer.cc              |   77 +
 .../auto_schedule/measure/schedule_measurer.h |   44 +
 .../auto_schedule/measure/simple_builder.cc   |   41 +
 .../auto_schedule/measure/simple_builder.h    |   37 +
 .../auto_schedule/measure/simple_runner.cc    |  227 ++
 .../auto_schedule/measure/simple_runner.h     |   43 +
 .../measure/simple_runner_test.cc             |  139 +
 .../post_schedule_rule/CMakeLists.txt         |    9 +
 .../post_schedule_rule/cooperative_process.cc |   70 +
 .../post_schedule_rule/cooperative_process.h  |   34 +
 .../cooperative_process_test.cc               |  199 ++
 .../post_schedule_rule/post_schedule_rule.h   |   38 +
 .../auto_schedule/search_space/CMakeLists.txt |   15 +
 .../search_space/auto_gen_rule/CMakeLists.txt |   24 +
 .../search_space/auto_gen_rule/auto_bind.cc   |  163 +
 .../search_space/auto_gen_rule/auto_bind.h    |   48 +
 .../auto_gen_rule/auto_bind_test.cc           |  118 +
 .../auto_gen_rule/auto_gen_rule.cc            |   41 +
 .../auto_gen_rule/auto_gen_rule.h             |   84 +
 .../search_space/auto_gen_rule/auto_inline.cc |  214 ++
 .../search_space/auto_gen_rule/auto_inline.h  |   71 +
 .../auto_gen_rule/auto_inline_test.cc         |  493 +++
 .../search_space/auto_gen_rule/auto_unroll.cc |  120 +
 .../search_space/auto_gen_rule/auto_unroll.h  |   54 +
 .../auto_gen_rule/auto_unroll_test.cc         |  107 +
 .../auto_gen_rule/mix_rules_test.cc           |   66 +
 .../auto_gen_rule/multi_level_tiling.cc       |  401 +++
 .../auto_gen_rule/multi_level_tiling.h        |  138 +
 .../auto_gen_rule/multi_level_tiling_test.cc  |  548 +++
 .../search_space/auto_gen_rule/skip_rule.cc   |   38 +
 .../search_space/auto_gen_rule/skip_rule.h    |   45 +
 .../auto_gen_rule/skip_rule_test.cc           |  122 +
 .../search_space/auto_gen_rule/test_helper.cc |  240 ++
 .../search_space/auto_gen_rule/test_helper.h  |   92 +
 .../search_space/block_sampler.cc             |   92 +
 .../search_space/block_sampler.h              |  115 +
 .../search_space/block_sampler_test.cc        |   73 +
 .../search_space/rule_sampler.cc              |   80 +
 .../auto_schedule/search_space/rule_sampler.h |  114 +
 .../search_space/rule_sampler_test.cc         |   75 +
 .../search_space/search_space.cc              |  301 ++
 .../auto_schedule/search_space/search_space.h |  104 +
 .../search_space/search_space_test.cc         |   21 +
 .../search_space/search_state.cc              |  152 +
 .../auto_schedule/search_space/search_state.h |   87 +
 .../search_space/search_state_test.cc         |  136 +
 .../search_strategy/CMakeLists.txt            |    7 +
 .../search_strategy/evolutionary_search.cc    |  302 ++
 .../search_strategy/evolutionary_search.h     |  146 +
 .../evolutionary_search_test.cc               |  196 ++
 .../mutate_rule/CMakeLists.txt                |    8 +
 .../mutate_rule/mutate_rule.cc                |   32 +
 .../search_strategy/mutate_rule/mutate_rule.h |   48 +
 .../mutate_rule/mutate_tile_size.cc           |  142 +
 .../mutate_rule/mutate_tile_size.h            |   33 +
 .../mutate_rule/mutate_tile_size_test.cc      |  126 +
 paddle/cinn/auto_schedule/task/CMakeLists.txt |   12 +
 .../cinn/auto_schedule/task/task_creator.cc   |   57 +
 paddle/cinn/auto_schedule/task/task_creator.h |   36 +
 .../auto_schedule/task/task_creator_test.cc   |   72 +
 .../cinn/auto_schedule/task/task_optimizer.cc |  407 +++
 .../cinn/auto_schedule/task/task_optimizer.h  |   70 +
 .../cinn/auto_schedule/task/task_registry.h   |   79 +
 .../auto_schedule/task/task_registry_test.cc  |  105 +
 paddle/cinn/auto_schedule/task/tune_task.cc   |   97 +
 paddle/cinn/auto_schedule/task/tune_task.h    |   69 +
 .../cinn/auto_schedule/task/tune_task_test.cc |  339 ++
 .../task_scheduler/CMakeLists.txt             |    5 +
 .../task_scheduler/efficiency_priority.cc     |   33 +
 .../task_scheduler/efficiency_priority.h      |   39 +
 .../task_scheduler/round_robin.cc             |   28 +
 .../task_scheduler/round_robin.h              |   36 +
 .../task_scheduler/task_scheduler.cc          |   46 +
 .../task_scheduler/task_scheduler.h           |   67 +
 .../task_scheduler/task_scheduler_test.cc     |   56 +
 .../cinn/auto_schedule/tests/CMakeLists.txt   |    5 +
 .../tests/performance_comparison_test.cc      |  310 ++
 paddle/cinn/auto_schedule/tuning.h            |   91 +
 paddle/cinn/backends/CMakeLists.txt           |   67 +
 paddle/cinn/backends/_x86_builtin_source.cc   |  378 +++
 paddle/cinn/backends/codegen_c.cc             |  868 +++++
 paddle/cinn/backends/codegen_c.h              |  127 +
 paddle/cinn/backends/codegen_c_test.cc        |  436 +++
 paddle/cinn/backends/codegen_c_x86.cc         |  153 +
 paddle/cinn/backends/codegen_c_x86.h          |  131 +
 paddle/cinn/backends/codegen_c_x86_test.cc    |   77 +
 paddle/cinn/backends/codegen_cuda_dev.cc      |  391 +++
 paddle/cinn/backends/codegen_cuda_dev.h       |  110 +
 .../backends/codegen_cuda_generate_test.cc    |   68 +
 paddle/cinn/backends/codegen_cuda_host.cc     |  173 +
 paddle/cinn/backends/codegen_cuda_host.h      |   56 +
 paddle/cinn/backends/codegen_cuda_util.cc     |   30 +
 paddle/cinn/backends/codegen_cuda_util.h      |  140 +
 paddle/cinn/backends/codegen_debug_test.cc    |  121 +
 paddle/cinn/backends/compiler.cc              |  163 +
 paddle/cinn/backends/compiler.h               |   94 +
 paddle/cinn/backends/compiler_test.cc         |  196 ++
 paddle/cinn/backends/cuda_util.cc             |   56 +
 paddle/cinn/backends/cuda_util.h              |  100 +
 paddle/cinn/backends/extern_func_emitter.cc   |   81 +
 paddle/cinn/backends/extern_func_emitter.h    |  134 +
 .../backends/extern_func_emitter_builtin.cc   |   87 +
 .../backends/extern_func_emitter_builtin.h    |   61 +
 .../cinn/backends/extern_func_jit_register.cc |   40 +
 .../cinn/backends/extern_func_jit_register.h  |  161 +
 paddle/cinn/backends/extern_func_protos.cc    |   66 +
 paddle/cinn/backends/extern_func_protos.h     |   43 +
 paddle/cinn/backends/function_prototype.cc    |  130 +
 paddle/cinn/backends/function_prototype.h     |  130 +
 paddle/cinn/backends/generated1.cu            |   15 +
 paddle/cinn/backends/generated_module1.cc     |   15 +
 paddle/cinn/backends/ir_schedule_test.cc      | 3019 +++++++++++++++++
 paddle/cinn/backends/llvm/CMakeLists.txt      |   41 +
 paddle/cinn/backends/llvm/codegen_llvm.cc     | 1527 +++++++++
 paddle/cinn/backends/llvm/codegen_llvm.h      |  248 ++
 .../cinn/backends/llvm/codegen_llvm_test.cc   |  623 ++++
 paddle/cinn/backends/llvm/codegen_x86.cc      |  163 +
 paddle/cinn/backends/llvm/codegen_x86.h       |   59 +
 paddle/cinn/backends/llvm/codegen_x86_test.cc |   73 +
 paddle/cinn/backends/llvm/execution_engine.cc |  250 ++
 paddle/cinn/backends/llvm/execution_engine.h  |  104 +
 .../backends/llvm/execution_engine_test.cc    |  329 ++
 .../backends/llvm/generate_runtime_llvm_ir.py |   57 +
 paddle/cinn/backends/llvm/ir_builder_mixin.h  |  306 ++
 paddle/cinn/backends/llvm/llvm_intrin_rule.h  |  177 +
 paddle/cinn/backends/llvm/llvm_optimizer.cc   |  166 +
 paddle/cinn/backends/llvm/llvm_optimizer.h    |   43 +
 paddle/cinn/backends/llvm/llvm_util.cc        |  146 +
 paddle/cinn/backends/llvm/llvm_util.h         |   55 +
 .../backends/llvm/runtime_symbol_registry.cc  |   68 +
 .../backends/llvm/runtime_symbol_registry.h   |  113 +
 paddle/cinn/backends/llvm/simple_jit.cc       |  133 +
 paddle/cinn/backends/llvm/simple_jit.h        |   82 +
 paddle/cinn/backends/modular.cc               |  128 +
 paddle/cinn/backends/modular.h                |   40 +
 paddle/cinn/backends/nvrtc/CMakeLists.txt     |    8 +
 .../cinn/backends/nvrtc/header_generator.cc   |   44 +
 paddle/cinn/backends/nvrtc/header_generator.h |   47 +
 paddle/cinn/backends/nvrtc/nvrtc_util.cc      |  239 ++
 paddle/cinn/backends/nvrtc/nvrtc_util.h       |   92 +
 paddle/cinn/backends/nvrtc/nvrtc_util_test.cc |   90 +
 paddle/cinn/backends/outputs.cc               |   50 +
 paddle/cinn/backends/outputs.h                |   52 +
 paddle/cinn/backends/raw_cuda_code_test.cu    |   54 +
 paddle/cinn/cinn.h                            |   56 +
 paddle/cinn/common/CMakeLists.txt             |   36 +
 paddle/cinn/common/arithmatic.cc              |  310 ++
 paddle/cinn/common/arithmatic.h               |   85 +
 paddle/cinn/common/arithmatic_test.cc         |   92 +
 paddle/cinn/common/axis.cc                    |  115 +
 paddle/cinn/common/axis.h                     |   45 +
 paddle/cinn/common/axis_test.cc               |   45 +
 paddle/cinn/common/bfloat16.h                 |  402 +++
 paddle/cinn/common/cas.cc                     | 2200 ++++++++++++
 paddle/cinn/common/cas.h                      |  166 +
 paddle/cinn/common/cas_test.cc                |  432 +++
 paddle/cinn/common/cinn_value.cc              |  251 ++
 paddle/cinn/common/cinn_value.h               |  222 ++
 paddle/cinn/common/cinn_value_test.cc         |   59 +
 paddle/cinn/common/common.h                   |   69 +
 paddle/cinn/common/context.cc                 |   80 +
 paddle/cinn/common/context.h                  |   95 +
 paddle/cinn/common/cost_model.h               |   40 +
 paddle/cinn/common/cuda_test_helper.cc        |   96 +
 paddle/cinn/common/cuda_test_helper.h         |   56 +
 paddle/cinn/common/debug_manager.cc           |   68 +
 paddle/cinn/common/debug_manager.h            |   50 +
 paddle/cinn/common/float16.h                  |  629 ++++
 .../cinn/common/float16_bfloat16_cuda_test.cu |  236 ++
 .../cinn/common/float16_bfloat16_host_test.cc |   98 +
 paddle/cinn/common/float16_bfloat16_utils.h   |  183 +
 paddle/cinn/common/graph_utils.cc             |  212 ++
 paddle/cinn/common/graph_utils.h              |  289 ++
 paddle/cinn/common/graph_utils_test.cc        |   92 +
 paddle/cinn/common/info_registry.cc           |   19 +
 paddle/cinn/common/info_registry.h            |   50 +
 paddle/cinn/common/ir_util.cc                 |  417 +++
 paddle/cinn/common/ir_util.h                  |  146 +
 paddle/cinn/common/macros.h                   |   51 +
 paddle/cinn/common/object.cc                  |   19 +
 paddle/cinn/common/object.h                   |   78 +
 .../cinn/common/python_interpreter_guard.cc   |   32 +
 paddle/cinn/common/python_interpreter_guard.h |   43 +
 paddle/cinn/common/shared.cc                  |   15 +
 paddle/cinn/common/shared.h                   |  151 +
 paddle/cinn/common/shared_test.cc             |   55 +
 paddle/cinn/common/target.cc                  |  225 ++
 paddle/cinn/common/target.h                   |  115 +
 paddle/cinn/common/test_helper.cc             |   80 +
 paddle/cinn/common/test_helper.h              |  115 +
 paddle/cinn/common/type.cc                    |  570 ++++
 paddle/cinn/common/type.h                     |  316 ++
 paddle/cinn/common/type_test.cc               |   31 +
 paddle/cinn/common/union_find.cc              |   24 +
 paddle/cinn/common/union_find.h               |  100 +
 paddle/cinn/frontend/CMakeLists.txt           |   49 +
 paddle/cinn/frontend/computation.cc           |  243 ++
 paddle/cinn/frontend/computation.h            |  161 +
 paddle/cinn/frontend/computation_test.cc      |  300 ++
 .../cinn/frontend/decomposer/CMakeLists.txt   |   19 +
 paddle/cinn/frontend/decomposer/activation.cc |  146 +
 .../frontend/decomposer/activation_test.cc    |   99 +
 paddle/cinn/frontend/decomposer/batch_norm.cc |  302 ++
 .../frontend/decomposer/batch_norm_test.cc    |  420 +++
 paddle/cinn/frontend/decomposer/broadcast.cc  |  175 +
 .../frontend/decomposer/broadcast_test.cc     |  281 ++
 .../cinn/frontend/decomposer/elementwise.cc   |   46 +
 .../frontend/decomposer/elementwise_test.cc   |   45 +
 .../cinn/frontend/decomposer/test_helper.cc   |   88 +
 paddle/cinn/frontend/decomposer/test_helper.h |  242 ++
 paddle/cinn/frontend/decomposer/top_k.cc      |   54 +
 paddle/cinn/frontend/decomposer/top_k_test.cc |   55 +
 .../cinn/frontend/decomposer/use_decomposer.h |   29 +
 paddle/cinn/frontend/decomposer_registry.h    |  128 +
 .../cinn/frontend/decomposer_registry_test.cc |   29 +
 paddle/cinn/frontend/interpreter.cc           |  142 +
 paddle/cinn/frontend/interpreter.h            |   66 +
 paddle/cinn/frontend/interpreter_test.cc      |   34 +
 paddle/cinn/frontend/net_builder.cc           |  939 +++++
 paddle/cinn/frontend/net_builder.h            | 1146 +++++++
 paddle/cinn/frontend/net_builder_test.cc      | 1501 ++++++++
 paddle/cinn/frontend/op_mapper_registry.cc    |   89 +
 paddle/cinn/frontend/op_mapper_registry.h     |  151 +
 .../cinn/frontend/op_mapper_registry_test.cc  |   42 +
 .../cinn/frontend/op_mappers/CMakeLists.txt   |    4 +
 .../cinn/frontend/op_mappers/common_utils.h   |  171 +
 .../frontend/op_mappers/paddle/CMakeLists.txt |    3 +
 .../frontend/op_mappers/paddle/arg_min_max.cc |   92 +
 .../frontend/op_mappers/paddle/argsort.cc     |   59 +
 .../cinn/frontend/op_mappers/paddle/atan.cc   |   54 +
 .../frontend/op_mappers/paddle/batchnorm.cc   |  168 +
 .../cinn/frontend/op_mappers/paddle/binary.cc |   64 +
 .../frontend/op_mappers/paddle/cholesky.cc    |   45 +
 .../cinn/frontend/op_mappers/paddle/clip.cc   |   81 +
 .../frontend/op_mappers/paddle/compare.cc     |   79 +
 .../cinn/frontend/op_mappers/paddle/concat.cc |  173 +
 .../frontend/op_mappers/paddle/constant.cc    |  238 ++
 .../cinn/frontend/op_mappers/paddle/conv2d.cc |  178 +
 .../cinn/frontend/op_mappers/paddle/cumsum.cc |   96 +
 .../frontend/op_mappers/paddle/dropout.cc     |   45 +
 .../frontend/op_mappers/paddle/elementwise.cc |  257 ++
 .../cinn/frontend/op_mappers/paddle/expand.cc |  120 +
 .../frontend/op_mappers/paddle/fetch_feed.cc  |   61 +
 .../cinn/frontend/op_mappers/paddle/flip.cc   |   51 +
 .../cinn/frontend/op_mappers/paddle/gather.cc |   68 +
 .../frontend/op_mappers/paddle/gather_nd.cc   |   49 +
 .../op_mappers/paddle/gaussian_random.cc      |   52 +
 .../frontend/op_mappers/paddle/layer_norm.cc  |  160 +
 paddle/cinn/frontend/op_mappers/paddle/log.cc |   88 +
 .../op_mappers/paddle/lookup_table.cc         |   65 +
 .../cinn/frontend/op_mappers/paddle/matmul.cc |   57 +
 paddle/cinn/frontend/op_mappers/paddle/mul.cc |   51 +
 .../cinn/frontend/op_mappers/paddle/norm.cc   |  110 +
 .../frontend/op_mappers/paddle/one_hot.cc     |   75 +
 .../cinn/frontend/op_mappers/paddle/pool2d.cc |  115 +
 .../frontend/op_mappers/paddle/randint.cc     |   56 +
 .../cinn/frontend/op_mappers/paddle/reduce.cc |  119 +
 .../cinn/frontend/op_mappers/paddle/relu.cc   |   73 +
 .../frontend/op_mappers/paddle/reshape.cc     |  135 +
 .../frontend/op_mappers/paddle/reverse.cc     |   45 +
 .../cinn/frontend/op_mappers/paddle/roll.cc   |  106 +
 .../cinn/frontend/op_mappers/paddle/scale.cc  |   80 +
 .../frontend/op_mappers/paddle/scatter.cc     |   75 +
 .../cinn/frontend/op_mappers/paddle/slice.cc  |   52 +
 .../frontend/op_mappers/paddle/softmax.cc     |   44 +
 .../frontend/op_mappers/paddle/squeeze.cc     |   63 +
 .../op_mappers/paddle/strided_slice.cc        |   55 +
 .../op_mappers/paddle/take_along_axis.cc      |   52 +
 .../cinn/frontend/op_mappers/paddle/tile.cc   |   98 +
 .../cinn/frontend/op_mappers/paddle/top_k.cc  |   48 +
 .../frontend/op_mappers/paddle/transpose.cc   |   82 +
 .../op_mappers/paddle/triangular_solve.cc     |   52 +
 .../cinn/frontend/op_mappers/paddle/unary.cc  |  109 +
 .../op_mappers/paddle/uniform_random.cc       |   56 +
 .../frontend/op_mappers/paddle/unsqueeze.cc   |   63 +
 .../cinn/frontend/op_mappers/paddle/where.cc  |   50 +
 .../op_mappers/science/CMakeLists.txt         |    3 +
 .../frontend/op_mappers/science/broadcast.cc  |   77 +
 .../frontend/op_mappers/science/compare.cc    |   64 +
 .../cinn/frontend/op_mappers/science/math.cc  |  107 +
 .../frontend/op_mappers/science/transform.cc  |  404 +++
 .../cinn/frontend/op_mappers/use_op_mappers.h |   72 +
 paddle/cinn/frontend/optimize.cc              |  177 +
 paddle/cinn/frontend/optimize.h               |   49 +
 paddle/cinn/frontend/paddle/CMakeLists.txt    |   26 +
 paddle/cinn/frontend/paddle/README.md         |    1 +
 paddle/cinn/frontend/paddle/compatible_pb.cc  |  266 ++
 paddle/cinn/frontend/paddle/compatible_pb.h   |   56 +
 .../cinn/frontend/paddle/cpp/CMakeLists.txt   |   18 +
 paddle/cinn/frontend/paddle/cpp/block_desc.cc |   55 +
 paddle/cinn/frontend/paddle/cpp/block_desc.h  |   79 +
 paddle/cinn/frontend/paddle/cpp/desc_api.h    |  250 ++
 paddle/cinn/frontend/paddle/cpp/op_desc.cc    |  152 +
 paddle/cinn/frontend/paddle/cpp/op_desc.h     |  110 +
 .../cinn/frontend/paddle/cpp/program_desc.cc  |   37 +
 .../cinn/frontend/paddle/cpp/program_desc.h   |   59 +
 paddle/cinn/frontend/paddle/cpp/var_desc.cc   |   17 +
 paddle/cinn/frontend/paddle/cpp/var_desc.h    |   58 +
 paddle/cinn/frontend/paddle/framework.proto   |  214 ++
 paddle/cinn/frontend/paddle/model_parser.cc   |  272 ++
 paddle/cinn/frontend/paddle/model_parser.h    |   66 +
 .../cinn/frontend/paddle/model_parser_test.cc |   45 +
 paddle/cinn/frontend/paddle/pb/CMakeLists.txt |   16 +
 paddle/cinn/frontend/paddle/pb/block_desc.cc  |   41 +
 paddle/cinn/frontend/paddle/pb/block_desc.h   |   71 +
 paddle/cinn/frontend/paddle/pb/op_desc.cc     |  124 +
 paddle/cinn/frontend/paddle/pb/op_desc.h      |  169 +
 .../cinn/frontend/paddle/pb/program_desc.cc   |   33 +
 paddle/cinn/frontend/paddle/pb/program_desc.h |   57 +
 paddle/cinn/frontend/paddle/pb/var_desc.cc    |  341 ++
 paddle/cinn/frontend/paddle/pb/var_desc.h     |  115 +
 .../cinn/frontend/paddle_model_convertor.cc   |  204 ++
 paddle/cinn/frontend/paddle_model_convertor.h |   99 +
 .../frontend/paddle_model_convertor_test.cc   |  108 +
 .../cinn/frontend/paddle_model_to_program.cc  |  736 ++++
 .../cinn/frontend/paddle_model_to_program.h   |  141 +
 paddle/cinn/frontend/pass/CMakeLists.txt      |   36 +
 paddle/cinn/frontend/pass/auto_broadcast.cc   |  139 +
 paddle/cinn/frontend/pass/auto_cast.cc        |  250 ++
 paddle/cinn/frontend/pass/auto_cast_test.cc   |   86 +
 paddle/cinn/frontend/pass/cast_collapsing.cc  |  347 ++
 .../frontend/pass/cast_collapsing_test.cc     |  200 ++
 .../cinn/frontend/pass/dead_code_eliminate.cc |  116 +
 .../frontend/pass/dead_code_eliminate_test.cc |   81 +
 paddle/cinn/frontend/pass/decomposer.cc       |   85 +
 paddle/cinn/frontend/pass/decomposer_test.cc  |   88 +
 .../frontend/pass/expand_zero_dim_pass.cc     |   73 +
 .../pass/expand_zero_dim_pass_test.cc         |  157 +
 .../frontend/pass/fill_constant_folding.cc    |  191 ++
 .../pass/fill_constant_folding_test.cc        |  210 ++
 .../frontend/pass/fill_constant_rewriter.cc   |  226 ++
 .../pass/fill_constant_rewriter_test.cc       |  160 +
 paddle/cinn/frontend/pass/gemm_rewriter.cc    |  216 ++
 .../cinn/frontend/pass/gemm_rewriter_test.cc  |  280 ++
 paddle/cinn/frontend/pass/pass_test_helper.h  |  212 ++
 .../frontend/pass/program_topoerror_test.cc   |   71 +
 paddle/cinn/frontend/pass/remove_identity.cc  |  276 ++
 .../frontend/pass/remove_identity_test.cc     |  123 +
 paddle/cinn/frontend/pass/test_helper.h       |  185 +
 .../frontend/pass/transpose_collapsing.cc     |  393 +++
 .../pass/transpose_collapsing_test.cc         |  455 +++
 .../frontend/pass/transpose_folding_base.h    |  211 ++
 .../frontend/pass/transpose_folding_input.cc  |  161 +
 .../pass/transpose_folding_input_test.cc      |  257 ++
 .../frontend/pass/transpose_folding_output.cc |  111 +
 .../pass/transpose_folding_output_test.cc     |  568 ++++
 .../pass/transpose_scale_folding_test.cc      |  370 ++
 paddle/cinn/frontend/pass/use_program_pass.h  |   31 +
 paddle/cinn/frontend/program_pass.cc          |   46 +
 paddle/cinn/frontend/program_pass.h           |  116 +
 paddle/cinn/frontend/syntax.cc                |  568 ++++
 paddle/cinn/frontend/syntax.h                 |  507 +++
 paddle/cinn/frontend/syntax_test.cc           |  143 +
 paddle/cinn/frontend/var_type_utils.h         |  102 +
 paddle/cinn/gtest_main.cc                     |   23 +
 paddle/cinn/hlir/CMakeLists.txt               |    5 +
 paddle/cinn/hlir/framework/CMakeLists.txt     |   41 +
 .../cinn/hlir/framework/accuracy_checker.cc   |  312 ++
 paddle/cinn/hlir/framework/accuracy_checker.h |   52 +
 .../hlir/framework/accuracy_checker_test.cc   |  162 +
 paddle/cinn/hlir/framework/buffer.cc          |   96 +
 paddle/cinn/hlir/framework/buffer.h           |   91 +
 paddle/cinn/hlir/framework/buffer_test.cc     |   61 +
 paddle/cinn/hlir/framework/graph.cc           |  514 +++
 paddle/cinn/hlir/framework/graph.h            |  232 ++
 paddle/cinn/hlir/framework/graph_compiler.cc  | 1532 +++++++++
 paddle/cinn/hlir/framework/graph_compiler.h   |  210 ++
 .../hlir/framework/graph_compiler_test.cc     |  216 ++
 paddle/cinn/hlir/framework/graph_test.cc      |   70 +
 paddle/cinn/hlir/framework/instruction.cc     |  333 ++
 paddle/cinn/hlir/framework/instruction.h      |  150 +
 .../cinn/hlir/framework/instruction_test.cc   |  482 +++
 paddle/cinn/hlir/framework/memory.cc          |   68 +
 paddle/cinn/hlir/framework/memory.h           |   77 +
 paddle/cinn/hlir/framework/node.cc            |  177 +
 paddle/cinn/hlir/framework/node.h             |  212 ++
 paddle/cinn/hlir/framework/op.h               |  248 ++
 paddle/cinn/hlir/framework/op_lowering.cc     | 1351 ++++++++
 paddle/cinn/hlir/framework/op_lowering.h      |  100 +
 .../cinn/hlir/framework/op_lowering_test.cc   | 1268 +++++++
 .../cinn/hlir/framework/op_lowering_util.cc   | 1661 +++++++++
 paddle/cinn/hlir/framework/op_lowering_util.h |  102 +
 paddle/cinn/hlir/framework/op_strategy.cc     |   55 +
 paddle/cinn/hlir/framework/op_strategy.h      |  138 +
 paddle/cinn/hlir/framework/op_test.cc         |   87 +
 .../cinn/hlir/framework/parallel_compiler.cc  |  230 ++
 .../cinn/hlir/framework/parallel_compiler.h   |   97 +
 .../hlir/framework/parallel_compiler_test.cc  |   83 +
 paddle/cinn/hlir/framework/pass.cc            |   58 +
 paddle/cinn/hlir/framework/pass.h             |  109 +
 .../hlir/framework/print_graph_pass_test.cc   |   78 +
 paddle/cinn/hlir/framework/schedule.h         |   64 +
 paddle/cinn/hlir/framework/scope.cc           |   51 +
 paddle/cinn/hlir/framework/scope.h            |   75 +
 paddle/cinn/hlir/framework/scope_test.cc      |   45 +
 paddle/cinn/hlir/framework/tensor.cc          |   58 +
 paddle/cinn/hlir/framework/tensor.h           |  118 +
 paddle/cinn/hlir/framework/tensor_test.cc     |   36 +
 paddle/cinn/hlir/framework/variable.cc        |   21 +
 paddle/cinn/hlir/framework/variable.h         |   21 +
 .../cinn/hlir/framework/visualize_helper.cc   |  440 +++
 paddle/cinn/hlir/framework/visualize_helper.h |  161 +
 paddle/cinn/hlir/kernels/CMakeLists.txt       |    0
 paddle/cinn/hlir/op/CMakeLists.txt            |   23 +
 paddle/cinn/hlir/op/broadcast.cc              |  455 +++
 paddle/cinn/hlir/op/contrib/CMakeLists.txt    |   29 +
 paddle/cinn/hlir/op/contrib/argmax.cc         |  249 ++
 paddle/cinn/hlir/op/contrib/argmax.h          |   32 +
 paddle/cinn/hlir/op/contrib/argmax_test.cc    |  119 +
 paddle/cinn/hlir/op/contrib/argmin.cc         |  248 ++
 paddle/cinn/hlir/op/contrib/argmin.h          |   32 +
 paddle/cinn/hlir/op/contrib/argmin_test.cc    |  118 +
 paddle/cinn/hlir/op/contrib/assert_true.cc    |   89 +
 .../cinn/hlir/op/contrib/bitcast_convert.cc   |  133 +
 paddle/cinn/hlir/op/contrib/cholesky.cc       |  110 +
 paddle/cinn/hlir/op/contrib/gather_nd.cc      |  193 ++
 paddle/cinn/hlir/op/contrib/gather_nd.h       |   32 +
 paddle/cinn/hlir/op/contrib/gather_nd_test.cc |   95 +
 .../cinn/hlir/op/contrib/gaussian_random.cc   |  111 +
 .../hlir/op/contrib/logical_right_shift.cc    |  157 +
 .../hlir/op/contrib/logical_right_shift.h     |   35 +
 .../op/contrib/logical_right_shift_test.cc    |   64 +
 paddle/cinn/hlir/op/contrib/lookup_table.cc   |  148 +
 paddle/cinn/hlir/op/contrib/lookup_table.h    |   37 +
 .../cinn/hlir/op/contrib/lookup_table_test.cc |   94 +
 paddle/cinn/hlir/op/contrib/one_hot.cc        |  225 ++
 paddle/cinn/hlir/op/contrib/one_hot.h         |   38 +
 paddle/cinn/hlir/op/contrib/one_hot_test.cc   |  107 +
 paddle/cinn/hlir/op/contrib/randint.cc        |  104 +
 paddle/cinn/hlir/op/contrib/reciprocal.cc     |  156 +
 paddle/cinn/hlir/op/contrib/reciprocal.h      |   29 +
 .../cinn/hlir/op/contrib/reciprocal_test.cc   |   68 +
 paddle/cinn/hlir/op/contrib/repeat.cc         |  230 ++
 paddle/cinn/hlir/op/contrib/repeat.h          |   32 +
 paddle/cinn/hlir/op/contrib/repeat_test.cc    |  116 +
 paddle/cinn/hlir/op/contrib/resize.cc         |  241 ++
 paddle/cinn/hlir/op/contrib/resize.h          |   36 +
 paddle/cinn/hlir/op/contrib/sort.cc           |  412 +++
 paddle/cinn/hlir/op/contrib/sort.h            |   44 +
 paddle/cinn/hlir/op/contrib/sort_test.cc      |  133 +
 .../cinn/hlir/op/contrib/triangular_solve.cc  |  121 +
 paddle/cinn/hlir/op/contrib/uniform_random.cc |  111 +
 paddle/cinn/hlir/op/custom_call.cc            |  853 +++++
 paddle/cinn/hlir/op/elementwise.cc            | 1056 ++++++
 paddle/cinn/hlir/op/external_api_registry.cc  |   86 +
 paddle/cinn/hlir/op/external_api_registry.h   |   78 +
 .../hlir/op/external_api_registry_test.cc     |   50 +
 paddle/cinn/hlir/op/nn.cc                     | 2460 ++++++++++++++
 paddle/cinn/hlir/op/op_broadcast_test.cc      |  318 ++
 paddle/cinn/hlir/op/op_nn_test.cc             |  513 +++
 paddle/cinn/hlir/op/op_util.cc                |  169 +
 paddle/cinn/hlir/op/op_util.h                 |  140 +
 paddle/cinn/hlir/op/reduction.cc              |  505 +++
 paddle/cinn/hlir/op/reduction_test.cc         |  561 +++
 paddle/cinn/hlir/op/transform.cc              | 1797 ++++++++++
 paddle/cinn/hlir/op/transform_test.cc         |  121 +
 paddle/cinn/hlir/op/use_ops.h                 |   43 +
 paddle/cinn/hlir/pass/CMakeLists.txt          |   42 +
 paddle/cinn/hlir/pass/alterlayout.cc          |  649 ++++
 paddle/cinn/hlir/pass/alterlayout_test.cc     |  458 +++
 .../hlir/pass/check_fusion_accuracy_pass.cc   |  546 +++
 .../pass/check_fusion_accuracy_pass_test.cc   |  589 ++++
 .../pass/common_subexpression_elimination.cc  |  307 ++
 .../common_subexpression_elimination_test.cc  |  198 ++
 paddle/cinn/hlir/pass/const_propagate.cc      |   74 +
 paddle/cinn/hlir/pass/const_propagate_test.cc |  131 +
 .../cinn/hlir/pass/constant_folding_pass.cc   |  122 +
 .../hlir/pass/constant_folding_pass_test.cc   |  333 ++
 .../hlir/pass/constant_folding_pass_util.cc   |  237 ++
 .../hlir/pass/constant_folding_pass_util.h    |   39 +
 paddle/cinn/hlir/pass/custom_call_pass.cc     |  100 +
 paddle/cinn/hlir/pass/dce_pass.cc             |  135 +
 paddle/cinn/hlir/pass/dce_pass_test.cc        |   64 +
 paddle/cinn/hlir/pass/dense_merge_pass.cc     |  187 +
 .../cinn/hlir/pass/dense_merge_pass_test.cc   |  168 +
 paddle/cinn/hlir/pass/dot_merger.cc           |  437 +++
 paddle/cinn/hlir/pass/dot_merger_test.cc      |  112 +
 paddle/cinn/hlir/pass/fusion_helper_base.h    |  208 ++
 paddle/cinn/hlir/pass/fusion_merge_pass.cc    | 1032 ++++++
 .../cinn/hlir/pass/fusion_merge_pass_test.cc  |  487 +++
 .../cinn/hlir/pass/fusion_merge_pass_util.h   |  561 +++
 paddle/cinn/hlir/pass/infershape.cc           |  128 +
 paddle/cinn/hlir/pass/infershape.h            |   29 +
 paddle/cinn/hlir/pass/op_fusion_pass.cc       |  384 +++
 paddle/cinn/hlir/pass/op_fusion_pass_test.cc  |  276 ++
 paddle/cinn/hlir/pass/op_fusion_pass_util.h   |  337 ++
 paddle/cinn/hlir/pass/opfusion.cc             |  536 +++
 paddle/cinn/hlir/pass/opfusion_test.cc        |  540 +++
 paddle/cinn/hlir/pass/reduce_split_pass.cc    |  230 ++
 .../cinn/hlir/pass/reduce_split_pass_test.cc  |   95 +
 .../hlir/pass/single_group_optimize_pass.cc   |  201 ++
 paddle/cinn/hlir/pass/test_dot_merger.cc      |  100 +
 paddle/cinn/hlir/pass/test_primitive_ops.cc   |  153 +
 paddle/cinn/hlir/pass/use_pass.h              |   35 +
 paddle/cinn/hlir/pe/CMakeLists.txt            |   25 +
 paddle/cinn/hlir/pe/broadcast.cc              |  372 ++
 paddle/cinn/hlir/pe/broadcast.h               |  126 +
 paddle/cinn/hlir/pe/elementwise.cc            |  233 ++
 paddle/cinn/hlir/pe/elementwise.h             |  129 +
 paddle/cinn/hlir/pe/ir_schedule_pe.cc         | 1223 +++++++
 paddle/cinn/hlir/pe/ir_schedule_pe.h          |  102 +
 paddle/cinn/hlir/pe/load_params_test.cc       |   62 +
 paddle/cinn/hlir/pe/load_x86_params.cc        | 1308 +++++++
 paddle/cinn/hlir/pe/load_x86_params.h         |   50 +
 paddle/cinn/hlir/pe/nn.cc                     | 1290 +++++++
 paddle/cinn/hlir/pe/nn.h                      |  426 +++
 paddle/cinn/hlir/pe/nn_util.cc                |  428 +++
 paddle/cinn/hlir/pe/nn_util.h                 |   46 +
 paddle/cinn/hlir/pe/pe_broadcast_test.cc      |  220 ++
 paddle/cinn/hlir/pe/pe_elementwise_test.cc    |  160 +
 paddle/cinn/hlir/pe/pe_transform_test.cc      |  229 ++
 paddle/cinn/hlir/pe/reduction.cc              |  884 +++++
 paddle/cinn/hlir/pe/reduction.h               |  419 +++
 paddle/cinn/hlir/pe/schedule.cc               | 2270 +++++++++++++
 paddle/cinn/hlir/pe/schedule.h                |  248 ++
 paddle/cinn/hlir/pe/schedule_param.proto      |   29 +
 paddle/cinn/hlir/pe/transform.cc              | 1182 +++++++
 paddle/cinn/hlir/pe/transform.h               |  232 ++
 paddle/cinn/hlir/pe/vision.cc                 |   21 +
 paddle/cinn/hlir/pe/vision.h                  |   21 +
 paddle/cinn/ir/CMakeLists.txt                 |   44 +
 paddle/cinn/ir/buffer.cc                      |  169 +
 paddle/cinn/ir/buffer.h                       |  192 ++
 paddle/cinn/ir/buffer_test.cc                 |   86 +
 paddle/cinn/ir/collect_ir_nodes.cc            |  186 +
 paddle/cinn/ir/collect_ir_nodes.h             |   56 +
 paddle/cinn/ir/collect_ir_nodes_test.cc       |   58 +
 paddle/cinn/ir/function_base.cc               |   19 +
 paddle/cinn/ir/function_base.h                |   34 +
 paddle/cinn/ir/function_definition.cc         |   19 +
 paddle/cinn/ir/function_definition.h          |   43 +
 paddle/cinn/ir/intrinsic_ops.cc               |  127 +
 paddle/cinn/ir/intrinsic_ops.h                |  200 ++
 paddle/cinn/ir/intrinsic_ops_test.cc          |   31 +
 paddle/cinn/ir/ir.cc                          |  819 +++++
 paddle/cinn/ir/ir.h                           |  999 ++++++
 paddle/cinn/ir/ir_base.cc                     |  231 ++
 paddle/cinn/ir/ir_base.h                      |  500 +++
 paddle/cinn/ir/ir_compare.cc                  |  319 ++
 paddle/cinn/ir/ir_compare.h                   |   46 +
 paddle/cinn/ir/ir_compare_test.cc             |  124 +
 paddle/cinn/ir/ir_mutator.cc                  |   22 +
 paddle/cinn/ir/ir_mutator.h                   |  334 ++
 paddle/cinn/ir/ir_operators.cc                |  153 +
 paddle/cinn/ir/ir_operators.h                 |  133 +
 paddle/cinn/ir/ir_operators_test.cc           |   28 +
 paddle/cinn/ir/ir_printer.cc                  |  645 ++++
 paddle/cinn/ir/ir_printer.h                   |   80 +
 paddle/cinn/ir/ir_printer_test.cc             |   23 +
 paddle/cinn/ir/ir_schedule.cc                 | 2310 +++++++++++++
 paddle/cinn/ir/ir_schedule.h                  |  614 ++++
 paddle/cinn/ir/ir_schedule_util.cc            | 1038 ++++++
 paddle/cinn/ir/ir_schedule_util.h             |  448 +++
 paddle/cinn/ir/ir_test.cc                     |   31 +
 paddle/cinn/ir/ir_verify.cc                   |   39 +
 paddle/cinn/ir/ir_verify.h                    |   22 +
 paddle/cinn/ir/ir_verify_test.cc              |   29 +
 paddle/cinn/ir/ir_visitor.cc                  |   35 +
 paddle/cinn/ir/ir_visitor.h                   |   82 +
 paddle/cinn/ir/layout.cc                      |   67 +
 paddle/cinn/ir/layout.h                       |   48 +
 paddle/cinn/ir/lowered_func.cc                |  472 +++
 paddle/cinn/ir/lowered_func.h                 |  198 ++
 paddle/cinn/ir/module.cc                      |   97 +
 paddle/cinn/ir/module.h                       |   89 +
 paddle/cinn/ir/operation.cc                   |  113 +
 paddle/cinn/ir/operation.h                    |  130 +
 paddle/cinn/ir/registry.cc                    |   93 +
 paddle/cinn/ir/registry.h                     |   46 +
 paddle/cinn/ir/schedule_desc.cc               |  680 ++++
 paddle/cinn/ir/schedule_desc.h                |  106 +
 paddle/cinn/ir/schedule_desc.proto            |   67 +
 paddle/cinn/ir/schedule_desc_test.cc          |  809 +++++
 paddle/cinn/ir/tensor.cc                      |  590 ++++
 paddle/cinn/ir/tensor.h                       |  342 ++
 paddle/cinn/ir/tensor_test.cc                 |  211 ++
 paddle/cinn/lang/CMakeLists.txt               |   17 +
 paddle/cinn/lang/README.md                    |   93 +
 paddle/cinn/lang/buffer.cc                    |   36 +
 paddle/cinn/lang/buffer.h                     |   44 +
 paddle/cinn/lang/builtin.cc                   |  262 ++
 paddle/cinn/lang/builtin.h                    |  173 +
 paddle/cinn/lang/compute.cc                   |  229 ++
 paddle/cinn/lang/compute.h                    |  132 +
 paddle/cinn/lang/compute_test.cc              |   39 +
 paddle/cinn/lang/lower.cc                     |  302 ++
 paddle/cinn/lang/lower.h                      |   85 +
 paddle/cinn/lang/lower_impl.cc                |  791 +++++
 paddle/cinn/lang/lower_impl.h                 |  304 ++
 paddle/cinn/lang/lower_impl_test.cc           |  320 ++
 paddle/cinn/lang/lower_test.cc                |  155 +
 paddle/cinn/lang/packed_func.cc               |   27 +
 paddle/cinn/lang/packed_func.h                |  128 +
 paddle/cinn/lang/packed_func_test.cc          |   95 +
 paddle/cinn/lang/placeholder.cc               |   65 +
 paddle/cinn/lang/placeholder.h                |  115 +
 paddle/cinn/lang/placeholder_test.cc          |   48 +
 paddle/cinn/optim/CMakeLists.txt              |   50 +
 paddle/cinn/optim/buffer_assign.cc            |  156 +
 paddle/cinn/optim/buffer_assign.h             |   39 +
 .../optim/cache_read_write_replace_test.cc    |  125 +
 .../cinn/optim/call_arg_list_to_pod_value.cc  |  108 +
 .../cinn/optim/call_arg_list_to_pod_value.h   |   28 +
 paddle/cinn/optim/cast_bool_to_int8.cc        |   47 +
 paddle/cinn/optim/cast_bool_to_int8.h         |   34 +
 paddle/cinn/optim/cast_simplify.cc            |  117 +
 paddle/cinn/optim/cast_simplify.h             |   30 +
 paddle/cinn/optim/cast_simplify_test.cc       |   63 +
 paddle/cinn/optim/collect_undefined_vars.cc   |  109 +
 paddle/cinn/optim/collect_undefined_vars.h    |   36 +
 paddle/cinn/optim/compute_inline_expand.cc    |  233 ++
 paddle/cinn/optim/compute_inline_expand.h     |   33 +
 .../optim/eliminate_broadcast_in_forloop.cc   |  111 +
 .../optim/eliminate_broadcast_in_forloop.h    |   24 +
 paddle/cinn/optim/extern_call_process.cc      |   41 +
 paddle/cinn/optim/extern_call_process.h       |   27 +
 paddle/cinn/optim/fold_cinn_call_arguments.cc |  114 +
 paddle/cinn/optim/fold_cinn_call_arguments.h  |   46 +
 paddle/cinn/optim/if_simplify.cc              |   57 +
 paddle/cinn/optim/if_simplify.h               |   22 +
 paddle/cinn/optim/if_simplify_test.cc         |   70 +
 paddle/cinn/optim/insert_debug_log_callee.cc  |  275 ++
 paddle/cinn/optim/insert_debug_log_callee.h   |   27 +
 paddle/cinn/optim/ir_copy.cc                  |  480 +++
 paddle/cinn/optim/ir_copy.h                   |   43 +
 paddle/cinn/optim/ir_copy_test.cc             |   31 +
 paddle/cinn/optim/ir_replace.cc               |   64 +
 paddle/cinn/optim/ir_replace.h                |   27 +
 paddle/cinn/optim/ir_simplify.cc              |  365 ++
 paddle/cinn/optim/ir_simplify.h               |   37 +
 paddle/cinn/optim/ir_simplify_test.cc         |  127 +
 .../optim/lower_function_call_bind_vars.cc    |   73 +
 .../optim/lower_function_call_bind_vars.h     |   26 +
 paddle/cinn/optim/lower_intrin.cc             |   95 +
 paddle/cinn/optim/lower_intrin.h              |   41 +
 paddle/cinn/optim/map_extern_call.cc          |  119 +
 paddle/cinn/optim/map_extern_call.h           |   33 +
 paddle/cinn/optim/optimize.cc                 |  111 +
 paddle/cinn/optim/optimize.h                  |   36 +
 paddle/cinn/optim/optimize_test.cc            |   58 +
 paddle/cinn/optim/remove_nested_block.cc      |  121 +
 paddle/cinn/optim/remove_nested_block.h       |   33 +
 paddle/cinn/optim/remove_nested_block_test.cc |   58 +
 paddle/cinn/optim/remove_schedule_block.cc    |   50 +
 paddle/cinn/optim/remove_schedule_block.h     |   33 +
 .../cinn/optim/remove_schedule_block_test.cc  |   98 +
 paddle/cinn/optim/replace_call_with_expr.cc   |  125 +
 paddle/cinn/optim/replace_call_with_expr.h    |   45 +
 .../cinn/optim/replace_call_with_expr_test.cc |   31 +
 .../optim/replace_const_param_to_integer.cc   |   43 +
 .../optim/replace_const_param_to_integer.h    |   34 +
 paddle/cinn/optim/replace_var_with_expr.cc    |  159 +
 paddle/cinn/optim/replace_var_with_expr.h     |   77 +
 paddle/cinn/optim/tensor_write_tell.cc        |   19 +
 paddle/cinn/optim/tensor_write_tell.h         |   54 +
 paddle/cinn/optim/transform_gpu_forloop.cc    |  664 ++++
 paddle/cinn/optim/transform_gpu_forloop.h     |   65 +
 paddle/cinn/optim/transform_polyfor_to_for.cc |  136 +
 paddle/cinn/optim/transform_polyfor_to_for.h  |   32 +
 .../optim/transform_polyfor_to_for_test.cc    |  109 +
 paddle/cinn/optim/unroll_loops.cc             |  118 +
 paddle/cinn/optim/unroll_loops.h              |   24 +
 paddle/cinn/optim/unroll_loops_test.cc        |  101 +
 paddle/cinn/optim/var_mod_simplify.cc         |   91 +
 paddle/cinn/optim/var_mod_simplify.h          |   32 +
 paddle/cinn/optim/vectorize_loops.cc          |  890 +++++
 paddle/cinn/optim/vectorize_loops.h           |   37 +
 paddle/cinn/optim/vectorize_loops_test.cc     |  288 ++
 paddle/cinn/poly/CMakeLists.txt               |   24 +
 paddle/cinn/poly/ast_gen.cc                   |  566 +++
 paddle/cinn/poly/ast_gen.h                    |   99 +
 paddle/cinn/poly/ast_gen_test.cc              |  130 +
 paddle/cinn/poly/compute_at_transform.cc      |  244 ++
 paddle/cinn/poly/compute_at_transform.h       |  116 +
 paddle/cinn/poly/compute_at_transform_test.cc |   53 +
 paddle/cinn/poly/dim.cc                       |   36 +
 paddle/cinn/poly/dim.h                        |   68 +
 paddle/cinn/poly/domain.cc                    |   76 +
 paddle/cinn/poly/domain.h                     |   54 +
 .../cinn/poly/domain_add_unit_loop_mutator.cc |  217 ++
 .../cinn/poly/domain_add_unit_loop_mutator.h  |   51 +
 paddle/cinn/poly/graph.cc                     |  129 +
 paddle/cinn/poly/graph.h                      |   96 +
 paddle/cinn/poly/graph_test.cc                |   24 +
 paddle/cinn/poly/isl_utils.cc                 |  512 +++
 paddle/cinn/poly/isl_utils.h                  |  143 +
 paddle/cinn/poly/isl_utils_test.cc            |   39 +
 paddle/cinn/poly/map.cc                       |  101 +
 paddle/cinn/poly/map.h                        |  108 +
 paddle/cinn/poly/naive_scheduler.cc           |   53 +
 paddle/cinn/poly/naive_scheduler.h            |   60 +
 paddle/cinn/poly/poly_scheduler.cc            |  462 +++
 paddle/cinn/poly/poly_scheduler.h             |   87 +
 paddle/cinn/poly/poly_scheduler_test.cc       |   21 +
 paddle/cinn/poly/schedule.cc                  |  254 ++
 paddle/cinn/poly/schedule.h                   |  228 ++
 paddle/cinn/poly/schedule_test.cc             |  125 +
 paddle/cinn/poly/stage.cc                     | 1666 +++++++++
 paddle/cinn/poly/stage.h                      |  537 +++
 paddle/cinn/poly/stage_test.cc                |  554 +++
 paddle/cinn/pybind/CMakeLists.txt             |   29 +
 paddle/cinn/pybind/backends.cc                |   81 +
 paddle/cinn/pybind/bind.cc                    |   52 +
 paddle/cinn/pybind/bind.h                     |   52 +
 paddle/cinn/pybind/bind_utils.h               |  168 +
 paddle/cinn/pybind/common.cc                  |  322 ++
 paddle/cinn/pybind/framework.cc               |  196 ++
 paddle/cinn/pybind/frontend.cc                |  799 +++++
 paddle/cinn/pybind/ir.cc                      |  636 ++++
 paddle/cinn/pybind/lang.cc                    |  248 ++
 paddle/cinn/pybind/optim.cc                   |   52 +
 paddle/cinn/pybind/pe.cc                      |  135 +
 paddle/cinn/pybind/poly.cc                    |  124 +
 paddle/cinn/pybind/runtime.cc                 |  279 ++
 paddle/cinn/pybind/utils.cc                   |   70 +
 paddle/cinn/runtime/CMakeLists.txt            |   23 +
 paddle/cinn/runtime/buffer.cc                 |   52 +
 paddle/cinn/runtime/buffer.h                  |  100 +
 paddle/cinn/runtime/cinn_runtime.cc           |  495 +++
 paddle/cinn/runtime/cinn_runtime.h            |  570 ++++
 paddle/cinn/runtime/cinn_runtime_test.cc      |   49 +
 paddle/cinn/runtime/cinn_x86_device_impl.cc   |   85 +
 paddle/cinn/runtime/cpu/CMakeLists.txt        |   26 +
 paddle/cinn/runtime/cpu/cblas.cc              |  226 ++
 paddle/cinn/runtime/cpu/cblas.h               |  102 +
 paddle/cinn/runtime/cpu/host_intrinsics.cc    |  460 +++
 paddle/cinn/runtime/cpu/host_intrinsics.h     |  122 +
 .../cinn/runtime/cpu/host_intrinsics_test.cc  |  208 ++
 paddle/cinn/runtime/cpu/mkl_math.cc           |  105 +
 paddle/cinn/runtime/cpu/mkl_math.h            |   53 +
 paddle/cinn/runtime/cpu/mkl_math_test.cc      |  217 ++
 paddle/cinn/runtime/cpu/mkldnn_math.cc        |  204 ++
 paddle/cinn/runtime/cpu/mkldnn_math.h         |   45 +
 paddle/cinn/runtime/cpu/mkldnn_math_test.cc   |  123 +
 paddle/cinn/runtime/cpu/thread_backend.cc     |   69 +
 paddle/cinn/runtime/cpu/thread_backend.h      |   46 +
 paddle/cinn/runtime/cpu/use_extern_funcs.h    |   27 +
 paddle/cinn/runtime/cuda/CMakeLists.txt       |   19 +
 paddle/cinn/runtime/cuda/bfloat16.h           |  402 +++
 .../runtime/cuda/cinn_cuda_runtime_source.cuh |  865 +++++
 paddle/cinn/runtime/cuda/cublas_util.h        |  328 ++
 .../runtime/cuda/cuda_instrinsics_bfloat16.cc |   80 +
 .../runtime/cuda/cuda_instrinsics_float16.cc  |  124 +
 paddle/cinn/runtime/cuda/cuda_intrinsics.cc   |  733 ++++
 .../runtime/cuda/cuda_intrinsics_reduce.cc    |  156 +
 paddle/cinn/runtime/cuda/cuda_module.cc       |  151 +
 paddle/cinn/runtime/cuda/cuda_module.h        |   80 +
 paddle/cinn/runtime/cuda/cuda_module_test.cc  |  179 +
 paddle/cinn/runtime/cuda/cuda_util.cc         | 2277 +++++++++++++
 paddle/cinn/runtime/cuda/cuda_util.h          |  309 ++
 paddle/cinn/runtime/cuda/float16.h            |  629 ++++
 paddle/cinn/runtime/cuda/test_util.h          |   56 +
 paddle/cinn/runtime/cuda/use_extern_funcs.h   |   24 +
 paddle/cinn/runtime/custom_function.cc        |  199 ++
 paddle/cinn/runtime/custom_function.h         |   66 +
 paddle/cinn/runtime/custom_function_test.cc   |  352 ++
 paddle/cinn/runtime/flags.cc                  |  227 ++
 paddle/cinn/runtime/flags.h                   |   62 +
 paddle/cinn/runtime/intrinsic.cc              |   67 +
 paddle/cinn/runtime/intrinsic.h               |  136 +
 paddle/cinn/runtime/intrinsic_types.cc        |   29 +
 paddle/cinn/runtime/intrinsic_types.h         |   52 +
 paddle/cinn/runtime/tiny_runtime.cc           |  158 +
 paddle/cinn/runtime/use_extern_funcs.h        |   20 +
 paddle/cinn/utils/CMakeLists.txt              |   23 +
 paddle/cinn/utils/data_util.cc                |  121 +
 paddle/cinn/utils/data_util.h                 |   45 +
 paddle/cinn/utils/dot_lang.cc                 |  159 +
 paddle/cinn/utils/dot_lang.h                  |  135 +
 paddle/cinn/utils/error.cc                    |   17 +
 paddle/cinn/utils/error.h                     |   24 +
 paddle/cinn/utils/event.cc                    |  125 +
 paddle/cinn/utils/event.h                     |  113 +
 paddle/cinn/utils/functional.cc               |   40 +
 paddle/cinn/utils/functional.h                |  127 +
 paddle/cinn/utils/functional_test.cc          |  119 +
 paddle/cinn/utils/multi_threading.cc          |   96 +
 paddle/cinn/utils/multi_threading.h           |   62 +
 paddle/cinn/utils/multi_threading_test.cc     |   59 +
 paddle/cinn/utils/profiler.cc                 |  126 +
 paddle/cinn/utils/profiler.h                  |   88 +
 paddle/cinn/utils/profiler_test.cc            |   78 +
 paddle/cinn/utils/random_engine.cc            |   41 +
 paddle/cinn/utils/random_engine.h             |  109 +
 paddle/cinn/utils/registry.h                  |  210 ++
 paddle/cinn/utils/sized_multi_set.cc          |   17 +
 paddle/cinn/utils/sized_multi_set.h           |   82 +
 paddle/cinn/utils/sized_multi_set_test.cc     |   82 +
 paddle/cinn/utils/small_vector.cc             |   17 +
 paddle/cinn/utils/small_vector.h              |   23 +
 paddle/cinn/utils/string.cc                   |  167 +
 paddle/cinn/utils/string.h                    |   94 +
 paddle/cinn/utils/string_test.cc              |   38 +
 paddle/cinn/utils/timer.cc                    |   31 +
 paddle/cinn/utils/timer.h                     |   35 +
 paddle/cinn/utils/type_defs.h                 |   46 +
 python/cinn/__init__.py                       |   28 +
 python/cinn/auto_schedule/__init__.py         |   13 +
 .../cinn/auto_schedule/cost_model/__init__.py |   23 +
 .../auto_schedule/cost_model/cost_model.py    |   82 +
 .../cost_model/xgb_cost_model.py              |   98 +
 python/cinn/backends.py                       |   16 +
 python/cinn/common.py                         |   20 +
 python/cinn/framework.py                      |   15 +
 python/cinn/frontend.py                       |   15 +
 python/cinn/ir/__init__.py                    |   42 +
 python/cinn/lang.py                           |   18 +
 python/cinn/libs/__init__.py                  |   15 +
 python/cinn/optim.py                          |   16 +
 python/cinn/pe.py                             |   15 +
 python/cinn/poly.py                           |   15 +
 python/cinn/runtime.py                        |   15 +
 python/cinn/utils.py                          |   15 +
 python/cinn/version/__init__.py               |   18 +
 .../cost_model/test_cost_model.py             |   58 +
 test/cinn/conv2d_utils.py                     |   95 +
 test/cinn/fake_model/naive_mul.py             |   43 +
 test/cinn/fake_model/naive_multi_fc.py        |   58 +
 test/cinn/fake_model/resnet_model.py          |   51 +
 test/cinn/fusion/fusion_test.py               |   62 +
 .../fusion/test_cast_broadcast_reduce_max.py  |   68 +
 test/cinn/fusion/test_reduce_cast.py          |   39 +
 test/cinn/fusion/test_select_reduce.py        |   51 +
 test/cinn/op_mappers/op_mapper_test.py        |  423 +++
 test/cinn/op_mappers/test_argmax_op.py        |  113 +
 test/cinn/op_mappers/test_argmin_op.py        |  113 +
 test/cinn/op_mappers/test_argsort_op.py       |   55 +
 test/cinn/op_mappers/test_assign_value_op.py  |  117 +
 test/cinn/op_mappers/test_atan2_op.py         |  103 +
 test/cinn/op_mappers/test_batch_norm_op.py    |  112 +
 test/cinn/op_mappers/test_bitwise_op.py       |   85 +
 test/cinn/op_mappers/test_cholesky_op.py      |   92 +
 test/cinn/op_mappers/test_clip_op.py          |  256 ++
 test/cinn/op_mappers/test_compare_op.py       |   90 +
 test/cinn/op_mappers/test_conv2d_op.py        |   93 +
 test/cinn/op_mappers/test_cumsum_op.py        |  112 +
 test/cinn/op_mappers/test_elementwise_op.py   |  117 +
 test/cinn/op_mappers/test_expand_op.py        |   50 +
 test/cinn/op_mappers/test_expand_v2_op.py     |   50 +
 test/cinn/op_mappers/test_fill_constant_op.py |   89 +
 test/cinn/op_mappers/test_flip_op.py          |   52 +
 test/cinn/op_mappers/test_gather_nd_op.py     |   63 +
 test/cinn/op_mappers/test_gather_op.py        |   65 +
 .../op_mappers/test_gaussian_random_op.py     |   79 +
 test/cinn/op_mappers/test_layer_norm_op.py    |   79 +
 test/cinn/op_mappers/test_log1p_op.py         |   50 +
 test/cinn/op_mappers/test_logical_op.py       |   82 +
 test/cinn/op_mappers/test_lookup_table_op.py  |  106 +
 test/cinn/op_mappers/test_mul_op.py           |   80 +
 test/cinn/op_mappers/test_norm_op.py          |   70 +
 test/cinn/op_mappers/test_one_hot_op.py       |  116 +
 test/cinn/op_mappers/test_pool2d_op.py        |  131 +
 test/cinn/op_mappers/test_pow_op.py           |   83 +
 test/cinn/op_mappers/test_randint_op.py       |   81 +
 test/cinn/op_mappers/test_reduce_op.py        |  141 +
 test/cinn/op_mappers/test_reverse_op.py       |   52 +
 test/cinn/op_mappers/test_roll_op.py          |  111 +
 test/cinn/op_mappers/test_scale_op.py         |  115 +
 test/cinn/op_mappers/test_scatter_op.py       |   87 +
 test/cinn/op_mappers/test_sign_op.py          |   50 +
 test/cinn/op_mappers/test_split_op.py         |   94 +
 test/cinn/op_mappers/test_squeeze_op.py       |   62 +
 test/cinn/op_mappers/test_stack_op.py         |   54 +
 test/cinn/op_mappers/test_strided_slice_op.py |  121 +
 .../op_mappers/test_take_along_axis_op.py     |  150 +
 test/cinn/op_mappers/test_tile_op.py          |   91 +
 test/cinn/op_mappers/test_transpose2_op.py    |   71 +
 .../op_mappers/test_triangular_solve_op.py    |   69 +
 test/cinn/op_mappers/test_unary_op.py         |  242 ++
 .../cinn/op_mappers/test_uniform_random_op.py |   91 +
 test/cinn/op_mappers/test_where_op.py         |   60 +
 test/cinn/ops/op_test.py                      |  320 ++
 test/cinn/ops/op_test_helper.py               |  134 +
 test/cinn/ops/test_abs_op.py                  |  111 +
 test/cinn/ops/test_acos_op.py                 |  104 +
 test/cinn/ops/test_add_op.py                  |  257 ++
 test/cinn/ops/test_arange_op.py               |  191 ++
 test/cinn/ops/test_argsort_op.py              |  112 +
 test/cinn/ops/test_asin_op.py                 |  109 +
 test/cinn/ops/test_asinh_op.py                |  101 +
 test/cinn/ops/test_atan2_op.py                |  139 +
 test/cinn/ops/test_atan_op.py                 |  101 +
 test/cinn/ops/test_atanh_op.py                |  101 +
 test/cinn/ops/test_batch_norm_op.py           |  245 ++
 test/cinn/ops/test_binary_elementwise_op.py   |  370 ++
 test/cinn/ops/test_bitcast_convert_op.py      |  103 +
 test/cinn/ops/test_bitwise_op.py              |  256 ++
 test/cinn/ops/test_broadcast_to_op.py         |  174 +
 test/cinn/ops/test_broadcast_to_op_new.py     |  224 ++
 test/cinn/ops/test_cast_op.py                 |  155 +
 test/cinn/ops/test_cbrt_op.py                 |  143 +
 test/cinn/ops/test_ceil_op.py                 |  144 +
 test/cinn/ops/test_cholesky_op.py             |  231 ++
 test/cinn/ops/test_clz_op.py                  |  140 +
 test/cinn/ops/test_comparison_op.py           |  357 ++
 test/cinn/ops/test_concat_op.py               |  362 ++
 test/cinn/ops/test_constant_op.py             |  163 +
 test/cinn/ops/test_conv2d_op.py               |  210 ++
 test/cinn/ops/test_cos_op.py                  |  106 +
 test/cinn/ops/test_cosh_op.py                 |  106 +
 test/cinn/ops/test_depthwise_conv2d_op.py     |  192 ++
 test/cinn/ops/test_divide_op.py               |  278 ++
 test/cinn/ops/test_dropout_infer_op.py        |  121 +
 test/cinn/ops/test_erf_op.py                  |  106 +
 test/cinn/ops/test_exp_op.py                  |  104 +
 test/cinn/ops/test_expand_dims.py             |  145 +
 test/cinn/ops/test_fill_constant_op.py        |  259 ++
 test/cinn/ops/test_floor_divide_op.py         |  232 ++
 test/cinn/ops/test_floor_op.py                |  109 +
 test/cinn/ops/test_gather_nd_op.py            |   99 +
 test/cinn/ops/test_gather_op.py               |  157 +
 test/cinn/ops/test_gaussian_random_op.py      |  158 +
 test/cinn/ops/test_gelu_op.py                 |  116 +
 test/cinn/ops/test_identity_op.py             |  114 +
 test/cinn/ops/test_is_finite_op.py            |  116 +
 test/cinn/ops/test_is_inf_op.py               |  116 +
 test/cinn/ops/test_is_nan_op.py               |  116 +
 test/cinn/ops/test_isclose_op.py              |  205 ++
 test/cinn/ops/test_left_shift_op.py           |  150 +
 test/cinn/ops/test_log_op.py                  |  145 +
 test/cinn/ops/test_logical_right_shift_op.py  |  130 +
 test/cinn/ops/test_lookup_table_op.py         |  112 +
 test/cinn/ops/test_matmul_op.py               |  259 ++
 test/cinn/ops/test_max_op.py                  |  108 +
 test/cinn/ops/test_mod_op.py                  |  132 +
 test/cinn/ops/test_mul_op.py                  |   64 +
 test/cinn/ops/test_multiply_op.py             |   89 +
 test/cinn/ops/test_negative_op.py             |  121 +
 test/cinn/ops/test_one_hot_op.py              |   71 +
 test/cinn/ops/test_pool2d_op.py               |  339 ++
 test/cinn/ops/test_popc_op.py                 |  139 +
 test/cinn/ops/test_pow_op.py                  |  152 +
 test/cinn/ops/test_randint_op.py              |   82 +
 test/cinn/ops/test_reciprocal_op.py           |   98 +
 test/cinn/ops/test_reduce_op.py               |  680 ++++
 test/cinn/ops/test_reduce_op_new.py           |  216 ++
 test/cinn/ops/test_reduce_op_other.py         |   87 +
 test/cinn/ops/test_relu6_op.py                |   94 +
 test/cinn/ops/test_relu_op.py                 |  155 +
 test/cinn/ops/test_remainder_op.py            |  198 ++
 test/cinn/ops/test_repeat_op.py               |  267 ++
 test/cinn/ops/test_reshape_op.py              |  223 ++
 test/cinn/ops/test_resize_op.py               |  115 +
 test/cinn/ops/test_reverse_op.py              |  311 ++
 test/cinn/ops/test_right_shift_op.py          |  150 +
 test/cinn/ops/test_round_op.py                |  112 +
 test/cinn/ops/test_rsqrt_op.py                |  109 +
 test/cinn/ops/test_scale_op.py                |  169 +
 test/cinn/ops/test_scatter_add.py             |  351 ++
 test/cinn/ops/test_scatter_assign_op.py       |  263 ++
 test/cinn/ops/test_select_op.py               |  153 +
 test/cinn/ops/test_sigmoid_op.py              |  144 +
 test/cinn/ops/test_sign_op.py                 |  146 +
 test/cinn/ops/test_sin_op.py                  |  106 +
 test/cinn/ops/test_sinh_op.py                 |  106 +
 test/cinn/ops/test_slice_assign_op.py         |  421 +++
 test/cinn/ops/test_slice_op.py                |  378 +++
 test/cinn/ops/test_softmax_op.py              |   97 +
 test/cinn/ops/test_sort_op.py                 |  228 ++
 test/cinn/ops/test_split_op.py                |  368 ++
 test/cinn/ops/test_sqrt_op.py                 |  107 +
 test/cinn/ops/test_squeeze_op.py              |  213 ++
 test/cinn/ops/test_subtract_op.py             |  250 ++
 test/cinn/ops/test_sum_op.py                  |  163 +
 test/cinn/ops/test_tan_op.py                  |  106 +
 test/cinn/ops/test_tanh_op.py                 |  106 +
 test/cinn/ops/test_top_k_op.py                |  307 ++
 test/cinn/ops/test_transpose_op.py            |  272 ++
 test/cinn/ops/test_triangular_solve_op.py     |  392 +++
 test/cinn/ops/test_trunc_op.py                |  111 +
 test/cinn/ops/test_unary_elementwise_op.py    |  409 +++
 test/cinn/ops/test_uniform_random_op.py       |  159 +
 test/cinn/ops/test_zero_dim_tensor.py         |  642 ++++
 test/cinn/passes/pass_test.py                 |  104 +
 test/cinn/passes/test_auto_cast_pass.py       |   40 +
 test/cinn/passes/test_expand_zero_dim_pass.py |   44 +
 .../test_transpose_floding_input_pass.py      |  244 ++
 .../test_transpose_floding_output_pass.py     |  108 +
 test/cinn/pool_utils.py                       |  422 +++
 test/cinn/test_common.py                      |   42 +
 test/cinn/test_computation.py                 |  130 +
 test/cinn/test_efficientnet.py                |  108 +
 test/cinn/test_facedet.py                     |  109 +
 test/cinn/test_frontend.py                    |  191 ++
 test/cinn/test_hlir_framework.py              |   34 +
 test/cinn/test_ir.py                          |   50 +
 test/cinn/test_matmul.py                      |  133 +
 test/cinn/test_mobilenetv1.py                 |  109 +
 test/cinn/test_mobilenetv2.py                 |  112 +
 test/cinn/test_netbuilder.py                  |  118 +
 test/cinn/test_op_benchmark.py                |  479 +++
 test/cinn/test_op_broadcast.py                |  104 +
 test/cinn/test_op_nn.py                       |  595 ++++
 test/cinn/test_op_transform.py                |  212 ++
 test/cinn/test_packed_func.py                 |   75 +
 test/cinn/test_paddle_model_convertor.py      |  269 ++
 test/cinn/test_pe_elementwise.py              |  164 +
 test/cinn/test_pe_reduction.py                |  179 +
 test/cinn/test_pe_transform.py                |  136 +
 test/cinn/test_resnet.py                      |   89 +
 test/cinn/test_resnet18.py                    |  110 +
 test/cinn/test_resnet50.py                    |  113 +
 test/cinn/test_squeezenet.py                  |  108 +
 test/cinn/test_utils.py                       |  152 +
 test/cpp/cinn/CMakeLists.txt                  |   22 +
 test/cpp/cinn/benchmark/CMakeLists.txt        |   11 +
 .../cinn/benchmark/test_all_ops_default.cc    |  368 ++
 test/cpp/cinn/benchmark/test_elementwise.cc   |   51 +
 test/cpp/cinn/benchmark/test_elementwise.h    |   54 +
 test/cpp/cinn/benchmark/test_matmul.cc        |  305 ++
 test/cpp/cinn/benchmark/test_matmul.h         |  122 +
 test/cpp/cinn/benchmark/test_utils.cc         |  232 ++
 test/cpp/cinn/benchmark/test_utils.h          |   96 +
 test/cpp/cinn/concrete_program_builder.h      |  100 +
 test/cpp/cinn/program_builder.cc              |   63 +
 test/cpp/cinn/program_builder.h               |   97 +
 test/cpp/cinn/test01_elementwise_add_case.cc  |  162 +
 test/cpp/cinn/test01_elementwise_add_main.cc  |  145 +
 test/cpp/cinn/test02_helper.h                 |  308 ++
 test/cpp/cinn/test02_matmul_case.cc           |  222 ++
 test/cpp/cinn/test02_matmul_main.cc           |  333 ++
 test/cpp/cinn/test03_convolution_case.cc      |   27 +
 test/cpp/cinn/test03_convolution_main.cc      |   70 +
 1056 files changed, 188306 insertions(+), 1 deletion(-)
 create mode 100644 paddle/cinn/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/analysis/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/analysis/analyze_ir.cc
 create mode 100644 paddle/cinn/auto_schedule/analysis/analyze_ir.h
 create mode 100644 paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
 create mode 100644 paddle/cinn/auto_schedule/auto_schedule.proto
 create mode 100644 paddle/cinn/auto_schedule/auto_tuner.cc
 create mode 100644 paddle/cinn/auto_schedule/auto_tuner.h
 create mode 100644 paddle/cinn/auto_schedule/auto_tuner_test.cc
 create mode 100644 paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
 create mode 100644 paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
 create mode 100644 paddle/cinn/auto_schedule/cost_model/feature.cc
 create mode 100644 paddle/cinn/auto_schedule/cost_model/feature.h
 create mode 100644 paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
 create mode 100644 paddle/cinn/auto_schedule/cost_model/feature_extractor.h
 create mode 100644 paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
 create mode 100644 paddle/cinn/auto_schedule/cost_model/feature_test.cc
 create mode 100644 paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
 create mode 100644 paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h
 create mode 100644 paddle/cinn/auto_schedule/cost_model/xgb_cost_model_test.cc
 create mode 100644 paddle/cinn/auto_schedule/database/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/database/database.cc
 create mode 100644 paddle/cinn/auto_schedule/database/database.h
 create mode 100644 paddle/cinn/auto_schedule/database/database_test.cc
 create mode 100644 paddle/cinn/auto_schedule/database/jsonfile_database.cc
 create mode 100644 paddle/cinn/auto_schedule/database/jsonfile_database.h
 create mode 100644 paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
 create mode 100644 paddle/cinn/auto_schedule/measure/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/measure/measure.h
 create mode 100644 paddle/cinn/auto_schedule/measure/measurer_test.cc
 create mode 100644 paddle/cinn/auto_schedule/measure/schedule_measurer.cc
 create mode 100644 paddle/cinn/auto_schedule/measure/schedule_measurer.h
 create mode 100644 paddle/cinn/auto_schedule/measure/simple_builder.cc
 create mode 100644 paddle/cinn/auto_schedule/measure/simple_builder.h
 create mode 100644 paddle/cinn/auto_schedule/measure/simple_runner.cc
 create mode 100644 paddle/cinn/auto_schedule/measure/simple_runner.h
 create mode 100644 paddle/cinn/auto_schedule/measure/simple_runner_test.cc
 create mode 100644 paddle/cinn/auto_schedule/post_schedule_rule/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
 create mode 100644 paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h
 create mode 100644 paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
 create mode 100644 paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/block_sampler.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/block_sampler.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/block_sampler_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/rule_sampler.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/rule_sampler.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/search_space.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/search_space.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/search_space_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/search_state.cc
 create mode 100644 paddle/cinn/auto_schedule/search_space/search_state.h
 create mode 100644 paddle/cinn/auto_schedule/search_space/search_state_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
 create mode 100644 paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h
 create mode 100644 paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
 create mode 100644 paddle/cinn/auto_schedule/search_strategy/mutate_rule/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.cc
 create mode 100644 paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h
 create mode 100644 paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.cc
 create mode 100644 paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h
 create mode 100644 paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
 create mode 100644 paddle/cinn/auto_schedule/task/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/task/task_creator.cc
 create mode 100644 paddle/cinn/auto_schedule/task/task_creator.h
 create mode 100644 paddle/cinn/auto_schedule/task/task_creator_test.cc
 create mode 100644 paddle/cinn/auto_schedule/task/task_optimizer.cc
 create mode 100644 paddle/cinn/auto_schedule/task/task_optimizer.h
 create mode 100644 paddle/cinn/auto_schedule/task/task_registry.h
 create mode 100644 paddle/cinn/auto_schedule/task/task_registry_test.cc
 create mode 100644 paddle/cinn/auto_schedule/task/tune_task.cc
 create mode 100644 paddle/cinn/auto_schedule/task/tune_task.h
 create mode 100755 paddle/cinn/auto_schedule/task/tune_task_test.cc
 create mode 100644 paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
 create mode 100644 paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
 create mode 100644 paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
 create mode 100644 paddle/cinn/auto_schedule/task_scheduler/round_robin.h
 create mode 100644 paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
 create mode 100644 paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
 create mode 100644 paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
 create mode 100644 paddle/cinn/auto_schedule/tests/CMakeLists.txt
 create mode 100644 paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
 create mode 100644 paddle/cinn/auto_schedule/tuning.h
 create mode 100755 paddle/cinn/backends/CMakeLists.txt
 create mode 100644 paddle/cinn/backends/_x86_builtin_source.cc
 create mode 100644 paddle/cinn/backends/codegen_c.cc
 create mode 100755 paddle/cinn/backends/codegen_c.h
 create mode 100755 paddle/cinn/backends/codegen_c_test.cc
 create mode 100644 paddle/cinn/backends/codegen_c_x86.cc
 create mode 100644 paddle/cinn/backends/codegen_c_x86.h
 create mode 100644 paddle/cinn/backends/codegen_c_x86_test.cc
 create mode 100644 paddle/cinn/backends/codegen_cuda_dev.cc
 create mode 100644 paddle/cinn/backends/codegen_cuda_dev.h
 create mode 100644 paddle/cinn/backends/codegen_cuda_generate_test.cc
 create mode 100644 paddle/cinn/backends/codegen_cuda_host.cc
 create mode 100644 paddle/cinn/backends/codegen_cuda_host.h
 create mode 100644 paddle/cinn/backends/codegen_cuda_util.cc
 create mode 100755 paddle/cinn/backends/codegen_cuda_util.h
 create mode 100644 paddle/cinn/backends/codegen_debug_test.cc
 create mode 100644 paddle/cinn/backends/compiler.cc
 create mode 100644 paddle/cinn/backends/compiler.h
 create mode 100644 paddle/cinn/backends/compiler_test.cc
 create mode 100644 paddle/cinn/backends/cuda_util.cc
 create mode 100644 paddle/cinn/backends/cuda_util.h
 create mode 100644 paddle/cinn/backends/extern_func_emitter.cc
 create mode 100644 paddle/cinn/backends/extern_func_emitter.h
 create mode 100644 paddle/cinn/backends/extern_func_emitter_builtin.cc
 create mode 100644 paddle/cinn/backends/extern_func_emitter_builtin.h
 create mode 100644 paddle/cinn/backends/extern_func_jit_register.cc
 create mode 100644 paddle/cinn/backends/extern_func_jit_register.h
 create mode 100644 paddle/cinn/backends/extern_func_protos.cc
 create mode 100644 paddle/cinn/backends/extern_func_protos.h
 create mode 100644 paddle/cinn/backends/function_prototype.cc
 create mode 100644 paddle/cinn/backends/function_prototype.h
 create mode 100644 paddle/cinn/backends/generated1.cu
 create mode 100644 paddle/cinn/backends/generated_module1.cc
 create mode 100644 paddle/cinn/backends/ir_schedule_test.cc
 create mode 100755 paddle/cinn/backends/llvm/CMakeLists.txt
 create mode 100644 paddle/cinn/backends/llvm/codegen_llvm.cc
 create mode 100644 paddle/cinn/backends/llvm/codegen_llvm.h
 create mode 100644 paddle/cinn/backends/llvm/codegen_llvm_test.cc
 create mode 100644 paddle/cinn/backends/llvm/codegen_x86.cc
 create mode 100644 paddle/cinn/backends/llvm/codegen_x86.h
 create mode 100644 paddle/cinn/backends/llvm/codegen_x86_test.cc
 create mode 100644 paddle/cinn/backends/llvm/execution_engine.cc
 create mode 100644 paddle/cinn/backends/llvm/execution_engine.h
 create mode 100644 paddle/cinn/backends/llvm/execution_engine_test.cc
 create mode 100644 paddle/cinn/backends/llvm/generate_runtime_llvm_ir.py
 create mode 100644 paddle/cinn/backends/llvm/ir_builder_mixin.h
 create mode 100644 paddle/cinn/backends/llvm/llvm_intrin_rule.h
 create mode 100644 paddle/cinn/backends/llvm/llvm_optimizer.cc
 create mode 100644 paddle/cinn/backends/llvm/llvm_optimizer.h
 create mode 100644 paddle/cinn/backends/llvm/llvm_util.cc
 create mode 100644 paddle/cinn/backends/llvm/llvm_util.h
 create mode 100644 paddle/cinn/backends/llvm/runtime_symbol_registry.cc
 create mode 100644 paddle/cinn/backends/llvm/runtime_symbol_registry.h
 create mode 100755 paddle/cinn/backends/llvm/simple_jit.cc
 create mode 100755 paddle/cinn/backends/llvm/simple_jit.h
 create mode 100644 paddle/cinn/backends/modular.cc
 create mode 100644 paddle/cinn/backends/modular.h
 create mode 100644 paddle/cinn/backends/nvrtc/CMakeLists.txt
 create mode 100644 paddle/cinn/backends/nvrtc/header_generator.cc
 create mode 100644 paddle/cinn/backends/nvrtc/header_generator.h
 create mode 100644 paddle/cinn/backends/nvrtc/nvrtc_util.cc
 create mode 100644 paddle/cinn/backends/nvrtc/nvrtc_util.h
 create mode 100644 paddle/cinn/backends/nvrtc/nvrtc_util_test.cc
 create mode 100644 paddle/cinn/backends/outputs.cc
 create mode 100644 paddle/cinn/backends/outputs.h
 create mode 100644 paddle/cinn/backends/raw_cuda_code_test.cu
 create mode 100644 paddle/cinn/cinn.h
 create mode 100644 paddle/cinn/common/CMakeLists.txt
 create mode 100644 paddle/cinn/common/arithmatic.cc
 create mode 100644 paddle/cinn/common/arithmatic.h
 create mode 100644 paddle/cinn/common/arithmatic_test.cc
 create mode 100644 paddle/cinn/common/axis.cc
 create mode 100644 paddle/cinn/common/axis.h
 create mode 100644 paddle/cinn/common/axis_test.cc
 create mode 100644 paddle/cinn/common/bfloat16.h
 create mode 100644 paddle/cinn/common/cas.cc
 create mode 100755 paddle/cinn/common/cas.h
 create mode 100644 paddle/cinn/common/cas_test.cc
 create mode 100644 paddle/cinn/common/cinn_value.cc
 create mode 100755 paddle/cinn/common/cinn_value.h
 create mode 100644 paddle/cinn/common/cinn_value_test.cc
 create mode 100644 paddle/cinn/common/common.h
 create mode 100644 paddle/cinn/common/context.cc
 create mode 100644 paddle/cinn/common/context.h
 create mode 100644 paddle/cinn/common/cost_model.h
 create mode 100644 paddle/cinn/common/cuda_test_helper.cc
 create mode 100644 paddle/cinn/common/cuda_test_helper.h
 create mode 100644 paddle/cinn/common/debug_manager.cc
 create mode 100644 paddle/cinn/common/debug_manager.h
 create mode 100644 paddle/cinn/common/float16.h
 create mode 100644 paddle/cinn/common/float16_bfloat16_cuda_test.cu
 create mode 100644 paddle/cinn/common/float16_bfloat16_host_test.cc
 create mode 100644 paddle/cinn/common/float16_bfloat16_utils.h
 create mode 100755 paddle/cinn/common/graph_utils.cc
 create mode 100644 paddle/cinn/common/graph_utils.h
 create mode 100644 paddle/cinn/common/graph_utils_test.cc
 create mode 100644 paddle/cinn/common/info_registry.cc
 create mode 100644 paddle/cinn/common/info_registry.h
 create mode 100755 paddle/cinn/common/ir_util.cc
 create mode 100644 paddle/cinn/common/ir_util.h
 create mode 100644 paddle/cinn/common/macros.h
 create mode 100644 paddle/cinn/common/object.cc
 create mode 100644 paddle/cinn/common/object.h
 create mode 100644 paddle/cinn/common/python_interpreter_guard.cc
 create mode 100644 paddle/cinn/common/python_interpreter_guard.h
 create mode 100644 paddle/cinn/common/shared.cc
 create mode 100644 paddle/cinn/common/shared.h
 create mode 100644 paddle/cinn/common/shared_test.cc
 create mode 100644 paddle/cinn/common/target.cc
 create mode 100755 paddle/cinn/common/target.h
 create mode 100644 paddle/cinn/common/test_helper.cc
 create mode 100644 paddle/cinn/common/test_helper.h
 create mode 100644 paddle/cinn/common/type.cc
 create mode 100644 paddle/cinn/common/type.h
 create mode 100644 paddle/cinn/common/type_test.cc
 create mode 100644 paddle/cinn/common/union_find.cc
 create mode 100644 paddle/cinn/common/union_find.h
 create mode 100755 paddle/cinn/frontend/CMakeLists.txt
 create mode 100644 paddle/cinn/frontend/computation.cc
 create mode 100644 paddle/cinn/frontend/computation.h
 create mode 100644 paddle/cinn/frontend/computation_test.cc
 create mode 100755 paddle/cinn/frontend/decomposer/CMakeLists.txt
 create mode 100644 paddle/cinn/frontend/decomposer/activation.cc
 create mode 100644 paddle/cinn/frontend/decomposer/activation_test.cc
 create mode 100644 paddle/cinn/frontend/decomposer/batch_norm.cc
 create mode 100755 paddle/cinn/frontend/decomposer/batch_norm_test.cc
 create mode 100644 paddle/cinn/frontend/decomposer/broadcast.cc
 create mode 100644 paddle/cinn/frontend/decomposer/broadcast_test.cc
 create mode 100644 paddle/cinn/frontend/decomposer/elementwise.cc
 create mode 100644 paddle/cinn/frontend/decomposer/elementwise_test.cc
 create mode 100644 paddle/cinn/frontend/decomposer/test_helper.cc
 create mode 100644 paddle/cinn/frontend/decomposer/test_helper.h
 create mode 100644 paddle/cinn/frontend/decomposer/top_k.cc
 create mode 100644 paddle/cinn/frontend/decomposer/top_k_test.cc
 create mode 100644 paddle/cinn/frontend/decomposer/use_decomposer.h
 create mode 100644 paddle/cinn/frontend/decomposer_registry.h
 create mode 100644 paddle/cinn/frontend/decomposer_registry_test.cc
 create mode 100755 paddle/cinn/frontend/interpreter.cc
 create mode 100755 paddle/cinn/frontend/interpreter.h
 create mode 100755 paddle/cinn/frontend/interpreter_test.cc
 create mode 100644 paddle/cinn/frontend/net_builder.cc
 create mode 100644 paddle/cinn/frontend/net_builder.h
 create mode 100644 paddle/cinn/frontend/net_builder_test.cc
 create mode 100644 paddle/cinn/frontend/op_mapper_registry.cc
 create mode 100644 paddle/cinn/frontend/op_mapper_registry.h
 create mode 100644 paddle/cinn/frontend/op_mapper_registry_test.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/CMakeLists.txt
 create mode 100644 paddle/cinn/frontend/op_mappers/common_utils.h
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/CMakeLists.txt
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/arg_min_max.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/argsort.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/atan.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/batchnorm.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/binary.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/cholesky.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/clip.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/compare.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/concat.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/constant.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/dropout.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/expand.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/fetch_feed.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/flip.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/gather.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/gather_nd.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/gaussian_random.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/log.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/lookup_table.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/matmul.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/mul.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/norm.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/one_hot.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/pool2d.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/randint.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/reduce.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/relu.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/reshape.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/reverse.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/roll.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/scale.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/scatter.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/slice.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/softmax.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/squeeze.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/strided_slice.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/take_along_axis.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/tile.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/top_k.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/transpose.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/triangular_solve.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/unary.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/uniform_random.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/unsqueeze.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/paddle/where.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/science/CMakeLists.txt
 create mode 100644 paddle/cinn/frontend/op_mappers/science/broadcast.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/science/compare.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/science/math.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/science/transform.cc
 create mode 100644 paddle/cinn/frontend/op_mappers/use_op_mappers.h
 create mode 100644 paddle/cinn/frontend/optimize.cc
 create mode 100755 paddle/cinn/frontend/optimize.h
 create mode 100644 paddle/cinn/frontend/paddle/CMakeLists.txt
 create mode 100644 paddle/cinn/frontend/paddle/README.md
 create mode 100644 paddle/cinn/frontend/paddle/compatible_pb.cc
 create mode 100644 paddle/cinn/frontend/paddle/compatible_pb.h
 create mode 100644 paddle/cinn/frontend/paddle/cpp/CMakeLists.txt
 create mode 100644 paddle/cinn/frontend/paddle/cpp/block_desc.cc
 create mode 100644 paddle/cinn/frontend/paddle/cpp/block_desc.h
 create mode 100644 paddle/cinn/frontend/paddle/cpp/desc_api.h
 create mode 100644 paddle/cinn/frontend/paddle/cpp/op_desc.cc
 create mode 100644 paddle/cinn/frontend/paddle/cpp/op_desc.h
 create mode 100644 paddle/cinn/frontend/paddle/cpp/program_desc.cc
 create mode 100644 paddle/cinn/frontend/paddle/cpp/program_desc.h
 create mode 100644 paddle/cinn/frontend/paddle/cpp/var_desc.cc
 create mode 100644 paddle/cinn/frontend/paddle/cpp/var_desc.h
 create mode 100644 paddle/cinn/frontend/paddle/framework.proto
 create mode 100755 paddle/cinn/frontend/paddle/model_parser.cc
 create mode 100644 paddle/cinn/frontend/paddle/model_parser.h
 create mode 100644 paddle/cinn/frontend/paddle/model_parser_test.cc
 create mode 100644 paddle/cinn/frontend/paddle/pb/CMakeLists.txt
 create mode 100644 paddle/cinn/frontend/paddle/pb/block_desc.cc
 create mode 100644 paddle/cinn/frontend/paddle/pb/block_desc.h
 create mode 100644 paddle/cinn/frontend/paddle/pb/op_desc.cc
 create mode 100644 paddle/cinn/frontend/paddle/pb/op_desc.h
 create mode 100644 paddle/cinn/frontend/paddle/pb/program_desc.cc
 create mode 100644 paddle/cinn/frontend/paddle/pb/program_desc.h
 create mode 100644 paddle/cinn/frontend/paddle/pb/var_desc.cc
 create mode 100644 paddle/cinn/frontend/paddle/pb/var_desc.h
 create mode 100644 paddle/cinn/frontend/paddle_model_convertor.cc
 create mode 100644 paddle/cinn/frontend/paddle_model_convertor.h
 create mode 100644 paddle/cinn/frontend/paddle_model_convertor_test.cc
 create mode 100644 paddle/cinn/frontend/paddle_model_to_program.cc
 create mode 100644 paddle/cinn/frontend/paddle_model_to_program.h
 create mode 100755 paddle/cinn/frontend/pass/CMakeLists.txt
 create mode 100644 paddle/cinn/frontend/pass/auto_broadcast.cc
 create mode 100644 paddle/cinn/frontend/pass/auto_cast.cc
 create mode 100644 paddle/cinn/frontend/pass/auto_cast_test.cc
 create mode 100644 paddle/cinn/frontend/pass/cast_collapsing.cc
 create mode 100644 paddle/cinn/frontend/pass/cast_collapsing_test.cc
 create mode 100644 paddle/cinn/frontend/pass/dead_code_eliminate.cc
 create mode 100644 paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
 create mode 100755 paddle/cinn/frontend/pass/decomposer.cc
 create mode 100644 paddle/cinn/frontend/pass/decomposer_test.cc
 create mode 100644 paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
 create mode 100644 paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
 create mode 100644 paddle/cinn/frontend/pass/fill_constant_folding.cc
 create mode 100644 paddle/cinn/frontend/pass/fill_constant_folding_test.cc
 create mode 100644 paddle/cinn/frontend/pass/fill_constant_rewriter.cc
 create mode 100644 paddle/cinn/frontend/pass/fill_constant_rewriter_test.cc
 create mode 100644 paddle/cinn/frontend/pass/gemm_rewriter.cc
 create mode 100755 paddle/cinn/frontend/pass/gemm_rewriter_test.cc
 create mode 100644 paddle/cinn/frontend/pass/pass_test_helper.h
 create mode 100644 paddle/cinn/frontend/pass/program_topoerror_test.cc
 create mode 100644 paddle/cinn/frontend/pass/remove_identity.cc
 create mode 100644 paddle/cinn/frontend/pass/remove_identity_test.cc
 create mode 100644 paddle/cinn/frontend/pass/test_helper.h
 create mode 100644 paddle/cinn/frontend/pass/transpose_collapsing.cc
 create mode 100644 paddle/cinn/frontend/pass/transpose_collapsing_test.cc
 create mode 100644 paddle/cinn/frontend/pass/transpose_folding_base.h
 create mode 100644 paddle/cinn/frontend/pass/transpose_folding_input.cc
 create mode 100644 paddle/cinn/frontend/pass/transpose_folding_input_test.cc
 create mode 100644 paddle/cinn/frontend/pass/transpose_folding_output.cc
 create mode 100755 paddle/cinn/frontend/pass/transpose_folding_output_test.cc
 create mode 100644 paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
 create mode 100644 paddle/cinn/frontend/pass/use_program_pass.h
 create mode 100644 paddle/cinn/frontend/program_pass.cc
 create mode 100755 paddle/cinn/frontend/program_pass.h
 create mode 100644 paddle/cinn/frontend/syntax.cc
 create mode 100644 paddle/cinn/frontend/syntax.h
 create mode 100644 paddle/cinn/frontend/syntax_test.cc
 create mode 100644 paddle/cinn/frontend/var_type_utils.h
 create mode 100644 paddle/cinn/gtest_main.cc
 create mode 100644 paddle/cinn/hlir/CMakeLists.txt
 create mode 100755 paddle/cinn/hlir/framework/CMakeLists.txt
 create mode 100644 paddle/cinn/hlir/framework/accuracy_checker.cc
 create mode 100644 paddle/cinn/hlir/framework/accuracy_checker.h
 create mode 100644 paddle/cinn/hlir/framework/accuracy_checker_test.cc
 create mode 100755 paddle/cinn/hlir/framework/buffer.cc
 create mode 100644 paddle/cinn/hlir/framework/buffer.h
 create mode 100755 paddle/cinn/hlir/framework/buffer_test.cc
 create mode 100644 paddle/cinn/hlir/framework/graph.cc
 create mode 100644 paddle/cinn/hlir/framework/graph.h
 create mode 100644 paddle/cinn/hlir/framework/graph_compiler.cc
 create mode 100644 paddle/cinn/hlir/framework/graph_compiler.h
 create mode 100644 paddle/cinn/hlir/framework/graph_compiler_test.cc
 create mode 100644 paddle/cinn/hlir/framework/graph_test.cc
 create mode 100644 paddle/cinn/hlir/framework/instruction.cc
 create mode 100644 paddle/cinn/hlir/framework/instruction.h
 create mode 100644 paddle/cinn/hlir/framework/instruction_test.cc
 create mode 100755 paddle/cinn/hlir/framework/memory.cc
 create mode 100755 paddle/cinn/hlir/framework/memory.h
 create mode 100644 paddle/cinn/hlir/framework/node.cc
 create mode 100644 paddle/cinn/hlir/framework/node.h
 create mode 100755 paddle/cinn/hlir/framework/op.h
 create mode 100644 paddle/cinn/hlir/framework/op_lowering.cc
 create mode 100755 paddle/cinn/hlir/framework/op_lowering.h
 create mode 100644 paddle/cinn/hlir/framework/op_lowering_test.cc
 create mode 100644 paddle/cinn/hlir/framework/op_lowering_util.cc
 create mode 100644 paddle/cinn/hlir/framework/op_lowering_util.h
 create mode 100644 paddle/cinn/hlir/framework/op_strategy.cc
 create mode 100644 paddle/cinn/hlir/framework/op_strategy.h
 create mode 100644 paddle/cinn/hlir/framework/op_test.cc
 create mode 100644 paddle/cinn/hlir/framework/parallel_compiler.cc
 create mode 100644 paddle/cinn/hlir/framework/parallel_compiler.h
 create mode 100644 paddle/cinn/hlir/framework/parallel_compiler_test.cc
 create mode 100644 paddle/cinn/hlir/framework/pass.cc
 create mode 100644 paddle/cinn/hlir/framework/pass.h
 create mode 100644 paddle/cinn/hlir/framework/print_graph_pass_test.cc
 create mode 100644 paddle/cinn/hlir/framework/schedule.h
 create mode 100755 paddle/cinn/hlir/framework/scope.cc
 create mode 100755 paddle/cinn/hlir/framework/scope.h
 create mode 100644 paddle/cinn/hlir/framework/scope_test.cc
 create mode 100644 paddle/cinn/hlir/framework/tensor.cc
 create mode 100644 paddle/cinn/hlir/framework/tensor.h
 create mode 100644 paddle/cinn/hlir/framework/tensor_test.cc
 create mode 100644 paddle/cinn/hlir/framework/variable.cc
 create mode 100644 paddle/cinn/hlir/framework/variable.h
 create mode 100644 paddle/cinn/hlir/framework/visualize_helper.cc
 create mode 100644 paddle/cinn/hlir/framework/visualize_helper.h
 create mode 100644 paddle/cinn/hlir/kernels/CMakeLists.txt
 create mode 100644 paddle/cinn/hlir/op/CMakeLists.txt
 create mode 100644 paddle/cinn/hlir/op/broadcast.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/CMakeLists.txt
 create mode 100644 paddle/cinn/hlir/op/contrib/argmax.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/argmax.h
 create mode 100644 paddle/cinn/hlir/op/contrib/argmax_test.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/argmin.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/argmin.h
 create mode 100644 paddle/cinn/hlir/op/contrib/argmin_test.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/assert_true.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/bitcast_convert.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/cholesky.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/gather_nd.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/gather_nd.h
 create mode 100644 paddle/cinn/hlir/op/contrib/gather_nd_test.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/gaussian_random.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/logical_right_shift.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/logical_right_shift.h
 create mode 100644 paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/lookup_table.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/lookup_table.h
 create mode 100644 paddle/cinn/hlir/op/contrib/lookup_table_test.cc
 create mode 100755 paddle/cinn/hlir/op/contrib/one_hot.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/one_hot.h
 create mode 100644 paddle/cinn/hlir/op/contrib/one_hot_test.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/randint.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/reciprocal.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/reciprocal.h
 create mode 100644 paddle/cinn/hlir/op/contrib/reciprocal_test.cc
 create mode 100755 paddle/cinn/hlir/op/contrib/repeat.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/repeat.h
 create mode 100755 paddle/cinn/hlir/op/contrib/repeat_test.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/resize.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/resize.h
 create mode 100644 paddle/cinn/hlir/op/contrib/sort.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/sort.h
 create mode 100644 paddle/cinn/hlir/op/contrib/sort_test.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/triangular_solve.cc
 create mode 100644 paddle/cinn/hlir/op/contrib/uniform_random.cc
 create mode 100644 paddle/cinn/hlir/op/custom_call.cc
 create mode 100644 paddle/cinn/hlir/op/elementwise.cc
 create mode 100644 paddle/cinn/hlir/op/external_api_registry.cc
 create mode 100644 paddle/cinn/hlir/op/external_api_registry.h
 create mode 100644 paddle/cinn/hlir/op/external_api_registry_test.cc
 create mode 100644 paddle/cinn/hlir/op/nn.cc
 create mode 100755 paddle/cinn/hlir/op/op_broadcast_test.cc
 create mode 100644 paddle/cinn/hlir/op/op_nn_test.cc
 create mode 100644 paddle/cinn/hlir/op/op_util.cc
 create mode 100644 paddle/cinn/hlir/op/op_util.h
 create mode 100644 paddle/cinn/hlir/op/reduction.cc
 create mode 100644 paddle/cinn/hlir/op/reduction_test.cc
 create mode 100644 paddle/cinn/hlir/op/transform.cc
 create mode 100644 paddle/cinn/hlir/op/transform_test.cc
 create mode 100644 paddle/cinn/hlir/op/use_ops.h
 create mode 100644 paddle/cinn/hlir/pass/CMakeLists.txt
 create mode 100644 paddle/cinn/hlir/pass/alterlayout.cc
 create mode 100755 paddle/cinn/hlir/pass/alterlayout_test.cc
 create mode 100644 paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
 create mode 100644 paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
 create mode 100644 paddle/cinn/hlir/pass/common_subexpression_elimination.cc
 create mode 100644 paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
 create mode 100644 paddle/cinn/hlir/pass/const_propagate.cc
 create mode 100644 paddle/cinn/hlir/pass/const_propagate_test.cc
 create mode 100644 paddle/cinn/hlir/pass/constant_folding_pass.cc
 create mode 100644 paddle/cinn/hlir/pass/constant_folding_pass_test.cc
 create mode 100644 paddle/cinn/hlir/pass/constant_folding_pass_util.cc
 create mode 100644 paddle/cinn/hlir/pass/constant_folding_pass_util.h
 create mode 100644 paddle/cinn/hlir/pass/custom_call_pass.cc
 create mode 100644 paddle/cinn/hlir/pass/dce_pass.cc
 create mode 100644 paddle/cinn/hlir/pass/dce_pass_test.cc
 create mode 100644 paddle/cinn/hlir/pass/dense_merge_pass.cc
 create mode 100644 paddle/cinn/hlir/pass/dense_merge_pass_test.cc
 create mode 100644 paddle/cinn/hlir/pass/dot_merger.cc
 create mode 100644 paddle/cinn/hlir/pass/dot_merger_test.cc
 create mode 100644 paddle/cinn/hlir/pass/fusion_helper_base.h
 create mode 100644 paddle/cinn/hlir/pass/fusion_merge_pass.cc
 create mode 100755 paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
 create mode 100644 paddle/cinn/hlir/pass/fusion_merge_pass_util.h
 create mode 100755 paddle/cinn/hlir/pass/infershape.cc
 create mode 100644 paddle/cinn/hlir/pass/infershape.h
 create mode 100644 paddle/cinn/hlir/pass/op_fusion_pass.cc
 create mode 100755 paddle/cinn/hlir/pass/op_fusion_pass_test.cc
 create mode 100644 paddle/cinn/hlir/pass/op_fusion_pass_util.h
 create mode 100644 paddle/cinn/hlir/pass/opfusion.cc
 create mode 100755 paddle/cinn/hlir/pass/opfusion_test.cc
 create mode 100644 paddle/cinn/hlir/pass/reduce_split_pass.cc
 create mode 100644 paddle/cinn/hlir/pass/reduce_split_pass_test.cc
 create mode 100644 paddle/cinn/hlir/pass/single_group_optimize_pass.cc
 create mode 100644 paddle/cinn/hlir/pass/test_dot_merger.cc
 create mode 100755 paddle/cinn/hlir/pass/test_primitive_ops.cc
 create mode 100644 paddle/cinn/hlir/pass/use_pass.h
 create mode 100755 paddle/cinn/hlir/pe/CMakeLists.txt
 create mode 100644 paddle/cinn/hlir/pe/broadcast.cc
 create mode 100644 paddle/cinn/hlir/pe/broadcast.h
 create mode 100644 paddle/cinn/hlir/pe/elementwise.cc
 create mode 100644 paddle/cinn/hlir/pe/elementwise.h
 create mode 100644 paddle/cinn/hlir/pe/ir_schedule_pe.cc
 create mode 100644 paddle/cinn/hlir/pe/ir_schedule_pe.h
 create mode 100644 paddle/cinn/hlir/pe/load_params_test.cc
 create mode 100644 paddle/cinn/hlir/pe/load_x86_params.cc
 create mode 100644 paddle/cinn/hlir/pe/load_x86_params.h
 create mode 100644 paddle/cinn/hlir/pe/nn.cc
 create mode 100755 paddle/cinn/hlir/pe/nn.h
 create mode 100644 paddle/cinn/hlir/pe/nn_util.cc
 create mode 100644 paddle/cinn/hlir/pe/nn_util.h
 create mode 100644 paddle/cinn/hlir/pe/pe_broadcast_test.cc
 create mode 100644 paddle/cinn/hlir/pe/pe_elementwise_test.cc
 create mode 100644 paddle/cinn/hlir/pe/pe_transform_test.cc
 create mode 100644 paddle/cinn/hlir/pe/reduction.cc
 create mode 100644 paddle/cinn/hlir/pe/reduction.h
 create mode 100644 paddle/cinn/hlir/pe/schedule.cc
 create mode 100644 paddle/cinn/hlir/pe/schedule.h
 create mode 100644 paddle/cinn/hlir/pe/schedule_param.proto
 create mode 100644 paddle/cinn/hlir/pe/transform.cc
 create mode 100644 paddle/cinn/hlir/pe/transform.h
 create mode 100644 paddle/cinn/hlir/pe/vision.cc
 create mode 100644 paddle/cinn/hlir/pe/vision.h
 create mode 100755 paddle/cinn/ir/CMakeLists.txt
 create mode 100755 paddle/cinn/ir/buffer.cc
 create mode 100755 paddle/cinn/ir/buffer.h
 create mode 100644 paddle/cinn/ir/buffer_test.cc
 create mode 100644 paddle/cinn/ir/collect_ir_nodes.cc
 create mode 100755 paddle/cinn/ir/collect_ir_nodes.h
 create mode 100644 paddle/cinn/ir/collect_ir_nodes_test.cc
 create mode 100644 paddle/cinn/ir/function_base.cc
 create mode 100644 paddle/cinn/ir/function_base.h
 create mode 100644 paddle/cinn/ir/function_definition.cc
 create mode 100644 paddle/cinn/ir/function_definition.h
 create mode 100644 paddle/cinn/ir/intrinsic_ops.cc
 create mode 100644 paddle/cinn/ir/intrinsic_ops.h
 create mode 100644 paddle/cinn/ir/intrinsic_ops_test.cc
 create mode 100755 paddle/cinn/ir/ir.cc
 create mode 100644 paddle/cinn/ir/ir.h
 create mode 100644 paddle/cinn/ir/ir_base.cc
 create mode 100644 paddle/cinn/ir/ir_base.h
 create mode 100644 paddle/cinn/ir/ir_compare.cc
 create mode 100644 paddle/cinn/ir/ir_compare.h
 create mode 100644 paddle/cinn/ir/ir_compare_test.cc
 create mode 100644 paddle/cinn/ir/ir_mutator.cc
 create mode 100755 paddle/cinn/ir/ir_mutator.h
 create mode 100644 paddle/cinn/ir/ir_operators.cc
 create mode 100644 paddle/cinn/ir/ir_operators.h
 create mode 100644 paddle/cinn/ir/ir_operators_test.cc
 create mode 100644 paddle/cinn/ir/ir_printer.cc
 create mode 100644 paddle/cinn/ir/ir_printer.h
 create mode 100644 paddle/cinn/ir/ir_printer_test.cc
 create mode 100644 paddle/cinn/ir/ir_schedule.cc
 create mode 100644 paddle/cinn/ir/ir_schedule.h
 create mode 100644 paddle/cinn/ir/ir_schedule_util.cc
 create mode 100644 paddle/cinn/ir/ir_schedule_util.h
 create mode 100644 paddle/cinn/ir/ir_test.cc
 create mode 100644 paddle/cinn/ir/ir_verify.cc
 create mode 100644 paddle/cinn/ir/ir_verify.h
 create mode 100644 paddle/cinn/ir/ir_verify_test.cc
 create mode 100644 paddle/cinn/ir/ir_visitor.cc
 create mode 100644 paddle/cinn/ir/ir_visitor.h
 create mode 100644 paddle/cinn/ir/layout.cc
 create mode 100644 paddle/cinn/ir/layout.h
 create mode 100644 paddle/cinn/ir/lowered_func.cc
 create mode 100755 paddle/cinn/ir/lowered_func.h
 create mode 100644 paddle/cinn/ir/module.cc
 create mode 100644 paddle/cinn/ir/module.h
 create mode 100644 paddle/cinn/ir/operation.cc
 create mode 100644 paddle/cinn/ir/operation.h
 create mode 100644 paddle/cinn/ir/registry.cc
 create mode 100644 paddle/cinn/ir/registry.h
 create mode 100644 paddle/cinn/ir/schedule_desc.cc
 create mode 100644 paddle/cinn/ir/schedule_desc.h
 create mode 100644 paddle/cinn/ir/schedule_desc.proto
 create mode 100644 paddle/cinn/ir/schedule_desc_test.cc
 create mode 100755 paddle/cinn/ir/tensor.cc
 create mode 100644 paddle/cinn/ir/tensor.h
 create mode 100755 paddle/cinn/ir/tensor_test.cc
 create mode 100644 paddle/cinn/lang/CMakeLists.txt
 create mode 100644 paddle/cinn/lang/README.md
 create mode 100644 paddle/cinn/lang/buffer.cc
 create mode 100644 paddle/cinn/lang/buffer.h
 create mode 100644 paddle/cinn/lang/builtin.cc
 create mode 100644 paddle/cinn/lang/builtin.h
 create mode 100644 paddle/cinn/lang/compute.cc
 create mode 100755 paddle/cinn/lang/compute.h
 create mode 100644 paddle/cinn/lang/compute_test.cc
 create mode 100755 paddle/cinn/lang/lower.cc
 create mode 100644 paddle/cinn/lang/lower.h
 create mode 100644 paddle/cinn/lang/lower_impl.cc
 create mode 100644 paddle/cinn/lang/lower_impl.h
 create mode 100644 paddle/cinn/lang/lower_impl_test.cc
 create mode 100755 paddle/cinn/lang/lower_test.cc
 create mode 100644 paddle/cinn/lang/packed_func.cc
 create mode 100644 paddle/cinn/lang/packed_func.h
 create mode 100644 paddle/cinn/lang/packed_func_test.cc
 create mode 100644 paddle/cinn/lang/placeholder.cc
 create mode 100644 paddle/cinn/lang/placeholder.h
 create mode 100644 paddle/cinn/lang/placeholder_test.cc
 create mode 100755 paddle/cinn/optim/CMakeLists.txt
 create mode 100644 paddle/cinn/optim/buffer_assign.cc
 create mode 100644 paddle/cinn/optim/buffer_assign.h
 create mode 100755 paddle/cinn/optim/cache_read_write_replace_test.cc
 create mode 100644 paddle/cinn/optim/call_arg_list_to_pod_value.cc
 create mode 100644 paddle/cinn/optim/call_arg_list_to_pod_value.h
 create mode 100644 paddle/cinn/optim/cast_bool_to_int8.cc
 create mode 100644 paddle/cinn/optim/cast_bool_to_int8.h
 create mode 100644 paddle/cinn/optim/cast_simplify.cc
 create mode 100644 paddle/cinn/optim/cast_simplify.h
 create mode 100644 paddle/cinn/optim/cast_simplify_test.cc
 create mode 100644 paddle/cinn/optim/collect_undefined_vars.cc
 create mode 100644 paddle/cinn/optim/collect_undefined_vars.h
 create mode 100644 paddle/cinn/optim/compute_inline_expand.cc
 create mode 100644 paddle/cinn/optim/compute_inline_expand.h
 create mode 100644 paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
 create mode 100644 paddle/cinn/optim/eliminate_broadcast_in_forloop.h
 create mode 100644 paddle/cinn/optim/extern_call_process.cc
 create mode 100644 paddle/cinn/optim/extern_call_process.h
 create mode 100644 paddle/cinn/optim/fold_cinn_call_arguments.cc
 create mode 100644 paddle/cinn/optim/fold_cinn_call_arguments.h
 create mode 100644 paddle/cinn/optim/if_simplify.cc
 create mode 100644 paddle/cinn/optim/if_simplify.h
 create mode 100644 paddle/cinn/optim/if_simplify_test.cc
 create mode 100644 paddle/cinn/optim/insert_debug_log_callee.cc
 create mode 100644 paddle/cinn/optim/insert_debug_log_callee.h
 create mode 100644 paddle/cinn/optim/ir_copy.cc
 create mode 100644 paddle/cinn/optim/ir_copy.h
 create mode 100644 paddle/cinn/optim/ir_copy_test.cc
 create mode 100755 paddle/cinn/optim/ir_replace.cc
 create mode 100644 paddle/cinn/optim/ir_replace.h
 create mode 100644 paddle/cinn/optim/ir_simplify.cc
 create mode 100644 paddle/cinn/optim/ir_simplify.h
 create mode 100755 paddle/cinn/optim/ir_simplify_test.cc
 create mode 100644 paddle/cinn/optim/lower_function_call_bind_vars.cc
 create mode 100644 paddle/cinn/optim/lower_function_call_bind_vars.h
 create mode 100644 paddle/cinn/optim/lower_intrin.cc
 create mode 100644 paddle/cinn/optim/lower_intrin.h
 create mode 100644 paddle/cinn/optim/map_extern_call.cc
 create mode 100644 paddle/cinn/optim/map_extern_call.h
 create mode 100644 paddle/cinn/optim/optimize.cc
 create mode 100644 paddle/cinn/optim/optimize.h
 create mode 100755 paddle/cinn/optim/optimize_test.cc
 create mode 100644 paddle/cinn/optim/remove_nested_block.cc
 create mode 100644 paddle/cinn/optim/remove_nested_block.h
 create mode 100644 paddle/cinn/optim/remove_nested_block_test.cc
 create mode 100644 paddle/cinn/optim/remove_schedule_block.cc
 create mode 100644 paddle/cinn/optim/remove_schedule_block.h
 create mode 100755 paddle/cinn/optim/remove_schedule_block_test.cc
 create mode 100644 paddle/cinn/optim/replace_call_with_expr.cc
 create mode 100644 paddle/cinn/optim/replace_call_with_expr.h
 create mode 100644 paddle/cinn/optim/replace_call_with_expr_test.cc
 create mode 100644 paddle/cinn/optim/replace_const_param_to_integer.cc
 create mode 100644 paddle/cinn/optim/replace_const_param_to_integer.h
 create mode 100644 paddle/cinn/optim/replace_var_with_expr.cc
 create mode 100644 paddle/cinn/optim/replace_var_with_expr.h
 create mode 100644 paddle/cinn/optim/tensor_write_tell.cc
 create mode 100644 paddle/cinn/optim/tensor_write_tell.h
 create mode 100644 paddle/cinn/optim/transform_gpu_forloop.cc
 create mode 100644 paddle/cinn/optim/transform_gpu_forloop.h
 create mode 100644 paddle/cinn/optim/transform_polyfor_to_for.cc
 create mode 100644 paddle/cinn/optim/transform_polyfor_to_for.h
 create mode 100644 paddle/cinn/optim/transform_polyfor_to_for_test.cc
 create mode 100755 paddle/cinn/optim/unroll_loops.cc
 create mode 100644 paddle/cinn/optim/unroll_loops.h
 create mode 100644 paddle/cinn/optim/unroll_loops_test.cc
 create mode 100644 paddle/cinn/optim/var_mod_simplify.cc
 create mode 100644 paddle/cinn/optim/var_mod_simplify.h
 create mode 100644 paddle/cinn/optim/vectorize_loops.cc
 create mode 100644 paddle/cinn/optim/vectorize_loops.h
 create mode 100644 paddle/cinn/optim/vectorize_loops_test.cc
 create mode 100644 paddle/cinn/poly/CMakeLists.txt
 create mode 100644 paddle/cinn/poly/ast_gen.cc
 create mode 100644 paddle/cinn/poly/ast_gen.h
 create mode 100644 paddle/cinn/poly/ast_gen_test.cc
 create mode 100755 paddle/cinn/poly/compute_at_transform.cc
 create mode 100644 paddle/cinn/poly/compute_at_transform.h
 create mode 100644 paddle/cinn/poly/compute_at_transform_test.cc
 create mode 100644 paddle/cinn/poly/dim.cc
 create mode 100644 paddle/cinn/poly/dim.h
 create mode 100644 paddle/cinn/poly/domain.cc
 create mode 100644 paddle/cinn/poly/domain.h
 create mode 100644 paddle/cinn/poly/domain_add_unit_loop_mutator.cc
 create mode 100644 paddle/cinn/poly/domain_add_unit_loop_mutator.h
 create mode 100755 paddle/cinn/poly/graph.cc
 create mode 100644 paddle/cinn/poly/graph.h
 create mode 100644 paddle/cinn/poly/graph_test.cc
 create mode 100644 paddle/cinn/poly/isl_utils.cc
 create mode 100644 paddle/cinn/poly/isl_utils.h
 create mode 100644 paddle/cinn/poly/isl_utils_test.cc
 create mode 100644 paddle/cinn/poly/map.cc
 create mode 100644 paddle/cinn/poly/map.h
 create mode 100644 paddle/cinn/poly/naive_scheduler.cc
 create mode 100644 paddle/cinn/poly/naive_scheduler.h
 create mode 100755 paddle/cinn/poly/poly_scheduler.cc
 create mode 100644 paddle/cinn/poly/poly_scheduler.h
 create mode 100644 paddle/cinn/poly/poly_scheduler_test.cc
 create mode 100644 paddle/cinn/poly/schedule.cc
 create mode 100755 paddle/cinn/poly/schedule.h
 create mode 100755 paddle/cinn/poly/schedule_test.cc
 create mode 100644 paddle/cinn/poly/stage.cc
 create mode 100755 paddle/cinn/poly/stage.h
 create mode 100755 paddle/cinn/poly/stage_test.cc
 create mode 100755 paddle/cinn/pybind/CMakeLists.txt
 create mode 100644 paddle/cinn/pybind/backends.cc
 create mode 100644 paddle/cinn/pybind/bind.cc
 create mode 100644 paddle/cinn/pybind/bind.h
 create mode 100644 paddle/cinn/pybind/bind_utils.h
 create mode 100644 paddle/cinn/pybind/common.cc
 create mode 100755 paddle/cinn/pybind/framework.cc
 create mode 100644 paddle/cinn/pybind/frontend.cc
 create mode 100755 paddle/cinn/pybind/ir.cc
 create mode 100644 paddle/cinn/pybind/lang.cc
 create mode 100755 paddle/cinn/pybind/optim.cc
 create mode 100644 paddle/cinn/pybind/pe.cc
 create mode 100644 paddle/cinn/pybind/poly.cc
 create mode 100644 paddle/cinn/pybind/runtime.cc
 create mode 100644 paddle/cinn/pybind/utils.cc
 create mode 100644 paddle/cinn/runtime/CMakeLists.txt
 create mode 100755 paddle/cinn/runtime/buffer.cc
 create mode 100755 paddle/cinn/runtime/buffer.h
 create mode 100644 paddle/cinn/runtime/cinn_runtime.cc
 create mode 100755 paddle/cinn/runtime/cinn_runtime.h
 create mode 100644 paddle/cinn/runtime/cinn_runtime_test.cc
 create mode 100644 paddle/cinn/runtime/cinn_x86_device_impl.cc
 create mode 100644 paddle/cinn/runtime/cpu/CMakeLists.txt
 create mode 100644 paddle/cinn/runtime/cpu/cblas.cc
 create mode 100644 paddle/cinn/runtime/cpu/cblas.h
 create mode 100644 paddle/cinn/runtime/cpu/host_intrinsics.cc
 create mode 100644 paddle/cinn/runtime/cpu/host_intrinsics.h
 create mode 100644 paddle/cinn/runtime/cpu/host_intrinsics_test.cc
 create mode 100644 paddle/cinn/runtime/cpu/mkl_math.cc
 create mode 100644 paddle/cinn/runtime/cpu/mkl_math.h
 create mode 100644 paddle/cinn/runtime/cpu/mkl_math_test.cc
 create mode 100644 paddle/cinn/runtime/cpu/mkldnn_math.cc
 create mode 100644 paddle/cinn/runtime/cpu/mkldnn_math.h
 create mode 100644 paddle/cinn/runtime/cpu/mkldnn_math_test.cc
 create mode 100644 paddle/cinn/runtime/cpu/thread_backend.cc
 create mode 100644 paddle/cinn/runtime/cpu/thread_backend.h
 create mode 100644 paddle/cinn/runtime/cpu/use_extern_funcs.h
 create mode 100755 paddle/cinn/runtime/cuda/CMakeLists.txt
 create mode 100644 paddle/cinn/runtime/cuda/bfloat16.h
 create mode 100644 paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
 create mode 100644 paddle/cinn/runtime/cuda/cublas_util.h
 create mode 100644 paddle/cinn/runtime/cuda/cuda_instrinsics_bfloat16.cc
 create mode 100644 paddle/cinn/runtime/cuda/cuda_instrinsics_float16.cc
 create mode 100644 paddle/cinn/runtime/cuda/cuda_intrinsics.cc
 create mode 100644 paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
 create mode 100644 paddle/cinn/runtime/cuda/cuda_module.cc
 create mode 100644 paddle/cinn/runtime/cuda/cuda_module.h
 create mode 100644 paddle/cinn/runtime/cuda/cuda_module_test.cc
 create mode 100644 paddle/cinn/runtime/cuda/cuda_util.cc
 create mode 100644 paddle/cinn/runtime/cuda/cuda_util.h
 create mode 100644 paddle/cinn/runtime/cuda/float16.h
 create mode 100644 paddle/cinn/runtime/cuda/test_util.h
 create mode 100644 paddle/cinn/runtime/cuda/use_extern_funcs.h
 create mode 100644 paddle/cinn/runtime/custom_function.cc
 create mode 100644 paddle/cinn/runtime/custom_function.h
 create mode 100644 paddle/cinn/runtime/custom_function_test.cc
 create mode 100644 paddle/cinn/runtime/flags.cc
 create mode 100644 paddle/cinn/runtime/flags.h
 create mode 100644 paddle/cinn/runtime/intrinsic.cc
 create mode 100644 paddle/cinn/runtime/intrinsic.h
 create mode 100644 paddle/cinn/runtime/intrinsic_types.cc
 create mode 100644 paddle/cinn/runtime/intrinsic_types.h
 create mode 100644 paddle/cinn/runtime/tiny_runtime.cc
 create mode 100644 paddle/cinn/runtime/use_extern_funcs.h
 create mode 100755 paddle/cinn/utils/CMakeLists.txt
 create mode 100644 paddle/cinn/utils/data_util.cc
 create mode 100644 paddle/cinn/utils/data_util.h
 create mode 100644 paddle/cinn/utils/dot_lang.cc
 create mode 100644 paddle/cinn/utils/dot_lang.h
 create mode 100644 paddle/cinn/utils/error.cc
 create mode 100644 paddle/cinn/utils/error.h
 create mode 100644 paddle/cinn/utils/event.cc
 create mode 100644 paddle/cinn/utils/event.h
 create mode 100644 paddle/cinn/utils/functional.cc
 create mode 100644 paddle/cinn/utils/functional.h
 create mode 100644 paddle/cinn/utils/functional_test.cc
 create mode 100644 paddle/cinn/utils/multi_threading.cc
 create mode 100644 paddle/cinn/utils/multi_threading.h
 create mode 100644 paddle/cinn/utils/multi_threading_test.cc
 create mode 100644 paddle/cinn/utils/profiler.cc
 create mode 100644 paddle/cinn/utils/profiler.h
 create mode 100644 paddle/cinn/utils/profiler_test.cc
 create mode 100644 paddle/cinn/utils/random_engine.cc
 create mode 100644 paddle/cinn/utils/random_engine.h
 create mode 100644 paddle/cinn/utils/registry.h
 create mode 100644 paddle/cinn/utils/sized_multi_set.cc
 create mode 100644 paddle/cinn/utils/sized_multi_set.h
 create mode 100644 paddle/cinn/utils/sized_multi_set_test.cc
 create mode 100644 paddle/cinn/utils/small_vector.cc
 create mode 100644 paddle/cinn/utils/small_vector.h
 create mode 100644 paddle/cinn/utils/string.cc
 create mode 100644 paddle/cinn/utils/string.h
 create mode 100644 paddle/cinn/utils/string_test.cc
 create mode 100644 paddle/cinn/utils/timer.cc
 create mode 100644 paddle/cinn/utils/timer.h
 create mode 100644 paddle/cinn/utils/type_defs.h
 create mode 100644 python/cinn/__init__.py
 create mode 100644 python/cinn/auto_schedule/__init__.py
 create mode 100644 python/cinn/auto_schedule/cost_model/__init__.py
 create mode 100644 python/cinn/auto_schedule/cost_model/cost_model.py
 create mode 100644 python/cinn/auto_schedule/cost_model/xgb_cost_model.py
 create mode 100644 python/cinn/backends.py
 create mode 100644 python/cinn/common.py
 create mode 100644 python/cinn/framework.py
 create mode 100644 python/cinn/frontend.py
 create mode 100644 python/cinn/ir/__init__.py
 create mode 100644 python/cinn/lang.py
 create mode 100644 python/cinn/libs/__init__.py
 create mode 100644 python/cinn/optim.py
 create mode 100644 python/cinn/pe.py
 create mode 100644 python/cinn/poly.py
 create mode 100644 python/cinn/runtime.py
 create mode 100644 python/cinn/utils.py
 create mode 100644 python/cinn/version/__init__.py
 create mode 100644 test/cinn/auto_schedule/cost_model/test_cost_model.py
 create mode 100644 test/cinn/conv2d_utils.py
 create mode 100644 test/cinn/fake_model/naive_mul.py
 create mode 100644 test/cinn/fake_model/naive_multi_fc.py
 create mode 100644 test/cinn/fake_model/resnet_model.py
 create mode 100644 test/cinn/fusion/fusion_test.py
 create mode 100644 test/cinn/fusion/test_cast_broadcast_reduce_max.py
 create mode 100644 test/cinn/fusion/test_reduce_cast.py
 create mode 100644 test/cinn/fusion/test_select_reduce.py
 create mode 100644 test/cinn/op_mappers/op_mapper_test.py
 create mode 100644 test/cinn/op_mappers/test_argmax_op.py
 create mode 100644 test/cinn/op_mappers/test_argmin_op.py
 create mode 100644 test/cinn/op_mappers/test_argsort_op.py
 create mode 100644 test/cinn/op_mappers/test_assign_value_op.py
 create mode 100644 test/cinn/op_mappers/test_atan2_op.py
 create mode 100644 test/cinn/op_mappers/test_batch_norm_op.py
 create mode 100644 test/cinn/op_mappers/test_bitwise_op.py
 create mode 100644 test/cinn/op_mappers/test_cholesky_op.py
 create mode 100644 test/cinn/op_mappers/test_clip_op.py
 create mode 100644 test/cinn/op_mappers/test_compare_op.py
 create mode 100644 test/cinn/op_mappers/test_conv2d_op.py
 create mode 100644 test/cinn/op_mappers/test_cumsum_op.py
 create mode 100644 test/cinn/op_mappers/test_elementwise_op.py
 create mode 100644 test/cinn/op_mappers/test_expand_op.py
 create mode 100644 test/cinn/op_mappers/test_expand_v2_op.py
 create mode 100644 test/cinn/op_mappers/test_fill_constant_op.py
 create mode 100644 test/cinn/op_mappers/test_flip_op.py
 create mode 100644 test/cinn/op_mappers/test_gather_nd_op.py
 create mode 100644 test/cinn/op_mappers/test_gather_op.py
 create mode 100644 test/cinn/op_mappers/test_gaussian_random_op.py
 create mode 100644 test/cinn/op_mappers/test_layer_norm_op.py
 create mode 100644 test/cinn/op_mappers/test_log1p_op.py
 create mode 100644 test/cinn/op_mappers/test_logical_op.py
 create mode 100644 test/cinn/op_mappers/test_lookup_table_op.py
 create mode 100644 test/cinn/op_mappers/test_mul_op.py
 create mode 100644 test/cinn/op_mappers/test_norm_op.py
 create mode 100644 test/cinn/op_mappers/test_one_hot_op.py
 create mode 100644 test/cinn/op_mappers/test_pool2d_op.py
 create mode 100644 test/cinn/op_mappers/test_pow_op.py
 create mode 100644 test/cinn/op_mappers/test_randint_op.py
 create mode 100644 test/cinn/op_mappers/test_reduce_op.py
 create mode 100644 test/cinn/op_mappers/test_reverse_op.py
 create mode 100644 test/cinn/op_mappers/test_roll_op.py
 create mode 100644 test/cinn/op_mappers/test_scale_op.py
 create mode 100644 test/cinn/op_mappers/test_scatter_op.py
 create mode 100644 test/cinn/op_mappers/test_sign_op.py
 create mode 100644 test/cinn/op_mappers/test_split_op.py
 create mode 100644 test/cinn/op_mappers/test_squeeze_op.py
 create mode 100644 test/cinn/op_mappers/test_stack_op.py
 create mode 100644 test/cinn/op_mappers/test_strided_slice_op.py
 create mode 100644 test/cinn/op_mappers/test_take_along_axis_op.py
 create mode 100644 test/cinn/op_mappers/test_tile_op.py
 create mode 100644 test/cinn/op_mappers/test_transpose2_op.py
 create mode 100644 test/cinn/op_mappers/test_triangular_solve_op.py
 create mode 100644 test/cinn/op_mappers/test_unary_op.py
 create mode 100644 test/cinn/op_mappers/test_uniform_random_op.py
 create mode 100644 test/cinn/op_mappers/test_where_op.py
 create mode 100755 test/cinn/ops/op_test.py
 create mode 100644 test/cinn/ops/op_test_helper.py
 create mode 100644 test/cinn/ops/test_abs_op.py
 create mode 100644 test/cinn/ops/test_acos_op.py
 create mode 100644 test/cinn/ops/test_add_op.py
 create mode 100644 test/cinn/ops/test_arange_op.py
 create mode 100644 test/cinn/ops/test_argsort_op.py
 create mode 100644 test/cinn/ops/test_asin_op.py
 create mode 100644 test/cinn/ops/test_asinh_op.py
 create mode 100644 test/cinn/ops/test_atan2_op.py
 create mode 100644 test/cinn/ops/test_atan_op.py
 create mode 100644 test/cinn/ops/test_atanh_op.py
 create mode 100644 test/cinn/ops/test_batch_norm_op.py
 create mode 100644 test/cinn/ops/test_binary_elementwise_op.py
 create mode 100644 test/cinn/ops/test_bitcast_convert_op.py
 create mode 100644 test/cinn/ops/test_bitwise_op.py
 create mode 100644 test/cinn/ops/test_broadcast_to_op.py
 create mode 100644 test/cinn/ops/test_broadcast_to_op_new.py
 create mode 100644 test/cinn/ops/test_cast_op.py
 create mode 100644 test/cinn/ops/test_cbrt_op.py
 create mode 100644 test/cinn/ops/test_ceil_op.py
 create mode 100644 test/cinn/ops/test_cholesky_op.py
 create mode 100644 test/cinn/ops/test_clz_op.py
 create mode 100644 test/cinn/ops/test_comparison_op.py
 create mode 100755 test/cinn/ops/test_concat_op.py
 create mode 100644 test/cinn/ops/test_constant_op.py
 create mode 100755 test/cinn/ops/test_conv2d_op.py
 create mode 100644 test/cinn/ops/test_cos_op.py
 create mode 100644 test/cinn/ops/test_cosh_op.py
 create mode 100644 test/cinn/ops/test_depthwise_conv2d_op.py
 create mode 100644 test/cinn/ops/test_divide_op.py
 create mode 100644 test/cinn/ops/test_dropout_infer_op.py
 create mode 100644 test/cinn/ops/test_erf_op.py
 create mode 100644 test/cinn/ops/test_exp_op.py
 create mode 100644 test/cinn/ops/test_expand_dims.py
 create mode 100644 test/cinn/ops/test_fill_constant_op.py
 create mode 100644 test/cinn/ops/test_floor_divide_op.py
 create mode 100644 test/cinn/ops/test_floor_op.py
 create mode 100644 test/cinn/ops/test_gather_nd_op.py
 create mode 100644 test/cinn/ops/test_gather_op.py
 create mode 100644 test/cinn/ops/test_gaussian_random_op.py
 create mode 100644 test/cinn/ops/test_gelu_op.py
 create mode 100644 test/cinn/ops/test_identity_op.py
 create mode 100644 test/cinn/ops/test_is_finite_op.py
 create mode 100644 test/cinn/ops/test_is_inf_op.py
 create mode 100644 test/cinn/ops/test_is_nan_op.py
 create mode 100644 test/cinn/ops/test_isclose_op.py
 create mode 100644 test/cinn/ops/test_left_shift_op.py
 create mode 100644 test/cinn/ops/test_log_op.py
 create mode 100644 test/cinn/ops/test_logical_right_shift_op.py
 create mode 100644 test/cinn/ops/test_lookup_table_op.py
 create mode 100755 test/cinn/ops/test_matmul_op.py
 create mode 100644 test/cinn/ops/test_max_op.py
 create mode 100644 test/cinn/ops/test_mod_op.py
 create mode 100755 test/cinn/ops/test_mul_op.py
 create mode 100644 test/cinn/ops/test_multiply_op.py
 create mode 100644 test/cinn/ops/test_negative_op.py
 create mode 100755 test/cinn/ops/test_one_hot_op.py
 create mode 100644 test/cinn/ops/test_pool2d_op.py
 create mode 100644 test/cinn/ops/test_popc_op.py
 create mode 100644 test/cinn/ops/test_pow_op.py
 create mode 100644 test/cinn/ops/test_randint_op.py
 create mode 100644 test/cinn/ops/test_reciprocal_op.py
 create mode 100644 test/cinn/ops/test_reduce_op.py
 create mode 100644 test/cinn/ops/test_reduce_op_new.py
 create mode 100644 test/cinn/ops/test_reduce_op_other.py
 create mode 100644 test/cinn/ops/test_relu6_op.py
 create mode 100755 test/cinn/ops/test_relu_op.py
 create mode 100644 test/cinn/ops/test_remainder_op.py
 create mode 100644 test/cinn/ops/test_repeat_op.py
 create mode 100644 test/cinn/ops/test_reshape_op.py
 create mode 100644 test/cinn/ops/test_resize_op.py
 create mode 100755 test/cinn/ops/test_reverse_op.py
 create mode 100644 test/cinn/ops/test_right_shift_op.py
 create mode 100644 test/cinn/ops/test_round_op.py
 create mode 100644 test/cinn/ops/test_rsqrt_op.py
 create mode 100644 test/cinn/ops/test_scale_op.py
 create mode 100644 test/cinn/ops/test_scatter_add.py
 create mode 100644 test/cinn/ops/test_scatter_assign_op.py
 create mode 100644 test/cinn/ops/test_select_op.py
 create mode 100644 test/cinn/ops/test_sigmoid_op.py
 create mode 100644 test/cinn/ops/test_sign_op.py
 create mode 100644 test/cinn/ops/test_sin_op.py
 create mode 100644 test/cinn/ops/test_sinh_op.py
 create mode 100644 test/cinn/ops/test_slice_assign_op.py
 create mode 100644 test/cinn/ops/test_slice_op.py
 create mode 100644 test/cinn/ops/test_softmax_op.py
 create mode 100644 test/cinn/ops/test_sort_op.py
 create mode 100755 test/cinn/ops/test_split_op.py
 create mode 100644 test/cinn/ops/test_sqrt_op.py
 create mode 100644 test/cinn/ops/test_squeeze_op.py
 create mode 100644 test/cinn/ops/test_subtract_op.py
 create mode 100644 test/cinn/ops/test_sum_op.py
 create mode 100644 test/cinn/ops/test_tan_op.py
 create mode 100644 test/cinn/ops/test_tanh_op.py
 create mode 100644 test/cinn/ops/test_top_k_op.py
 create mode 100644 test/cinn/ops/test_transpose_op.py
 create mode 100644 test/cinn/ops/test_triangular_solve_op.py
 create mode 100644 test/cinn/ops/test_trunc_op.py
 create mode 100644 test/cinn/ops/test_unary_elementwise_op.py
 create mode 100644 test/cinn/ops/test_uniform_random_op.py
 create mode 100644 test/cinn/ops/test_zero_dim_tensor.py
 create mode 100644 test/cinn/passes/pass_test.py
 create mode 100644 test/cinn/passes/test_auto_cast_pass.py
 create mode 100644 test/cinn/passes/test_expand_zero_dim_pass.py
 create mode 100644 test/cinn/passes/test_transpose_floding_input_pass.py
 create mode 100644 test/cinn/passes/test_transpose_floding_output_pass.py
 create mode 100644 test/cinn/pool_utils.py
 create mode 100644 test/cinn/test_common.py
 create mode 100755 test/cinn/test_computation.py
 create mode 100755 test/cinn/test_efficientnet.py
 create mode 100755 test/cinn/test_facedet.py
 create mode 100755 test/cinn/test_frontend.py
 create mode 100644 test/cinn/test_hlir_framework.py
 create mode 100644 test/cinn/test_ir.py
 create mode 100755 test/cinn/test_matmul.py
 create mode 100644 test/cinn/test_mobilenetv1.py
 create mode 100755 test/cinn/test_mobilenetv2.py
 create mode 100755 test/cinn/test_netbuilder.py
 create mode 100755 test/cinn/test_op_benchmark.py
 create mode 100644 test/cinn/test_op_broadcast.py
 create mode 100644 test/cinn/test_op_nn.py
 create mode 100644 test/cinn/test_op_transform.py
 create mode 100755 test/cinn/test_packed_func.py
 create mode 100644 test/cinn/test_paddle_model_convertor.py
 create mode 100644 test/cinn/test_pe_elementwise.py
 create mode 100644 test/cinn/test_pe_reduction.py
 create mode 100644 test/cinn/test_pe_transform.py
 create mode 100755 test/cinn/test_resnet.py
 create mode 100755 test/cinn/test_resnet18.py
 create mode 100755 test/cinn/test_resnet50.py
 create mode 100644 test/cinn/test_squeezenet.py
 create mode 100755 test/cinn/test_utils.py
 create mode 100644 test/cpp/cinn/CMakeLists.txt
 create mode 100755 test/cpp/cinn/benchmark/CMakeLists.txt
 create mode 100644 test/cpp/cinn/benchmark/test_all_ops_default.cc
 create mode 100644 test/cpp/cinn/benchmark/test_elementwise.cc
 create mode 100644 test/cpp/cinn/benchmark/test_elementwise.h
 create mode 100644 test/cpp/cinn/benchmark/test_matmul.cc
 create mode 100644 test/cpp/cinn/benchmark/test_matmul.h
 create mode 100755 test/cpp/cinn/benchmark/test_utils.cc
 create mode 100755 test/cpp/cinn/benchmark/test_utils.h
 create mode 100644 test/cpp/cinn/concrete_program_builder.h
 create mode 100644 test/cpp/cinn/program_builder.cc
 create mode 100644 test/cpp/cinn/program_builder.h
 create mode 100644 test/cpp/cinn/test01_elementwise_add_case.cc
 create mode 100644 test/cpp/cinn/test01_elementwise_add_main.cc
 create mode 100644 test/cpp/cinn/test02_helper.h
 create mode 100644 test/cpp/cinn/test02_matmul_case.cc
 create mode 100644 test/cpp/cinn/test02_matmul_main.cc
 create mode 100644 test/cpp/cinn/test03_convolution_case.cc
 create mode 100755 test/cpp/cinn/test03_convolution_main.cc

diff --git a/cmake/cinn/external/jitify.cmake b/cmake/cinn/external/jitify.cmake
index b04d64b12b8fb..8ee57c13ece4c 100644
--- a/cmake/cinn/external/jitify.cmake
+++ b/cmake/cinn/external/jitify.cmake
@@ -12,7 +12,6 @@ ExternalProject_Add(
   ${EXTERNAL_PROJECT_LOG_ARGS}
   GIT_REPOSITORY "https://github.com/NVIDIA/jitify.git"
   GIT_TAG 57de649139c866eb83acacfe50c92ad7c6278776
-  GIT_TAG master
   PREFIX ${CINN_THIRD_PARTY_PATH}/jitify
   SOURCE_DIR ${JITIFY_SOURCE_PATH}
   CONFIGURE_COMMAND ""
diff --git a/paddle/cinn/CMakeLists.txt b/paddle/cinn/CMakeLists.txt
new file mode 100644
index 0000000000000..16c70714d7f36
--- /dev/null
+++ b/paddle/cinn/CMakeLists.txt
@@ -0,0 +1,21 @@
+if (WITH_TESTING)
+  cc_library(cinn_gtest_main SRCS gtest_main.cc DEPS gtest)
+endif()
+
+add_subdirectory(auto_schedule)
+add_subdirectory(common)
+add_subdirectory(utils)
+add_subdirectory(poly)
+add_subdirectory(runtime)
+add_subdirectory(ir)
+add_subdirectory(backends)
+add_subdirectory(lang)
+add_subdirectory(optim)
+add_subdirectory(hlir)
+add_subdirectory(pybind)
+add_subdirectory(frontend)
+
+# Download a model
+download_and_uncompress("${DOWNLOAD_MODEL_DIR}" "${PADDLE_RESOURCE_URL}" "lite_naive_model.tar.gz")
+
+core_gather_headers()
diff --git a/paddle/cinn/auto_schedule/CMakeLists.txt b/paddle/cinn/auto_schedule/CMakeLists.txt
new file mode 100644
index 0000000000000..7a2d725d33ee8
--- /dev/null
+++ b/paddle/cinn/auto_schedule/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_subdirectory(analysis)
+add_subdirectory(cost_model)
+add_subdirectory(database)
+add_subdirectory(measure)
+add_subdirectory(post_schedule_rule)
+add_subdirectory(search_space)
+add_subdirectory(search_strategy)
+add_subdirectory(task)
+add_subdirectory(task_scheduler)
+add_subdirectory(tests)
+
+proto_library(auto_schedule_proto SRCS auto_schedule.proto DEPS schedule_desc_proto)
+
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS auto_tuner.cc)
+
+#cc_test(test_auto_tuner SRCS auto_tuner_test.cc DEPS cinncore)
+
+foreach(header ${auto_schedule_proto_HDRS})
+  set(core_proto_includes "${core_proto_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/cinn/auto_schedule/analysis/CMakeLists.txt b/paddle/cinn/auto_schedule/analysis/CMakeLists.txt
new file mode 100644
index 0000000000000..46eda4a587bb8
--- /dev/null
+++ b/paddle/cinn/auto_schedule/analysis/CMakeLists.txt
@@ -0,0 +1,5 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS analyze_ir.cc)
+
+cc_test(test_analyze_ir SRCS analyze_ir_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
new file mode 100644
index 0000000000000..21ff620118d59
--- /dev/null
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -0,0 +1,176 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/analysis/analyze_ir.h"
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/lower.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/optimize.h"
+#include "cinn/optim/transform_gpu_forloop.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+std::vector<ir::Var> IndicesToVars(const std::vector<ir::Expr>& indices) {
+  std::vector<ir::Var> result;
+  for (const ir::Expr& e : indices) {
+    // Whether we have to convert other types, like const numbers to Var?
+    if (e.As<ir::_Var_>() != nullptr) {
+      ir::Expr copy_e    = optim::IRCopy(e);
+      ir::_Var_* var_ref = copy_e.As<ir::_Var_>();
+      result.emplace_back(ir::Var(var_ref));
+    }
+  }
+  return result;
+}
+
+void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) {
+  if (!sche_block->read_buffers.empty() || !sche_block->write_buffers.empty()) {
+    return;
+  }
+
+  ir::CollectIRNodesWithoutTensor(sche_block->body, [&](const Expr* x) {
+    const ir::Load* load_expr = x->As<ir::Load>();
+    if (load_expr != nullptr) {
+      const ir::Tensor t = load_expr->tensor.as_tensor_ref();
+      sche_block->read_buffers.emplace_back(ir::BufferRange(t->buffer, IndicesToVars(load_expr->indices)));
+      return false;
+    }
+    const ir::Store* store_expr = x->As<ir::Store>();
+    if (store_expr != nullptr) {
+      const ir::Tensor t = store_expr->tensor.as_tensor_ref();
+      sche_block->write_buffers.emplace_back(ir::BufferRange(t->buffer, IndicesToVars(store_expr->indices)));
+      return false;
+    }
+    return false;
+  });
+}
+
+bool ContainsNodeType(ir::Expr expr, const std::unordered_set<ir::IrNodeTy>& node_types) {
+  std::set<ir::Expr> collection = ir::CollectIRNodesWithoutTensor(
+      expr, [&](const Expr* x) { return node_types.find(x->node_type()) != node_types.end(); });
+  return !collection.empty();
+}
+
+std::unordered_set<std::string> GetOutputNamesFromLoweredFunc(const std::vector<ir::LoweredFunc>& lowered_funcs) {
+  std::unordered_set<std::string> result;
+  for (const ir::LoweredFunc& func : lowered_funcs) {
+    for (const ir::Argument& arg : func->args) {
+      if (arg.is_output()) {
+        result.insert(arg.name());
+      }
+    }
+  }
+  return result;
+}
+
+bool NeedsMultiLevelTiling(const ir::ScheduleBlockRealize& sche_block_realize) {
+  const ir::ScheduleBlock* sche_block = sche_block_realize.schedule_block.As<ir::ScheduleBlock>();
+  if (sche_block->write_buffers.size() != 1 || sche_block->read_buffers.empty()) {
+    return false;
+  }
+  const ir::Expr& write_buffer = sche_block->write_buffers[0].As<ir::_BufferRange_>()->buffer;
+
+  // Enumerate each read region, get the number of schedule block iter vars
+  // which  are not used to index the read region
+  int total_unused_iter_vars = 0;
+
+  for (const ir::Expr& read_buffer_expr : sche_block->read_buffers) {
+    const ir::_BufferRange_* read_buffer = read_buffer_expr.As<ir::_BufferRange_>();
+    // Skip the reduction buffer
+    if (read_buffer->buffer == write_buffer) {
+      continue;
+    }
+    // Collect the vars in schedule block that are used to index the read region
+    std::unordered_set<std::string> vars_index_read;
+    for (const Var& range : read_buffer->ranges) {
+      vars_index_read.insert(range->name);
+    }
+    // Check the block iter vars are not used to index the read region
+    int n_unused_block_vars = 0;
+    for (const ir::Var& block_iter_var : sche_block->iter_vars) {
+      if (!block_iter_var->is_reduce_axis) {
+        bool iter_var_in_read = false;
+        for (const std::string& var : vars_index_read) {
+          if (var == block_iter_var->name) {
+            iter_var_in_read = true;
+            break;
+          }
+        }
+        if (!iter_var_in_read) {
+          ++n_unused_block_vars;
+        }
+      }
+    }
+    total_unused_iter_vars += n_unused_block_vars;
+  }
+
+  return total_unused_iter_vars >= 1;
+}
+
+ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target, const ir::LoweredFunc& old_func, ir::Expr& body) {
+  ir::ModuleExpr mod_expr(std::vector<ir::Expr>({body}));
+  ir::IRSchedule ir_sch(mod_expr);
+
+  // temp_bufs may be deleted during auto tuning (such as auto inline),
+  // we have to check from old temp bufs and set them as local buffer.
+  for (const ir::Buffer& buf : old_func->temp_bufs) {
+    const std::string& buf_name              = buf->name;
+    std::vector<ir::Expr> all_block_realizes = ir_sch.GetAllBlocks();
+    for (ir::Expr& e : all_block_realizes) {
+      const ir::ScheduleBlockRealize* sche_block_realize = e.As<ir::ScheduleBlockRealize>();
+      const std::string& sche_name = sche_block_realize->schedule_block.As<ir::ScheduleBlock>()->name;
+      if (buf_name == "_" + sche_name) {
+        VLOG(6) << "Set local buffer for temp buffer " << buf_name;
+        ir_sch.SetBuffer(e, "local", true);
+        break;
+      }
+    }
+  }
+
+  ir::Expr updated_body = ir_sch.GetModule().GetExprs()[0];
+#ifdef CINN_WITH_CUDA
+  optim::OptimizeExprGPU(&updated_body);
+#endif
+
+  // Get new temp bufs by analyzing.
+  std::vector<ir::Buffer> new_temp_bufs = lang::GetTempBuffers(old_func->args, updated_body);
+  ir::LoweredFunc new_func = ir::_LoweredFunc_::Make(old_func->name, old_func->args, updated_body, new_temp_bufs);
+#ifdef CINN_WITH_CUDA
+  if (target == common::DefaultNVGPUTarget()) {
+    new_func->PrepareCudaAxisInfoFromBody();
+  }
+#endif
+  new_func = optim::Optimize(Expr(new_func), target, false).as_lowered_func_ref();
+  new_func->PrepareBufferCastExprs(/*with_expr_gen_tensor = */ false);
+
+  return new_func;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.h b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
new file mode 100644
index 0000000000000..f2d214db89e43
--- /dev/null
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_set>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/lowered_func.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block);
+
+bool ContainsNodeType(ir::Expr expr, const std::unordered_set<ir::IrNodeTy>& node_types);
+
+/**
+ * Collects all input lowered_funcs and return names of all output arguments
+ */
+std::unordered_set<std::string> GetOutputNamesFromLoweredFunc(const std::vector<ir::LoweredFunc>& lowered_funcs);
+
+/**
+ * Determine whether a schedule block needs multileveltiling
+ */
+bool NeedsMultiLevelTiling(const ir::ScheduleBlockRealize& sche_block_realize);
+
+/**
+ * Update a LoweredFunc by regenerating related fields with a new function body
+ */
+ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target, const ir::LoweredFunc& old_func, ir::Expr& body);
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
new file mode 100644
index 0000000000000..e51bd0e94cf26
--- /dev/null
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/analysis/analyze_ir.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <sstream>
+#include <vector>
+
+#include "cinn/common/context.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_SimpleAssign) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  ir::Expr M(32);
+  ir::Expr N(32);
+
+  lang::Placeholder<float> A("A", {M, N});
+  ir::Tensor B = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  poly::StageMap stages              = poly::CreateStages({A, B});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+
+  ASSERT_FALSE(funcs.empty());
+  ir::Expr ast_expr = funcs[0]->body;
+
+  VLOG(6) << "Analyzing for Expr:";
+  VLOG(6) << ast_expr;
+
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  std::vector<ir::Expr> all_block_realizes = ir_sch.GetAllBlocks();
+  ASSERT_EQ(all_block_realizes.size(), 1UL);
+
+  ir::ScheduleBlockRealize* sche_block_realize = all_block_realizes[0].As<ir::ScheduleBlockRealize>();
+  ir::ScheduleBlock* sche_block                = sche_block_realize->schedule_block.As<ir::ScheduleBlock>();
+  AnalyzeScheduleBlockReadWriteBuffer(sche_block);
+
+  /*
+   * the sche_block_realize will be:
+   * ScheduleBlock(B)
+   * {
+   *   i0, i1 = axis.bind(i, j)
+   *   read_buffers(_A[i0(undefined:undefined), i1(undefined:undefined)])
+   *   write_buffers(_B[i0(undefined:undefined), i1(undefined:undefined)])
+   *   B[i0, i1] = A[i0, i1]
+   * }
+   */
+
+  VLOG(6) << "ScheduleBlockRealize: ";
+  VLOG(6) << all_block_realizes[0];
+
+  ASSERT_EQ(sche_block->read_buffers.size(), 1UL);
+
+  std::stringstream read_ss;
+  read_ss << sche_block->read_buffers[0];
+  ASSERT_EQ(read_ss.str(), "_A[i0(0:32), i1(0:32)]");
+
+  ASSERT_EQ(sche_block->write_buffers.size(), 1UL);
+  std::stringstream write_ss;
+  write_ss << sche_block->write_buffers[0];
+  ASSERT_EQ(write_ss.str(), "_B[i0(0:32), i1(0:32)]");
+}
+
+TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_AddDiffShape) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  ir::Expr M(32);
+  ir::Expr N(128);
+
+  lang::Placeholder<float> A("A", {M});
+  lang::Placeholder<float> B("B", {N});
+
+  ir::Tensor C = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");
+
+  poly::StageMap stages              = poly::CreateStages({C});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("AddDiffShape", stages, {C}, {}, {}, nullptr, target, true);
+
+  ir::Expr ast_expr = funcs[0]->body;
+  VLOG(6) << "Expr before MultiLevelTiling: ";
+  VLOG(6) << ast_expr;
+
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  std::vector<ir::Expr> all_block_realizes = ir_sch.GetAllBlocks();
+  ASSERT_EQ(all_block_realizes.size(), 1UL);
+
+  ir::ScheduleBlockRealize* sche_block_realize = all_block_realizes[0].As<ir::ScheduleBlockRealize>();
+  ir::ScheduleBlock* sche_block                = sche_block_realize->schedule_block.As<ir::ScheduleBlock>();
+  AnalyzeScheduleBlockReadWriteBuffer(sche_block);
+
+  VLOG(6) << "ScheduleBlockRealize: ";
+  VLOG(6) << all_block_realizes[0];
+  ASSERT_EQ(sche_block->read_buffers.size(), 2UL);
+  std::vector<std::string> expect_read = {"_A[i0(0:32)]", "_B[i1(0:128)]"};
+
+  ASSERT_EQ(sche_block->read_buffers.size(), expect_read.size());
+  for (size_t i = 0; i < expect_read.size(); ++i) {
+    std::stringstream read_ss;
+    read_ss << sche_block->read_buffers[i];
+    ASSERT_EQ(read_ss.str(), expect_read[i]);
+  }
+
+  ASSERT_EQ(sche_block->write_buffers.size(), 1UL);
+  std::stringstream write_ss;
+  write_ss << sche_block->write_buffers[0];
+  ASSERT_EQ(write_ss.str(), "_C[i0(0:32), i1(0:128)]");
+}
+
+TEST(AnalyzeIr, ContainsNodeType) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  ir::Expr M(32);
+  ir::Expr N(32);
+
+  lang::Placeholder<float> A("A", {M, N});
+  ir::Tensor B = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  poly::StageMap stages              = poly::CreateStages({A, B});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+
+  ASSERT_FALSE(funcs.empty());
+  ir::Expr ast_expr = funcs[0]->body;
+
+  VLOG(6) << "Analyzing for Expr:";
+  VLOG(6) << ast_expr;
+
+  ASSERT_TRUE(ContainsNodeType(ast_expr, {ir::IrNodeTy::Load, ir::IrNodeTy::Store}));
+  ASSERT_TRUE(ContainsNodeType(ast_expr, {ir::IrNodeTy::Load, ir::IrNodeTy::IfThenElse}));
+  ASSERT_FALSE(ContainsNodeType(ast_expr, {ir::IrNodeTy::IfThenElse, ir::IrNodeTy::Sum}));
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/auto_schedule.proto b/paddle/cinn/auto_schedule/auto_schedule.proto
new file mode 100644
index 0000000000000..d5d8eff373fa3
--- /dev/null
+++ b/paddle/cinn/auto_schedule/auto_schedule.proto
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax ="proto3";
+
+package cinn.auto_schedule.proto;
+
+import "cinn/ir/schedule_desc.proto";
+
+message TuningRecord {
+  string task_key = 1;
+  double execution_cost = 2;
+  double predicted_cost = 3;
+  cinn.ir.proto.ScheduleDesc trace = 4;
+}
diff --git a/paddle/cinn/auto_schedule/auto_tuner.cc b/paddle/cinn/auto_schedule/auto_tuner.cc
new file mode 100644
index 0000000000000..86baae7007a56
--- /dev/null
+++ b/paddle/cinn/auto_schedule/auto_tuner.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/auto_tuner.h"
+
+#include <glog/logging.h>
+#include <pybind11/embed.h>
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "cinn/auto_schedule/database/jsonfile_database.h"
+#include "cinn/auto_schedule/measure/schedule_measurer.h"
+#include "cinn/auto_schedule/measure/simple_builder.h"
+#include "cinn/auto_schedule/measure/simple_runner.h"
+#include "cinn/auto_schedule/task/task_creator.h"
+#include "cinn/auto_schedule/task/task_registry.h"
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
+#include "cinn/common/context.h"
+#include "cinn/common/type.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/visualize_helper.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+AutoTuner::AutoTuner(const common::Target& target, hlir::framework::Graph* graph) : target_(target), graph_(graph) {}
+
+void AutoTuner::Initialize(const Config& config, hlir::framework::GraphCompiler* graph_compiler) {
+  // create builder, runner, and schedule measurer
+  builder_           = std::make_unique<SimpleBuilder>(graph_compiler);
+  runner_            = std::make_unique<SimpleRunner>(config.runner_repeat_times);
+  schedule_measurer_ = std::make_unique<ScheduleMeasurer>(builder_.get(), runner_.get());
+
+  // initialize database
+  database_ = std::move(Database::Make(config.database_config));
+
+  // create tasks
+  TaskCreator task_creator;
+  tasks_ = task_creator.CreateTuneTaskOpLevel(graph_);
+
+  const auto& dtype_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+  const auto& shape_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+
+  op_lowerer_                        = std::make_unique<hlir::framework::OpLowerer>(dtype_dict, shape_dict, target_);
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+  for (auto i = 0; i < tasks_.size(); ++i) {
+    auto&& task = tasks_[i];
+    task.Initialize(shape_dict, dtype_dict, op_lowerer_.get());
+    // Register the initial ModuleExpr corresponding to the task
+    task_registry->Regist(task.serialized_key, ir::ModuleExpr(task.GetLoweredFuncBodyExprs()));
+    VLOG(3) << "Add a task, id:" << i << ", serialized_key:\n" << task.serialized_key;
+  }
+
+  // create task optimizers
+  utils::LinearRandomEngine::StateType initial_seed = utils::LinearRandomEngine::GetDeviceRandomValue();
+  task_optimizers_.resize(tasks_.size());
+  std::transform(tasks_.begin(), tasks_.end(), task_optimizers_.begin(), [&](TuneTask& task) {
+    return std::make_unique<TaskOptimizer>(
+        &task, schedule_measurer_.get(), database_.get(), utils::ForkRandomState(&initial_seed));
+  });
+
+  // create task scheduler
+  task_scheduler_ = TaskScheduler::Make(tasks_, config.task_schedule_config, config.task_schedule_strategy);
+}
+
+void PrintResult(std::shared_ptr<hlir::framework::Graph::Group> group) {
+  if (!VLOG_IS_ON(3)) {
+    return;
+  }
+
+  auto nodes = group->CollectNodes();
+  VLOG(3) << "Node size:" << nodes.size();
+  VLOG(3) << "Group {";
+  for (auto* node : nodes) {
+    VLOG(3) << "  " << hlir::framework::DebugString(node);
+  }
+  VLOG(3) << "}";
+}
+
+void PrintResult(const FunctionGroup& functions) {
+  if (!VLOG_IS_ON(3)) {
+    return;
+  }
+
+  VLOG(3) << "Function size:" << functions.size();
+  for (auto i = 0; i < functions.size(); ++i) {
+    const ir::LoweredFunc& func = functions.at(i);
+    VLOG(3) << "LoweredFunc-" << i << " detail:\n" << func;
+  }
+}
+
+void PrintResult(const TuningResult& result) {
+  if (!VLOG_IS_ON(3)) {
+    return;
+  }
+  VLOG(3) << "###### Debug TuningResult ######\n";
+  VLOG(3) << "Tuned SubGraph num:" << result.subgraphs.size();
+  for (auto i = 0; i < result.subgraphs.size(); ++i) {
+    VLOG(3) << "****** SubGraph-" << i << " Detail ******\n";
+    PrintResult(result.subgraphs.at(i));
+    VLOG(3) << "****** SubGraph End ******";
+  }
+
+  VLOG(3) << "Tuned FunctionGroup num:" << result.function_groups.size();
+  for (auto i = 0; i < result.function_groups.size(); ++i) {
+    VLOG(3) << "****** FunctionGroup-" << i << " Detail ******\n";
+    PrintResult(result.function_groups.at(i));
+    VLOG(3) << "****** FunctionGroup End ******";
+  }
+  VLOG(3) << "###### TuningResult End ######";
+}
+
+TuningResult AutoTuner::Tune(const TuningOptions& options) {
+  CHECK_GT(options.num_tuning_rounds, 0) << "Invalid config";
+  VLOG(3) << "Begin tuning with round num=" << options.num_tuning_rounds << ", tasks size=" << tasks_.size();
+
+  TuningResult result;
+  result.subgraphs.resize(tasks_.size());
+  result.function_groups.resize(tasks_.size());
+  // A task only tunes schedule now, so we populate its sub_graph
+  // as default result of graph tuning, and that should be updated
+  // once we support graph tuning.
+  for (auto i = 0; i < tasks_.size(); ++i) {
+    auto&& task         = tasks_.at(i);
+    result.subgraphs[i] = task.subgraph;
+  }
+
+  for (int r = 0; r < options.num_tuning_rounds; ++r) {
+    VLOG(3) << "<<<<<< Round " << r << " >>>>>>";
+    int run_id = -1;
+    task_scheduler_->Reset();
+    while ((run_id = task_scheduler_->NextTaskId()) != -1) {
+      VLOG(3) << "Start tuning Task-" << run_id;
+      auto* opt           = task_optimizers_.at(run_id).get();
+      auto function_group = opt->Optimize(options);
+      VLOG(3) << "Task-" << run_id << " finished, print optimized functions:\n";
+      PrintResult(function_group);
+      // update the best schedules searched so far.
+      result.function_groups.at(run_id) = std::move(function_group);
+    }
+  }
+
+  PrintResult(result);
+  return result;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/auto_tuner.h b/paddle/cinn/auto_schedule/auto_tuner.h
new file mode 100644
index 0000000000000..6a356bd3dd7b1
--- /dev/null
+++ b/paddle/cinn/auto_schedule/auto_tuner.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/auto_schedule/measure/schedule_measurer.h"
+#include "cinn/auto_schedule/task/task_optimizer.h"
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
+#include "cinn/auto_schedule/tuning.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/op_lowering.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// This class is entrance of auto-tune, users can use it
+// to tune graph (not supported yet) and search a series of schedules
+// that maybe more likely to obtain better performance.
+// Internally, it creates necessary components and use them to perform tuning.
+class AutoTuner {
+ public:
+  // configure how to perform auto-tune, such as
+  // the way to create tasks, the strategy of scheduling tasks and so on.
+  struct Config {
+    std::string task_schedule_strategy = "round_robin";
+    TaskScheduler::Config task_schedule_config;
+    int runner_repeat_times = 1;
+    DatabaseConfig database_config;
+  };
+
+  AutoTuner(const common::Target& target, hlir::framework::Graph* graph);
+
+  // Initialize tuner with specific config and auxiliary objects.
+  void Initialize(const Config& config, hlir::framework::GraphCompiler* graph_compiler);
+
+  // Perform the tuning process and return the final result
+  TuningResult Tune(const TuningOptions& options);
+
+ private:
+  const common::Target& target_;
+  hlir::framework::Graph* graph_;
+  std::unique_ptr<hlir::framework::OpLowerer> op_lowerer_;
+
+  // Tasks to tune
+  std::vector<TuneTask> tasks_;
+  // Scheduler that select a task to tune at every turn.
+  std::unique_ptr<TaskScheduler> task_scheduler_;
+  // The actor to perform auto-tune, each optimizer take a task.
+  std::vector<std::unique_ptr<TaskOptimizer>> task_optimizers_;
+
+  // Classes used to measure AutoTune samples
+  std::unique_ptr<ScheduleBuilder> builder_;
+  std::unique_ptr<ScheduleRunner> runner_;
+  std::unique_ptr<ScheduleMeasurer> schedule_measurer_;
+
+  // The database to store tuning record
+  std::unique_ptr<Database> database_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/auto_tuner_test.cc b/paddle/cinn/auto_schedule/auto_tuner_test.cc
new file mode 100644
index 0000000000000..362a279e852d1
--- /dev/null
+++ b/paddle/cinn/auto_schedule/auto_tuner_test.cc
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/auto_tuner.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+#include <iostream>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(auto_schedule_use_cost_model);
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::hlir::framework::BuildScope;
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::GraphCompiler;
+using ::cinn::hlir::framework::Instruction;
+using ::cinn::hlir::framework::Node;
+using ::cinn::hlir::framework::Scope;
+
+class TestAutoTuner : public ::testing::Test {
+ public:
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  std::shared_ptr<Graph> graph;
+  std::shared_ptr<Scope> compiled_scope;
+  std::unique_ptr<GraphCompiler> graph_compiler;
+  std::unique_ptr<AutoTuner> tuner;
+
+  frontend::Program CreateAddReluProgram() {
+    frontend::NetBuilder builder("test");
+
+    auto a = builder.CreateInput(Float(32), {1, 64, 112, 112}, "A");
+    auto b = builder.CreateInput(Float(32), {64}, "B");
+    auto c = builder.Add(a, b, 1);
+    auto d = builder.Relu(c);
+
+    return builder.Build();
+  }
+
+  void SetUp() override {
+    srand(0);
+    // AutoTuner is combined with new IR Schedule
+    FLAGS_cinn_ir_schedule = true;
+    std::unordered_set<std::string> fetch_ids;
+    auto program   = CreateAddReluProgram();
+    auto graph     = cinn::frontend::Optimize(&program, fetch_ids, target);
+    compiled_scope = BuildScope(target, graph);
+    graph_compiler = std::make_unique<GraphCompiler>(target, compiled_scope, graph);
+    tuner          = std::make_unique<AutoTuner>(target, graph.get());
+  }
+
+  TuningResult InitializeAndTune(const AutoTuner::Config& config, const TuningOptions& options) {
+    tuner->Initialize(config, graph_compiler.get());
+    return tuner->Tune(options);
+  }
+
+  virtual void BasicCheckResult(const TuningResult& result) {
+    ASSERT_EQ(1, result.subgraphs.size());
+    auto nodes = result.subgraphs.front()->CollectNodes();
+    ASSERT_EQ(nodes.size(), 4UL);
+    ASSERT_EQ(nodes[0]->op()->name, "broadcast_to");
+    ASSERT_EQ(nodes[1]->op()->name, "fill_constant");
+    ASSERT_EQ(nodes[2]->op()->name, "elementwise_add");
+    ASSERT_EQ(nodes[3]->op()->name, "max");
+
+    ASSERT_EQ(result.function_groups.size(), 1UL);
+    ASSERT_EQ(result.function_groups[0].size(), 1UL);
+  }
+
+  virtual void ApplyTunedAndRun(const TuningResult& result) {
+    // build runtime program with tuning result
+    GraphCompiler::CompileOptions compile_options;
+    compile_options.with_instantiate_variables = true;
+    compile_options.Apply(result);
+    ASSERT_EQ(1, compile_options.groups.size());
+    ASSERT_EQ(1, compile_options.lowered_funcs.size());
+    VLOG(6) << "Print lowered_funcs before building";
+    VLOG(6) << compile_options.lowered_funcs[0][0];
+    VLOG(6) << compile_options.lowered_funcs[1][0];
+    auto runtime_program = graph_compiler->Build(compile_options).runtime_program;
+    ASSERT_EQ(1, runtime_program->size());
+    runtime_program->Execute();
+  }
+
+  void ZeroMeasure() {
+    // set config and options
+    AutoTuner::Config tuning_config;
+    tuning_config.task_schedule_strategy = "round_robin";
+
+    TuningOptions tuning_options;
+    tuning_options.num_measure_trials = 0;
+    auto result                       = InitializeAndTune(tuning_config, tuning_options);
+    BasicCheckResult(result);
+    ApplyTunedAndRun(result);
+  }
+
+  void NonZeroMeasure() {
+    // set config and options
+    AutoTuner::Config tuning_config;
+    tuning_config.task_schedule_strategy = "round_robin";
+
+    TuningOptions tuning_options;
+    tuning_options.num_measure_trials        = 4;
+    tuning_options.num_samples_per_iteration = 2;
+
+    auto result = InitializeAndTune(tuning_config, tuning_options);
+    BasicCheckResult(result);
+    ApplyTunedAndRun(result);
+  }
+};
+
+TEST_F(TestAutoTuner, ZeroMeasure_DisableCostModel) {
+  FLAGS_auto_schedule_use_cost_model = false;
+  ZeroMeasure();
+}
+
+TEST_F(TestAutoTuner, ZeroMeasure_EnableCostModel) {
+  FLAGS_auto_schedule_use_cost_model = true;
+  ZeroMeasure();
+}
+
+TEST_F(TestAutoTuner, NonZeroMeasure_DisableCostModel) {
+  FLAGS_auto_schedule_use_cost_model = false;
+  NonZeroMeasure();
+}
+
+TEST_F(TestAutoTuner, NonZeroMeasure_EnableCostModel) {
+  FLAGS_auto_schedule_use_cost_model = true;
+  NonZeroMeasure();
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt b/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
new file mode 100644
index 0000000000000..6e52f7a3dad14
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS xgb_cost_model.cc expr_cost_model.cc feature.cc feature_extractor.cc)
+
+cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
+cc_test(test_feature_extractor SRCS feature_extractor_test.cc DEPS cinncore)
+cc_test(test_feature SRCS feature_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
new file mode 100644
index 0000000000000..e41a71a409109
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
+
+#include <glog/logging.h>
+
+#include <atomic>
+#include <vector>
+
+#include "cinn/auto_schedule/cost_model/feature.h"
+#include "cinn/auto_schedule/cost_model/feature_extractor.h"
+#include "cinn/auto_schedule/search_space/search_state.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+float ExprCostModel::Predict(const ir::ModuleExpr& sample, const common::Target& target) const {
+  if (trained_times_.load() == 0) {
+    return SearchState::NOT_INIT_COST;
+  }
+  FeatureExtractor extractor;
+  Feature feature                    = extractor.Extract(sample, target);
+  std::vector<float> feature_numbers = feature.ToFixedSizeVector();
+  std::vector<float> pred            = XgbCostModel::Predict({feature_numbers});
+  return pred[0];
+}
+
+void ExprCostModel::Train(const std::vector<const ir::ModuleExpr*>& samples,
+                          const std::vector<float>& labels,
+                          const common::Target& target) {
+  trained_times_.store(1);
+  size_t total_size = samples.size();
+  CHECK_EQ(total_size, labels.size()) << "Samples must have same size as labels";
+  std::vector<std::vector<float>> train_feature_numbers(total_size);
+  FeatureExtractor extractor;
+  for (size_t i = 0; i < total_size; ++i) {
+    CHECK(samples[i] != nullptr) << "Train samples cannot be nullptr";
+    Feature feature          = extractor.Extract(*samples[i], target);
+    train_feature_numbers[i] = feature.ToFixedSizeVector();
+  }
+
+  XgbCostModel::Train(train_feature_numbers, labels);
+}
+
+void ExprCostModel::Update(const std::vector<const ir::ModuleExpr*>& samples,
+                           const std::vector<float>& labels,
+                           const common::Target& target) {
+  ++trained_times_;
+  size_t total_size = samples.size();
+  CHECK_EQ(total_size, labels.size()) << "Samples must have same size as labels";
+  std::vector<std::vector<float>> train_feature_numbers(total_size);
+  FeatureExtractor extractor;
+  for (size_t i = 0; i < total_size; ++i) {
+    CHECK(samples[i] != nullptr) << "Train samples cannot be nullptr";
+    Feature feature          = extractor.Extract(*samples[i], target);
+    train_feature_numbers[i] = feature.ToFixedSizeVector();
+  }
+
+  XgbCostModel::Update(train_feature_numbers, labels);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
new file mode 100644
index 0000000000000..176424c785cb0
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <vector>
+
+#include "cinn/auto_schedule/cost_model/xgb_cost_model.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/**
+ * A C++ cost model which trains and predicts on ir::Expr
+ *
+ */
+class ExprCostModel : public XgbCostModel {
+ public:
+  virtual float Predict(const ir::ModuleExpr& sample, const common::Target& target) const;
+  void Train(const std::vector<const ir::ModuleExpr*>& samples,
+             const std::vector<float>& labels,
+             const common::Target& target);
+  void Update(const std::vector<const ir::ModuleExpr*>& samples,
+              const std::vector<float>& labels,
+              const common::Target& target);
+
+ private:
+  std::atomic<int> trained_times_{0};
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/feature.cc b/paddle/cinn/auto_schedule/cost_model/feature.cc
new file mode 100644
index 0000000000000..1c7f8158eb409
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/feature.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/cost_model/feature.h"
+
+#include <glog/logging.h>
+
+#include <vector>
+
+#include "cinn/common/target.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+Feature::Feature()
+    : target_(common::UnkTarget()),
+      stack_encoded_feature_(1),  // initialize a LoopBlockFeature as root block
+      current_loop_block_index_(0),
+      parent_indices_(1, -1) {}
+
+Feature::Feature(const common::Target& target)
+    : target_(target),
+      stack_encoded_feature_(1),  // initialize a LoopBlockFeature as root block
+      current_loop_block_index_(0),
+      parent_indices_(1, -1) {}
+
+std::vector<float> Feature::ToFixedSizeVector() {
+  std::vector<float> ret(LoopBlockFeature::kTotalSize + 1, 0);  // LoopBlockFeature::kTotalSize plus 1 for target
+
+  if (target_ == common::DefaultNVGPUTarget()) {
+    ret[0] = 1;
+  }  // else 0 for other cases
+
+  // loop[i] feature count should multiply iter_multi_num[i]
+  std::vector<int> iter_multi_num;
+  for (size_t i = 0; i < stack_encoded_feature_.size(); ++i) {
+    int j                                = 1;
+    const LoopBlockFeature& loop_feature = stack_encoded_feature_[i];
+    int loop_prod                        = 1;
+    int parent_prod                      = 1;
+    if (i != 0) {
+      parent_prod = iter_multi_num[parent_indices_[i]];
+      loop_prod   = parent_prod * loop_feature.loop_length;
+    }
+    iter_multi_num.push_back(loop_prod);
+
+    ret[j] += (loop_feature.float_add_or_sub * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_mul * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_div_or_mod * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_cmp * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_math_func * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_other_call * loop_prod);
+    ++j;
+
+    ret[j] += (loop_feature.int_add_or_sub * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_mul * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_div_or_mod * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_cmp * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_math_func * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_other_call * loop_prod);
+    ++j;
+
+    ret[j] += (loop_feature.bool_op * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.select_op * loop_prod);
+    ++j;
+
+    ret[j] += (loop_feature.mem_alloc * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.mem_free * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.mem_read * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.mem_write * loop_prod);
+    ++j;
+
+    ret[j] += (loop_feature.float_reduce_sum_or_sub * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_reduce_mul * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_reduce_div * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_reduce_max_or_min * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_broadcast * loop_prod);
+    ++j;
+
+    ret[j] += (loop_feature.int_reduce_sum_or_sub * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_reduce_mul * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_reduce_div * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_reduce_max_or_min * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_broadcast * loop_prod);
+    ++j;
+
+    ret[j + static_cast<int>(loop_feature.loop_opt_type)] += 1;
+    j += LoopBlockFeature::kOptApplySize;
+
+    ret[j] += (loop_feature.len_blockIdx_x * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_blockIdx_y * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_blockIdx_z * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_threadIdx_x * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_threadIdx_y * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_threadIdx_z * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_vthread * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.vectorize_factor * parent_prod);
+    ++j;
+  }
+
+  for (size_t i = 0; i < ret.size(); ++i) {
+    ret[i] = slog(ret[i]);
+  }
+
+  return ret;
+}
+
+void Feature::IntoLoopBlock() {
+  stack_encoded_feature_.emplace_back(LoopBlockFeature());
+  stack_encoded_feature_[current_loop_block_index_].num_sub_loops += 1;
+  parent_indices_.push_back(current_loop_block_index_);
+  current_loop_block_index_ = stack_encoded_feature_.size() - 1;
+}
+
+void Feature::ExitLoopBlock() { current_loop_block_index_ = parent_indices_[current_loop_block_index_]; }
+
+LoopBlockFeature& Feature::CurrentLoopBlock() { return stack_encoded_feature_[current_loop_block_index_]; }
+
+const LoopBlockFeature& Feature::CurrentLoopBlock() const { return stack_encoded_feature_[current_loop_block_index_]; }
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/feature.h b/paddle/cinn/auto_schedule/cost_model/feature.h
new file mode 100644
index 0000000000000..019bd25382432
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/feature.h
@@ -0,0 +1,178 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/* Loop feature enums */
+enum class ForOptimizeFeatureEnum : int { kNone, kGpuBind, kParallel, kUnroll, kVectorize };
+
+/* function to scale feature numbers */
+inline float slog(float x) { return x < 0 ? std::log2(-x + 1) : std::log2(x + 1); }
+
+class LoopBlockFeature {
+ public:
+  // TODO(zhhsplendid): distinguish more types such as float16, float32,
+  // float64, etc. However speed the gap between float and int are larger than
+  // different bits, so we just distinguished int and float here
+  /* Arithmetic features */
+  int float_add_or_sub = 0;
+  int float_mul        = 0;
+  int float_div_or_mod = 0;
+  int float_cmp        = 0;
+  int float_math_func  = 0;
+  int float_other_call = 0;  // like simple assign, cast, etc.
+
+  int int_add_or_sub = 0;
+  int int_mul        = 0;
+  int int_div_or_mod = 0;
+  int int_cmp        = 0;
+  int int_math_func  = 0;
+  int int_other_call = 0;  // like simple assign, cast, etc.
+
+  int bool_op   = 0;
+  int select_op = 0;
+
+  static constexpr int kArithSize = 6 * 2 + 2;
+
+  /**
+   * Buffer memory features, which is the number of memory operations.
+   * Note that different size of memory operation can have various speed,
+   * however the speed difference would be small in OS. A meticulous TODO
+   * may be collect operand sizes (like alloc size, write size, or so)
+   */
+  int mem_alloc = 0;
+  int mem_free  = 0;
+  int mem_read  = 0;
+  int mem_write = 0;
+
+  static constexpr int kMemSize = 4;
+
+  /**
+   * Reduce and Broadcast features
+   */
+  int float_reduce_sum_or_sub = 0;
+  int float_reduce_mul        = 0;
+  int float_reduce_div        = 0;
+  int float_reduce_max_or_min = 0;
+  int float_broadcast         = 0;
+
+  int int_reduce_sum_or_sub = 0;
+  int int_reduce_mul        = 0;
+  int int_reduce_div        = 0;
+  int int_reduce_max_or_min = 0;
+  int int_broadcast         = 0;
+
+  static constexpr int kReduceBroadcastSize = 10;
+
+  /* Loop type features */
+
+  // A TODO maybe add loop position (Inner, Outer, Middle) feature
+
+  ForOptimizeFeatureEnum loop_opt_type = ForOptimizeFeatureEnum::kNone;
+
+  static constexpr int kOptApplySize = 5;
+
+  /* Thread features if loop is optimized by GPU or CPU parallelism.
+   * Useless in other cases.
+   */
+  int len_blockIdx_x   = 0;
+  int len_blockIdx_y   = 0;
+  int len_blockIdx_z   = 0;
+  int len_threadIdx_x  = 0;
+  int len_threadIdx_y  = 0;
+  int len_threadIdx_z  = 0;
+  int len_vthread      = 0;  // length of virtual thread
+  int vectorize_factor = 0;
+
+  static constexpr int kThreadFeatureSize = 8;
+
+  static constexpr int kTotalSize = kArithSize + kMemSize + kReduceBroadcastSize + kOptApplySize + kThreadFeatureSize;
+
+  /* Non-feature attributes, used to maintain during feature_extractor */
+
+  // Number to indicate the loop block inside current one
+  int num_sub_loops = 0;
+
+  // Number of repeats of this loop, -1 represents unknown
+  int loop_length = 1;
+};
+
+/**
+ * Feature of Expr. It is used in CostModel
+ */
+class Feature {
+ public:
+  Feature();
+
+  Feature(const common::Target& target);
+
+  // Convert the various-length loop block features to fixed-size vector
+  std::vector<float> ToFixedSizeVector();
+
+  // Call when visit into a loop block to collect LoopBlockFeature
+  void IntoLoopBlock();
+  // Call when exit a loop block to collect LoopBlockFeature
+  void ExitLoopBlock();
+  // The current loop block which we should collect feature on
+  LoopBlockFeature& CurrentLoopBlock();
+  // The current loop block which we should collect feature on
+  const LoopBlockFeature& CurrentLoopBlock() const;
+
+ private:
+  // We treat a computation feature to be encoded as variable-length vector.
+  // The root compute block is not a loop, but we treat it as a size-1 loop.
+  // Blocks are encoded like a stack. Each LoopBlockFeature contains a
+  // num_sub_loops to indicate the next level sub-loop-block it contains.
+  //
+  // For example, code like:
+  //
+  // some_compute_0
+  // loop1 {
+  //   some_compute_1
+  //   loop2 {
+  //     some_compute_2
+  //   }
+  // }
+  //
+  // loop3 {
+  //   some_compute_3
+  // }
+  //
+  // We go through the code and push loops into stack, then the features are encoded as
+  // [loop_block_feature_0, loop_block_feature_1, loop_block_feature_2, loop_block_feature_3]
+  // where loop_block_feature_i stores the features of some_compute_i (such
+  // as number of arithmetic operations)
+  //
+  // loop_block_feature_0.num_sub_loops = 2
+  // loop_block_feature_1.num_sub_loops = 1
+  // loop_block_feature_2.num_sub_loops = 0
+  // loop_block_feature_3.num_sub_loops = 0
+  std::vector<LoopBlockFeature> stack_encoded_feature_;
+  int current_loop_block_index_;
+  std::vector<int> parent_indices_;
+
+  common::Target target_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
new file mode 100644
index 0000000000000..5f44b2e3f0a8d
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
@@ -0,0 +1,299 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/cost_model/feature_extractor.h"
+
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/common/type.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/transform_polyfor_to_for.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+using namespace ::cinn::ir;
+
+FeatureExtractor::FeatureExtractor() {}
+
+void FeatureExtractor::Visit(const Expr *x) { IRVisitor::Visit(x); }
+
+Feature FeatureExtractor::Extract(const ir::ModuleExpr &mod_expr, const common::Target &target) {
+  feature_ = Feature(target);
+  for (const ir::Expr &e : mod_expr.GetExprs()) {
+    Visit(&e);
+  }
+  return feature_;
+}
+
+#define VisitDoNothing(NodeType)                            \
+  void FeatureExtractor::Visit(const NodeType *x) {         \
+    std::vector<const Expr *> sub_exprs = x->expr_fields(); \
+    for (const Expr *e : sub_exprs) {                       \
+      if (e->defined()) {                                   \
+        Visit(e);                                           \
+      }                                                     \
+    }                                                       \
+  }
+
+VisitDoNothing(IntImm);
+VisitDoNothing(UIntImm);
+VisitDoNothing(FloatImm);
+VisitDoNothing(StringImm);
+
+VisitDoNothing(Block);
+VisitDoNothing(_Module_);
+VisitDoNothing(_Var_);
+VisitDoNothing(_LoweredFunc_);
+VisitDoNothing(ScheduleBlock);
+VisitDoNothing(ScheduleBlockRealize);
+VisitDoNothing(Ramp);
+VisitDoNothing(_Buffer_);
+VisitDoNothing(_BufferRange_);
+
+#define NotVisitExprFields(NodeType) \
+  void FeatureExtractor::Visit(const NodeType *x) {}
+
+NotVisitExprFields(_Tensor_)
+
+#define VisitForDtypePattern(NodeType, member)                                                    \
+  void FeatureExtractor::Visit(const NodeType *x) {                                               \
+    if (x->type() == common::F32() || x->type() == common::F16() || x->type() == common::F64()) { \
+      feature_.CurrentLoopBlock().float_##member += x->type().lanes();                            \
+    } else {                                                                                      \
+      feature_.CurrentLoopBlock().int_##member += x->type().lanes();                              \
+    }                                                                                             \
+    std::vector<const Expr *> sub_exprs = x->expr_fields();                                       \
+    for (const Expr *e : sub_exprs) {                                                             \
+      if (e->defined()) {                                                                         \
+        Visit(e);                                                                                 \
+      }                                                                                           \
+    }                                                                                             \
+  }
+
+    VisitForDtypePattern(Add, add_or_sub);
+VisitForDtypePattern(Sub, add_or_sub);
+VisitForDtypePattern(Minus, add_or_sub);
+VisitForDtypePattern(Mul, mul);
+VisitForDtypePattern(Div, div_or_mod);
+VisitForDtypePattern(Mod, div_or_mod);
+VisitForDtypePattern(FracOp, div_or_mod);
+VisitForDtypePattern(EQ, cmp);
+VisitForDtypePattern(NE, cmp);
+VisitForDtypePattern(GT, cmp);
+VisitForDtypePattern(GE, cmp);
+VisitForDtypePattern(LT, cmp);
+VisitForDtypePattern(LE, cmp);
+VisitForDtypePattern(Call, math_func);
+VisitForDtypePattern(PrimitiveNode, math_func);
+VisitForDtypePattern(Cast, other_call);
+VisitForDtypePattern(Let, other_call);
+
+#define VisitForMultiOperandsDtypePattern(NodeType, member)                                       \
+  void FeatureExtractor::Visit(const NodeType *x) {                                               \
+    if (x->type() == common::F32() || x->type() == common::F16() || x->type() == common::F64()) { \
+      feature_.CurrentLoopBlock().float_##member += (x->operands().size() - 1);                   \
+    } else {                                                                                      \
+      feature_.CurrentLoopBlock().int_##member += (x->operands().size() - 1);                     \
+    }                                                                                             \
+    std::vector<const Expr *> sub_exprs = x->expr_fields();                                       \
+    for (const Expr *e : sub_exprs) {                                                             \
+      if (e->defined()) {                                                                         \
+        Visit(e);                                                                                 \
+      }                                                                                           \
+    }                                                                                             \
+  }
+
+VisitForMultiOperandsDtypePattern(Sum, add_or_sub);
+VisitForMultiOperandsDtypePattern(Product, mul);
+
+#define VisitCountMemberPattern(NodeType, member)           \
+  void FeatureExtractor::Visit(const NodeType *x) {         \
+    feature_.CurrentLoopBlock().member += 1;                \
+    std::vector<const Expr *> sub_exprs = x->expr_fields(); \
+    for (const Expr *e : sub_exprs) {                       \
+      if (e->defined()) {                                   \
+        Visit(e);                                           \
+      }                                                     \
+    }                                                       \
+  }
+
+VisitCountMemberPattern(And, bool_op);
+VisitCountMemberPattern(Or, bool_op);
+VisitCountMemberPattern(Not, bool_op);
+VisitCountMemberPattern(Max, select_op);
+VisitCountMemberPattern(Min, select_op);
+VisitCountMemberPattern(IfThenElse, select_op);
+VisitCountMemberPattern(Select, select_op);
+VisitCountMemberPattern(Alloc, mem_alloc);
+VisitCountMemberPattern(Free, mem_free);
+VisitCountMemberPattern(Load, mem_read);
+VisitCountMemberPattern(Store, mem_write);
+
+/* Visit for loops */
+
+void FeatureExtractor::Visit(const For *x) {
+  feature_.IntoLoopBlock();
+
+  LoopBlockFeature &loop_feature = feature_.CurrentLoopBlock();
+  if (x->min.is_constant() && x->extent.is_constant()) {
+    loop_feature.loop_length = (x->extent.get_constant() - x->min.get_constant());
+  } else {
+    loop_feature.loop_length = -1;  // -1 represents unknown
+  }
+
+  if (x->is_parallel()) {
+    loop_feature.loop_opt_type = ForOptimizeFeatureEnum::kParallel;
+    loop_feature.len_vthread   = loop_feature.loop_length;
+  } else if (x->is_unrolled()) {
+    loop_feature.loop_opt_type = ForOptimizeFeatureEnum::kUnroll;
+  } else if (x->is_vectorized()) {
+    loop_feature.loop_opt_type    = ForOptimizeFeatureEnum::kVectorize;
+    loop_feature.vectorize_factor = x->vectorize_info().factor;
+  } else if (x->is_binded()) {
+    loop_feature.loop_opt_type = ForOptimizeFeatureEnum::kGpuBind;
+    const BindInfo &bind_info  = x->bind_info();
+    int offset                 = bind_info.offset;
+    if (bind_info.for_type == ForType::GPUBlock) {
+      if (offset == 0) {
+        loop_feature.len_blockIdx_x = loop_feature.loop_length;
+      } else if (offset == 1) {
+        loop_feature.len_blockIdx_y = loop_feature.loop_length;
+      } else if (offset == 2) {
+        loop_feature.len_blockIdx_z = loop_feature.loop_length;
+      }
+    } else if (bind_info.for_type == ForType::GPUThread) {
+      if (offset == 0) {
+        loop_feature.len_threadIdx_x = loop_feature.loop_length;
+      } else if (offset == 1) {
+        loop_feature.len_threadIdx_y = loop_feature.loop_length;
+      } else if (offset == 2) {
+        loop_feature.len_threadIdx_z = loop_feature.loop_length;
+      }
+    }
+  }
+
+  std::vector<const Expr *> sub_exprs = x->expr_fields();
+  for (const Expr *e : sub_exprs) {
+    Visit(e);
+  }
+
+  feature_.ExitLoopBlock();
+}
+
+void FeatureExtractor::Visit(const PolyFor *x) {
+  Expr copy = optim::IRCopy(Expr(x));
+  feature_.IntoLoopBlock();
+  optim::TransformPolyForToFor(&copy);
+  ir::For *loop = copy.As<For>();
+  CHECK(loop != nullptr);
+  Visit(loop);
+  feature_.ExitLoopBlock();
+}
+
+/* Visit for Reduce and Broadcast */
+
+void FeatureExtractor::Visit(const Reduce *x) {
+  if (x->type() == common::F32() || x->type() == common::F16() || x->type() == common::F64()) {
+    switch (x->reduce_type) {
+      case Reduce::ReduceType::kSum:
+        feature_.CurrentLoopBlock().float_reduce_sum_or_sub += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kSub:
+        feature_.CurrentLoopBlock().float_reduce_sum_or_sub += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kDiv:
+        feature_.CurrentLoopBlock().float_reduce_div += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMul:
+        feature_.CurrentLoopBlock().float_reduce_mul += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMax:
+        feature_.CurrentLoopBlock().float_reduce_max_or_min += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMin:
+        feature_.CurrentLoopBlock().float_reduce_max_or_min += x->type().lanes();
+        break;
+    }
+  } else {
+    switch (x->reduce_type) {
+      case Reduce::ReduceType::kSum:
+        feature_.CurrentLoopBlock().int_reduce_sum_or_sub += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kSub:
+        feature_.CurrentLoopBlock().int_reduce_sum_or_sub += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kDiv:
+        feature_.CurrentLoopBlock().int_reduce_div += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMul:
+        feature_.CurrentLoopBlock().int_reduce_mul += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMax:
+        feature_.CurrentLoopBlock().int_reduce_max_or_min += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMin:
+        feature_.CurrentLoopBlock().int_reduce_max_or_min += x->type().lanes();
+        break;
+    }
+  }
+  std::vector<const Expr *> sub_exprs = x->expr_fields();
+  for (const Expr *e : sub_exprs) {
+    Visit(e);
+  }
+}
+VisitForDtypePattern(Broadcast, broadcast);
+
+/* Visit for IntrinsicOp */
+void FeatureExtractor::Visit(const IntrinsicOp *x) {
+  switch (x->getKind()) {
+#define __(op__)                                \
+  case IntrinsicKind::k##op__:                  \
+    Visit(llvm::dyn_cast<intrinsics::op__>(x)); \
+    break;
+
+    INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+  }
+}
+
+VisitDoNothing(intrinsics::BufferGetDataHandle);
+VisitDoNothing(intrinsics::BufferGetDataConstHandle);
+VisitDoNothing(intrinsics::PodValueToX);
+VisitDoNothing(intrinsics::BufferCreate);
+VisitDoNothing(intrinsics::GetAddr);
+VisitDoNothing(intrinsics::ArgsConstruct);
+
+VisitForDtypePattern(intrinsics::BuiltinIntrin, other_call)
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor.h b/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
new file mode 100644
index 0000000000000..073eee27cac77
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/auto_schedule/cost_model/feature.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/ir_visitor.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class FeatureExtractor : public ir::IRVisitor {
+ public:
+  FeatureExtractor();
+  Feature Extract(const ir::ModuleExpr& mod_expr, const common::Target& target);
+
+  void Visit(const Expr* x) override;
+
+#define __(op__) void Visit(const ir::op__* x) override;
+  NODETY_FORALL(__)
+#undef __
+
+#define __(op__) virtual void Visit(const ir::intrinsics::op__* x);
+  INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+
+ private:
+  Feature feature_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
new file mode 100644
index 0000000000000..ed0cd984c93de
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
@@ -0,0 +1,158 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/cost_model/feature_extractor.h"
+
+#include <gtest/gtest.h>
+#include <pybind11/embed.h>
+
+#include <cmath>
+#include <unordered_set>
+#include <vector>
+
+#include "cinn/common/context.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(FeatureExtractor, SimpleAssign) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  ir::Expr M(32);
+  ir::Expr N(32);
+
+  lang::Placeholder<float> A("A", {M, N});
+  ir::Tensor B = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  poly::StageMap stages              = poly::CreateStages({A, B});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+  ir::Expr ast_expr                  = funcs[0]->body;
+  VLOG(6) << "Expr to test: " << ast_expr;
+
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+
+  FeatureExtractor extractor;
+
+  Feature feature = extractor.Extract(mod_expr, target);
+
+  std::vector<float> to_check = feature.ToFixedSizeVector();
+
+  ASSERT_EQ(to_check.size(), static_cast<size_t>(LoopBlockFeature::kTotalSize + 1));
+  VLOG(6) << "Feature data before slog:";
+  for (size_t i = 0; i < to_check.size(); ++i) {
+    VLOG(6) << i << " " << (std::pow(2, to_check[i]) - 1);
+    if (i != 0 && i != 17 && i != 18 && i != 29) {
+      ASSERT_EQ(to_check[i], 0);
+    }
+  }
+  // target
+#ifdef CINN_WITH_CUDA
+  ASSERT_EQ(to_check[0], 1);
+#else
+  ASSERT_EQ(to_check[0], 0);
+#endif
+  // mem_read
+  ASSERT_EQ(to_check[17], slog(M.get_constant() * N.get_constant()));  // mem_read
+  // mem_write
+  ASSERT_EQ(to_check[18], slog(M.get_constant() * N.get_constant()));  // mem_write
+  // non-opt loops, including root block
+  ASSERT_EQ(to_check[29], slog(3));
+}
+
+TEST(FeatureExtractor, MatrixMultiply) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  ir::Expr M(2);
+  ir::Expr N(2);
+  ir::Expr K(4);
+
+  lang::Placeholder<float> A("A", {M, K});
+  lang::Placeholder<float> B("B", {K, N});
+
+  ir::Var k(K.as_int32(), "reduce_axis_k");
+  ir::Tensor C = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  poly::StageMap stages              = poly::CreateStages({C});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("MatrixMultiply", stages, {C}, {}, {}, nullptr, target, true);
+
+  std::vector<Expr> vec_ast{funcs[0]->body};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  std::vector<ir::Expr> blocks = ir_sch.GetAllBlocks();
+  std::vector<ir::Expr> loops  = ir_sch.GetLoops(blocks[0]);
+  ir_sch.Bind(loops.back(), "threadIdx.x");
+
+  ir::Expr ast_expr = mod_expr.GetExprs()[0];
+  VLOG(6) << "Expr to test: " << ast_expr;
+
+  FeatureExtractor extractor;
+  Feature feature = extractor.Extract(mod_expr, target);
+
+  std::vector<float> to_check = feature.ToFixedSizeVector();
+
+  ASSERT_EQ(to_check.size(), static_cast<size_t>(LoopBlockFeature::kTotalSize + 1));
+  std::unordered_set<size_t> non_zero_indice = {0, 1, 2, 17, 18, 29, 30, 37};
+  for (size_t i = 0; i < to_check.size(); ++i) {
+    VLOG(6) << i << " " << (std::pow(2, to_check[i]) - 1);
+    if (!non_zero_indice.count(i)) {
+      ASSERT_EQ(to_check[i], 0);
+    }
+  }
+  // target
+#ifdef CINN_WITH_CUDA
+  ASSERT_EQ(to_check[0], 1);
+#else
+  ASSERT_EQ(to_check[0], 0);
+#endif
+  float out_loop   = M.get_constant() * N.get_constant();
+  float total_loop = out_loop * K.get_constant();
+  // float_mul
+  ASSERT_EQ(to_check[1], slog(total_loop));
+  // float_add_or_sub
+  ASSERT_EQ(to_check[2], slog(total_loop));
+  // mem_read
+  ASSERT_EQ(to_check[17], slog(total_loop * 3));
+  // mem_write
+  ASSERT_EQ(to_check[18], slog(total_loop + out_loop));
+
+  // non-opt loops, including root block
+  ASSERT_EQ(to_check[29], slog(3));
+  // GpuBind loop
+  ASSERT_EQ(to_check[30], slog(1));
+  // GpuBind loop
+  ASSERT_EQ(to_check[37], slog(out_loop));
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_test.cc b/paddle/cinn/auto_schedule/cost_model/feature_test.cc
new file mode 100644
index 0000000000000..908672d41b404
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/feature_test.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/cost_model/feature.h"
+
+#include <gtest/gtest.h>
+#include <pybind11/embed.h>
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(Feature, Basic) {
+  // TODO(zhhsplendid): add some basic tests
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
new file mode 100644
index 0000000000000..8549442688033
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/cost_model/xgb_cost_model.h"
+
+#include <dirent.h>
+#include <glog/logging.h>
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <atomic>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <regex>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/python_interpreter_guard.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+std::atomic<int> XgbCostModel::xgb_cost_model_count_(0);
+
+// Convert 1D vector to py numpy
+template <typename Dtype>
+pybind11::array VectorToNumpy(const std::vector<Dtype>& vec) {
+  return pybind11::array(pybind11::cast(vec));
+}
+
+// Convert 2D vector to py numpy
+template <typename Dtype>
+pybind11::array VectorToNumpy(const std::vector<std::vector<Dtype>>& vec) {
+  if (vec.size() == 0) {
+    return pybind11::array(pybind11::dtype::of<Dtype>(), {0, 0});
+  }
+
+  std::vector<size_t> shape{vec.size(), vec[0].size()};
+  pybind11::array ret(pybind11::dtype::of<Dtype>(), shape);
+
+  Dtype* py_data = static_cast<Dtype*>(ret.mutable_data());
+  for (size_t i = 0; i < vec.size(); ++i) {
+    assert(vec[i].size() == shape[1] && "Sub vectors must have same size in VectorToNumpy");
+    memcpy(py_data + (shape[1] * i), vec[i].data(), shape[1] * sizeof(Dtype));
+  }
+  return ret;
+}
+
+// the Pybind default Python interpreter doesn't contain some paths in
+// sys.path, so we have to add it.
+//
+// Note: the Pybind default Python interpreter only uses default Python.
+// Something may be wrong when users use virtual Python environment.
+void AddDistPkgToPythonSysPath() {
+  pybind11::module sys_py_mod = pybind11::module::import("sys");
+  // short version such as "3.7", "3.8", ...
+  std::string py_short_version = sys_py_mod.attr("version").cast<std::string>().substr(0, 3);
+
+  std::string site_pkg_str = "/usr/local/lib/python" + py_short_version + "/dist-packages";
+  sys_py_mod.attr("path").attr("append")(site_pkg_str);
+
+  // TODO(zhhsplendid): warning to users if setuptools hasn't been installed
+  DIR* site_pkg_dir = opendir(site_pkg_str.c_str());
+  if (site_pkg_dir != nullptr) {
+    std::regex setuptool_regex("setuptools-.*-py" + py_short_version + "\\.egg");
+    struct dirent* entry = nullptr;
+    while ((entry = readdir(site_pkg_dir)) != nullptr) {
+      if (std::regex_match(entry->d_name, setuptool_regex)) {
+        sys_py_mod.attr("path").attr("append")(site_pkg_str + "/" + entry->d_name);
+      }
+    }
+    closedir(site_pkg_dir);
+  }
+}
+
+XgbCostModel::XgbCostModel() {
+  common::PythonInterpreterGuard::Guard();
+  int previous = xgb_cost_model_count_.fetch_add(1);
+  if (previous == 0) {
+    AddDistPkgToPythonSysPath();
+  }
+  xgb_module_  = pybind11::module::import("xgboost");
+  xgb_booster_ = xgb_module_.attr("Booster")();
+}
+
+void XgbCostModel::Train(const std::vector<std::vector<float>>& samples, const std::vector<float>& labels) {
+  update_samples_            = samples;
+  update_labels_             = labels;
+  pybind11::array np_samples = VectorToNumpy<float>(samples);
+  pybind11::array np_labels  = VectorToNumpy<float>(labels);
+
+  pybind11::object dmatrix = xgb_module_.attr("DMatrix")(np_samples, np_labels);
+  xgb_booster_             = xgb_module_.attr("train")(pybind11::dict(), dmatrix, pybind11::int_(kTrainRound_));
+}
+
+std::vector<float> XgbCostModel::Predict(const std::vector<std::vector<float>>& samples) const {
+  pybind11::array np_samples = VectorToNumpy<float>(samples);
+  pybind11::object dmatrix   = xgb_module_.attr("DMatrix")(np_samples);
+  pybind11::array py_result  = xgb_booster_.attr("predict")(dmatrix);
+  return py_result.cast<std::vector<float>>();
+}
+
+void XgbCostModel::Update(const std::vector<std::vector<float>>& samples, const std::vector<float>& labels) {
+  update_samples_.insert(update_samples_.end(), samples.begin(), samples.end());
+  update_labels_.insert(update_labels_.end(), labels.begin(), labels.end());
+  pybind11::array np_samples = VectorToNumpy<float>(update_samples_);
+  pybind11::array np_labels  = VectorToNumpy<float>(update_labels_);
+
+  pybind11::object dmatrix = xgb_module_.attr("DMatrix")(np_samples, np_labels);
+  xgb_booster_             = xgb_module_.attr("train")(pybind11::dict(), dmatrix, pybind11::int_(kTrainRound_));
+}
+
+void XgbCostModel::Save(const std::string& path) { xgb_booster_.attr("save_model")(pybind11::str(path)); }
+
+void XgbCostModel::Load(const std::string& path) { xgb_booster_.attr("load_model")(pybind11::str(path)); }
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h
new file mode 100644
index 0000000000000..69dbb8a7f3904
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/embed.h>
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "cinn/common/cost_model.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/**
+ * A C++ cost model which calls Python xgboost via pybind
+ *
+ * Note: this class handles Python interpreter life time in class.
+ * If you have to call other Python functions out of this class so that meet
+ * life time conflict, you can check cinn::common::PythonInterpreterGuard
+ *
+ * For cinn::common::PythonInterpreterGuard, see:
+ *   cinn/common/python_interpreter_guard.h .cc
+ *
+ * For pybind interpreter lifetime management, see:
+ *
+ *   https://pybind11.readthedocs.io/en/stable/advanced/embedding.html#interpreter-lifetime
+ *   https://pybind11.readthedocs.io/en/stable/reference.html#_CPPv422initialize_interpreterbiPPCKcb
+ */
+class XgbCostModel : public CostModel {
+ public:
+  XgbCostModel();
+  ~XgbCostModel() = default;
+
+  void Train(const std::vector<std::vector<float>>& samples, const std::vector<float>& labels) override;
+
+  std::vector<float> Predict(const std::vector<std::vector<float>>& samples) const override;
+
+  void Update(const std::vector<std::vector<float>>& samples, const std::vector<float>& labels) override;
+
+  void Save(const std::string& path) override;
+
+  void Load(const std::string& path) override;
+
+ private:
+  // Python xgboost module
+  pybind11::module xgb_module_;
+  // Object points to Python xgb.Booster()
+  pybind11::object xgb_booster_;
+  // atomic int to handle python interpreter lifetime and package dependency
+  static std::atomic<int> xgb_cost_model_count_;
+  // Default train rounds
+  static constexpr int kTrainRound_ = 10;
+
+  std::vector<std::vector<float>> update_samples_;
+  std::vector<float> update_labels_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model_test.cc b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model_test.cc
new file mode 100644
index 0000000000000..f237699a94406
--- /dev/null
+++ b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model_test.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/cost_model/xgb_cost_model.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <pybind11/embed.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <vector>
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(CostModel, Basic) {
+  XgbCostModel cost_model;
+
+  srand(time(NULL));
+
+  int batch_size   = 16;
+  int feature_size = 8;
+  std::vector<float> labels(batch_size, 1.0);
+  std::vector<std::vector<float>> samples(batch_size, std::vector<float>(feature_size));
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < feature_size; ++j) {
+      samples[i][j] = rand() % 10;
+    }
+  }
+
+  cost_model.Train(samples, labels);
+  std::vector<float> pred = cost_model.Predict(samples);
+
+  std::string path = "./test_cost_model.cpp_save_model";
+  cost_model.Save(path);
+
+  XgbCostModel load_cost_model;
+  load_cost_model.Load(path);
+  std::vector<float> load_pred = cost_model.Predict(samples);
+
+  ASSERT_EQ(pred.size(), load_pred.size());
+  for (size_t i = 0; i < pred.size(); ++i) {
+    ASSERT_FLOAT_EQ(pred[i], load_pred[i]);
+    VLOG(6) << "pred[" << i << "] = " << pred[i];
+  }
+  std::remove(path.c_str());
+
+  cost_model.Update(samples, labels);
+  pred = cost_model.Predict(samples);
+  for (size_t i = 0; i < pred.size(); ++i) {
+    VLOG(6) << "pred[" << i << "] = " << pred[i];
+  }
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/database/CMakeLists.txt b/paddle/cinn/auto_schedule/database/CMakeLists.txt
new file mode 100644
index 0000000000000..1c3ca9330ba8c
--- /dev/null
+++ b/paddle/cinn/auto_schedule/database/CMakeLists.txt
@@ -0,0 +1,6 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS database.cc jsonfile_database.cc)
+
+cc_test(test_database SRCS database_test.cc DEPS cinncore)
+cc_test(test_jsonfile_database SRCS jsonfile_database_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/database/database.cc b/paddle/cinn/auto_schedule/database/database.cc
new file mode 100644
index 0000000000000..87cfd63007db4
--- /dev/null
+++ b/paddle/cinn/auto_schedule/database/database.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/database/database.h"
+
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/util/json_util.h>
+
+#include "cinn/auto_schedule/database/jsonfile_database.h"
+#include "cinn/auto_schedule/task/task_registry.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/schedule_desc.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+bool TuningRecord::Compare::operator()(const TuningRecord& lhs, const TuningRecord& rhs) const {
+  return lhs.execution_cost < rhs.execution_cost;
+}
+
+proto::TuningRecord TuningRecord::ToProto() const {
+  proto::TuningRecord record_proto;
+  record_proto.set_task_key(task_key);
+  record_proto.set_execution_cost(execution_cost);
+  record_proto.set_predicted_cost(predicted_cost);
+  record_proto.mutable_trace()->CopyFrom(trace);
+  return record_proto;
+}
+
+Database::Database(int capacity_per_task) : capacity_per_task_(capacity_per_task) {
+  CHECK_GT(capacity_per_task_, 0) << "capacity_per_task_ should be greater than 0";
+}
+
+std::unique_ptr<Database> Database::Make(const DatabaseConfig& config) {
+  if (config.type == DatabaseType::kMemory) {
+    return std::make_unique<Database>(config.capacity_per_task);
+  } else if (config.type == DatabaseType::kJSONFile) {
+    return std::make_unique<JSONFileDatabase>(config.capacity_per_task, config.record_file_path, true);
+  }
+
+  LOG(FATAL) << "Unimplemented database type.";
+  return nullptr;
+}
+
+void Database::Insert(const TuningRecord& record) {
+  auto& records = key2record_[record.task_key];
+  records.emplace(record);
+  if (records.size() > capacity_per_task_) {
+    records.erase(std::prev(records.end()));
+  }
+}
+
+bool Database::AddRecord(const TuningRecord& record) {
+  CHECK(!record.task_key.empty()) << "task_key of TuningRecord can't be empty";
+
+  Insert(record);
+  return Commit(record);
+}
+
+std::vector<TuningRecord> Database::LookUp(const std::string& task_key) {
+  auto fit = key2record_.find(task_key);
+  if (fit == key2record_.end()) {
+    return {};
+  }
+
+  std::vector<TuningRecord> results;
+  results.reserve(fit->second.size());
+  results.assign(fit->second.begin(), fit->second.end());
+  return results;
+}
+
+std::vector<TuningRecord> Database::GetTopK(const std::string& task_key, int k) {
+  auto fit = key2record_.find(task_key);
+  if (fit == key2record_.end() || k <= 0) {
+    return {};
+  }
+  if (k > capacity_per_task_) {
+    LOG(WARNING) << "Top k=" << k << " is greater than the capacity, will adjust k=" << capacity_per_task_;
+    k = capacity_per_task_;
+  }
+
+  std::vector<TuningRecord> results;
+  results.reserve(k);
+  for (const TuningRecord& record : fit->second) {
+    results.emplace_back(record);
+    if (results.size() == k) {
+      break;
+    }
+  }
+  return results;
+}
+
+size_t Database::Size() {
+  auto res =
+      std::accumulate(key2record_.begin(), key2record_.end(), size_t(0), [](size_t res, const auto& kv) -> size_t {
+        return std::move(res) + kv.second.size();
+      });
+  return res;
+}
+
+size_t Database::Count(const std::string& task_key) {
+  auto fit = key2record_.find(task_key);
+  if (fit == key2record_.end()) {
+    return 0;
+  }
+  return fit->second.size();
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/database/database.h b/paddle/cinn/auto_schedule/database/database.h
new file mode 100644
index 0000000000000..4487272b23875
--- /dev/null
+++ b/paddle/cinn/auto_schedule/database/database.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unordered_map>
+
+#include "cinn/auto_schedule/auto_schedule.pb.h"
+#include "cinn/auto_schedule/search_space/search_state.h"
+#include "cinn/ir/schedule_desc.pb.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// Record related data about tuning process of a measure candidate
+struct TuningRecord {
+  // the unique key to identify a task
+  std::string task_key;
+  // the predicted cost of CostModel
+  float predicted_cost;  // unit: us
+  // the ScheduleDesc of this tuning process
+  ir::proto::ScheduleDesc trace;
+  // the cost time of the candidate executed during measure
+  double execution_cost;  // unit: us
+
+  TuningRecord() = default;
+  TuningRecord(const proto::TuningRecord& record)
+      : task_key(record.task_key()),
+        predicted_cost(record.predicted_cost()),
+        trace(record.trace()),
+        execution_cost(record.execution_cost()) {}
+  TuningRecord(const std::string& task_key, const SearchState& state, double execution_cost)
+      : task_key(task_key),
+        predicted_cost(state->predicted_cost),
+        trace(state->ir_schedule.GetTraceDesc().ToProto()),
+        execution_cost(execution_cost) {}
+
+  // convert to proto object
+  proto::TuningRecord ToProto() const;
+
+  // a binary compare function that denotes when the left
+  // will be sorted in the front of the right
+  struct Compare {
+    bool operator()(const TuningRecord& lhs, const TuningRecord& rhs) const;
+  };
+};
+
+enum class DatabaseType : int { kMemory, kJSONFile };
+
+struct DatabaseConfig {
+  DatabaseType type            = DatabaseType::kMemory;
+  int capacity_per_task        = 2;
+  std::string record_file_path = "/tmp/tuning_record.json";
+};
+
+// A database supports insert or lookup historial tuning result with specified traits.
+// It can be implemented with a concrete storage to save/load underlying data,
+// such as memory, file, database server and so on, this base class can be regarded as
+// one using memory as its underlying storage medium.
+class Database {
+ public:
+  explicit Database(int capacity_per_task);
+  ~Database() = default;
+
+  // Create a Database with the specific config
+  static std::unique_ptr<Database> Make(const DatabaseConfig& config);
+
+  // add a record into the database
+  bool AddRecord(const TuningRecord& record);
+  // return all records whose task_keys are equal to the specified key
+  std::vector<TuningRecord> LookUp(const std::string& task_key);
+  // return the states of the top k in sorted candidates
+  std::vector<TuningRecord> GetTopK(const std::string& task_key, int k);
+  // return the total number of stored candidates
+  size_t Size();
+  // return the number of stored candidates with specified key
+  size_t Count(const std::string& task_key);
+
+ protected:
+  // commit the newly added record into underlying storage
+  virtual bool Commit(const TuningRecord& record) { return true; }
+  // insert a newly added record into memory storage
+  void Insert(const TuningRecord& record);
+
+  // map task_key to its records
+  std::unordered_map<std::string, std::multiset<TuningRecord, TuningRecord::Compare>> key2record_;
+  // the max number of candidates stored
+  const int capacity_per_task_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/database/database_test.cc b/paddle/cinn/auto_schedule/database/database_test.cc
new file mode 100644
index 0000000000000..2e06f4a56be0b
--- /dev/null
+++ b/paddle/cinn/auto_schedule/database/database_test.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/database/database.h"
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "cinn/auto_schedule/auto_schedule.pb.h"
+#include "cinn/auto_schedule/search_space/search_state.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class TestDatabase : public ::testing::Test {
+ public:
+  TestDatabase() : test_db(2) {
+    auto state = SearchState(ir::IRSchedule());
+    test_db.AddRecord(TuningRecord("k1", state, 1.0));
+    test_db.AddRecord(TuningRecord("k2", state, 2.0));
+    test_db.AddRecord(TuningRecord("k2", state, 3.0));
+    test_db.AddRecord(TuningRecord("k3", state, 3.0));
+    test_db.AddRecord(TuningRecord("k3", state, 4.0));
+    test_db.AddRecord(TuningRecord("k3", state, 5.0));
+    test_db.AddRecord(TuningRecord("k4", state, 4.0));
+  }
+
+  void SetUp() override {}
+  Database test_db;
+};
+
+TEST_F(TestDatabase, Basic) {
+  ASSERT_EQ(test_db.Size(), 6);
+  auto records = test_db.LookUp("k3");
+  // check the max number of stored candidates will
+  // be restricted to capacity_per_task
+  ASSERT_EQ(test_db.Count("k3"), 2);
+  ASSERT_EQ(records.size(), 2);
+  EXPECT_EQ(records[0].execution_cost, 3.0);
+  EXPECT_EQ(records[1].execution_cost, 4.0);
+}
+
+TEST_F(TestDatabase, GetTopK) {
+  ASSERT_TRUE(test_db.GetTopK("k5", 2).empty());
+  ASSERT_EQ(test_db.GetTopK("k4", 3).size(), 1);
+
+  test_db.AddRecord(TuningRecord("k4", SearchState(ir::IRSchedule(), 1.2), 2.0));
+  test_db.AddRecord(TuningRecord("k4", SearchState(ir::IRSchedule(), 1.0), 3.0));
+
+  auto records = test_db.GetTopK("k4", 3);
+  ASSERT_EQ(records.size(), 2);
+  EXPECT_FLOAT_EQ(records[0].predicted_cost, 1.2);
+  EXPECT_FLOAT_EQ(records[1].predicted_cost, 1.0);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database.cc b/paddle/cinn/auto_schedule/database/jsonfile_database.cc
new file mode 100644
index 0000000000000..3a7eb677183f3
--- /dev/null
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/database/jsonfile_database.h"
+
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/util/json_util.h>
+
+#include <fstream>
+
+#include "cinn/auto_schedule/auto_schedule.pb.h"
+#include "cinn/auto_schedule/task/task_registry.h"
+#include "cinn/utils/multi_threading.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// append a line to file
+void AppendLineToFile(const std::string& file_path, const std::string& line) {
+  std::ofstream os(file_path, std::ofstream::app);
+  CHECK(os.good()) << "Cannot open the file to write: " << file_path;
+  os << line << std::endl;
+}
+
+// read lines from a json file
+std::vector<std::string> ReadLinesFromFile(const std::string& file_path, bool allow_new_file) {
+  std::ifstream is(file_path);
+  if (is.good()) {
+    std::vector<std::string> json_strs;
+    for (std::string str; std::getline(is, str);) {
+      json_strs.push_back(str);
+    }
+
+    return json_strs;
+  }
+  CHECK(allow_new_file) << "File doesn't exist: " << file_path;
+  std::ofstream os(file_path);
+  CHECK(os.good()) << "Cannot create new file: " << file_path;
+  return {};
+}
+
+JSONFileDatabase::JSONFileDatabase(int capacity_per_task, const std::string& record_file_path, bool allow_new_file)
+    : Database(capacity_per_task), record_file_path_(record_file_path) {
+  VLOG(3) << "Auto schedule will save/load tuning records on file:" << record_file_path;
+  auto json_lines = ReadLinesFromFile(record_file_path_, allow_new_file);
+  std::vector<cinn::auto_schedule::proto::TuningRecord> all_records_proto(json_lines.size());
+
+  // convert JSON string to proto object
+  auto worker_fn = [this, &json_lines, &all_records_proto](int index) {
+    cinn::auto_schedule::proto::TuningRecord record_proto;
+    auto status = google::protobuf::util::JsonStringToMessage(json_lines[index], &record_proto);
+    CHECK(status.ok()) << "Failed to parse JSON: " << json_lines[index];
+    all_records_proto[index].Swap(&record_proto);
+  };
+  utils::parallel_run(worker_fn, utils::SequenceDispatcher(0, json_lines.size()), -1);
+
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+
+  for (const auto& record_proto : all_records_proto) {
+    std::string task_key = record_proto.task_key();
+    if (task_registry->Has(task_key)) {
+      VLOG(4) << "Add a measured TuningRecord with task_key=" << task_key;
+      Insert(TuningRecord(record_proto));
+    }
+  }
+}
+
+// convert a TuningRecord object to string in JSON format
+std::string JSONFileDatabase::RecordToJSON(const TuningRecord& record) {
+  proto::TuningRecord record_proto = record.ToProto();
+  std::string json_string;
+  auto status = google::protobuf::util::MessageToJsonString(record_proto, &json_string);
+  CHECK(status.ok()) << "Failed to serialize record to JSON, task key = " << record.task_key;
+  VLOG(4) << "json_string = \n" << json_string;
+
+  return json_string;
+}
+
+bool JSONFileDatabase::Commit(const TuningRecord& record) {
+  std::string json_string = RecordToJSON(record);
+  AppendLineToFile(record_file_path_, json_string);
+
+  return true;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database.h b/paddle/cinn/auto_schedule/database/jsonfile_database.h
new file mode 100644
index 0000000000000..540013c224d5f
--- /dev/null
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/auto_schedule/database/database.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// JSONFileDatabase is a database implemented by JSON file to save/load underlying data.
+class JSONFileDatabase : public Database {
+ public:
+  /*!
+   * \brief Build a JSONFileDatabase object from a json file.
+   * \param capacity_per_task The max number of candidates stored.
+   * \param record_file_path The path of the json file.
+   * \param allow_new_file Whether to create new file when the given path is not found.
+   */
+  JSONFileDatabase(int capacity_per_task, const std::string& record_file_path, bool allow_new_file);
+  ~JSONFileDatabase() = default;
+
+  // convert a TuningRecord object to string in JSON format
+  std::string RecordToJSON(const TuningRecord& record);
+
+ protected:
+  // commit the newly added record into json file
+  bool Commit(const TuningRecord& record) override;
+
+  // the name of the json file to save tuning records.
+  std::string record_file_path_;
+};
+
+// append a line to file
+void AppendLineToFile(const std::string& file_path, const std::string& line);
+
+// read lines from a json file
+std::vector<std::string> ReadLinesFromFile(const std::string& file_path, bool allow_new_file = true);
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
new file mode 100644
index 0000000000000..6ace45ea19478
--- /dev/null
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
@@ -0,0 +1,214 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/database/jsonfile_database.h"
+
+#include <google/protobuf/util/message_differencer.h>
+#include <gtest/gtest.h>
+
+#include <fstream>
+#include <vector>
+
+#include "cinn/auto_schedule/search_space/search_state.h"
+#include "cinn/auto_schedule/task/task_registry.h"
+#include "cinn/cinn.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/ir_copy.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// Return lowerd ir AST for example functions used in this test
+std::vector<ir::LoweredFunc> LowerCompute(const std::vector<int>& shape, const Target& target) {
+  CHECK(shape.size() == 2) << "shape should be 2";
+  std::vector<Expr> domain;
+  for (auto i = 0; i < shape.size(); ++i) {
+    domain.emplace_back(shape[i]);
+  }
+
+  Placeholder<float> A("A", domain);
+  ir::Tensor B, C;
+
+  B = Compute(
+      domain, [&A](Var i, Var j) { return A(i, j); }, "B");
+  C = Compute(
+      domain, [&B](Var i, Var j) { return B(i, j); }, "C");
+
+  return cinn::lang::LowerVec("test_func", CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
+}
+
+// Create a new IRSchedule with copied ir::LoweredFunc AST
+ir::IRSchedule MakeIRSchedule(const std::vector<ir::LoweredFunc>& lowered_funcs, const std::string& task_key) {
+  std::vector<Expr> exprs;
+  for (auto&& func : lowered_funcs) {
+    exprs.emplace_back(optim::IRCopy(func->body));
+  }
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+  task_registry->Regist(task_key, ir::ModuleExpr(exprs));
+
+  return ir::IRSchedule(ir::ModuleExpr(exprs));
+}
+
+class TestJSONFileDatabase : public ::testing::Test {
+ public:
+  TestJSONFileDatabase() : record_file_path("/tmp/test_record.json"), test_db(2, record_file_path, true) {}
+
+  void SetUp() override { lowered_funcs = LowerCompute({32, 32}, target); }
+
+  void TearDown() override {
+    auto isFileExists = [](const std::string& file_path) -> bool {
+      std::ifstream f(file_path.c_str());
+      return f.good();
+    };
+    if (isFileExists(record_file_path)) {
+      if (remove(record_file_path.c_str()) == 0) {
+        LOG(INFO) << "Successfully deleted file: " << record_file_path;
+      } else {
+        LOG(INFO) << "failed to delete file: " << record_file_path;
+      }
+    } else {
+      LOG(INFO) << "file: " << record_file_path << "does not exist.";
+    }
+  }
+
+  std::string record_file_path;
+  JSONFileDatabase test_db;
+  std::vector<ir::LoweredFunc> lowered_funcs;
+  Target target = common::DefaultHostTarget();
+};
+
+TEST_F(TestJSONFileDatabase, Serialize) {
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs, "test");
+  auto fused            = ir_sch.Fuse("B", {0, 1});
+  VLOG(3) << "after Fuse, Expr: " << fused;
+
+  TuningRecord record1("test", SearchState(std::move(ir_sch), 2.0), 1.0);
+  std::string str = test_db.RecordToJSON(record1);
+  VLOG(3) << "RecordToJSON: " << str;
+  // Because the serialization of protobuf does not guarantee the order, we give all possible results.
+  std::string case1 =
+      "{\"taskKey\":\"test\",\"executionCost\":1,\"predictedCost\":2,\"trace\":{\"steps\":[{\"type\":\"FuseWithName\","
+      "\"outputs\":[\"e0\"],\"attrs\":[{\"name\":\"loops_index\",\"dtype\":\"INTS\",\"ints\":[0,1]},{\"name\":\"block_"
+      "name\",\"dtype\":\"STRING\",\"s\":\"B\"}]}]}}";
+  std::string case2 =
+      "{\"taskKey\":\"test\",\"executionCost\":1,\"predictedCost\":2,\"trace\":{\"steps\":[{\"type\":\"FuseWithName\","
+      "\"outputs\":[\"e0\"],\"attrs\":[{\"name\":\"block_name\",\"dtype\":\"STRING\",\"s\":\"B\"},{\"name\":\"loops_"
+      "index\",\"dtype\":\"INTS\",\"ints\":[0,1]}]}]}}";
+  EXPECT_EQ(true, str == case1 || str == case2);
+}
+
+TEST_F(TestJSONFileDatabase, SaveLoad) {
+  ir::IRSchedule ir_sch1 = MakeIRSchedule(lowered_funcs, "k1");
+  auto fused1            = ir_sch1.Fuse("B", {0, 1});
+  ir::IRSchedule ir_sch2 = MakeIRSchedule(lowered_funcs, "k2");
+
+  test_db.AddRecord(TuningRecord("k1", SearchState(std::move(ir_sch1), 1.5), 1.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(std::move(ir_sch2), 3.5), 3.0));
+
+  std::vector<std::string> strs = ReadLinesFromFile(record_file_path);
+  ASSERT_EQ(strs.size(), 2);
+  // Because the serialization of protobuf does not guarantee the order, we give all possible results.
+  std::string case1 =
+      "{\"taskKey\":\"k1\",\"executionCost\":1,\"predictedCost\":1.5,\"trace\":{\"steps\":[{\"type\":\"FuseWithName\","
+      "\"outputs\":[\"e0\"],\"attrs\":[{\"name\":\"loops_index\",\"dtype\":\"INTS\",\"ints\":[0,1]},{\"name\":\"block_"
+      "name\",\"dtype\":\"STRING\",\"s\":\"B\"}]}]}}";
+  std::string case2 =
+      "{\"taskKey\":\"k1\",\"executionCost\":1,\"predictedCost\":1.5,\"trace\":{\"steps\":[{\"type\":\"FuseWithName\","
+      "\"outputs\":[\"e0\"],\"attrs\":[{\"name\":\"block_name\",\"dtype\":\"STRING\",\"s\":\"B\"},{\"name\":\"loops_"
+      "index\",\"dtype\":\"INTS\",\"ints\":[0,1]}]}]}}";
+  EXPECT_EQ(true, strs[0] == case1 || strs[0] == case2);
+  EXPECT_EQ(strs[1], "{\"taskKey\":\"k2\",\"executionCost\":3,\"predictedCost\":3.5,\"trace\":{}}");
+}
+
+TEST_F(TestJSONFileDatabase, Basic) {
+  test_db.AddRecord(TuningRecord("k1", SearchState(MakeIRSchedule(lowered_funcs, "k1"), 1.0), 1.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(MakeIRSchedule(lowered_funcs, "k2"), 1.0), 2.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(MakeIRSchedule(lowered_funcs, "k2"), 1.0), 3.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 8.0), 3.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 7.0), 4.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 6.0), 5.0));
+  test_db.AddRecord(TuningRecord("k4", SearchState(MakeIRSchedule(lowered_funcs, "k4"), 1.0), 4.0));
+
+  ASSERT_EQ(test_db.Size(), 6);
+  auto records = test_db.LookUp("k3");
+  // check the max number of stored candidates will
+  // be restricted to capacity_per_task
+  ASSERT_EQ(test_db.Count("k3"), 2);
+  ASSERT_EQ(records.size(), 2);
+  EXPECT_EQ(records[0].execution_cost, 3.0);
+  EXPECT_EQ(records[1].execution_cost, 4.0);
+}
+
+TEST_F(TestJSONFileDatabase, GetTopK) {
+  test_db.AddRecord(TuningRecord("k1", SearchState(MakeIRSchedule(lowered_funcs, "k1"), 1.0), 1.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(MakeIRSchedule(lowered_funcs, "k2"), 1.0), 2.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(MakeIRSchedule(lowered_funcs, "k2"), 1.0), 3.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 1.0), 3.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 1.0), 4.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 1.0), 5.0));
+  test_db.AddRecord(TuningRecord("k4", SearchState(MakeIRSchedule(lowered_funcs, "k4"), 2.0), 4.0));
+  test_db.AddRecord(TuningRecord("k4", SearchState(MakeIRSchedule(lowered_funcs, "k4"), 1.2), 2.0));
+  test_db.AddRecord(TuningRecord("k4", SearchState(MakeIRSchedule(lowered_funcs, "k4"), 1.0), 3.0));
+
+  auto records = test_db.GetTopK("k4", 3);
+  ASSERT_EQ(records.size(), 2);
+  EXPECT_FLOAT_EQ(records[0].predicted_cost, 1.2);
+  EXPECT_FLOAT_EQ(records[1].predicted_cost, 1.0);
+}
+
+TEST_F(TestJSONFileDatabase, Reload) {
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs, "k1");
+  auto fused            = ir_sch.Fuse("B", {0, 1});
+  test_db.AddRecord(TuningRecord("k1", SearchState(std::move(ir_sch), 1.0), 1.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(MakeIRSchedule(lowered_funcs, "k2"), 1.0), 2.0));
+  auto records = test_db.LookUp("k1");
+  ASSERT_EQ(records.size(), 1);
+
+  JSONFileDatabase new_db(2, record_file_path, false);
+  ASSERT_EQ(new_db.Size(), 2);
+  auto loaded_records = new_db.LookUp("k1");
+  ASSERT_EQ(records.size(), loaded_records.size());
+  EXPECT_EQ(records[0].task_key, loaded_records[0].task_key);
+  EXPECT_EQ(records[0].execution_cost, loaded_records[0].execution_cost);
+  EXPECT_EQ(records[0].predicted_cost, loaded_records[0].predicted_cost);
+
+  // check the equality of trace info between original TuningRecord and the loaded TuningRecord
+  const auto& lhs_trace = records[0].trace;
+  const auto& rhs_trace = loaded_records[0].trace;
+  google::protobuf::util::MessageDifferencer dif;
+  static const google::protobuf::Descriptor* descriptor = cinn::ir::proto::ScheduleDesc_Step::descriptor();
+  dif.TreatAsSet(descriptor->FindFieldByName("attrs"));
+  EXPECT_TRUE(dif.Compare(lhs_trace, rhs_trace));
+
+  // check the equality of module expr between original TuningRecord
+  // and the loaded TuningRecord by replaying with tracing ScheduleDesc
+  ir::IRSchedule lhs_sch = MakeIRSchedule(lowered_funcs, "k1");
+  ir::IRSchedule rhs_sch = MakeIRSchedule(lowered_funcs, "k1");
+  ir::ScheduleDesc::ReplayWithProto(lhs_trace, &lhs_sch);
+  ir::ScheduleDesc::ReplayWithProto(rhs_trace, &rhs_sch);
+  auto lhs_exprs = lhs_sch.GetModule().GetExprs();
+  auto rhs_exprs = rhs_sch.GetModule().GetExprs();
+
+  ASSERT_EQ(lhs_exprs.size(), rhs_exprs.size());
+  for (auto i = 0; i < lhs_exprs.size(); ++i) {
+    std::string lhs          = utils::GetStreamCnt(lhs_exprs.at(i));
+    std::string rhs          = utils::GetStreamCnt(rhs_exprs.at(i));
+    size_t remove_prefix_len = 28;
+    ASSERT_EQ(lhs.erase(0, remove_prefix_len), rhs.erase(0, remove_prefix_len));
+  }
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/measure/CMakeLists.txt b/paddle/cinn/auto_schedule/measure/CMakeLists.txt
new file mode 100644
index 0000000000000..ea2e822368df2
--- /dev/null
+++ b/paddle/cinn/auto_schedule/measure/CMakeLists.txt
@@ -0,0 +1,6 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS schedule_measurer.cc simple_builder.cc simple_runner.cc)
+
+cc_test(test_simple_runner SRCS simple_runner_test.cc DEPS cinncore)
+cc_test(test_measurer SRCS measurer_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/measure/measure.h b/paddle/cinn/auto_schedule/measure/measure.h
new file mode 100644
index 0000000000000..124aa474d9948
--- /dev/null
+++ b/paddle/cinn/auto_schedule/measure/measure.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/instruction.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// The input to a measurer
+struct MeasureInput {
+  // The task object related to this measurement.
+  const TuneTask* task;
+  // lowered Exprs to be measured
+  std::vector<ir::LoweredFunc> lowered_funcs;
+  // It is used to pass for some arguments that maybe
+  // specified value in advance. default is null
+  const std::map<std::string, cinn_pod_value_t>* execution_args = nullptr;
+};
+
+// The result of a measurement
+struct MeasureResult {
+  // The time cost of execution in average of running
+  // with a specific repeated times.
+  double execution_cost = 0.0;  // unit: us
+  // The time cost of the whole measurement process including
+  // building and running
+  double elapsed_time = 0.0;  // unit: us
+  // used to return detail messages once an error occurred during measurement,
+  // empty if nothing goes wrong
+  std::string error_msg;
+};
+
+// The result of building with input schedule
+struct BuildResult {
+  // The scope that owns detail compilation infos of parameters in the runtime program
+  const hlir::framework::Scope* compiled_scope;
+  // The executable program
+  std::unique_ptr<hlir::framework::Program> runtime_program;
+};
+
+// This interface defines how to generate executable objects
+// with input schedule. A builder should not contain stateful data
+// related to any task so it can be called parallelly among multiple
+// processes of task tuning.
+class ScheduleBuilder {
+ public:
+  virtual BuildResult Build(const MeasureInput& input) = 0;
+};
+
+// This interface defines how to run the built result. Like above ScheduleBuilder,
+// a runner shoule be implemented with not bound to a specific task.
+class ScheduleRunner {
+ public:
+  virtual MeasureResult Run(const MeasureInput& input, const BuildResult& build_result) = 0;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/measure/measurer_test.cc b/paddle/cinn/auto_schedule/measure/measurer_test.cc
new file mode 100644
index 0000000000000..5297cabad5296
--- /dev/null
+++ b/paddle/cinn/auto_schedule/measure/measurer_test.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "cinn/auto_schedule/measure/schedule_measurer.h"
+#include "cinn/auto_schedule/measure/simple_builder.h"
+#include "cinn/auto_schedule/measure/simple_runner.h"
+#include "cinn/auto_schedule/task/task_creator.h"
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::hlir::framework::BuildScope;
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::GraphCompiler;
+
+frontend::Program CreateAddReluProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+  frontend::NetBuilder builder("test");
+
+  auto a = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c = builder.Add(a, b);
+  auto d = builder.Relu(c);
+  return builder.Build();
+}
+
+class TestMeasurer : public ::testing::Test {
+ public:
+  std::unique_ptr<GraphCompiler> graph_compiler;
+  std::vector<TuneTask> tasks;
+  std::vector<MeasureInput> inputs;
+
+  void SetUp() override {
+    FLAGS_cinn_ir_schedule = true;
+#ifdef CINN_WITH_CUDA
+    Target target = common::DefaultNVGPUTarget();
+#else
+    Target target = common::DefaultHostTarget();
+#endif
+    std::unordered_set<std::string> fetch_ids;
+    auto program   = CreateAddReluProgram();
+    auto graph     = cinn::frontend::Optimize(&program, fetch_ids, target);
+    auto scope     = BuildScope(target, graph);
+    graph_compiler = std::make_unique<GraphCompiler>(target, scope, graph);
+    TaskCreator task_creator;
+    tasks                  = task_creator.CreateTuneTaskOpLevel(graph.get());
+    const auto& dtype_dict = graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+    const auto& shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+
+    auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(dtype_dict, shape_dict, target);
+    inputs.reserve(tasks.size());
+    for (int i = 0; i < tasks.size(); ++i) {
+      auto* task = &tasks[i];
+      task->Initialize(shape_dict, dtype_dict, op_lowerer.get());
+      MeasureInput input;
+      input.task          = task;
+      input.lowered_funcs = task->lowered_funcs;
+      inputs.emplace_back(input);
+    }
+  }
+};
+
+class ThrowExceptionBuilder : public ScheduleBuilder {
+  struct Exception : public std::exception {
+    const char* what() const throw() { return "BuildError"; }
+  };
+  BuildResult Build(const MeasureInput& input) override { throw Exception(); }
+};
+
+class ThrowExceptionRunner : public ScheduleRunner {
+  struct Exception : public std::exception {
+    const char* what() const throw() { return "RunError"; }
+  };
+  MeasureResult Run(const MeasureInput& input, const BuildResult& build_result) override { throw Exception(); }
+};
+
+TEST_F(TestMeasurer, Basic) {
+  auto builder                       = std::make_unique<SimpleBuilder>(graph_compiler.get());
+  auto runner                        = std::make_unique<SimpleRunner>(1);
+  auto measurer                      = std::make_unique<ScheduleMeasurer>(builder.get(), runner.get());
+  std::vector<MeasureResult> results = measurer->Measure(inputs);
+  ASSERT_EQ(inputs.size(), results.size());
+}
+
+TEST_F(TestMeasurer, CatchException) {
+  auto builder                       = std::make_unique<SimpleBuilder>(graph_compiler.get());
+  auto runner                        = std::make_unique<SimpleRunner>(1);
+  auto throw_builder                 = std::make_unique<ThrowExceptionBuilder>();
+  auto throw_runner                  = std::make_unique<ThrowExceptionRunner>();
+  auto measurer_with_build_error     = std::make_unique<ScheduleMeasurer>(throw_builder.get(), runner.get(), 2);
+  std::vector<MeasureResult> results = measurer_with_build_error->Measure(inputs);
+  ASSERT_EQ(inputs.size(), results.size());
+  EXPECT_EQ(results[0].error_msg, "Build failed, error: BuildError\n");
+
+  // TODO(CtfGo): test parallel build after we support thread-safe compilation
+  auto measurer_with_run_error = std::make_unique<ScheduleMeasurer>(builder.get(), throw_runner.get(), 1);
+  results                      = measurer_with_run_error->Measure(inputs);
+  ASSERT_EQ(inputs.size(), results.size());
+  EXPECT_EQ(results[0].error_msg, "Run failed, error: RunError\n");
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/measure/schedule_measurer.cc b/paddle/cinn/auto_schedule/measure/schedule_measurer.cc
new file mode 100644
index 0000000000000..3662d831d3eb2
--- /dev/null
+++ b/paddle/cinn/auto_schedule/measure/schedule_measurer.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/measure/schedule_measurer.h"
+
+#include <exception>
+
+#include "cinn/utils/multi_threading.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+ScheduleMeasurer::ScheduleMeasurer(ScheduleBuilder* builder, ScheduleRunner* runner, int num_threads)
+    : builder_(builder), runner_(runner), num_threads_(num_threads) {}
+
+std::vector<MeasureResult> ScheduleMeasurer::Measure(const std::vector<MeasureInput>& inputs) {
+  if (inputs.empty()) {
+    LOG(WARNING) << "inputs is empty";
+    return {};
+  }
+  std::vector<BuildResult> build_results(inputs.size());
+  std::vector<MeasureResult> results(inputs.size());
+
+  // define how to build a candidate with the specified index
+  auto build_fn = [builder = builder_, &inputs, &build_results, &results](int index) {
+    VLOG(6) << "Build candidate index: " << index;
+    auto m_start = std::chrono::steady_clock::now();
+    try {
+      build_results[index] = builder->Build(inputs[index]);
+    } catch (std::exception& e) {
+      results[index].error_msg = utils::StringFormat("Build failed, error: %s\n", e.what());
+    }
+    auto time_span = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - m_start);
+    results[index].elapsed_time += static_cast<double>(time_span.count());
+  };
+
+  // define how to run a candidate with the specified index
+  auto run_fn = [runner = runner_, &inputs, &build_results, &results](int index) {
+    VLOG(6) << "Run candidate index: " << index;
+    auto m_start = std::chrono::steady_clock::now();
+    try {
+      // if error occurred in building, then skip running
+      if (results[index].error_msg.empty()) {
+        results[index] = runner->Run(inputs[index], build_results[index]);
+      }
+    } catch (std::exception& e) {
+      results[index].error_msg = utils::StringFormat("Run failed, error: %s\n", e.what());
+    }
+    auto time_span = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - m_start);
+    results[index].elapsed_time += static_cast<double>(time_span.count());
+  };
+
+  // measure a candidate by calling build and run successively
+  auto measure_fn = [&build_fn, &run_fn](int index) {
+    build_fn(index);
+    run_fn(index);
+  };
+  // default num_threads_ is 1 and in that case it will perform all measurements sequentially inplace.
+  utils::parallel_run(measure_fn, utils::SequenceDispatcher(0, inputs.size()), num_threads_);
+
+  VLOG(4) << "Measure " << inputs.size() << " candidates";
+  return results;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/measure/schedule_measurer.h b/paddle/cinn/auto_schedule/measure/schedule_measurer.h
new file mode 100644
index 0000000000000..bf093b2c199a5
--- /dev/null
+++ b/paddle/cinn/auto_schedule/measure/schedule_measurer.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "cinn/auto_schedule/measure/measure.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// Entrance of schedule measurement, it mainly includes two processes:
+// which are building the input schedules and running the generated codes.
+class ScheduleMeasurer {
+ public:
+  ScheduleMeasurer(ScheduleBuilder* builder, ScheduleRunner* runner, int num_threads = 1);
+
+  // Measure a batch of inputs and return all results once.
+  std::vector<MeasureResult> Measure(const std::vector<MeasureInput>& inputs);
+
+ private:
+  // The handle to implemented ScheduleBuilder
+  ScheduleBuilder* builder_;
+  // The handle to implemented ScheduleRunner
+  ScheduleRunner* runner_;
+  // The number of threads used to perform measurement,
+  // if it is greater than 1 that means parallel measurement.
+  const int num_threads_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/measure/simple_builder.cc b/paddle/cinn/auto_schedule/measure/simple_builder.cc
new file mode 100644
index 0000000000000..5921d1b63b026
--- /dev/null
+++ b/paddle/cinn/auto_schedule/measure/simple_builder.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/measure/simple_builder.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+using hlir::framework::GraphCompiler;
+
+SimpleBuilder::SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler) : graph_compiler_(graph_compiler) {}
+
+BuildResult SimpleBuilder::Build(const MeasureInput& input) {
+  CHECK_NE(graph_compiler_, static_cast<GraphCompiler*>(nullptr)) << "empty handle to GraphCompiler";
+  GraphCompiler::CompileOptions compile_options;
+  compile_options.groups.emplace_back(input.task->subgraph);
+  compile_options.lowered_funcs.emplace_back(input.lowered_funcs);
+  compile_options.remove_unused_variables = false;
+  VLOG(5) << "call GraphCompiler to Build with Graph::Group size=" << compile_options.groups.size()
+          << ", lowered_funcs group size=" << compile_options.lowered_funcs.size();
+  GraphCompiler::CompilationResult compiled_result = graph_compiler_->Build(compile_options);
+
+  BuildResult build_result;
+  build_result.compiled_scope  = graph_compiler_->GetScope().get();
+  build_result.runtime_program = std::move(compiled_result.runtime_program);
+  return build_result;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/measure/simple_builder.h b/paddle/cinn/auto_schedule/measure/simple_builder.h
new file mode 100644
index 0000000000000..8757a3e322207
--- /dev/null
+++ b/paddle/cinn/auto_schedule/measure/simple_builder.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/auto_schedule/measure/measure.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// This class utilize the GraphCompiler bound to the graph to build
+// the input schedule as executable objects
+class SimpleBuilder : public ScheduleBuilder {
+ public:
+  SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler);
+
+  // Build and pack the result
+  BuildResult Build(const MeasureInput& input) override;
+
+ private:
+  hlir::framework::GraphCompiler* graph_compiler_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/measure/simple_runner.cc b/paddle/cinn/auto_schedule/measure/simple_runner.cc
new file mode 100644
index 0000000000000..54660ccc93c56
--- /dev/null
+++ b/paddle/cinn/auto_schedule/measure/simple_runner.cc
@@ -0,0 +1,227 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/measure/simple_runner.h"
+
+#include <algorithm>
+#include <chrono>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <random>
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/buffer.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/hlir/framework/tensor.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+using hlir::framework::Buffer;
+using hlir::framework::Shape;
+using hlir::framework::Tensor;
+
+// Parameters that needs to be initialized to 0.
+// Key is the Op name, and value is the index of the input parameter in the Op.
+static const std::unordered_map<std::string, std::vector<int>> kInitWithZeroParams = {
+    {"lookup_table", {1}},
+    {"gather", {1}},
+    {"gather_nd", {1}},
+    {"scatter_assign", {2}},
+    {"scatter_add", {2}},
+};
+
+// Generate random value and populate them to the output address of memory
+static void PopulateRandomValue(const common::Type& type, const int numel, void* raw_ptr) {
+  std::random_device seed;
+  std::default_random_engine engine(seed());
+
+  if (type == common::Bool()) {
+    auto* fmt_ptr = reinterpret_cast<bool*>(raw_ptr);
+    std::bernoulli_distribution dist(0.5);
+    std::generate_n(fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
+  } else if (type == common::I32()) {
+    auto* fmt_ptr = reinterpret_cast<int*>(raw_ptr);
+    std::uniform_int_distribution<int> dist(std::numeric_limits<int>::min(), std::numeric_limits<int>::max());
+    std::generate_n(fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
+  } else if (type == common::I64()) {
+    auto* fmt_ptr = reinterpret_cast<int64_t*>(raw_ptr);
+    std::uniform_int_distribution<int64_t> dist(std::numeric_limits<int64_t>::min(),
+                                                std::numeric_limits<int64_t>::max());
+    std::generate_n(fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
+  } else if (type == common::F32()) {
+    auto* fmt_ptr = reinterpret_cast<float*>(raw_ptr);
+    std::uniform_real_distribution<float> dist(std::numeric_limits<float>::min(), std::numeric_limits<float>::max());
+    std::generate_n(fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
+  } else {
+    CHECK_EQ(type.bytes(), 8) << "Unsupported type: " << type << ", type.bytes = " << type.bytes();
+    auto* fmt_ptr = reinterpret_cast<uint8_t*>(raw_ptr);
+    std::uniform_int_distribution<uint8_t> dist(std::numeric_limits<uint8_t>::min(),
+                                                std::numeric_limits<uint8_t>::max());
+    std::generate_n(fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
+  }
+}
+
+// Initialize a tensor with 0 if init_with_zero == true, otherwise initialize the tensor with random value.
+static void InitTensorData(Tensor tensor, const common::Target& target, bool init_with_zero) {
+  int mem_size      = tensor->shape().numel() * tensor->type().bytes();
+  auto* tensor_data = tensor->mutable_data(target, tensor->type());
+#ifdef CINN_WITH_CUDA
+  if (target == common::DefaultNVGPUTarget()) {
+    if (init_with_zero) {
+      cudaMemset(tensor_data, 0, mem_size);
+    } else {
+      void* tmp_buffer = malloc(mem_size);
+      PopulateRandomValue(tensor->type(), tensor->shape().numel(), tmp_buffer);
+      cudaMemcpy(tensor_data, tmp_buffer, mem_size, cudaMemcpyHostToDevice);
+      free(tmp_buffer);
+    }
+  }
+#endif
+  if (target == common::DefaultHostTarget()) {
+    if (init_with_zero) {
+      memset(tensor_data, 0, mem_size);
+    } else {
+      PopulateRandomValue(tensor->type(), tensor->shape().numel(), tensor_data);
+    }
+  }
+}
+
+// Find all parameter names in the task corresponding to the MeasureInput
+// that need to be initialized to 0 when measuring.
+static std::unordered_set<std::string> ParamsNeedInitWithZero(const MeasureInput& input) {
+  std::unordered_set<std::string> res;
+  std::vector<hlir::framework::Node*> nodes = input.task->subgraph->CollectNodes();
+  for (auto* node : nodes) {
+    if (kInitWithZeroParams.count(node->op()->name) != 0) {
+      std::vector<int> param_idxs = kInitWithZeroParams.at(node->op()->name);
+      const auto& inlinks         = node->inlinks_in_order();
+      for (int param_idx : param_idxs) {
+        CHECK_GT(inlinks.size(), param_idx);
+        auto& edge             = inlinks.at(param_idx);
+        std::string param_name = edge->source()->as<hlir::framework::NodeData>()->id();
+        VLOG(6) << "param needs to be init with 0: " << param_name;
+        res.insert(param_name);
+      }
+    }
+  }
+
+  return res;
+}
+
+SimpleRunner::SimpleRunner(int repeat_times) : repeat_times_(repeat_times) {
+  CHECK_GT(repeat_times_, 0) << "repeat_times can't less than 0";
+}
+
+// Prepare execution arguments of all instructions to run, a argument
+// may be obtained from the input of measurement or allocating new buffer
+// with random value.
+std::map<std::string, cinn_pod_value_t> SimpleRunner::PrepareArgs(const MeasureInput& input,
+                                                                  const BuildResult& build_result,
+                                                                  hlir::framework::Scope* temp_scope) {
+  std::map<std::string, cinn_pod_value_t> result;
+
+  const auto& target         = input.task->target;
+  const auto* input_args     = input.execution_args;
+  const auto* compiled_scope = build_result.compiled_scope;
+  const auto& instructions   = build_result.runtime_program->GetRunInstructions();
+
+  std::unordered_set<std::string> params_need_init_with_zero = ParamsNeedInitWithZero(input);
+
+  auto fill_arg_fn = [&](const std::string& param) {
+    VLOG(6) << "Filling argument:" << param;
+    // the argument is duplicated and has been prepared.
+    if (result.count(param)) {
+      return;
+    }
+
+    // if the input of measurement specifies this argument,
+    // we should use it firstly.
+    if (input_args && input_args->count(param)) {
+      VLOG(6) << "Argument[" << param << "] use input value";
+      result.emplace(param, input_args->at(param));
+      return;
+    }
+
+    if (temp_scope->FindVar(param)) {
+      auto temp_tensor = temp_scope->GetTensor(param);
+      result.emplace(param, temp_tensor->buffer());
+      return;
+    }
+
+    // allocate a new buffer for this argument and store it in
+    // the temporary scope to be released at proper time.
+    auto compiled_tensor = compiled_scope->GetTensor(param);
+    temp_scope->Var<Tensor>(param);
+    auto temp_tensor = temp_scope->GetTensor(param);
+    temp_tensor->Resize(compiled_tensor->shape());
+    temp_tensor->set_type(compiled_tensor->type());
+    temp_tensor->mutable_data(target, compiled_tensor->type());
+    InitTensorData(temp_tensor, target, params_need_init_with_zero.count(param) != 0);
+
+    result.emplace(param, temp_tensor->buffer());
+  };
+
+  for (auto&& instr : instructions) {
+    for (auto&& args : instr->GetInArgs()) {
+      std::for_each(args.begin(), args.end(), fill_arg_fn);
+    }
+
+    for (auto&& args : instr->GetOutArgs()) {
+      std::for_each(args.begin(), args.end(), fill_arg_fn);
+    }
+  }
+  return result;
+}
+
+MeasureResult SimpleRunner::Run(const MeasureInput& input, const BuildResult& build_result) {
+  MeasureResult result;
+  auto t_start = std::chrono::steady_clock::now();
+  // prepare execution arguments
+  VLOG(4) << "SimpleRunner prepare execution arguments";
+  hlir::framework::Scope temp_scope;  // used for store temporary allocated data
+  auto execution_args = PrepareArgs(input, build_result, &temp_scope);
+
+  // Execute each instruction repeatedly and take the average as cost.
+  result.execution_cost    = 0;
+  const auto& instructions = build_result.runtime_program->GetRunInstructions();
+  for (auto ct = 0; ct < instructions.size(); ++ct) {
+    auto&& instr = instructions.at(ct);
+    VLOG(5) << "Start running instruction-" << ct;
+    auto run_start = std::chrono::steady_clock::now();
+    for (int i = 0; i < repeat_times_; ++i) {
+      instr->Run(&execution_args);
+    }
+#ifdef CINN_WITH_CUDA
+    if (instr->target_ == common::DefaultNVGPUTarget()) {
+      CUDA_CALL(cudaDeviceSynchronize());
+    }
+#endif
+    auto time_span =
+        std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - run_start);
+    auto cost_avg = static_cast<double>(time_span.count()) / repeat_times_;
+    result.execution_cost += cost_avg;
+  }
+
+  auto time_span = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - t_start);
+  result.elapsed_time = static_cast<double>(time_span.count());
+
+  VLOG(4) << "A measurement done:repeat_times[" << repeat_times_ << "]total_elapsed_time[" << result.elapsed_time
+          << "]us,execution_cost[" << result.execution_cost << "]us";
+  return result;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/measure/simple_runner.h b/paddle/cinn/auto_schedule/measure/simple_runner.h
new file mode 100644
index 0000000000000..48b316a0d7c06
--- /dev/null
+++ b/paddle/cinn/auto_schedule/measure/simple_runner.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/auto_schedule/measure/measure.h"
+#include "cinn/hlir/framework/instruction.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// This class utilize the built instructions to execute the generated
+// kernels and count the elapsed time as the measurement of performance
+class SimpleRunner : public ScheduleRunner {
+ public:
+  SimpleRunner(int repeat_times);
+
+  MeasureResult Run(const MeasureInput& input, const BuildResult& build_result) override;
+
+ private:
+  std::map<std::string, cinn_pod_value_t> PrepareArgs(const MeasureInput& input,
+                                                      const BuildResult& build_result,
+                                                      hlir::framework::Scope* temp_scope);
+
+ private:
+  // The repeat times of running instructions,
+  // this runner will return the average time
+  const int repeat_times_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/measure/simple_runner_test.cc b/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
new file mode 100644
index 0000000000000..b20faa6734a52
--- /dev/null
+++ b/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
@@ -0,0 +1,139 @@
+
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/measure/simple_runner.h"
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <thread>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::hlir::framework::BuildScope;
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::GraphCompiler;
+using ::cinn::hlir::framework::Instruction;
+using ::cinn::hlir::framework::Scope;
+
+class TestSimpleRunner : public ::testing::Test {
+ public:
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::shared_ptr<Graph> graph;
+  std::shared_ptr<Scope> compiled_scope;
+  std::unique_ptr<GraphCompiler> graph_compiler;
+  std::unique_ptr<TuneTask> task;
+
+  MeasureInput input;
+  BuildResult build_result;
+
+  static frontend::Program CreateAddReluProgram();
+  void SetUp() override {
+    std::unordered_set<std::string> fetch_ids;
+    auto program             = CreateAddReluProgram();
+    auto graph               = cinn::frontend::Optimize(&program, fetch_ids, target);
+    compiled_scope           = BuildScope(target, graph);
+    graph_compiler           = std::make_unique<GraphCompiler>(target, compiled_scope, graph);
+    auto runtime_program     = graph_compiler->Build();
+    const auto& instructions = runtime_program->GetRunInstructions();
+    ASSERT_EQ(1, instructions.size());
+
+    build_result.compiled_scope  = compiled_scope.get();
+    build_result.runtime_program = std::move(runtime_program);
+
+    task = std::make_unique<TuneTask>();
+#ifdef CINN_WITH_CUDA
+    task->target = common::DefaultNVGPUTarget();
+#else
+    task->target = common::DefaultHostTarget();
+#endif
+    task->subgraph = graph->fusion_groups.front();
+    input.task     = task.get();
+  }
+};
+
+frontend::Program TestSimpleRunner::CreateAddReluProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+  frontend::NetBuilder builder("test");
+
+  auto a = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c = builder.Add(a, b);
+  auto d = builder.Relu(c);
+  return builder.Build();
+}
+
+TEST_F(TestSimpleRunner, MeasureWithRandomValue) {
+  auto runner = std::make_unique<SimpleRunner>(1);
+  ASSERT_NO_THROW(runner->Run(input, build_result));
+}
+
+TEST_F(TestSimpleRunner, MeasureWithSpecifiedArgs) {
+  auto ta = compiled_scope->GetTensor("A");
+  ta->mutable_data<float>(target);
+  auto tb = compiled_scope->GetTensor("B");
+  tb->mutable_data<float>(target);
+  std::map<std::string, cinn_pod_value_t> preset_args;
+  preset_args.emplace("A", ta->buffer());
+  preset_args.emplace("B", tb->buffer());
+
+  auto runner = std::make_unique<SimpleRunner>(1);
+  // specific several execution args
+  input.execution_args = &preset_args;
+  ASSERT_NO_THROW(runner->Run(input, build_result));
+}
+
+TEST_F(TestSimpleRunner, TimeMeasured) {
+  // set up a BuildResult object with one instruction of the `sleep` function
+  void (*sleep_fn)(void*, int32_t) = [](void*, int32_t) -> void {
+    std::this_thread::sleep_for(std::chrono::microseconds(100));
+  };
+  BuildResult build_result;
+  build_result.compiled_scope = nullptr;
+  std::vector<std::unique_ptr<Instruction>> instructions;
+  instructions.emplace_back(
+      new Instruction(common::DefaultHostTarget(), nullptr, {}, {"empty_placeholder"}, "sleep_fn"));
+  instructions.back()->SetLoweredFunc(reinterpret_cast<void*>(sleep_fn));
+  instructions.back()->Finalize();
+  build_result.runtime_program.reset(new hlir::framework::Program(nullptr, std::move(instructions)));
+
+  // to skip the condition check of params in Instruction::PreparePodArgs
+  std::map<std::string, cinn_pod_value_t> preset_args;
+  preset_args.emplace("empty_placeholder", cinn_pod_value_t());
+  input.execution_args = &preset_args;
+
+  auto runner                  = std::make_unique<SimpleRunner>(2);
+  MeasureResult measure_result = runner->Run(input, build_result);
+  // because the kernel function will sleep 100 us,
+  // the cost time of execution and span in total must
+  // be greater than 100us and 200us (repeatedly running 2 times) respectively.
+  ASSERT_GE(measure_result.execution_cost, 100);
+  ASSERT_GE(measure_result.elapsed_time, 200);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/CMakeLists.txt b/paddle/cinn/auto_schedule/post_schedule_rule/CMakeLists.txt
new file mode 100644
index 0000000000000..eda51bbb7e568
--- /dev/null
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/CMakeLists.txt
@@ -0,0 +1,9 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+  cooperative_process.cc
+	)
+
+if (WITH_CUDA)
+  nv_test(test_cooperative_process SRCS cooperative_process_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
+endif()
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
new file mode 100644
index 0000000000000..2b8c05e105f1d
--- /dev/null
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/schedule_desc.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+int ExtractNumThreads(const ir::IRSchedule& ir_schedule, const std::string& bind_axis) {
+  const ir::ScheduleDesc& trace = ir_schedule.GetTraceDesc();
+  for (auto&& step : trace.Steps()) {
+    if (step.type == "Bind" && step.attrs.find("thread_axis") != step.attrs.end() &&
+        absl::get<std::string>(step.attrs.at("thread_axis")) == bind_axis) {
+      CHECK_EQ(step.inputs.at("loop").size(), 1);
+      return step.inputs.at("loop")[0].As<ir::For>()->extent.as_int32();
+    }
+  }
+  return 0;
+}
+
+std::vector<std::string> FindCandidates(const ir::ScheduleDesc& trace) {
+  std::vector<std::string> candidate_block_names;
+  for (auto&& step : trace.Steps()) {
+    if (step.type == "AnnotateIntAttr" &&
+        absl::get<std::string>(step.attrs.at("key")) == ir::attr::cooperative_process) {
+      candidate_block_names.push_back(
+          step.inputs.at("block")[0].As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name);
+    }
+  }
+  return candidate_block_names;
+}
+
+bool CooperativeProcess::Apply(ir::IRSchedule* schedule) {
+  int num_threads                                = ExtractNumThreads(*schedule, "threadIdx.x");
+  const ir::ScheduleDesc& trace                  = schedule->GetTraceDesc();
+  std::vector<std::string> candidate_block_names = FindCandidates(trace);
+  for (auto&& candidate : candidate_block_names) {
+    auto loop = schedule->GetLoops(candidate).back();
+    if (loop.As<ir::For>()->extent.as_int32() <= num_threads) {
+      schedule->Bind(loop, "threadIdx.x");
+      loop = schedule->GetLoops(candidate).back();
+      schedule->SyncThreads(loop);
+    } else {
+      auto splited_buffer_loop = schedule->Split(loop, {-1, num_threads});
+      schedule->Bind(splited_buffer_loop.back(), "threadIdx.x");
+      schedule->SyncThreads(splited_buffer_loop[0]);
+    }
+    auto block = schedule->GetBlock(candidate);
+    schedule->Unannotate(block, ir::attr::cooperative_process);
+  }
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h
new file mode 100644
index 0000000000000..9f106dfda0eb3
--- /dev/null
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/*
+ * @brief Rewrite the cooperative_process annotation to actually bind the loop on threadIdx.
+ * This rule is used for collaborative data handling of multiple threads within the same block.
+ */
+class CooperativeProcess : public PostScheduleRule {
+ public:
+  CooperativeProcess() = default;
+
+  bool Apply(ir::IRSchedule* schedule) final;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
new file mode 100644
index 0000000000000..c10005a910969
--- /dev/null
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "cinn/ir/ir_printer.h"
+#include "tests/program_builder.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class TestCooperativeProcess : public TestAutoGenRuleBase {
+ public:
+  int fixed_rand_seed = 1;
+  std::vector<std::string> default_input_names;
+  std::vector<std::string> default_output_names;
+};
+
+TEST_F(TestCooperativeProcess, Matmul) {
+  default_input_names            = {"X", "Y"};
+  default_output_names           = {"temp_matmul_out"};
+  std::vector<int32_t> X_shape   = {32, 32};
+  std::vector<int32_t> Y_shape   = {32, 32};
+  std::vector<int32_t> out_shape = {32, 32};
+
+  int num_blocks_y  = 2;
+  int num_blocks_x  = 2;
+  int num_threads_y = 8;
+  int num_threads_x = 2;
+  int steps_k       = 8;
+
+  Initialize(common::DefaultNVGPUTarget());
+  frontend::Program matmul_op = tests::OpBuilder("matmul").Build({{"X", X_shape}, {"Y", Y_shape}});
+  ir::IRSchedule ir_schedule  = MakeIRSchedule(matmul_op, fixed_rand_seed);
+
+  // split loops
+  std::vector<ir::Expr> loops   = ir_schedule.GetLoops("temp_matmul_out");
+  std::vector<ir::Expr> k_loops = ir_schedule.Split(loops[2], {steps_k, -1});
+  std::vector<ir::Expr> j_loops = ir_schedule.Split(loops[1], {num_blocks_x, num_threads_x, -1});
+  std::vector<ir::Expr> i_loops = ir_schedule.Split(loops[0], {num_blocks_y, num_threads_y, -1});
+  // reorder to "SSRRS": i0, j0, i1, j1, k0, k1, j2, i2
+  loops = ir_schedule.GetLoops("temp_matmul_out");
+  ir_schedule.Reorder({loops[0], loops[3], loops[1], loops[4], loops[6], loops[7], loops[2], loops[5]});
+  // fuse and bind
+  loops                = ir_schedule.GetLoops("temp_matmul_out");
+  ir::Expr i1_j1_fused = ir_schedule.Fuse({loops[2], loops[3]});
+  ir::Expr i0_j0_fused = ir_schedule.Fuse({loops[0], loops[1]});
+  loops                = ir_schedule.GetLoops("temp_matmul_out");
+  ir_schedule.Bind(loops[1], "threadIdx.x");
+  ir_schedule.Bind(loops[0], "blockIdx.x");
+  // cache read
+  ir::Expr out_block     = ir_schedule.GetBlock("temp_matmul_out");
+  ir::Expr X_cache_block = ir_schedule.CacheRead(out_block, 1, "shared");
+  std::string X_cache_block_name =
+      X_cache_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name;
+  loops = ir_schedule.GetLoops("temp_matmul_out");
+  ir_schedule.ComputeAt(X_cache_block, loops[2]);
+  std::vector<ir::Expr> X_cache_loops = ir_schedule.GetLoops(X_cache_block_name);
+  ir_schedule.Fuse({X_cache_loops[3], X_cache_loops[4]});
+  ir_schedule.Annotate(ir_schedule.GetBlock(X_cache_block_name), ir::attr::cooperative_process, 0);
+
+  out_block              = ir_schedule.GetBlock("temp_matmul_out");
+  ir::Expr Y_cache_block = ir_schedule.CacheRead(out_block, 2, "shared");
+  std::string Y_cache_block_name =
+      Y_cache_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name;
+  loops = ir_schedule.GetLoops("temp_matmul_out");
+  ir_schedule.ComputeAt(Y_cache_block, loops[2]);
+  std::vector<ir::Expr> Y_cache_loops = ir_schedule.GetLoops(Y_cache_block_name);
+  ir_schedule.Fuse({Y_cache_loops[3], Y_cache_loops[4]});
+  ir_schedule.Annotate(ir_schedule.GetBlock(Y_cache_block_name), ir::attr::cooperative_process, 0);
+
+  // apply CooperativeProcess
+  CooperativeProcess cooperative_process;
+  cooperative_process.Apply(&ir_schedule);
+
+  // check ir
+  auto ir = GetIR(ir_schedule);
+  VLOG(6) << "after CooperativeProcess, ir: \n" << ir;
+  std::string expected_ir = R"ROC(Expr 0 {
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 2)
+      {
+        serial for (j, 0, 2)
+        {
+          serial for (i_0, 0, 8)
+          {
+            serial for (j_0, 0, 2)
+            {
+              serial for (i_1, 0, 2)
+              {
+                serial for (j_1, 0, 8)
+                {
+                  ScheduleBlock(temp_matmul_out__reduce_init)
+                  {
+                    i0, i1 = axis.bind(((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1)))
+                    {
+                      temp_matmul_out__reduce_init[((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1))] = 0.00000000f
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      thread_bind[blockIdx.x] for (i_j_fused, 0, 4)
+      {
+        thread_bind[threadIdx.x] for (i_0_j_0_fused, 0, 16)
+        {
+          serial for (reduce_k_0, 0, 8)
+          {
+            serial for (ax0_0_ax1_0_fused, 0, 2)
+            {
+              thread_bind[threadIdx.x] for (ax0_0_ax1_0_fused_0, 0, 16)
+              {
+                ScheduleBlock(Y_reshape_shared_temp_buffer)
+                {
+                  v0, v1 = axis.bind(((((16 * ax0_0_ax1_0_fused) + ax0_0_ax1_0_fused_0) / 8) + (4 * reduce_k_0)), ((((16 * ax0_0_ax1_0_fused) + ax0_0_ax1_0_fused_0) % 8) + ((8 * (i_0_j_0_fused % 2)) + (16 * (i_j_fused % 2)))))
+                  attrs(compute_at_extra_var:ax0_0,ax1_0)
+                  {
+                    Y_reshape_shared_temp_buffer[v0, v1] = Y_reshape[v0, v1]
+                  }
+                }
+              }
+            }
+            __syncthreads()
+            thread_bind[threadIdx.x] for (ax0_ax1_fused, 0, 8)
+            {
+              ScheduleBlock(X_reshape_shared_temp_buffer)
+              {
+                v0, v1 = axis.bind(((ax0_ax1_fused / 4) + ((2 * (i_0_j_0_fused / 2)) + (16 * (i_j_fused / 2)))), ((ax0_ax1_fused % 4) + (4 * reduce_k_0)))
+                attrs(compute_at_extra_var:ax0,ax1)
+                {
+                  X_reshape_shared_temp_buffer[v0, v1] = X_reshape[v0, v1]
+                }
+              }
+            }
+            __syncthreads()
+            serial for (reduce_k_1, 0, 4)
+            {
+              serial for (i_1, 0, 2)
+              {
+                serial for (j_1, 0, 8)
+                {
+                  ScheduleBlock(temp_matmul_out)
+                  {
+                    i0_0, i1_0, i2 = axis.bind(((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1)), ((4 * reduce_k_0) + reduce_k_1))
+                    {
+                      temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] = (temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] + (X_reshape_shared_temp_buffer[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((4 * reduce_k_0) + reduce_k_1)] * Y_reshape_shared_temp_buffer[((4 * reduce_k_0) + reduce_k_1), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))]))
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // end Expr 0
+)ROC";
+  ASSERT_EQ(ir, expected_ir);
+
+  // build ir::Module and debug source code
+  auto ir_module   = BuildIRModule(ir_schedule);
+  auto source_code = GenSourceCode(ir_module);
+  VLOG(6) << "scheduled source code:\n" << source_code;
+
+  // execute and check precision
+  CheckResult(
+      GenExecutableKernel(ir_module),
+      GenExecutableKernel(BuildIRModule(MakeIRSchedule(matmul_op, fixed_rand_seed, /* apply_manual_schedule*/ true))),
+      default_input_names,
+      default_output_names,
+      {X_shape, Y_shape},
+      {out_shape},
+      target_);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h b/paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h
new file mode 100644
index 0000000000000..136d4fc18f297
--- /dev/null
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/**
+ * Base class for rules of post process,
+ * used to process schedules that rely on mutate results.
+ */
+class PostScheduleRule {
+ public:
+  PostScheduleRule() = default;
+
+  /**
+   * @brief Apply the post schedule rule to the given SearchState.
+   * @param state The given SearchState for post schedule.
+   * @return True if apply successfully.
+   */
+  virtual bool Apply(ir::IRSchedule* schedule) = 0;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/CMakeLists.txt b/paddle/cinn/auto_schedule/search_space/CMakeLists.txt
new file mode 100644
index 0000000000000..44d73649efaec
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_subdirectory(auto_gen_rule)
+
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    search_space.cc
+    search_state.cc
+    block_sampler.cc
+    rule_sampler.cc
+    )
+
+cc_test(test_search_space SRCS search_space_test.cc DEPS cinncore)
+cc_test(test_search_state SRCS search_state_test.cc DEPS cinncore)
+cc_test(test_block_sampler SRCS block_sampler_test.cc DEPS cinncore)
+cc_test(test_rule_sampler SRCS rule_sampler_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
new file mode 100644
index 0000000000000..dcb81c71baefd
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
@@ -0,0 +1,24 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+	auto_gen_rule.cc
+	auto_inline.cc
+	auto_unroll.cc
+	multi_level_tiling.cc
+	skip_rule.cc
+  auto_bind.cc
+)
+
+if (WITH_TESTING)
+  cc_library(auto_gen_rule_test_helper SRCS test_helper.cc DEPS glog gtest cinncore)
+endif()
+
+if (WITH_CUDA)
+    nv_test(test_mix_rules SRCS mix_rules_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
+    nv_test(test_auto_bind SRCS auto_bind_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
+    nv_test(test_multi_level_tiling SRCS multi_level_tiling_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
+endif()
+
+#cc_test(test_auto_inline SRCS auto_inline_test.cc DEPS cinncore auto_gen_rule_test_helper)
+cc_test(test_skip_rule SRCS skip_rule_test.cc DEPS cinncore)
+cc_test(test_auto_unroll SRCS auto_unroll_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
new file mode 100644
index 0000000000000..0a49d8c269645
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h"
+
+#include <glog/logging.h>
+
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/ir_copy.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+static constexpr uint32_t kMaxBlocks = 256;
+// check whether the input ir::For is a spatial loop
+bool IsSpatialLoop(const ir::For* for_node) {
+  if (for_node->for_type() != ir::ForType::Serial) return false;
+  const auto& loop_var = for_node->loop_var;
+  // collect cases where the loop_var used in one of reduce axis in underneath ScheduleBlock
+  auto used_for_reduce_axis = ir::CollectIRNodesWithoutTensor(for_node->body, [&loop_var](const Expr* x) {
+    const auto* block_realize = x->As<ir::ScheduleBlockRealize>();
+    if (!block_realize) return false;
+
+    const auto* schedule_block = block_realize->schedule_block.As<ir::ScheduleBlock>();
+    CHECK(schedule_block) << "schedule_block field is not a ScheduleBlock";
+    CHECK_EQ(block_realize->iter_values.size(), schedule_block->iter_vars.size());
+    for (int i = 0; i < block_realize->iter_values.size(); ++i) {
+      const ir::Var& iter_var = schedule_block->iter_vars[i];
+      const ir::Expr& binding = block_realize->iter_values[i];
+      if (iter_var->is_reduce_axis || iter_var->name.substr(0, 6) == "reduce") {
+        auto used_exprs = ir::CollectIRNodesWithoutTensor(binding, [&loop_var](const Expr* x) {
+          const ir::_Var_* var = x->As<ir::_Var_>();
+          if (var && (x->same_as(loop_var) || var->name == loop_var->name)) {
+            return true;
+          }
+          return false;
+        });
+        if (!used_exprs.empty()) return true;
+      }
+    }
+
+    return false;
+  });
+
+  if (!used_for_reduce_axis.empty()) return false;
+  return true;
+}
+
+// count the number of loops that can be binded from the input for_node to bottom
+int CountLoopCanBinded(const ir::For* for_node) {
+  int cnt = 0;
+  while (for_node) {
+    if (for_node->is_binded()) break;     // has binded
+    if (!IsSpatialLoop(for_node)) break;  // only spatial loops to be binded
+
+    cnt += 1;
+
+    CHECK(for_node->body.defined() && for_node->body.As<ir::Block>()) << "Body is not defined";
+    const ir::Block* body = for_node->body.As<ir::Block>();
+    // terminate when body of this loop has more than one statement or the body is not a ir::For node
+    for_node = body->stmts.size() == 1 ? body->stmts[0].As<ir::For>() : nullptr;
+  }
+  return cnt;
+}
+
+void BindGPUIndex(ir::IRSchedule* ir_schedule,
+                  const std::string& block_name,
+                  int num_loops_to_bind,
+                  int max_blocks,
+                  int max_threads_per_block) {
+  auto all_loops = ir_schedule->GetLoops(block_name);
+  CHECK_LE(num_loops_to_bind, all_loops.size()) << "The number of loops to be bind is greater than size of all_loops";
+  // check whether it is the case that threadIdx has been binded but blockIdx not,
+  // the threadIdx can only be binded in the first loop after num_loops_to_bind loops
+  // because we has excluded other cases in CountLoopCanBinded
+  bool gpu_thread_has_binded =
+      num_loops_to_bind < all_loops.size() && all_loops[num_loops_to_bind].As<ir::For>()->is_gpu_thread_binded();
+  Expr fused_loop = ir_schedule->Fuse({all_loops.begin(), all_loops.begin() + num_loops_to_bind});
+  int32_t extent  = fused_loop.As<ir::For>()->extent.as_int32();
+  if (gpu_thread_has_binded) {
+    ir_schedule->Bind(fused_loop, "blockIdx.x");
+    return;
+  }
+
+  if (extent <= max_threads_per_block) {
+    ir_schedule->Bind(fused_loop, "threadIdx.x");
+    return;
+  }
+
+  if (extent <= max_blocks * max_threads_per_block) {
+    auto splits = ir_schedule->Split(fused_loop, {-1, max_threads_per_block});
+    CHECK_EQ(splits.size(), 2);
+    ir_schedule->Bind(splits[0], "blockIdx.x");
+    ir_schedule->Bind(splits[1], "threadIdx.x");
+  } else {
+    auto splits = ir_schedule->Split(fused_loop, {-1, max_blocks, max_threads_per_block});
+    CHECK_EQ(splits.size(), 3);
+    ir_schedule->Reorder({splits[1], splits[2], splits[0]});
+    all_loops = ir_schedule->GetLoops(block_name);
+    ir_schedule->Bind(all_loops[0], "blockIdx.x");
+    ir_schedule->Bind(all_loops[1], "threadIdx.x");
+  }
+}
+
+RuleApplyType AutoBind::Init(ir::IRSchedule* ir_schedule) {
+  ir_schedule_ = ir_schedule;
+
+  for (auto&& block_realize : ir_schedule->GetAllBlocks()) {
+    auto all_loops = ir_schedule->GetLoops(block_realize);
+    if (CountLoopCanBinded(all_loops[0].As<ir::For>()) > 0) {
+      applicable_schedule_blocks_.emplace_back(block_realize);
+    }
+  }
+  num_applicable_ = applicable_schedule_blocks_.size();
+  VLOG(6) << "Collect applicable_schedule_blocks_:" << num_applicable_;
+  return num_applicable_ > 0 ? RuleApplyType::kApplyAndPruneOtherRules : RuleApplyType::kCannotApply;
+}
+
+void AutoBind::Apply(int index) {
+  CHECK_LT(index, applicable_schedule_blocks_.size()) << "invalid apply index:" << index;
+  auto applied_block = applicable_schedule_blocks_.at(index);
+  auto all_loops     = ir_schedule_->GetLoops(applied_block);
+  BindGPUIndex(ir_schedule_,
+               applied_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name,
+               CountLoopCanBinded(all_loops[0].As<ir::For>()),
+               kMaxBlocks,
+               target_->max_num_threads());
+  return;
+}
+
+RuleApplyType AutoBind::AnalyseApplyType(SearchState state, const std::string& block_name) const {
+  Expr block_expr = state->ir_schedule.GetBlock(block_name);
+  auto all_loops  = state->ir_schedule.GetLoops(block_expr);
+  return CountLoopCanBinded(all_loops[0].As<ir::For>()) > 0 ? RuleApplyType::kApplyAndPruneOtherRules
+                                                            : RuleApplyType::kCannotApply;
+}
+
+std::vector<SearchState> AutoBind::ApplyOnBlock(SearchState state, const std::string& block_name) {
+  SearchState new_state = state.Copy();
+  auto all_loops        = state->ir_schedule.GetLoops(block_name);
+  BindGPUIndex(&new_state->ir_schedule,
+               block_name,
+               CountLoopCanBinded(all_loops[0].As<ir::For>()),
+               kMaxBlocks,
+               target_->max_num_threads());
+  return {new_state};
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
new file mode 100644
index 0000000000000..b93f633b230e3
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// Auto bind GPU index(BlockIdx, ThreadIdx) to the loops around the block
+class AutoBind : public AutoGenRule {
+ public:
+  AutoBind(const common::Target& target) : AutoGenRule(target) {}
+  ~AutoBind() = default;
+
+  RuleApplyType Init(ir::IRSchedule* init_schedule) override;
+
+  void Apply(int index) override;
+
+  std::string GetRuleName() const override { return "AutoBind"; }
+
+  RuleApplyType AnalyseApplyType(SearchState state, const std::string& block_name) const override;
+
+  std::vector<SearchState> ApplyOnBlock(SearchState state, const std::string& block_name) override;
+
+ private:
+  std::vector<Expr> applicable_schedule_blocks_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
new file mode 100644
index 0000000000000..9ffbe0a3f4a3a
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <functional>
+#include <numeric>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "cinn/ir/ir_printer.h"
+#include "tests/program_builder.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+static constexpr uint32_t kMaxBlocks          = 256;
+static constexpr uint32_t kMaxThreadsPerBlock = 1024;
+
+class TestAutoBind : public TestAutoGenRuleBase {
+ public:
+  std::vector<std::string> default_input_names  = {"X", "Y"};
+  std::vector<std::string> default_output_names = {"temp_matmul_out"};
+
+  void TestApplyOnElementWiseAdd(const std::vector<int>& shape, const std::string& block_name) {
+    Initialize(common::DefaultNVGPUTarget());
+    auto test_program = tests::OpBuilder("elementwise_add").Build({{"X", shape}, {"Y", shape}});
+    // construct input parameter
+    ir::IRSchedule ir_schedule = MakeIRSchedule(test_program);
+    SearchState state(ir_schedule, 0, {});
+    std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+    ASSERT_EQ(func_bodys.size(), 1UL);
+    VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+    // apply
+    AutoBind auto_bind(target_);
+    ASSERT_EQ(auto_bind.AnalyseApplyType(state, block_name), RuleApplyType::kApplyAndPruneOtherRules);
+    auto result                 = auto_bind.ApplyOnBlock(state, block_name)[0];
+    std::vector<ir::Expr> exprs = result->ir_schedule.GetModule().GetExprs();
+    EXPECT_EQ(exprs.size(), 1UL);
+    VLOG(6) << "AutoBind applied Expr: " << exprs[0];
+
+    // check bind result
+    auto all_loops = result->ir_schedule.GetLoops(block_name);
+    int total_num  = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+    if (total_num <= kMaxThreadsPerBlock) {
+      ASSERT_EQ(all_loops.size(), 1);
+      EXPECT_EQ(all_loops[0].As<ir::For>()->extent.as_int32(), total_num);
+      EXPECT_TRUE(all_loops[0].As<ir::For>()->is_gpu_thread_binded());
+    } else if (total_num <= kMaxBlocks * kMaxThreadsPerBlock) {
+      ASSERT_EQ(all_loops.size(), 2);
+      EXPECT_EQ(all_loops[0].As<ir::For>()->extent.as_int32(),
+                static_cast<int32_t>(std::ceil(double(total_num) / kMaxThreadsPerBlock)));
+      EXPECT_TRUE(all_loops[0].As<ir::For>()->is_gpu_block_binded());
+      EXPECT_EQ(all_loops[1].As<ir::For>()->extent.as_int32(), kMaxThreadsPerBlock);
+      EXPECT_TRUE(all_loops[1].As<ir::For>()->is_gpu_thread_binded());
+    } else {
+      ASSERT_EQ(all_loops.size(), 3);
+      EXPECT_EQ(all_loops[0].As<ir::For>()->extent.as_int32(), kMaxBlocks);
+      EXPECT_TRUE(all_loops[0].As<ir::For>()->is_gpu_block_binded());
+      EXPECT_EQ(all_loops[1].As<ir::For>()->extent.as_int32(), kMaxThreadsPerBlock);
+      EXPECT_TRUE(all_loops[1].As<ir::For>()->is_gpu_thread_binded());
+      EXPECT_EQ(all_loops[2].As<ir::For>()->extent.as_int32(),
+                static_cast<int32_t>(std::ceil(double(total_num) / (kMaxBlocks * kMaxThreadsPerBlock))));
+      EXPECT_FALSE(all_loops[2].As<ir::For>()->is_binded());
+    }
+
+    // build and run
+    auto ir_module   = BuildIRModule(result->ir_schedule);
+    auto source_code = GenSourceCode(ir_module);
+    VLOG(6) << "Optimized source code:\n" << source_code;
+    auto manual_ir_module = BuildIRModule(MakeIRSchedule(test_program, /* apply_manual_schedule*/ true));
+    VLOG(6) << "Manual-schedule compiled source code:\n" << GenSourceCode(manual_ir_module);
+    CheckResult(GenExecutableKernel(ir_module),
+                GenExecutableKernel(manual_ir_module),
+                default_input_names,
+                {block_name},
+                {shape, shape},
+                {shape},
+                target_);
+  }
+};
+
+TEST_F(TestAutoBind, AnalyseApplyType) {
+  Initialize(common::DefaultNVGPUTarget());
+  ir::IRSchedule ir_schedule = MakeIRSchedule(tests::OpBuilder("matmul").Build({{"X", {32, 64}}, {"Y", {64, 32}}}));
+  SearchState state(ir_schedule, 0, {});
+  AutoBind auto_bind(target_);
+  const std::string& applied_block_name = default_output_names.back();
+  // outer two loops of initial Expr are spatial loops, so it can be applied
+  EXPECT_EQ(auto_bind.AnalyseApplyType(state, applied_block_name), RuleApplyType::kApplyAndPruneOtherRules);
+  state->ir_schedule.Fuse(applied_block_name, {0, 1});
+  state->ir_schedule.Bind(state->ir_schedule.GetLoops(applied_block_name)[0], "threadIdx.x");
+  // after fuse and bind, there is no loops to be binded.
+  EXPECT_EQ(auto_bind.AnalyseApplyType(state, applied_block_name), RuleApplyType::kCannotApply);
+}
+
+TEST_F(TestAutoBind, ApplyOnBlock) {
+  TestApplyOnElementWiseAdd({64, 128}, "var_1");
+  TestApplyOnElementWiseAdd({57, 133, 125}, "var_1");
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
new file mode 100644
index 0000000000000..fb6eaa797b4c1
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+
+#include <glog/logging.h>
+
+#include <cstdlib>
+
+#include "cinn/common/target.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+AutoGenRule::AutoGenRule(const common::Target& target) : target_(&target) {}
+
+int AutoGenRule::NumberApplicable() const {
+  CHECK_GE(num_applicable_, 0) << "Call " << GetRuleName() << "::NumberApplicable() without initialization.";
+  return num_applicable_;
+}
+
+void AutoGenRule::ApplyRandomly() {
+  CHECK_GT(num_applicable_, 0) << "Call " << GetRuleName() << "::ApplyRandomly() with NumberApplicable() == 0";
+  int index = rand() % num_applicable_;
+  return Apply(index);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
new file mode 100644
index 0000000000000..2a4ed201ad709
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "cinn/auto_schedule/search_space/search_state.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+/**
+ * Enum class representing how this rule can be applied to a ModuleExpr.
+ */
+enum class RuleApplyType : int {
+  // This rule cannot be applied to ModuleExpr.
+  kCannotApply = 0,
+  // This rule can be applied to ModuleExpr,
+  // and the original ModuleExpr will be retained for branching with other rules.
+  kApply = 1,
+  // This rule can be applied, but the original ModuleExpr will be deleted,
+  // so the branches with other rules applied on the original ModuleExpr will be pruned.
+  kApplyAndPruneOtherRules = 2,
+};
+
+/**
+ * Base class for rules of auto-generating schedule (like Ansor's sketch generation)
+ *
+ */
+class AutoGenRule {
+ public:
+  AutoGenRule(const common::Target& target);
+  ~AutoGenRule() = default;
+
+  // Initialize the AutoGenRule, it must be called before further actions.
+  // Returns false if the rule cannot be applied on the mod_expr, true otherwise.
+  virtual RuleApplyType Init(ir::IRSchedule* ir_schedule) = 0;
+
+  // CINN IRSchedule can contain many ScheduleBlock(s) and Loop(s), so
+  // a auto gen rule may be suitable to different number of
+  // Schedule Blocks. This method returns the number of ScheduleBlock
+  // that can be applied by this auto gen rule
+  virtual int NumberApplicable() const;
+
+  // Applies rule on the ir::ModuleExpr for a schedule block randomly
+  virtual void ApplyRandomly();
+
+  // Applies rule on the ir::ModuleExpr for a schedule block specified by index
+  // between 0 (inclusive) and NumberApplicable() (exclusive)
+  virtual void Apply(int index) = 0;
+
+  // Returns the name of the rule, used for debug.
+  virtual std::string GetRuleName() const = 0;
+
+  // Analyze the ApplyType of the rule used for a block determined by a specific SearchState and block name
+  virtual RuleApplyType AnalyseApplyType(SearchState state, const std::string& block_name) const = 0;
+
+  // Apply the rule to a block determined by a specific SearchState and block name
+  virtual std::vector<SearchState> ApplyOnBlock(SearchState state, const std::string& block_name) = 0;
+
+ protected:
+  // number of ScheduleBlock that can apply this auto gen rule
+  int num_applicable_ = -1;
+  // Target, not owned.
+  const common::Target* target_;
+  // IRSchedule, not owned;
+  ir::IRSchedule* ir_schedule_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
new file mode 100644
index 0000000000000..5b53ee148173c
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
@@ -0,0 +1,214 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h"
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cinn/auto_schedule/analysis/analyze_ir.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/ir_copy.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+AutoInline::AutoInline(const common::Target& target, const std::unordered_set<std::string>& no_inline_output_names)
+    : AutoGenRule(target), no_inline_output_names_(no_inline_output_names) {}
+
+bool AutoInline::CanInlineIntoConsumer(const Expr& sche_block_realize_expr, ir::IRSchedule* ir_sch) const {
+  const ir::ScheduleBlockRealize* sche_block_realize = sche_block_realize_expr.As<ir::ScheduleBlockRealize>();
+  const ir::ScheduleBlock* sche_block                = sche_block_realize->schedule_block.As<ir::ScheduleBlock>();
+  ir::Expr compute_body                              = sche_block->body;
+  ir::Expr root                                      = ir_sch->GetRootBlock(sche_block_realize_expr);
+
+  // Check the schedule block to be inlined is not a reduce tensor.
+  std::set<ir::Expr> find_store =
+      ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) { return x->As<ir::Store>(); });
+  if (find_store.size() != 1UL) {
+    return false;
+  }
+
+  ir::Expr tensor_expr = (*find_store.begin()).As<ir::Store>()->tensor;
+  ir::Tensor tensor    = tensor_expr.as_tensor_ref();
+  if (tensor->is_reduce_tensor()) {
+    return false;
+  }
+
+  // LoweredFunc output can be tensor name or tensor buffer name
+  if (no_inline_output_names_.find(tensor->name) != no_inline_output_names_.end() ||
+      no_inline_output_names_.find(tensor->buffer->name) != no_inline_output_names_.end()) {
+    return false;
+  }
+
+  // write_buffers.size() = 1 and read_buffers is empty, means const
+  // we can inline to consumer
+  if (sche_block->read_buffers.empty()) {
+    return true;
+  }
+
+  // Check this schedule block is the only writer of the tensor.
+  find_store = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+    return x->As<ir::Store>() && (x->As<ir::Store>()->tensor).as_tensor_ref()->name == tensor->name;
+  });
+  if (find_store.size() != 1UL) {
+    return false;
+  }
+  // Check there is no overlap between the buffers the schedule block reads and writes.
+  std::set<ir::Expr> find_load = ir::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) { return x->As<ir::Load>() && x->As<ir::Load>()->tensor == tensor_expr; });
+  if (!find_load.empty()) {
+    return false;
+  }
+
+  ir::Expr store = *(find_store.begin());
+
+  ir::ComputeInliner inliner(store.As<ir::Store>()->tensor.as_tensor_ref(), store);
+  if (!inliner.BodyPatternAllowInline()) {
+    return false;
+  }
+
+  ir::LeafBlockRemovalPlan remove_plan(sche_block_realize_expr, &inliner.src_stmt, &inliner.tgt_stmt);
+  remove_plan(&root);
+  if (!inliner.src_stmt.defined() || !inliner.tgt_stmt.defined()) {
+    return false;
+  }
+
+  VLOG(6) << "Found store Expr " << store << ", which CanInlineIntoConsumer";
+  return true;
+}
+
+AutoInlineType AutoInline::AnalyzeInlineType(const Expr& sche_block_realize_expr, ir::IRSchedule* ir_sch) const {
+  const ir::ScheduleBlockRealize* sche_block_realize = sche_block_realize_expr.As<ir::ScheduleBlockRealize>();
+  const ir::ScheduleBlock* sche_block                = sche_block_realize->schedule_block.As<ir::ScheduleBlock>();
+
+  // Inline if the block has only 1 write buffer
+  if (sche_block->write_buffers.size() != 1) {
+    return AutoInlineType::kCannotInline;
+  }
+
+  std::unordered_set<ir::IrNodeTy> no_inline_node_types = {ir::IrNodeTy::IfThenElse};
+  if (ContainsNodeType(sche_block->body, no_inline_node_types)) {
+    return AutoInlineType::kCannotInline;
+  }
+
+  // InlineIntoConsumer other than above situations
+  if (CanInlineIntoConsumer(sche_block_realize_expr, ir_sch)) {
+    return AutoInlineType::kInlineIntoConsumer;
+  }
+
+  // TODO(zhhsplendid): We don't have ReverseComputeInline in IRSchedule now,
+  // so we just do kInlineIntoConsumer here. Add CanInlineIntoProducer
+  // once ReverseComputeInline is ready.
+  return AutoInlineType::kCannotInline;
+}
+
+RuleApplyType AutoInline::Init(ir::IRSchedule* ir_schedule) {
+  ir_schedule_        = ir_schedule;
+  all_block_realizes_ = ir_schedule_->GetAllBlocks();
+  apply_indices_and_type_.clear();
+  num_applicable_ = 0;
+
+  for (size_t i = 0; i < all_block_realizes_.size(); ++i) {
+    ir::ScheduleBlockRealize* sche_block_realize = all_block_realizes_[i].As<ir::ScheduleBlockRealize>();
+    AnalyzeScheduleBlockReadWriteBuffer(sche_block_realize->schedule_block.As<ir::ScheduleBlock>());
+    AutoInlineType type = AnalyzeInlineType(all_block_realizes_[i], ir_schedule_);
+    if (type != AutoInlineType::kCannotInline) {
+      ++num_applicable_;
+      apply_indices_and_type_.push_back({i, type});
+    }
+  }
+
+  return num_applicable_ > 0 ? RuleApplyType::kApplyAndPruneOtherRules : RuleApplyType::kCannotApply;
+}
+
+void AutoInline::Apply(int index) {
+  CHECK(ir_schedule_ != nullptr) << "Run AutoInline::Apply without Init";
+  CHECK(num_applicable_ > 0 && apply_indices_and_type_.size() == num_applicable_)
+      << "AutoInline::Apply pre-condition doesn't meet";
+  CHECK(index >= 0 && num_applicable_ > index)
+      << "Invalid index for AutoInline::Apply, the index needs 0 <= index && index < NumberApplicable(), "
+      << "Currently index = " << index << ",  NumberApplicable() = " << num_applicable_;
+
+  int apply_index = apply_indices_and_type_[index].first;
+  Apply(ir_schedule_, all_block_realizes_[apply_index]);
+  return;
+}
+
+std::string AutoInline::GetRuleName() const { return "AutoInline"; }
+
+RuleApplyType AutoInline::AnalyseApplyType(SearchState state, const std::string& block_name) const {
+  Expr block_expr     = state->ir_schedule.GetBlock(block_name);
+  auto* block_realize = block_expr.As<ir::ScheduleBlockRealize>();
+  CHECK(block_realize) << "stmt is not a ScheduleBlockRealize:" << block_expr;
+
+  AnalyzeScheduleBlockReadWriteBuffer(block_realize->schedule_block.As<ir::ScheduleBlock>());
+  AutoInlineType type = AnalyzeInlineType(block_expr, &state->ir_schedule);
+
+  return type == AutoInlineType::kCannotInline ? RuleApplyType::kCannotApply : RuleApplyType::kApplyAndPruneOtherRules;
+}
+
+std::vector<SearchState> AutoInline::ApplyOnBlock(SearchState state, const std::string& block_name) {
+  SearchState new_state = state.Copy();
+  Expr block_expr       = new_state->ir_schedule.GetBlock(block_name);
+  Apply(&new_state->ir_schedule, block_expr);
+
+  return {new_state};
+}
+
+void AutoInline::Apply(ir::IRSchedule* ir_schedule, ir::Expr& block_expr) {
+  auto* block_realize = block_expr.As<ir::ScheduleBlockRealize>();
+  CHECK(block_realize) << "stmt is not a ScheduleBlockRealize:" << block_expr;
+
+  AnalyzeScheduleBlockReadWriteBuffer(block_realize->schedule_block.As<ir::ScheduleBlock>());
+  AutoInlineType type = AnalyzeInlineType(block_expr, ir_schedule);
+
+  if (type == AutoInlineType::kInlineIntoConsumer) {
+    VLOG(6) << "Apply ComputeInline on " << block_expr;
+    ir_schedule->ComputeInline(block_expr);
+    VLOG(6) << "After ComputeInline: " << block_expr;
+
+  } else if (type == AutoInlineType::kInlineIntoProducer) {
+    // TODO(zhhsplendid): We don't have ReverseComputeInline in IRSchedule now,
+    // so we just do kInlineIntoConsumer here. Add CanInlineIntoConsumer
+    // once ReverseComputeInline is ready.
+
+    // ir_schedule->ReverseComputeInline(all_block_realizes_[apply_index]);
+  }
+
+  // Make sure re-apply the AutoInline won't be error.
+  // AutoInline changes the read and write buffers of schedule blocks,
+  // we need to re-analyze
+  all_block_realizes_ = ir_schedule->GetAllBlocks();
+  for (size_t i = 0; i < all_block_realizes_.size(); ++i) {
+    ir::ScheduleBlockRealize* sche_block_realize = all_block_realizes_[i].As<ir::ScheduleBlockRealize>();
+    ir::ScheduleBlock* sche_block                = sche_block_realize->schedule_block.As<ir::ScheduleBlock>();
+    sche_block->read_buffers                     = {};
+    sche_block->write_buffers                    = {};
+    AnalyzeScheduleBlockReadWriteBuffer(sche_block);
+  }
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
new file mode 100644
index 0000000000000..982092e717c33
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/**
+ * The types of the AutoInline
+ */
+enum class AutoInlineType : int {
+  // The block cannot be inlined
+  kCannotInline = 0,
+  // Inline this block into the consumer
+  kInlineIntoConsumer,
+  // Inline this block into the producer
+  kInlineIntoProducer,
+};
+
+class AutoInline : public AutoGenRule {
+ public:
+  AutoInline(const common::Target& target, const std::unordered_set<std::string>& no_inline_output_names);
+  ~AutoInline() = default;
+
+  RuleApplyType Init(ir::IRSchedule* ir_schedule) override;
+
+  void Apply(int index) override;
+
+  std::string GetRuleName() const override;
+
+  AutoInlineType AnalyzeInlineType(const Expr& sche_block_realize_expr, ir::IRSchedule* ir_sch) const;
+
+  bool CanInlineIntoConsumer(const Expr& sche_block_realize_expr, ir::IRSchedule* ir_sch) const;
+
+  RuleApplyType AnalyseApplyType(SearchState state, const std::string& block_name) const override;
+
+  std::vector<SearchState> ApplyOnBlock(SearchState state, const std::string& block_name) override;
+
+ private:
+  void Apply(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
+
+ private:
+  std::vector<ir::Expr> all_block_realizes_;
+  std::vector<std::pair<int, AutoInlineType>> apply_indices_and_type_;
+  std::unordered_set<std::string> no_inline_output_names_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
new file mode 100644
index 0000000000000..a8d8ee9f9d0c0
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
@@ -0,0 +1,493 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "cinn/cinn.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/hlir/framework/op_lowering.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/ir/function_base.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/poly/stage.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/string.h"
+#include "tests/concrete_program_builder.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::OpLowerer;
+
+TEST(AutoInline, SingleLoopInline) {
+  srand(0);
+  Context::Global().ResetNameId();
+  Target target = common::DefaultHostTarget();
+
+  Expr M(32);
+
+  Placeholder<float> A("A", {M});
+  ir::Tensor B = Compute(
+      {M}, [&](Var i) { return A(i) * ir::Expr(2.f); }, "B");
+  ir::Tensor C = Compute(
+      {M}, [&](Var i) { return B(i) + ir::Expr(1.f); }, "C");
+
+  poly::StageMap stages = CreateStages({A, B, C});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestAutoInline_SingleLoopInline", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr after lowering:";
+  VLOG(6) << funcs[0]->body;
+
+  /*
+   * We have to use ComputeAt to put two Tensor loops together to create IR
+   * test case for AutoInline.
+   */
+  ir::IRSchedule ir_sch(ir::ModuleExpr(std::vector<ir::Expr>{funcs[0]->body}));
+  SearchState state(ir_sch, 0, {});
+  ir::Expr block_b            = ir_sch.GetBlock("B");
+  std::vector<ir::Expr> loops = ir_sch.GetLoops("C");
+  ir_sch.ComputeAt(block_b, loops[0]);
+
+  ir::ModuleExpr mod_expr_before_inline = ir_sch.GetModule();
+  VLOG(6) << "Expr after ComputeAt:";
+  VLOG(6) << mod_expr_before_inline.GetExprs()[0];
+
+  AutoInline auto_inline(target, {"C"});
+  EXPECT_EQ(auto_inline.Init(&ir_sch), RuleApplyType::kApplyAndPruneOtherRules);
+  EXPECT_EQ(auto_inline.NumberApplicable(), 1);
+  auto_inline.ApplyRandomly();
+  std::vector<ir::Expr> exprs = ir_sch.GetModule().GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+
+  // ApplyOnBlock
+  EXPECT_EQ(auto_inline.AnalyseApplyType(state, "B"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states = auto_inline.ApplyOnBlock(state, "B");
+
+  auto test_func = [](ir::IRSchedule* ir_sch) {
+    ir::ModuleExpr mod_expr_after_inline = ir_sch->GetModule();
+    std::vector<ir::Expr> exprs          = mod_expr_after_inline.GetExprs();
+    EXPECT_EQ(exprs.size(), 1UL);
+
+    std::stringstream ss;
+    ss << exprs[0];
+
+    std::string expr_str = ss.str();
+    VLOG(6) << "After AutoInline:";
+    VLOG(6) << expr_str;
+
+    std::string target_str = R"ROC({
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 32)
+      {
+        ScheduleBlock(C)
+        {
+          i0 = axis.bind(i)
+          read_buffers(_A[i0(0:32)])
+          write_buffers(_C[i0(0:32)])
+          C[i0] = ((A[i0] * 2.00000000f) + 1.00000000f)
+        }
+      }
+    }
+  }
+})ROC";
+    EXPECT_EQ(expr_str, target_str);
+  };
+
+  test_func(&ir_sch);
+  test_func(&new_states[0]->ir_schedule);
+
+  // Cannot inline above expr again
+  EXPECT_EQ(auto_inline.Init(&ir_sch), RuleApplyType::kCannotApply);
+  EXPECT_EQ(auto_inline.AnalyseApplyType(new_states[0], "C"), RuleApplyType::kCannotApply);
+}
+
+TEST(AutoInline, AddReluInline) {
+  srand(0);
+  Context::Global().ResetNameId();
+  Target target = common::DefaultHostTarget();
+
+  frontend::NetBuilder builder("test");
+
+  auto a = builder.CreateInput(Float(32), {1, 64, 112, 112}, "A");
+  auto b = builder.CreateInput(Float(32), {64}, "B");
+  auto c = builder.Add(a, b, 1);
+  auto d = builder.Relu(c);
+
+  frontend::Program program = builder.Build();
+
+  FLAGS_cinn_ir_schedule = true;
+  auto graph             = std::make_shared<Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+
+  const auto& dtype_dict = graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+  const auto& shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+  auto op_lowerer        = std::make_unique<hlir::framework::OpLowerer>(dtype_dict, shape_dict, target);
+
+  EXPECT_EQ(graph->fusion_groups.size(), 1UL);
+  std::vector<ir::LoweredFunc> funcs = op_lowerer->LowerWithoutSchedule(graph->fusion_groups[0]);
+
+  VLOG(6) << "Expr before auto inline: " << funcs[0]->body;
+
+  ir::ModuleExpr mod_expr_before_inline(std::vector<Expr>({funcs[0]->body}));
+  ir::IRSchedule ir_sch(mod_expr_before_inline);
+  SearchState state(ir_sch, 0, {});
+
+  AutoInline auto_inline(target, {"var_2"});
+  EXPECT_EQ(auto_inline.Init(&ir_sch), RuleApplyType::kApplyAndPruneOtherRules);
+  EXPECT_EQ(auto_inline.NumberApplicable(), 2);
+
+  auto_inline.Apply(1);
+  ir::ModuleExpr mod_expr_after_inline = ir_sch.GetModule();
+  std::vector<ir::Expr> exprs          = mod_expr_after_inline.GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+
+  std::stringstream ss;
+  ss << exprs[0];
+
+  std::string expr_str = ss.str();
+  VLOG(6) << "After AutoInline:";
+  VLOG(6) << expr_str;
+
+  // Auto Inline again
+  EXPECT_EQ(auto_inline.Init(&ir_sch), RuleApplyType::kApplyAndPruneOtherRules);
+  EXPECT_EQ(auto_inline.NumberApplicable(), 1);
+  auto_inline.Apply(0);
+
+  // ApplyOnBlock
+  EXPECT_EQ(auto_inline.AnalyseApplyType(state, "var_1"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states = auto_inline.ApplyOnBlock(state, "var_1");
+  // Auto Inline again
+  EXPECT_EQ(auto_inline.AnalyseApplyType(new_states[0], "var_3"), RuleApplyType::kApplyAndPruneOtherRules);
+  new_states = auto_inline.ApplyOnBlock(new_states[0], "var_3");
+
+  auto test_func = [](ir::IRSchedule* ir_sch) {
+    ir::ModuleExpr final_mod_expr = ir_sch->GetModule();
+    auto exprs                    = final_mod_expr.GetExprs();
+    EXPECT_EQ(exprs.size(), 1UL);
+
+    std::stringstream ss;
+    ss << exprs[0];
+
+    std::string expr_str = ss.str();
+    VLOG(6) << "Final AutoInline:";
+    VLOG(6) << expr_str;
+
+    std::string target_str = R"ROC({
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 1)
+      {
+        serial for (j, 0, 64)
+        {
+          serial for (k, 0, 112)
+          {
+            serial for (a, 0, 112)
+            {
+              ScheduleBlock(var_2)
+              {
+                i0, i1, i2, i3 = axis.bind(0, j, k, a)
+                read_buffers(_A[i0(0:1), i1(0:64), i2(0:112), i3(0:112)], _B[i1(0:64)])
+                write_buffers(_var_2[i0(0:1), i1(0:64), i2(0:112), i3(0:112)])
+                var_2[i0, i1, i2, i3] = cinn_max((A[i0, i1, i2, i3] + B[i1]), 0.00000000f)
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+})ROC";
+    EXPECT_EQ(expr_str, target_str);
+  };
+
+  test_func(&ir_sch);
+  test_func(&new_states[0]->ir_schedule);
+
+  // Cannot inline above expr again
+  EXPECT_EQ(auto_inline.Init(&ir_sch), RuleApplyType::kCannotApply);
+  EXPECT_EQ(auto_inline.AnalyseApplyType(new_states[0], "var_2"), RuleApplyType::kCannotApply);
+}
+
+#ifdef CINN_WITH_CUDA
+class TestAutoInline : public TestAutoGenRuleBase {};
+
+/* The single chain graph composed of multiple blocks can be inlined into one.
+ *
+ * Before AutoInline: The output of the previous block is the input of another block.
+ *   Loop1:
+ *     x1 = Add()
+ *   Loop2:
+ *     x2 = Multiply(x1)
+ *   Loop3:
+ *     x3 = Add(x2)
+ *   Loop4:
+ *     x4 = Relu(x3)
+ *
+ * After AutoInline: All loops are inlined into a loop.
+ *   Loop:
+ *     Add(Multiply(Add(Relu())))
+ */
+TEST_F(TestAutoInline, SingleChain) {
+  Target target = common::DefaultNVGPUTarget();
+  Initialize(target);
+  std::vector<std::string> input_names   = {"bias", "conv_output", "bn_scale", "bn_offset"};
+  std::vector<std::string> output_names  = {"var_6", "var_5", "var_1", "var", "var_0", "var_4", "var_3"};
+  std::vector<int32_t> conv_output_shape = {1, 512, 56, 56};
+  int32_t channel                        = conv_output_shape[1];
+  std::vector<tests::VariableInfo> inputs_varinfo({{"conv_output", conv_output_shape},
+                                                   {"bias", {channel, 1, 1}},
+                                                   {"bn_scale", {channel, 1, 1}},
+                                                   {"bn_offset", {channel, 1, 1}}});
+
+  // Construct the computation graph and convert it to ir::Expr
+  Context::Global().ResetNameId();
+  ir::IRSchedule ir_schedule = MakeIRSchedule(tests::BiasBnReLUBuilder().Build(inputs_varinfo));
+  SearchState state(ir_schedule, 0, {});
+  std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+  ASSERT_EQ(func_bodys.size(), 1UL);
+  VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+  // Apply AutoInline for every block that can be inline
+  AutoInline auto_inline(target_, {output_names.front()});
+  EXPECT_EQ(auto_inline.AnalyseApplyType(state, "var_3"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states = auto_inline.ApplyOnBlock(state, "var_3");
+  std::vector<std::string> inline_block_names({"var_4", "var_5", "var_6", "var", "var_0", "var_1"});
+  for (const auto& inline_block_name : inline_block_names) {
+    new_states = auto_inline.ApplyOnBlock(new_states[0], inline_block_name);
+  }
+  std::vector<ir::Expr> exprs = new_states[0]->ir_schedule.GetModule().GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+  VLOG(6) << "Expr after AutoInline applied on block: " << exprs[0];
+
+  // build ir::Module and debug source code
+  auto build_module_auto = BuildIRModule(new_states[0]->ir_schedule);
+  auto build_module_manually =
+      BuildIRModule(MakeIRSchedule(tests::BiasBnReLUBuilder().Build(inputs_varinfo), -1, true));
+  auto source_code_auto = GenSourceCode(build_module_auto);
+  VLOG(6) << " auto-schedule source code:\n" << source_code_auto;
+  auto source_code_manually = GenSourceCode(build_module_manually);
+  VLOG(6) << " manually-schedule source code:\n" << source_code_manually;
+
+  CheckResult(GenExecutableKernel(build_module_auto),
+              GenExecutableKernel(build_module_manually),
+              input_names,
+              output_names,
+              {{conv_output_shape[1], 1, 1}, conv_output_shape, conv_output_shape, conv_output_shape},
+              {conv_output_shape, {1}, {1}, {1}, {1}, {1}, {1}},
+              target);
+}
+
+/* An op can be inlined into multiple consumers at the same time.
+ *
+ * Before AutoInline: The output of Exp is used by Add and Multiply.
+ *   Loop1:
+ *     x = Exp()
+ *   Loop2:
+ *     y = Add(x)
+ *   Loop3:
+ *     z = Multiply(x)
+ *
+ * After AutoInline: Exp is inlined into Add and Multiply.
+ *   Loop:
+ *     y = Add(Exp())
+ *     z = Multiply(Exp())
+ */
+TEST_F(TestAutoInline, InlineToMultiConsumers) {
+  Target target = common::DefaultNVGPUTarget();
+  Initialize(target);
+  std::vector<std::string> input_names  = {"x"};
+  std::vector<std::string> output_names = {"var_2", "var_1", "var_0"};
+  std::vector<int32_t> input_shape{256, 256};
+  std::vector<tests::VariableInfo> inputs_varinfo({{"x", input_shape}});
+
+  // Construct the computation graph and convert it to ir::Expr
+  Context::Global().ResetNameId();
+  ir::IRSchedule ir_schedule = MakeIRSchedule(tests::ExpTwoConsumersOpBuilder().Build(inputs_varinfo));
+  SearchState state(ir_schedule, 0, {});
+  std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+  ASSERT_EQ(func_bodys.size(), 1UL);
+  VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+  // Apply AutoInline for every block that can be inline
+  AutoInline auto_inline(target_, {output_names.front()});
+  EXPECT_EQ(auto_inline.AnalyseApplyType(state, "var_0"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states             = auto_inline.ApplyOnBlock(state, "var_1");
+  new_states                  = auto_inline.ApplyOnBlock(state, "var_0");
+  std::vector<ir::Expr> exprs = new_states[0]->ir_schedule.GetModule().GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+  VLOG(6) << "Expr after AutoInline applied on block: " << exprs[0];
+
+  // build ir::Module and debug source code
+  auto build_module_auto = BuildIRModule(new_states[0]->ir_schedule);
+  auto build_module_manually =
+      BuildIRModule(MakeIRSchedule(tests::ExpTwoConsumersOpBuilder().Build(inputs_varinfo), -1, true));
+  auto source_code_auto = GenSourceCode(build_module_auto);
+  VLOG(6) << " auto-schedule source code:\n" << source_code_auto;
+  auto source_code_manually = GenSourceCode(build_module_manually);
+  VLOG(6) << " manually-schedule source code:\n" << source_code_manually;
+
+  CheckResult(GenExecutableKernel(build_module_auto),
+              GenExecutableKernel(build_module_manually),
+              input_names,
+              output_names,
+              {input_shape},
+              {input_shape, {1}, {1}},
+              target);
+}
+
+/* Operators of type elementwise or injective can all be inlined.
+ *
+ * Before AutoInline: A graph of Gather, Add and Subtract
+ *   Loop1:
+ *     x1 = Gather()
+ *   Loop2:
+ *     x2 = Add(x1)
+ *   Loop3:
+ *     y1 = Gather()
+ *   Loop4:
+ *     z1 = Subtract(y1, x1)
+ *
+ * After AutoInline: All loops are inlined to one
+ *     z1 = Subtract(Gather(), Add(Gather()))
+ */
+TEST_F(TestAutoInline, OnlySpatialOp) {
+  Target target = common::DefaultNVGPUTarget();
+  Initialize(target);
+  std::vector<std::string> input_names  = {"x", "y"};
+  std::vector<std::string> output_names = {
+      "var_6", "var_4", "constant_idx_last", "constant_idx_first", "var_2", "var_5"};
+  std::vector<int32_t> input_shape{256, 256};
+  std::vector<tests::VariableInfo> inputs_varinfo({{"x", input_shape}, {"y", input_shape}});
+
+  // Construct the computation graph and convert it to ir::Expr
+  Context::Global().ResetNameId();
+  ir::IRSchedule ir_schedule = MakeIRSchedule(tests::GatherAddSubBuilder().Build(inputs_varinfo));
+  SearchState state(ir_schedule, 0, {});
+  std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+  ASSERT_EQ(func_bodys.size(), 1UL);
+  VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+  // Apply AutoInline for every block that can be inline
+  AutoInline auto_inline(target_, {output_names.front()});
+  EXPECT_EQ(auto_inline.AnalyseApplyType(state, "constant_idx_first"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states = auto_inline.ApplyOnBlock(state, "constant_idx_first");
+  std::vector<std::string> inline_block_names({"constant_idx_last", "var_2", "var_5", "var_4"});
+  for (const auto& inline_block_name : inline_block_names) {
+    new_states = auto_inline.ApplyOnBlock(new_states[0], inline_block_name);
+  }
+  std::vector<ir::Expr> exprs = new_states[0]->ir_schedule.GetModule().GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+  VLOG(6) << "Expr after AutoInline applied on block: " << exprs[0];
+
+  // build ir::Module and debug source code
+  auto build_module_auto = BuildIRModule(new_states[0]->ir_schedule);
+  auto build_module_manually =
+      BuildIRModule(MakeIRSchedule(tests::GatherAddSubBuilder().Build(inputs_varinfo), -1, true));
+  auto source_code_auto = GenSourceCode(build_module_auto);
+  VLOG(6) << " auto-schedule source code:\n" << source_code_auto;
+  auto source_code_manually = GenSourceCode(build_module_manually);
+  VLOG(6) << " manually-schedule source code:\n" << source_code_manually;
+
+  CheckResult(GenExecutableKernel(build_module_auto),
+              GenExecutableKernel(build_module_manually),
+              input_names,
+              output_names,
+              {input_shape, input_shape},
+              {input_shape, {1}, {1}, {1}, {1}, {1}},
+              target);
+}
+
+/* An op that does not read data can be directly inlined.
+ *
+ * Before AutoInline: fill_constant op is in a separate loop.
+ *   Loop1:
+ *     x = fill_constant()
+ *   Loop2:
+ *     y = Add(x)
+ *
+ * After AutoInline: fill_constant op is inlined into other loop
+ *   Loop:
+ *     y = Add(fill_constant())
+ */
+TEST_F(TestAutoInline, NoReadBufferOp) {
+  Target target = common::DefaultNVGPUTarget();
+  Initialize(target);
+  std::vector<std::string> input_names  = {"x"};
+  std::vector<std::string> output_names = {"var_0", "fill_constant"};
+  std::vector<int32_t> input_shape{256, 256};
+  std::vector<tests::VariableInfo> inputs_varinfo({{"x", input_shape}});
+
+  // Construct the computation graph and convert it to ir::Expr
+  ir::IRSchedule ir_schedule = MakeIRSchedule(tests::FillConstantAddBuilder().Build(inputs_varinfo));
+  SearchState state(ir_schedule, 0, {});
+  std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+  ASSERT_EQ(func_bodys.size(), 1UL);
+  VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+  // Apply AutoInline for every block that can be inline
+  AutoInline auto_inline(target_, {output_names.front()});
+  EXPECT_EQ(auto_inline.AnalyseApplyType(state, "fill_constant"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states             = auto_inline.ApplyOnBlock(state, "fill_constant");
+  std::vector<ir::Expr> exprs = new_states[0]->ir_schedule.GetModule().GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+  VLOG(6) << "Expr after AutoInline applied on block: " << exprs[0];
+
+  // build ir::Module and debug source code
+  auto build_module_auto = BuildIRModule(new_states[0]->ir_schedule);
+  auto build_module_manually =
+      BuildIRModule(MakeIRSchedule(tests::FillConstantAddBuilder().Build(inputs_varinfo), -1, true));
+  auto source_code_auto = GenSourceCode(build_module_auto);
+  VLOG(6) << " auto-schedule source code:\n" << source_code_auto;
+  auto source_code_manually = GenSourceCode(build_module_manually);
+  VLOG(6) << " manually-schedule source code:\n" << source_code_manually;
+
+  CheckResult(GenExecutableKernel(build_module_auto),
+              GenExecutableKernel(build_module_manually),
+              input_names,
+              output_names,
+              {input_shape},
+              {input_shape, {1}},
+              target);
+}
+
+/* An op can be inlined into multiple producers at the same time.
+ */
+// TEST_F(TestAutoInline, InlineToMultiProducers) {
+// TODO(6clc): Complete the unit test, once ReverseComputeInline is ready.
+// }
+#endif
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
new file mode 100644
index 0000000000000..a4bc75ef1af83
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
+
+#include <glog/logging.h>
+
+#include <cstdlib>
+
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/ir_copy.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+static std::vector<int> auto_unroll_options = {0, 8, 32, 128};
+
+bool AutoUnroll::MeetCondition(const ir::ScheduleBlock* schedule_block) const {
+  // whether any block has reduce iter
+  auto has_reduce_iter = [](const Expr* x) {
+    auto* block_realize = x->As<ir::ScheduleBlockRealize>();
+    if (block_realize) {
+      auto* schedule_block = block_realize->schedule_block.As<ir::ScheduleBlock>();
+      CHECK(schedule_block) << "schedule_block field is not a ScheduleBlock";
+      for (auto&& var : schedule_block->iter_vars) {
+        if (var->is_reduce_axis) {
+          VLOG(6) << "find ScheduleBlockRealize:" << *x << " has reduce_axis:" << var;
+          return true;
+        }
+      }
+    }
+    return false;
+  };
+  // whether has any for-loop with non-serial type
+  auto has_nonserial_loop = [](const Expr* x) {
+    if (x->As<ir::For>() && x->As<ir::For>()->for_type() != ir::ForType::Serial) {
+      VLOG(6) << "find non-serial loop:" << *x;
+      return true;
+    }
+    return false;
+  };
+
+  auto find_target_exprs = ir::CollectIRNodesWithoutTensor(
+      schedule_block->body,
+      [&has_reduce_iter, &has_nonserial_loop](const Expr* x) { return has_reduce_iter(x) || has_nonserial_loop(x); });
+
+  return !find_target_exprs.empty();
+}
+
+RuleApplyType AutoUnroll::Init(ir::IRSchedule* ir_schedule) {
+  ir_schedule_        = ir_schedule;
+  auto block_realizes = ir_schedule_->GetAllBlocks();
+
+  // A schedule block can perform `auto_unroll` rule should meet two conditions:
+  // (1) it is a root block
+  // (2) MeetCondition returns true with it
+  applicable_schedule_blocks_.clear();
+  std::set<Expr> deduplicate_results;
+  for (size_t i = 0; i < block_realizes.size(); ++i) {
+    // find root block
+    Expr root_block     = ir_schedule_->GetRootBlock(block_realizes[i]);
+    auto* block_realize = root_block.As<ir::ScheduleBlockRealize>();
+    CHECK(block_realize) << "stmt is not a ScheduleBlockRealize:" << root_block;
+    auto* schedule_block = block_realize->schedule_block.As<ir::ScheduleBlock>();
+    CHECK(schedule_block) << "schedule_block field is not a ScheduleBlock:" << Expr(block_realize);
+    if (MeetCondition(schedule_block)) {
+      deduplicate_results.emplace(root_block);
+    }
+  }
+  applicable_schedule_blocks_ = {deduplicate_results.begin(), deduplicate_results.end()};
+  num_applicable_             = applicable_schedule_blocks_.size();
+  VLOG(6) << "Collect applicable_schedule_blocks_:" << num_applicable_;
+
+  return num_applicable_ > 0 ? RuleApplyType::kApplyAndPruneOtherRules : RuleApplyType::kCannotApply;
+}
+
+void AutoUnroll::Apply(int index) {
+  CHECK_LT(index, applicable_schedule_blocks_.size()) << "invalid apply index:" << index;
+  auto applied_block = applicable_schedule_blocks_.at(index);
+  int max_step       = auto_unroll_options[std::rand() % auto_unroll_options.size()];
+  ir_schedule_->Annotate(applied_block, ir::attr::auto_unroll_max_step, max_step);
+  return;
+}
+
+RuleApplyType AutoUnroll::AnalyseApplyType(SearchState state, const std::string& block_name) const {
+  Expr block_expr     = state->ir_schedule.GetBlock(block_name);
+  Expr root_block     = state->ir_schedule.GetRootBlock(block_expr);
+  auto* block_realize = root_block.As<ir::ScheduleBlockRealize>();
+  CHECK(block_realize) << "stmt is not a ScheduleBlockRealize:" << root_block;
+  auto* schedule_block = block_realize->schedule_block.As<ir::ScheduleBlock>();
+  CHECK(schedule_block) << "schedule_block field is not a ScheduleBlock:" << Expr(block_realize);
+
+  return MeetCondition(schedule_block) ? RuleApplyType::kApplyAndPruneOtherRules : RuleApplyType::kCannotApply;
+}
+
+std::vector<SearchState> AutoUnroll::ApplyOnBlock(SearchState state, const std::string& block_name) {
+  SearchState new_state = state.Copy();
+  Expr block_expr       = new_state->ir_schedule.GetBlock(block_name);
+  Expr applied_block    = new_state->ir_schedule.GetRootBlock(block_expr);
+  int max_step          = auto_unroll_options[std::rand() % auto_unroll_options.size()];
+  new_state->ir_schedule.Annotate(applied_block, ir::attr::auto_unroll_max_step, max_step);
+
+  return {new_state};
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
new file mode 100644
index 0000000000000..f1b67d173cf3f
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// This rule can be applied in a ScheduleBlock has reduce axis or has loops with non-serial type.
+// As a result, it will set a attribute with key named ir::attr::auto_unroll_max_step and value
+// indicating max permitted unrolled step in the applied ScheduleBlock. Finally, UnrollLoop pass
+// will do unroll based on actual situation.
+class AutoUnroll : public AutoGenRule {
+ public:
+  AutoUnroll(const common::Target& target) : AutoGenRule(target) {}
+  ~AutoUnroll() = default;
+
+  RuleApplyType Init(ir::IRSchedule* init_schedule) override;
+
+  void Apply(int index) override;
+
+  std::string GetRuleName() const override { return "AutoUnroll"; }
+
+  RuleApplyType AnalyseApplyType(SearchState state, const std::string& block_name) const override;
+
+  std::vector<SearchState> ApplyOnBlock(SearchState state, const std::string& block_name) override;
+
+ private:
+  bool MeetCondition(const ir::ScheduleBlock* schedule_block) const;
+
+ private:
+  std::vector<Expr> applicable_schedule_blocks_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
new file mode 100644
index 0000000000000..99688a2da6738
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/lang/lower.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(AutoUnroll, Init) {
+  using namespace ir;
+
+  Expr M(100);
+  Expr N(4);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+  Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "C");
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  auto stages = CreateStages({C});
+  auto funcs  = cinn::lang::LowerVec("test_init", stages, {A, B, C}, {}, {}, nullptr, target, true);
+
+  auto ast_expr = funcs[0]->body;
+  ir::IRSchedule init_schedule(ir::ModuleExpr({ast_expr}));
+  AutoUnroll test_rule(target);
+  // not meet specific condition
+  ASSERT_EQ(test_rule.Init(&init_schedule), RuleApplyType::kCannotApply);
+}
+
+TEST(AutoUnroll, UnrollableApply) {
+  using namespace ir;
+
+  Expr M(100);
+  Expr N(4);
+  Expr K(32);
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+  Var k(K.as_int32(), "k0");
+  Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  auto stages = CreateStages({C});
+  auto funcs  = cinn::lang::LowerVec("test_unrollable", stages, {A, B, C}, {}, {}, nullptr, target, true);
+
+  auto ast_expr             = funcs[0]->body;
+  auto* init_block_realize  = ast_expr.As<ir::Block>()->stmts.front().As<ir::ScheduleBlockRealize>();
+  auto* init_schedule_block = init_block_realize->schedule_block.As<ir::ScheduleBlock>();
+  ASSERT_NE(init_schedule_block, nullptr);
+  ASSERT_TRUE(init_schedule_block->attrs.empty());
+  VLOG(6) << "Before auto-unroll:\n" << ast_expr;
+
+  AutoUnroll test_rule(target);
+  ir::IRSchedule ir_schedule(ir::ModuleExpr({ast_expr}));
+  SearchState state(ir_schedule, 0, {});
+  ASSERT_EQ(test_rule.Init(&ir_schedule), RuleApplyType::kApplyAndPruneOtherRules);
+  EXPECT_EQ(test_rule.NumberApplicable(), 1);
+  test_rule.ApplyRandomly();
+
+  // ApplyOnBlock
+  EXPECT_EQ(test_rule.AnalyseApplyType(state, "C"), RuleApplyType::kApplyAndPruneOtherRules);
+  std::vector<cinn::auto_schedule::SearchState> states = test_rule.ApplyOnBlock(state, "C");
+
+  auto test_func = [](IRSchedule* ir_sch) {
+    Expr applied_expr            = ir_sch->GetModule().GetExprs().front();
+    auto* applied_block_realize  = applied_expr.As<ir::Block>()->stmts.front().As<ir::ScheduleBlockRealize>();
+    auto* applied_schedule_block = applied_block_realize->schedule_block.As<ir::ScheduleBlock>();
+    ASSERT_FALSE(applied_schedule_block->attrs.empty());
+    EXPECT_EQ(applied_schedule_block->attrs.count(ir::attr::auto_unroll_max_step), 1);
+    const auto& attr_value = applied_schedule_block->attrs.at(ir::attr::auto_unroll_max_step);
+    const int* max_step    = absl::get_if<int>(&attr_value);
+    EXPECT_NE(max_step, nullptr);
+    EXPECT_LE(*max_step, 128);
+    VLOG(6) << "After auto-unroll:max_step=" << *max_step << ", Ast:\n" << ir_sch->GetModule().GetExprs().front();
+  };
+
+  test_func(&ir_schedule);
+  test_func(&states[0]->ir_schedule);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
new file mode 100644
index 0000000000000..21ed0e94f9ddf
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "tests/program_builder.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class TestMixRules : public TestAutoGenRuleBase {
+ public:
+  std::vector<std::string> default_input_names  = {"X", "Y"};
+  std::vector<std::string> default_output_names = {"temp_matmul_out"};
+};
+
+TEST_F(TestMixRules, 2DMatmulOnMultiTilingRelated) {
+  frontend::Program matmul_op = tests::OpBuilder("matmul").Build({{"X", {32, 32}}, {"Y", {32, 32}}});
+  Initialize(common::DefaultNVGPUTarget());
+  ir::IRSchedule ir_schedule       = MakeIRSchedule(matmul_op);
+  std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+  ASSERT_EQ(func_bodys.size(), 1UL);
+  VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+  // Apply MultiLevelTiling
+  MultiLevelTiling multi_level_tiling(target_, MultiLevelTiling::kConfigs.at(target_.arch));
+  multi_level_tiling.Init(&ir_schedule);
+  ASSERT_EQ(multi_level_tiling.NumberApplicable(), 1);
+  multi_level_tiling.ApplyRandomly();
+  VLOG(6) << "after MultiLevelTiling Expr:\n" << func_bodys[0];
+
+  // build ir::Module and debug source code
+  auto ir_module   = BuildIRModule(ir_schedule);
+  auto source_code = GenSourceCode(ir_module);
+  VLOG(6) << "scheduled source code:\n" << source_code;
+  // execute and check precision
+  CheckResult(GenExecutableKernel(ir_module),
+              GenExecutableKernel(BuildIRModule(MakeIRSchedule(matmul_op, /* apply_manual_schedule */ true))),
+              default_input_names,
+              default_output_names,
+              {{32, 32}, {32, 32}},
+              {{32, 32}},
+              target_);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
new file mode 100644
index 0000000000000..3dee778f8f886
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
@@ -0,0 +1,401 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "cinn/auto_schedule/analysis/analyze_ir.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/optim/ir_copy.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+MultiLevelTiling::MultiLevelTiling(const common::Target& target, const Config& config)
+    : AutoGenRule(target), config_(config) {
+  for (int i = 0; i < config_.tile_struct.size(); ++i) {
+    if (config_.tile_struct[i] == 'S') {
+      s_indices_.push_back(i);
+    } else if (config_.tile_struct[i] == 'R') {
+      r_indices_.push_back(i);
+    } else {
+      CHECK(false) << "Illegal tiling structure string";
+    }
+  }
+}
+
+bool MultiLevelTiling::MeetCondition(const ir::ScheduleBlockRealize& sche_block_realize) const {
+  return NeedsMultiLevelTiling(sche_block_realize);
+}
+
+RuleApplyType MultiLevelTiling::Init(ir::IRSchedule* ir_schedule) {
+  ir_schedule_        = ir_schedule;
+  all_block_realizes_ = ir_schedule_->GetAllBlocks();
+  applicable_indices_.clear();
+  num_applicable_ = 0;
+  for (size_t i = 0; i < all_block_realizes_.size(); ++i) {
+    ir::ScheduleBlockRealize* sche_block_realize = all_block_realizes_[i].As<ir::ScheduleBlockRealize>();
+    AnalyzeScheduleBlockReadWriteBuffer(sche_block_realize->schedule_block.As<ir::ScheduleBlock>());
+    if (MeetCondition(*sche_block_realize)) {
+      ++num_applicable_;
+      applicable_indices_.push_back(i);
+    }
+  }
+
+  return num_applicable_ > 0 ? RuleApplyType::kApplyAndPruneOtherRules : RuleApplyType::kCannotApply;
+}
+
+void MultiLevelTiling::Apply(int index) {
+  CHECK(ir_schedule_ != nullptr) << "Run MultiLevelTiling::Apply without Init";
+  CHECK(num_applicable_ > 0 && applicable_indices_.size() == num_applicable_)
+      << "MultiLevelTiling::Apply pre-condition doesn't meet";
+  CHECK(index >= 0 && num_applicable_ > index)
+      << "Invalid index for MultiLevelTiling::Apply, the index needs 0 <= index && index < NumberApplicable(), "
+      << "Currently index = " << index << ",  NumberApplicable() = " << num_applicable_;
+
+  int apply_index = applicable_indices_[index];
+  std::string block_name =
+      all_block_realizes_[apply_index].As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name;
+  Expr block_expr = all_block_realizes_[apply_index];
+  ApplyTiling(ir_schedule_, block_expr);
+  block_expr = ir_schedule_->GetBlock(block_name);
+  ApplyCacheRead(ir_schedule_, block_expr);
+  block_expr = ir_schedule_->GetBlock(block_name);
+  ApplyCacheWrite(ir_schedule_, block_expr);
+
+  VLOG(4) << "Returning the result of MultiLevelTiling";
+  return;
+}
+
+std::string MultiLevelTiling::GetRuleName() const { return "MultiLevelTiling"; }
+
+RuleApplyType MultiLevelTiling::AnalyseApplyType(SearchState state, const std::string& block_name) const {
+  Expr block_expr     = state->ir_schedule.GetBlock(block_name);
+  auto* block_realize = block_expr.As<ir::ScheduleBlockRealize>();
+  CHECK(block_realize) << "stmt is not a ScheduleBlockRealize:" << block_expr;
+  AnalyzeScheduleBlockReadWriteBuffer(block_realize->schedule_block.As<ir::ScheduleBlock>());
+
+  return NeedsMultiLevelTiling(*block_realize) ? RuleApplyType::kApplyAndPruneOtherRules : RuleApplyType::kCannotApply;
+}
+
+std::vector<SearchState> MultiLevelTiling::ApplyOnBlock(SearchState state, const std::string& block_name) {
+  SearchState new_state  = state.Copy();
+  ir::IRSchedule* ir_sch = &new_state->ir_schedule;
+  Expr block_expr        = ir_sch->GetBlock(block_name);
+  ApplyTiling(ir_sch, block_expr);
+  block_expr = ir_sch->GetBlock(block_name);
+  ApplyCacheRead(ir_sch, block_expr);
+  block_expr = ir_sch->GetBlock(block_name);
+  ApplyCacheWrite(ir_sch, block_expr);
+
+  VLOG(4) << "Returning the result of MultiLevelTiling";
+  return {new_state};
+}
+
+void MultiLevelTiling::ApplyTiling(ir::IRSchedule* ir_schedule, ir::Expr& block_expr) {
+  ir::ScheduleBlockRealize* sche_block_realize = block_expr.As<ir::ScheduleBlockRealize>();
+  ir::ScheduleBlock* sche_block                = sche_block_realize->schedule_block.As<ir::ScheduleBlock>();
+  tile_loops_.clear();
+  tile_loops_.resize(config_.tile_struct.size());
+  std::vector<Expr> for_exprs = ir_schedule->GetLoops(block_expr);
+
+  VLOG(5) << "The number of loops to split in MultiLevelTiling is " << for_exprs.size();
+  for (int i = for_exprs.size() - 1; i >= 0; --i) {
+    ir::For* ir_for = for_exprs[i].As<ir::For>();
+    VLOG(6) << "Applying Split for MultiLevelTiling on: " << Expr(ir_for);
+    const std::vector<int>* idx = nullptr;
+    if (sche_block->iter_vars[i]->is_reduce_axis) {
+      idx = &r_indices_;
+    } else {
+      idx = &s_indices_;
+    }  // TODO: support more iterator variable types
+
+    int extent = ir_for->extent.as_int32();  // maybe int64?
+
+    int num_split = idx->size();
+    if (num_split > 1) {
+      std::vector<Expr> tile_split_factor = ir_schedule->SamplePerfectTile(Expr(ir_for), num_split, 64);
+      std::vector<Expr> splited           = ir_schedule->Split(Expr(ir_for), tile_split_factor);
+      VLOG(6) << "Finish Split for MultiLevelTiling on above loop";
+      for (int j = 0; j < num_split; ++j) {
+        tile_loops_[idx->at(j)].push_back(splited[j]);
+      }
+    } else {
+      tile_loops_[idx->at(0)].push_back(for_exprs[i]);
+    }
+  }
+  VLOG(5) << "Finish Split in MultiLevelTiling, before Reorder.";
+
+  // Have to GetLoops again because Split can change Block Expr(s)
+  for_exprs = ir_schedule->GetLoops(sche_block->name);
+  std::unordered_map<std::string, int> loop_var_name_to_idx;
+  for (int i = 0; i < for_exprs.size(); ++i) {
+    loop_var_name_to_idx[for_exprs[i].As<ir::For>()->loop_var->name] = i;
+  }
+  CHECK(loop_var_name_to_idx.size() == for_exprs.size()) << "Loops contain duplicate loop var names after split";
+
+  std::vector<Expr> splited_loops;
+  for (auto& t : tile_loops_) {
+    std::reverse(t.begin(), t.end());
+    for (auto& tile_loop_expr : t) {
+      const ir::For* tile_loop = tile_loop_expr.As<ir::For>();
+      CHECK(tile_loop) << "tiles store non For Expr";
+      int idx = loop_var_name_to_idx[tile_loop->loop_var->name];
+      splited_loops.push_back(for_exprs[idx]);
+    }
+  }
+
+  Expr reordered_expr = ir_schedule->Reorder(splited_loops);
+  VLOG(5) << "Finish Reorder in MultiLevelTiling, now do Fuse and Binding on the main loop chain";
+
+  int num_binds = std::min(config_.bind_axis.size(), tile_loops_.size());
+  for (int i = 0; i < num_binds; ++i) {
+    loop_var_name_to_idx.clear();
+    for_exprs = ir_schedule->GetLoops(sche_block->name);
+    for (int j = 0; j < for_exprs.size(); ++j) {
+      loop_var_name_to_idx[for_exprs[j].As<ir::For>()->loop_var->name] = j;
+    }
+    CHECK(loop_var_name_to_idx.size() == for_exprs.size()) << "Loops contain duplicate loop var names before Fusion";
+
+    // Some loops extent may exceed the limited max factor (For example,
+    // exceed the limit number of CUDA threads), here we check whether
+    // the fused loop extent, which is the production of extends of loops
+    // to be fused, is less or equal to the max factor.
+    //
+    // If yes, we fuse those loops and bind the fused loop
+    // If no, we bind the first loop whose extent is less than the factor.
+    int extent_prod                    = 1;
+    int first_idx_less_than_max_factor = -1;
+    for (int j = 0; j < tile_loops_[i].size(); ++j) {
+      const ir::For* tile_loop = tile_loops_[i][j].As<ir::For>();
+      CHECK(tile_loop) << "tiles store non For Expr";
+      int idx           = loop_var_name_to_idx[tile_loop->loop_var->name];
+      tile_loops_[i][j] = for_exprs[idx];
+      int extent        = tile_loop->extent.as_int32();  // maybe int64?
+      extent_prod *= extent;
+      if (first_idx_less_than_max_factor == -1 && extent <= max_factor_) {
+        first_idx_less_than_max_factor = idx;
+      }
+    }
+
+    if (extent_prod <= max_factor_) {
+      Expr fused = ir_schedule->Fuse(tile_loops_[i]);
+      ir_schedule->Bind(fused, config_.bind_axis[i]);
+    } else if (first_idx_less_than_max_factor != -1) {
+      ir_schedule->Bind(for_exprs[first_idx_less_than_max_factor], config_.bind_axis[i]);
+    }
+  }
+
+  VLOG(5) << "Do Fuse and Binding on the non-main loop chains";
+  Expr sche_block_top_loop = ir_schedule->GetLoops(sche_block->name)[0];
+
+  if (reordered_expr.As<ir::Block>()) {
+    for (Expr& top_loop : reordered_expr.As<ir::Block>()->stmts) {
+      if (top_loop != sche_block_top_loop) {
+        std::vector<Expr> scan_loop_blocks = ir_schedule->GetAllBlocks();
+        Expr other_loop_chain_schedule;
+        for (Expr& block : scan_loop_blocks) {
+          std::vector<Expr> loop_chain = ir_schedule->GetLoops(block);
+          if (loop_chain[0] == top_loop) {
+            other_loop_chain_schedule = block;
+            break;
+          }
+        }
+        if (!other_loop_chain_schedule.defined()) {
+          LOG(WARNING) << "Has non-main loop chain, but not corresponding ScheduleBlock in MultiLevelTiling";
+          continue;
+        }
+
+        std::string other_loop_schedule_name =
+            other_loop_chain_schedule.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name;
+        VLOG(6) << "Found other_loop_schedule_name = " << other_loop_schedule_name;
+        int fuse_index = 0;
+        for (int i = 0; i < num_binds; ++i) {
+          for_exprs = ir_schedule->GetLoops(other_loop_schedule_name);
+
+          // Some loops extent may exceed the limited max factor (For example,
+          // exceed the limit number of CUDA threads), here we check whether
+          // the fused loop extent, which is the production of extends of loops
+          // to be fused, is less or equal to the max factor.
+          //
+          // If yes, we fuse those loops and bind the fused loop
+          // If no, we bind the first loop whose extent is less than the factor.
+          int extent_prod                    = 1;
+          int first_idx_less_than_max_factor = -1;
+          for (int j = 0; j < tile_loops_[i].size(); ++j) {
+            int extent = for_exprs[fuse_index + j].As<ir::For>()->extent.as_int32();
+            extent_prod *= extent;
+            if (first_idx_less_than_max_factor == -1 && extent <= max_factor_) {
+              first_idx_less_than_max_factor = fuse_index + j;
+            }
+          }
+          if (extent_prod <= max_factor_) {
+            std::vector<Expr> loops_to_fuse(for_exprs.begin() + fuse_index,
+                                            for_exprs.begin() + fuse_index + tile_loops_[i].size());
+            Expr fused = ir_schedule->Fuse(loops_to_fuse);
+            ir_schedule->Bind(fused, config_.bind_axis[i]);
+            fuse_index += 1;
+          } else if (first_idx_less_than_max_factor != -1) {
+            ir_schedule->Bind(for_exprs[first_idx_less_than_max_factor], config_.bind_axis[i]);
+            fuse_index += tile_loops_[i].size();
+          }
+        }
+      }
+    }
+  }
+}
+
+void MultiLevelTiling::ApplyCacheRead(ir::IRSchedule* ir_schedule, ir::Expr& block_expr) {
+  ir::ScheduleBlockRealize* sch_block_realize = block_expr.As<ir::ScheduleBlockRealize>();
+  ir::ScheduleBlock* sch_block                = sch_block_realize->schedule_block.As<ir::ScheduleBlock>();
+  std::string block_name                      = sch_block->name;
+
+  // Analyze which buffers can be cached
+  std::vector<int> read_buffer_indexes;
+  for (int i = 0; i < sch_block->read_buffers.size(); ++i) {
+    bool is_read_write = false;
+    for (int j = 0; j < sch_block->write_buffers.size(); ++j) {
+      if (sch_block->read_buffers[i] == sch_block->write_buffers[j]) {
+        is_read_write = true;
+        break;
+      }
+    }
+    if (!is_read_write) {
+      read_buffer_indexes.push_back(i);
+    }
+  }
+
+  // Schedule
+  for (int read_buffer_index : read_buffer_indexes) {
+    for (int level : config_.read_cache_levels) {
+      // 1.find target loop
+      const auto loops = tile_loops_.at(level - 1);
+      if (loops.size() == 0) {
+        continue;
+      }
+
+      // 2.Do CacheRead and get the cache block
+      ir::Expr cache_block = ir_schedule->CacheRead(block_expr, read_buffer_index, config_.read_cache_memory_type);
+      std::string cache_block_name =
+          cache_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name;
+
+      std::string target_for_loop_name = loops.back().As<ir::For>()->loop_var->name;
+
+      // 3.Place the cache_block under target_for_loop
+      // The original block expr is invalid after the CacheRead schedule,
+      // so we reacquire the block expr after the schedule according to the block name
+      block_expr                  = ir_schedule->GetBlock(block_name);
+      std::vector<Expr> for_exprs = ir_schedule->GetLoops(block_expr);
+      for (const Expr& for_expr : for_exprs) {
+        if (for_expr.As<ir::For>()->loop_var->name.find(target_for_loop_name) != std::string::npos) {
+          ir_schedule->ComputeAt(cache_block, for_expr, true);
+          break;
+        }
+      }
+
+      // 4.Threads under the same block cooperative fetch data from global memory.
+      Expr new_cache_block                          = ir_schedule->GetBlock(cache_block_name);
+      auto cache_block_loops                        = ir_schedule->GetLoops(new_cache_block);
+      std::vector<std::string> compute_at_extra_var = utils::Split(
+          absl::get<std::string>(
+              new_cache_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->attrs.at(
+                  "compute_at_extra_var")),
+          ",");
+      std::vector<Expr> buffer_loops;
+      // int nthreads = 1;
+      for (const Expr& for_expr : cache_block_loops) {
+        if (std::find(compute_at_extra_var.begin(),
+                      compute_at_extra_var.end(),
+                      for_expr.As<ir::For>()->loop_var->name) != compute_at_extra_var.end()) {
+          buffer_loops.push_back(for_expr);
+        }
+      }
+      auto fused_buffer_loop = ir_schedule->Fuse(buffer_loops);
+      // TODO(BiynXu): Implement vectorize fetching data and pass in vector length
+      ir_schedule->Annotate(ir_schedule->GetBlock(cache_block_name), ir::attr::cooperative_process, 0);
+    }
+  }
+}
+
+void MultiLevelTiling::ApplyCacheWrite(ir::IRSchedule* ir_schedule, ir::Expr& block_expr) {
+  ir::Expr cache_block = ir_schedule->CacheWrite(block_expr, 0, config_.write_cache_memory_type);
+
+  for (int level : config_.write_cache_levels) {
+    const auto loops = tile_loops_.at(level - 1);
+    if (loops.size() == 0) {
+      continue;
+    }
+    std::string target_for_loop_name = loops.back().As<ir::For>()->loop_var->name;
+    // Because the block name is changed in CacheWrite, we need to calculate the derived name
+    // according to the logic of CacheWrite and find the loop structure according to the derived name.
+    const std::string original_block_name =
+        block_expr.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name;
+    const std::string derivative_block_name =
+        original_block_name + "_" + config_.write_cache_memory_type + "_temp_buffer";
+    std::vector<Expr> for_exprs = ir_schedule->GetLoops(derivative_block_name);
+    for (const Expr& for_expr : for_exprs) {
+      if (for_expr.As<ir::For>()->loop_var->name.find(target_for_loop_name) != std::string::npos) {
+        ir_schedule->ReverseComputeAt(ir_schedule->GetBlock(original_block_name), for_expr, true);
+      }
+    }
+
+    const std::string reduce_init_block_name = original_block_name + "__reduce_init";
+    for_exprs                                = ir_schedule->GetLoops(derivative_block_name);
+    for (const Expr& for_expr : for_exprs) {
+      if (for_expr.As<ir::For>()->loop_var->name.find(target_for_loop_name) != std::string::npos &&
+          ir_schedule->HasBlock(reduce_init_block_name)) {
+        ir_schedule->SimpleComputeAt(ir_schedule->GetBlock(reduce_init_block_name), for_expr);
+      }
+    }
+  }
+}
+
+const std::unordered_map<common::Target::Arch, MultiLevelTiling::Config> MultiLevelTiling::kConfigs{
+    {common::Target::Arch::NVGPU,
+     MultiLevelTiling::Config{
+         /*bind_axis*/ std::vector<std::string>{"blockIdx.x", "threadIdx.x"},
+         /*tile_struct*/ std::string("SSSRRSRS"),
+         /*read_cache_memory_type*/ std::string("shared"),
+         /*read_cache_levels*/ std::vector<int>{4},
+         /*write_cache_memory_type*/ std::string("local"),
+         /*write_cache_levels*/ std::vector<int>{3},
+     }},
+    {common::Target::Arch::X86,
+     MultiLevelTiling::Config{
+         /*bind_axis*/ std::vector<std::string>{},
+         /*tile_struct*/ std::string("SSRSRS"),
+         /*read_cache_memory_type*/ std::string("local"),
+         /*read_cache_levels*/ std::vector<int>{3},
+         /*write_cache_memory_type*/ std::string("local"),
+         /*write_cache_levels*/ std::vector<int>{2},
+     }}};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
new file mode 100644
index 0000000000000..0756071657dbd
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class MultiLevelTiling : public AutoGenRule {
+ public:
+  struct Config {
+    // Which thread axis each tiled loop is bound to
+    std::vector<std::string> bind_axis;
+    // Use char 'S' and 'R' to represent tile structure.
+    // S means space tiling level and R means reduce tiling level
+    //
+    // For example, if tile_struct_ = "SSRSRS" and we are doing matrix
+    // multiplication, i, j are the spatial indices and k is the reduce index,
+    // the tiling result will be i_0, j0, i1, j1, k0, i2, j2, k1, i3, j3
+    std::string tile_struct;
+    // The storage type of read cache
+    std::string read_cache_memory_type;
+    // Which tiled levels are read cache block inserted at
+    std::vector<int> read_cache_levels;
+    // The storage type of write cache
+    std::string write_cache_memory_type;
+    // Which tiled levels are write cache block inserted at
+    std::vector<int> write_cache_levels;
+  };
+
+  static const std::unordered_map<common::Target::Arch, Config> kConfigs;
+
+  MultiLevelTiling(const common::Target& target, const Config& config);
+  ~MultiLevelTiling() = default;
+
+  // initialize the AutoGenRule, it must be called before further actions.
+  // Returns false if the rule cannot be applied on the mod_expr, true otherwise
+  RuleApplyType Init(ir::IRSchedule* init_schedule) override;
+
+  // Applies rule on the ir::ModuleExpr for a schedule block specified by index
+  // between 0 (inclusive) and NumberApplicable() (exclusive)
+  void Apply(int index) override;
+
+  // Returns the name of the rule, used for debug.
+  std::string GetRuleName() const override;
+
+  // Returns true if sche_block_realize is applicable by MultiLevelTiling
+  bool MeetCondition(const ir::ScheduleBlockRealize& sche_block_realize) const;
+
+  RuleApplyType AnalyseApplyType(SearchState state, const std::string& block_name) const override;
+
+  std::vector<SearchState> ApplyOnBlock(SearchState state, const std::string& block_name) override;
+
+  // Sample pair of integer type (a, b) such as a * b = extent
+  template <typename T>
+  std::vector<T> SampleSplitTwo(T extent) const {
+    std::vector<std::vector<T>> candidates;
+    for (T div = 1; div <= sqrt(extent); ++div) {
+      if (extent % div == 0) {
+        candidates.push_back({T(div), extent / div});
+      }
+    }
+    if (candidates.size() == 0) {
+      return {1, T(extent)};
+    }
+    int index           = rand() % candidates.size();
+    std::vector<T> pick = candidates[index];
+    if (rand() % 2 != 0) {
+      T tmp   = pick[0];
+      pick[0] = pick[1];
+      pick[1] = tmp;
+    }
+    return pick;
+  }
+
+  // Sample num_split integers whose product equals extent
+  template <typename T>
+  std::vector<T> SampleTileSplit(T extent, int num_split) const {
+    CHECK_GT(num_split, 0) << "num_split in SampleTileSplit must be greater than 0";
+    if (num_split == 1) {
+      return {extent};
+    }
+    std::vector<T> two_split = SampleSplitTwo<T>(extent);
+    if (num_split == 2) {
+      return two_split;
+    }
+    int half              = num_split >> 1;
+    std::vector<T> result = SampleTileSplit<T>(two_split[0], half);
+    std::vector<T> remind = SampleTileSplit<T>(two_split[1], num_split - half);
+    result.insert(result.end(), remind.begin(), remind.end());
+    return result;
+  }
+
+ private:
+  void ApplyTiling(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
+  void ApplyCacheRead(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
+  void ApplyCacheWrite(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
+
+ private:
+  std::vector<ir::Expr> all_block_realizes_;
+  std::vector<int> applicable_indices_;
+
+  Config config_;
+  std::vector<int> s_indices_;
+  std::vector<int> r_indices_;
+  std::vector<std::vector<ir::Expr>> tile_loops_;
+
+  // A factor to limit the split factor within max thread number per block
+  int max_factor_ = 1024;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
new file mode 100644
index 0000000000000..91ddf361da4d3
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
@@ -0,0 +1,548 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "cinn/cinn.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/poly/stage.h"
+#include "cinn/utils/string.h"
+#include "tests/program_builder.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(MultiLevelTile, SampleSplitTwo) {
+  srand(0);
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  MultiLevelTiling multi_level_tiling(target, MultiLevelTiling::kConfigs.at(target.arch));
+
+  for (int i = 0; i < 100; ++i) {
+    size_t number_to_split    = rand() % 65535 + 2;  // random number in [2, 2^16]
+    std::vector<size_t> split = multi_level_tiling.SampleSplitTwo<size_t>(number_to_split);
+    EXPECT_EQ(split.size(), 2UL);
+    EXPECT_EQ(split[0] * split[1], number_to_split);
+  }
+}
+
+TEST(MultiLevelTile, SampleTileSplit) {
+  srand(0);
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  MultiLevelTiling multi_level_tiling(target, MultiLevelTiling::kConfigs.at(target.arch));
+
+  for (int i = 0; i < 100; ++i) {
+    int number_to_split    = rand() % 65535 + 2;  // random number in [2, 2^16]
+    int split_size         = rand() % 5 + 1;      // random in [1, 5]
+    std::vector<int> split = multi_level_tiling.SampleTileSplit<int>(number_to_split, split_size);
+    EXPECT_EQ(split.size(), static_cast<size_t>(split_size));
+    int product = 1;
+    for (int num : split) {
+      product *= num;
+    }
+    EXPECT_EQ(product, number_to_split);
+  }
+}
+
+TEST(MultiLevelTile, SimpleLoops) {
+  srand(0);
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  Expr M(32);
+  Expr N(128);
+
+  Placeholder<float> A("A", {M});
+  Placeholder<float> B("B", {N});
+
+  ir::Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");
+
+  poly::StageMap stages = CreateStages({C});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestMultiLevelTile_SimpleLoops", stages, {C}, {}, {}, nullptr, target, true);
+
+  ir::Expr ast_expr = funcs[0]->body;
+  VLOG(6) << "Expr before MultiLevelTiling: ";
+  VLOG(6) << ast_expr;
+
+  MultiLevelTiling multi_level_tiling(target, MultiLevelTiling::kConfigs.at(target.arch));
+  ir::IRSchedule ir_schedule(ir::ModuleExpr({ast_expr}));
+  SearchState state(ir_schedule, 0, {});
+  EXPECT_EQ(multi_level_tiling.Init(&ir_schedule), RuleApplyType::kApplyAndPruneOtherRules);
+  EXPECT_EQ(multi_level_tiling.NumberApplicable(), 1);
+  multi_level_tiling.ApplyRandomly();
+
+  // ApplyOnBlock
+  EXPECT_EQ(multi_level_tiling.AnalyseApplyType(state, "C"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states = multi_level_tiling.ApplyOnBlock(state, "C");
+
+  auto test_func = [](ir::IRSchedule* ir_sch) {
+    std::vector<ir::Expr> exprs = ir_sch->GetModule().GetExprs();
+    EXPECT_EQ(exprs.size(), 1UL);
+    std::stringstream ss;
+    ss << exprs[0];
+    std::string expr_str = ss.str();
+    VLOG(6) << expr_str;
+  };
+
+  test_func(&ir_schedule);
+  test_func(&new_states[0]->ir_schedule);
+}
+
+// TODO: fix in future
+/*
+TEST(MulitLevelTile, MatrixMultiply) {
+  srand(0);
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  Expr M(32);
+  Expr N(32);
+  Expr K(32);
+
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "reduce_axis_k");
+  ir::Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  poly::StageMap stages = CreateStages({C});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestMultiLevelTile_MatrixMultiply", stages, {C}, {}, {}, nullptr, target, true);
+
+  ir::Expr ast_expr = funcs[0]->body;
+  VLOG(6) << "Expr before MultiLevelTiling: ";
+  VLOG(6) << ast_expr;
+
+  MultiLevelTiling multi_level_tiling(target, MultiLevelTiling::kConfigs.at(target.arch));
+  ir::IRSchedule ir_schedule(ir::ModuleExpr({ast_expr}));
+  SearchState state(ir_schedule, 0, {});
+  EXPECT_EQ(multi_level_tiling.Init(&ir_schedule), RuleApplyType::kApplyAndPruneOtherRules);
+  EXPECT_EQ(multi_level_tiling.NumberApplicable(), 1);
+  multi_level_tiling.ApplyRandomly();
+
+  // ApplyOnBlock
+  EXPECT_EQ(multi_level_tiling.AnalyseApplyType(state, "C"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states = multi_level_tiling.ApplyOnBlock(state, "C");
+
+  auto test_func = [](ir::IRSchedule* ir_sch) {
+    std::vector<ir::Expr> exprs = ir_sch->GetModule().GetExprs();
+    EXPECT_EQ(exprs.size(), 1UL);
+    std::stringstream ss;
+    ss << exprs[0];
+    std::string expr_str = ss.str();
+    VLOG(6) << expr_str;
+  };
+
+  test_func(&ir_schedule);
+  test_func(&new_states[0]->ir_schedule);
+}
+*/
+class TestMultiLevelTiling : public TestAutoGenRuleBase {
+ public:
+  int fixed_rand_seed = 1;
+  std::vector<std::string> default_input_names;
+  std::vector<std::string> default_output_names;
+};
+
+TEST_F(TestMultiLevelTiling, Matmul) {
+  default_input_names            = {"X", "Y"};
+  default_output_names           = {"temp_matmul_out"};
+  std::vector<int32_t> X_shape   = {32, 32};
+  std::vector<int32_t> Y_shape   = {32, 32};
+  std::vector<int32_t> out_shape = {32, 32};
+
+  Initialize(common::DefaultNVGPUTarget());
+  frontend::Program matmul_op = tests::OpBuilder("matmul").Build({{"X", X_shape}, {"Y", Y_shape}});
+  ir::IRSchedule ir_schedule  = MakeIRSchedule(matmul_op, fixed_rand_seed);
+  SearchState state(ir_schedule);
+  VLOG(6) << "Original state:\n" << state->DebugString();
+
+  // Apply MultiLevelTiling
+  MultiLevelTiling multi_level_tiling(target_, MultiLevelTiling::kConfigs.at(target_.arch));
+  EXPECT_EQ(multi_level_tiling.AnalyseApplyType(state, default_output_names[0]),
+            RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states = multi_level_tiling.ApplyOnBlock(state, default_output_names[0]);
+  VLOG(6) << "After MultiLevelTiling, state:\n" << new_states[0]->DebugString();
+  std::string ir          = GetIR(new_states[0]->ir_schedule);
+  std::string expected_ir = R"ROC(Expr 0 {
+{
+  ScheduleBlock(root)
+  {
+    {
+      thread_bind[blockIdx.x] for (i_j_fused, 0, 4)
+      {
+        thread_bind[threadIdx.x] for (i_0_j_0_fused, 0, 1)
+        {
+          serial for (i_1, 0, 1)
+          {
+            serial for (j_1, 0, 1)
+            {
+              serial for (i_2, 0, 1)
+              {
+                serial for (j_2, 0, 1)
+                {
+                  serial for (i_3, 0, 8)
+                  {
+                    serial for (j_3, 0, 32)
+                    {
+                      ScheduleBlock(temp_matmul_out__reduce_init)
+                      {
+                        i0, i1 = axis.bind(((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3)))
+                        {
+                          temp_matmul_out__reduce_init[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] = 0.00000000f
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+              {
+                serial for (reduce_k_0, 0, 4)
+                {
+                  serial for (ax0_0_ax1_0_fused, 0, 256)
+                  {
+                    ScheduleBlock(Y_reshape_shared_temp_buffer)
+                    {
+                      v0, v1 = axis.bind(((ax0_0_ax1_0_fused / 32) + (8 * reduce_k_0)), ((ax0_0_ax1_0_fused % 32) + (32 * j_1)))
+                      attrs(compute_at_extra_var:ax0_0,ax1_0, cooperative_process:0)
+                      {
+                        Y_reshape_shared_temp_buffer[v0, v1] = Y_reshape[v0, v1]
+                      }
+                    }
+                  }
+                  serial for (ax0_ax1_fused, 0, 64)
+                  {
+                    ScheduleBlock(X_reshape_shared_temp_buffer)
+                    {
+                      v0, v1 = axis.bind(((ax0_ax1_fused / 8) + ((8 * i_0_j_0_fused) + ((8 * i_1) + (8 * i_j_fused)))), ((ax0_ax1_fused % 8) + (8 * reduce_k_0)))
+                      attrs(compute_at_extra_var:ax0,ax1, cooperative_process:0)
+                      {
+                        X_reshape_shared_temp_buffer[v0, v1] = X_reshape[v0, v1]
+                      }
+                    }
+                  }
+                  serial for (reduce_k_1, 0, 1)
+                  {
+                    serial for (i_2, 0, 1)
+                    {
+                      serial for (j_2, 0, 1)
+                      {
+                        serial for (reduce_k_2, 0, 8)
+                        {
+                          serial for (i_3, 0, 8)
+                          {
+                            serial for (j_3, 0, 32)
+                            {
+                              ScheduleBlock(temp_matmul_out_local_temp_buffer)
+                              {
+                                i0_0, i1_0, i2 = axis.bind(((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3)), ((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2)))
+                                read_buffers(_temp_matmul_out[i(undefined:undefined), j(undefined:undefined)], _X[i(undefined:undefined), reduce_k(undefined:undefined)], _Y[reduce_k(undefined:undefined), j(undefined:undefined)])
+                                write_buffers(_temp_matmul_out[i(undefined:undefined), j(undefined:undefined)])
+                                {
+                                  temp_matmul_out_local_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] = (temp_matmul_out_local_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((32 * j_1) + ((32 * j_2) + j_3))] + (X_reshape_shared_temp_buffer[((8 * i_0_j_0_fused) + ((8 * i_1) + ((8 * i_2) + ((8 * i_j_fused) + i_3)))), ((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2))] * Y_reshape_shared_temp_buffer[((8 * reduce_k_0) + ((8 * reduce_k_1) + reduce_k_2)), ((32 * j_1) + ((32 * j_2) + j_3))]))
+                                }
+                              }
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+                serial for (ax0_1, 0, 8)
+                {
+                  serial for (ax1_1, 0, 32)
+                  {
+                    ScheduleBlock(temp_matmul_out)
+                    {
+                      v0, v1 = axis.bind((((8 * i_0_j_0_fused) + ((8 * i_1) + (8 * i_j_fused))) + ax0_1), ((32 * j_1) + ax1_1))
+                      attrs(reverse_compute_at_extra_var:ax0_1,ax1_1)
+                      {
+                        temp_matmul_out[v0, v1] = temp_matmul_out_local_temp_buffer[v0, v1]
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // end Expr 0
+)ROC";
+  ASSERT_EQ(ir, expected_ir);
+
+  // build ir::Module and debug source code
+  auto ir_module   = BuildIRModule(new_states[0]->ir_schedule);
+  auto source_code = GenSourceCode(ir_module);
+  VLOG(6) << "scheduled source code:\n" << source_code;
+
+  // execute and check precision
+  CheckResult(
+      GenExecutableKernel(ir_module),
+      GenExecutableKernel(BuildIRModule(MakeIRSchedule(matmul_op, fixed_rand_seed, /* apply_manual_schedule*/ true))),
+      default_input_names,
+      default_output_names,
+      {X_shape, Y_shape},
+      {out_shape},
+      target_);
+}
+
+TEST_F(TestMultiLevelTiling, ReduceSum) {
+  default_input_names             = {"X"};
+  default_output_names            = {"var_0_tmp"};
+  std::vector<int32_t> X_shape    = {1, 16, 32};
+  std::vector<int32_t> out_shape  = {1, 16, 1};
+  std::vector<int32_t> reduce_dim = {2};
+
+  Initialize(common::DefaultNVGPUTarget());
+  frontend::Program reduce_sum_op =
+      tests::OpBuilder("reduce_sum").Build({{"X", X_shape}}, {{"dim", reduce_dim}, {"keep_dim", false}});
+  ir::IRSchedule ir_schedule = MakeIRSchedule(reduce_sum_op);
+  SearchState state(ir_schedule);
+  VLOG(6) << "Original state:\n" << state->DebugString();
+
+  // Apply MultiLevelTiling
+  MultiLevelTiling multi_level_tiling(target_, MultiLevelTiling::kConfigs.at(target_.arch));
+  // EXPECT_EQ(multi_level_tiling.AnalyseApplyType(state, default_output_names[0]), RuleApplyType::kCannotApply);
+}
+
+TEST_F(TestMultiLevelTiling, Pool2d) {
+  default_input_names  = {"input"};
+  default_output_names = {"var_0"};
+  std::vector<int32_t> input_shape{2, 8, 16, 16};
+  std::vector<int32_t> output_shape{2, 8, 8, 8};
+  std::string pooling_type = "max";
+  std::vector<int> ksize{3, 3};
+  std::vector<int> strides{2, 2};
+  std::vector<int> paddings{1, 1, 1, 1};
+  bool ceil_mode                   = false;
+  bool exclusive                   = true;
+  bool global_pooling              = false;
+  std::string data_format          = "NCHW";
+  bool adaptive                    = false;
+  std::string padding_algorithm    = "EXPLICIT";
+  frontend::Program pool2d_program = tests::OpBuilder("pool2d").Build({{"input", input_shape}},
+                                                                      {{"pool_type", pooling_type},
+                                                                       {"kernel_size", ksize},
+                                                                       {"stride_size", strides},
+                                                                       {"padding_size", paddings},
+                                                                       {"ceil_mode", ceil_mode},
+                                                                       {"exclusive", exclusive},
+                                                                       {"global_pooling", global_pooling},
+                                                                       {"data_format", data_format},
+                                                                       {"adaptive", adaptive},
+                                                                       {"padding_algorithm", padding_algorithm}});
+
+  Initialize(common::DefaultNVGPUTarget());
+  ir::IRSchedule ir_schedule = MakeIRSchedule(pool2d_program, fixed_rand_seed);
+  SearchState state(ir_schedule);
+  VLOG(6) << "Original state:\n" << state->DebugString();
+
+  // Apply MultiLevelTiling
+  MultiLevelTiling::Config mlt_config = {
+      /*bind_axis*/ std::vector<std::string>{"blockIdx.x", "threadIdx.x"},
+      /*tile_struct*/ std::string("SSRS"),
+      /*read_cache_memory_type*/ std::string("shared"),
+      /*read_cache_levels*/ std::vector<int>{3},
+      /*write_cache_memory_type*/ std::string("local"),
+      /*write_cache_levels*/ std::vector<int>{2},
+  };
+  MultiLevelTiling multi_level_tiling(target_, mlt_config);
+  EXPECT_EQ(multi_level_tiling.AnalyseApplyType(state, default_output_names[0]),
+            RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states = multi_level_tiling.ApplyOnBlock(state, default_output_names[0]);
+  VLOG(6) << "After MultiLevelTiling, state:\n" << new_states[0]->DebugString();
+
+  std::string ir          = GetIR(new_states[0]->ir_schedule);
+  std::string expected_ir = R"ROC(Expr 0 {
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 2)
+    {
+      serial for (j, 0, 8)
+      {
+        serial for (k, 0, 18)
+        {
+          serial for (a, 0, 18)
+          {
+            ScheduleBlock(pad_temp_0)
+            {
+              i0, i1, i2, i3 = axis.bind(i, j, k, a)
+              pad_temp_0[i, j, k, a] = select(((a < 17) and ((a >= 1) and ((k < 17) and (k >= 1)))), input[i, j, (-1 + k), (-1 + a)], -3.40282347e+38f)
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // end Expr 0
+Expr 1 {
+{
+  ScheduleBlock(root_0)
+  {
+    {
+      thread_bind[blockIdx.x] for (i_j_k_a_fused, 0, 16)
+      {
+        thread_bind[threadIdx.x] for (i_0_j_0_k_0_a_0_fused, 0, 4)
+        {
+          serial for (i_1, 0, 1)
+          {
+            serial for (j_1, 0, 4)
+            {
+              serial for (k_1, 0, 1)
+              {
+                serial for (a_1, 0, 4)
+                {
+                  ScheduleBlock(var_0__reduce_init)
+                  {
+                    i0_0, i1_0, i2_0, i3_0 = axis.bind(((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((i_0_j_0_k_0_a_0_fused % 4) + ((4 * ((i_j_k_a_fused / 2) % 2)) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1))
+                    {
+                      var_0__reduce_init[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((4 * ((i_j_k_a_fused / 2) % 2)) + ((i_0_j_0_k_0_a_0_fused % 4) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1)] = -3.40282347e+38f
+                    }
+                  }
+                }
+              }
+            }
+          }
+          {
+            serial for (kernel_idx, 0, 3)
+            {
+              serial for (kernel_idx_0, 0, 3)
+              {
+                serial for (ax0_ax1_ax2_ax3_fused, 0, 28)
+                {
+                  ScheduleBlock(pad_temp_0_shared_temp_buffer)
+                  {
+                    v0, v1, v2, v3 = axis.bind(((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + ((ax0_ax1_ax2_ax3_fused / 7) / 4))), (((ax0_ax1_ax2_ax3_fused / 7) % 4) + (4 * (((i_j_k_a_fused / 2) / 2) % 2))), ((8 * ((i_j_k_a_fused / 2) % 2)) + ((2 * (i_0_j_0_k_0_a_0_fused % 4)) + kernel_idx)), ((ax0_ax1_ax2_ax3_fused % 7) + ((8 * (i_j_k_a_fused % 2)) + kernel_idx_0)))
+                    attrs(compute_at_extra_var:ax0,ax1,ax2,ax3, cooperative_process:0)
+                    {
+                      pad_temp_0_shared_temp_buffer[v0, v1, v2, v3] = pad_temp_0[v0, v1, v2, v3]
+                    }
+                  }
+                }
+                serial for (i_1, 0, 1)
+                {
+                  serial for (j_1, 0, 4)
+                  {
+                    serial for (k_1, 0, 1)
+                    {
+                      serial for (a_1, 0, 4)
+                      {
+                        ScheduleBlock(var_0_local_temp_buffer)
+                        {
+                          i0_1, i1_1, i2_1, i3_1, i4, i5 = axis.bind(((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((i_0_j_0_k_0_a_0_fused % 4) + ((4 * ((i_j_k_a_fused / 2) % 2)) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1), kernel_idx, kernel_idx_0)
+                          read_buffers(_var_0[i(undefined:undefined), j(undefined:undefined), k(undefined:undefined), a(undefined:undefined)], _pad_temp_0[i(undefined:undefined), j(undefined:undefined)])
+                          write_buffers(_var_0[i(undefined:undefined), j(undefined:undefined), k(undefined:undefined), a(undefined:undefined)])
+                          {
+                            var_0_local_temp_buffer[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((4 * ((i_j_k_a_fused / 2) % 2)) + ((i_0_j_0_k_0_a_0_fused % 4) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1)] = cinn_max(var_0_local_temp_buffer[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((i_0_j_0_k_0_a_0_fused % 4) + ((4 * ((i_j_k_a_fused / 2) % 2)) + k_1)), ((4 * (i_j_k_a_fused % 2)) + a_1)], pad_temp_0_shared_temp_buffer[((((i_j_k_a_fused / 2) / 2) / 2) + ((i_0_j_0_k_0_a_0_fused / 4) + i_1)), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + j_1), ((8 * ((i_j_k_a_fused / 2) % 2)) + ((2 * (i_0_j_0_k_0_a_0_fused % 4)) + ((2 * k_1) + kernel_idx))), ((8 * (i_j_k_a_fused % 2)) + ((2 * a_1) + kernel_idx_0))])
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+            serial for (ax0_0, 0, 1)
+            {
+              serial for (ax1_0, 0, 4)
+              {
+                serial for (ax2_0, 0, 1)
+                {
+                  serial for (ax3_0, 0, 4)
+                  {
+                    ScheduleBlock(var_0)
+                    {
+                      v0, v1, v2, v3 = axis.bind((((((i_j_k_a_fused / 2) / 2) / 2) + (i_0_j_0_k_0_a_0_fused / 4)) + ax0_0), ((4 * (((i_j_k_a_fused / 2) / 2) % 2)) + ax1_0), (((4 * ((i_j_k_a_fused / 2) % 2)) + (i_0_j_0_k_0_a_0_fused % 4)) + ax2_0), ((4 * (i_j_k_a_fused % 2)) + ax3_0))
+                      attrs(reverse_compute_at_extra_var:ax0_0,ax1_0,ax2_0,ax3_0)
+                      {
+                        var_0[v0, v1, v2, v3] = var_0_local_temp_buffer[v0, v1, v2, v3]
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // end Expr 1
+)ROC";
+  ASSERT_EQ(ir, expected_ir);
+
+  // build ir::Module and debug source code
+  auto ir_module   = BuildIRModule(new_states[0]->ir_schedule);
+  auto source_code = GenSourceCode(ir_module);
+  VLOG(6) << "scheduled source code:\n" << source_code;
+
+  // execute and check precision
+  CheckResult(GenExecutableKernel(ir_module),
+              GenExecutableKernel(
+                  BuildIRModule(MakeIRSchedule(pool2d_program, fixed_rand_seed, /* apply_manual_schedule*/ true))),
+              default_input_names,
+              default_output_names,
+              {input_shape},
+              {output_shape},
+              target_);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
new file mode 100644
index 0000000000000..795a1bdc488fb
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
+
+#include <string>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/ir_copy.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+SkipRule::SkipRule(const common::Target& target) : AutoGenRule(target) {}
+
+RuleApplyType SkipRule::Init(ir::IRSchedule* ir_schedule) {
+  ir_schedule_    = ir_schedule;
+  num_applicable_ = 1;
+  return RuleApplyType::kApply;
+}
+
+std::string SkipRule::GetRuleName() const { return "SkipRule"; }
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
new file mode 100644
index 0000000000000..0b7f26f2fdd8b
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class SkipRule : public AutoGenRule {
+ public:
+  SkipRule(const common::Target& target);
+  ~SkipRule() = default;
+
+  RuleApplyType Init(ir::IRSchedule* init_schedule) override;
+
+  void Apply(int index) override {}
+
+  std::string GetRuleName() const override;
+
+  RuleApplyType AnalyseApplyType(SearchState state, const std::string& block_name) const override {
+    return RuleApplyType::kApply;
+  }
+
+  std::vector<SearchState> ApplyOnBlock(SearchState state, const std::string& block_name) override { return {state}; }
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
new file mode 100644
index 0000000000000..9031605a7508c
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/cinn.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(SkipRule, Basic) {
+  srand(0);
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  Expr M(32);
+  Expr N(128);
+
+  Placeholder<float> A("A", {M});
+  Placeholder<float> B("B", {N});
+
+  ir::Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");
+
+  poly::StageMap stages              = CreateStages({C});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("TestSkipRule_Basic", stages, {C}, {}, {}, nullptr, target, true);
+
+  ir::Expr ast_expr = funcs[0]->body;
+  VLOG(6) << "Expr before SkipRule: ";
+  VLOG(6) << ast_expr;
+
+  SkipRule skip_rule(target);
+  ir::IRSchedule ir_schedule(ir::ModuleExpr({ast_expr}));
+  SearchState state(ir_schedule, 0, {});
+
+  EXPECT_EQ(skip_rule.Init(&ir_schedule), RuleApplyType::kApply);
+  EXPECT_EQ(skip_rule.NumberApplicable(), 1);
+  skip_rule.ApplyRandomly();
+
+  // ApplyOnBlock
+  EXPECT_EQ(skip_rule.AnalyseApplyType(state, "C"), RuleApplyType::kApply);
+  std::vector<cinn::auto_schedule::SearchState> states = skip_rule.ApplyOnBlock(state, "C");
+
+  auto test_func = [&ast_expr](ir::IRSchedule* ir_sch) {
+    std::vector<ir::Expr> exprs = ir_sch->GetModule().GetExprs();
+    EXPECT_EQ(exprs.size(), 1UL);
+    EXPECT_EQ(ast_expr, exprs[0]);
+  };
+
+  test_func(&ir_schedule);
+  test_func(&states[0]->ir_schedule);
+}
+
+TEST(SkipRule, ApplyOnSpecificBlock) {
+  srand(0);
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  Expr M(32);
+  Expr N(128);
+
+  Placeholder<float> A("A", {M});
+  Placeholder<float> B("B", {N});
+
+  ir::Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");
+
+  poly::StageMap stages              = CreateStages({C});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("TestSkipRule_Basic", stages, {C}, {}, {}, nullptr, target, true);
+
+  ir::Expr ast_expr = funcs[0]->body;
+  VLOG(6) << "Expr before SkipRule: ";
+  VLOG(6) << ast_expr;
+
+  SkipRule skip_rule(target);
+  ir::IRSchedule ir_schedule(ir::ModuleExpr({ast_expr}));
+  SearchState state(ir_schedule, 0, {});
+
+  EXPECT_EQ(skip_rule.AnalyseApplyType(state, "C"), RuleApplyType::kApply);
+  std::vector<cinn::auto_schedule::SearchState> states = skip_rule.ApplyOnBlock(state, "C");
+
+  std::vector<ir::Expr> exprs = states[0]->ir_schedule.GetModule().GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+  EXPECT_EQ(ast_expr, exprs[0]);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
new file mode 100644
index 0000000000000..9ad001a23bdcc
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
@@ -0,0 +1,240 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory.h>
+#include <stdlib.h>
+
+#include "cinn/auto_schedule/analysis/analyze_ir.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/cinn.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/hlir/framework/instruction.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_lowering.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/optim/transform_gpu_forloop.h"
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::hlir::framework::Instruction;
+using ::cinn::hlir::framework::Scope;
+using ::cinn::hlir::framework::Shape;
+using ::cinn::hlir::framework::Tensor;
+
+void TestAutoGenRuleBase::Initialize(const common::Target& target) {
+  target_          = target;
+  backend_compier_ = backends::Compiler::Create(target);
+}
+
+ir::IRSchedule TestAutoGenRuleBase::MakeIRSchedule(const frontend::Program& test_program,
+                                                   utils::LinearRandomEngine::StateType rand_seed,
+                                                   bool apply_manual_schedule) {
+  Context::Global().ResetNameId();
+
+  auto graph = std::make_shared<hlir::framework::Graph>(test_program, target_);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  LOG_IF(WARNING, graph->fusion_groups.size() > 1) << "Test Graph has more than 1 group";
+  auto& dtype_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+  auto& shape_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+  hlir::framework::OpLowerer op_lowerer(dtype_dict, shape_dict, target_);
+
+  if (apply_manual_schedule) {
+    lowered_funcs_ = op_lowerer.Lower(graph->fusion_groups.front());
+  } else {
+    lowered_funcs_ = op_lowerer.LowerWithoutSchedule(graph->fusion_groups.front());
+  }
+  CHECK(!lowered_funcs_.empty()) << "lowered_funcs_ is empty";
+
+  std::vector<Expr> bodys;
+  for (auto&& func : lowered_funcs_) {
+    bodys.emplace_back(func->body);
+  }
+  return ir::IRSchedule(ir::ModuleExpr({std::move(bodys)}), rand_seed);
+}
+
+std::string TestAutoGenRuleBase::GetIR(const ir::IRSchedule& schedule) {
+  const auto& exprs = schedule.GetModule().GetExprs();
+  std::stringstream module_stream;
+  for (auto i = 0; i < exprs.size(); ++i) {
+    module_stream << "Expr " << i << " {\n" << exprs.at(i) << "\n}  // end Expr " << i << "\n";
+  }
+  return module_stream.str();
+}
+
+ir::Module TestAutoGenRuleBase::BuildIRModule(const ir::IRSchedule& schedule) {
+  auto&& updated_bodys = schedule.GetModule().GetExprs();
+  CHECK_EQ(lowered_funcs_.size(), updated_bodys.size()) << "associated exprs size not equal";
+
+  ir::Module::Builder builder("test_bulder", this->target_);
+  for (int i = 0; i < lowered_funcs_.size(); ++i) {
+    ir::Expr func_body              = updated_bodys.at(i);
+    const ir::LoweredFunc& ori_func = lowered_funcs_.at(i);
+    auto&& new_func                 = UpdateFuncWithNewBody(target_, ori_func, func_body);
+    builder.AddFunction(new_func);
+  }
+
+  return builder.Build();
+}
+
+std::string TestAutoGenRuleBase::GenSourceCode(const ir::Module& ir_module) {
+  std::unique_ptr<backends::CodeGenC> codegen;
+#ifdef CINN_WITH_CUDA
+  if (target_ == common::DefaultNVGPUTarget()) {
+    codegen = std::make_unique<backends::CodeGenCUDA_Dev>(this->target_);
+  } else {
+    codegen = std::make_unique<backends::CodeGenCX86>(this->target_, CodeGenCX86::Feature::AVX512);
+  }
+#else
+  codegen = std::make_unique<backends::CodeGenCX86>(this->target_, CodeGenCX86::Feature::AVX512);
+#endif
+  codegen->SetInlineBuiltinCodes(false);
+  return codegen->Compile(ir_module, CodeGenC::OutputKind::CImpl);
+}
+
+raw_func_type TestAutoGenRuleBase::GenExecutableKernel(const ir::Module& ir_module) {
+  auto&& func_name = lowered_funcs_.front()->name;
+  // Compile to machine code
+  backend_compier_->Build(ir_module);
+  auto test_func_ptr = reinterpret_cast<void (*)(void**, int32_t)>(backend_compier_->Lookup(func_name));
+  return test_func_ptr;
+}
+
+void MemoryCopy(const float* src, float* dst, int numel, std::string type) {
+#ifdef CINN_WITH_CUDA
+  if (type == "DeviceToHost") {
+    cudaMemcpy(dst, src, numel * sizeof(float), cudaMemcpyDeviceToHost);
+    return;
+  } else if (type == "HostToDevice") {
+    cudaMemcpy(dst, src, numel * sizeof(float), cudaMemcpyHostToDevice);
+    return;
+  }
+#endif
+  if (type == "HostToHost") {
+    for (size_t i = 0; i < numel; ++i) {
+      dst[i] = src[i];
+    }
+  } else {
+    LOG(FATAL) << "Unknown memory copy type";
+  }
+}
+
+void AddDataToScope(
+    Scope* scope, const common::Target& target, float* data_ptr, std::string name, const std::vector<int>& shape) {
+  auto* var    = scope->Var<Tensor>(name);
+  auto& tensor = absl::get<Tensor>(*var);
+  CHECK(shape.size()) << "The size of shape can not be 0.";
+  Shape cinn_shape(shape);
+  tensor->Resize(cinn_shape);
+  auto* tgt_data_ptr       = tensor->mutable_data<float>(target);
+  std::string mem_cpy_type = target == common::DefaultNVGPUTarget() ? "DeviceToHost" : "HostToHost";
+  MemoryCopy(data_ptr, tgt_data_ptr, cinn_shape.numel(), mem_cpy_type);
+}
+
+void CheckResult(raw_func_type test_func,
+                 raw_func_type expected_func,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names,
+                 const std::vector<std::vector<int>>& input_shapes,
+                 const std::vector<std::vector<int>>& output_shapes,
+                 const common::Target& target) {
+  CHECK(input_names.size()) << "The number of inputs must be greater than 0.";
+  CHECK(output_names.size()) << "The number of outputs must be greater than 0.";
+  CHECK_EQ(input_names.size(), input_shapes.size()) << "The quantity of input_names and input_shapes must be equal.";
+  CHECK_EQ(output_names.size(), output_shapes.size())
+      << "The quantity of output_names and output_shapes must be equal.";
+
+  // Initialize data
+  std::vector<float*> input_data_ptrs(input_names.size());
+  for (int i = 0; i < input_shapes.size(); ++i) {
+    int input_data_numel =
+        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1, [](int a, int b) { return a * b; });
+    input_data_ptrs[i] = reinterpret_cast<float*>(malloc(input_data_numel * sizeof(float)));
+    for (int j = 0; j < input_data_numel; ++j) {
+      input_data_ptrs[i][j] = (rand() * 1.f) / RAND_MAX;
+    }
+  }
+  std::vector<float*> test_output_data_ptrs(output_names.size());
+  std::vector<float*> expected_output_data_ptrs(output_names.size());
+  std::vector<int> output_data_numels(output_shapes.size());
+  for (int i = 0; i < output_shapes.size(); ++i) {
+    output_data_numels[i] =
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, [](int a, int b) { return a * b; });
+    test_output_data_ptrs[i] = reinterpret_cast<float*>(malloc(output_data_numels[i] * sizeof(float)));
+    memset(test_output_data_ptrs[i], 0, output_data_numels[i] * sizeof(float));
+    expected_output_data_ptrs[i] = reinterpret_cast<float*>(malloc(output_data_numels[i] * sizeof(float)));
+    memset(expected_output_data_ptrs[i], 0, output_data_numels[i] * sizeof(float));
+  }
+
+  auto launch_kernel_fn = [&](raw_func_type& raw_func, std::vector<float*>& output_data_ptrs) {
+    // Initialize scope
+    Scope scope;
+    // Initialize input data in scope.
+    for (int i = 0; i < input_names.size(); ++i) {
+      AddDataToScope(&scope, target, input_data_ptrs[i], input_names[i], input_shapes[i]);
+    }
+    // Initialize output data in scope.
+    for (int i = 0; i < output_names.size(); ++i) {
+      AddDataToScope(&scope, target, output_data_ptrs[i], output_names[i], output_shapes[i]);
+    }
+
+    // Create Instruction and run
+    Instruction instr(target, &scope, input_names, output_names);
+    CHECK(raw_func) << "The raw_func can not be nullptr.";
+    instr.SetLoweredFunc(reinterpret_cast<void*>(raw_func));
+    // should call Finalize explicitly before Run
+    instr.Finalize();
+    instr.Run();
+
+    // data
+    for (int i = 0; i < output_names.size(); ++i) {
+      const float* result_ptr  = scope.GetTensor(output_names[i])->data<float>();
+      std::string mem_cpy_type = target == common::DefaultNVGPUTarget() ? "DeviceToHost" : "HostToHost";
+      MemoryCopy(result_ptr, output_data_ptrs[i], output_data_numels[i], mem_cpy_type);
+    }
+  };
+
+  // launch and execute test and expected kernel separately
+  launch_kernel_fn(test_func, test_output_data_ptrs);
+  launch_kernel_fn(expected_func, expected_output_data_ptrs);
+
+  // Check result
+  for (int i = 0; i < output_shapes.size(); ++i) {
+    for (int j = 0; j < output_data_numels[i]; ++j) {
+      ASSERT_NEAR(test_output_data_ptrs[i][j], expected_output_data_ptrs[i][j], 1e-4);
+    }
+  }
+
+  // Free memory
+  for (auto ptr : input_data_ptrs) {
+    free(ptr);
+  }
+  for (auto ptr : test_output_data_ptrs) {
+    free(ptr);
+  }
+  for (auto ptr : expected_output_data_ptrs) {
+    free(ptr);
+  }
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
new file mode 100644
index 0000000000000..d8f8feb46babb
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/compiler.h"
+#include "cinn/common/target.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/utils/random_engine.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/* @brief: Function pointer of executable code compiled by CINN.
+ * @params-1: Pointers to all arguments, including input and output.
+ * @params-2: The number of Arguments.
+ * @return: void
+ */
+using raw_func_type = void (*)(void**, int32_t);
+
+// A base utility class for testing AutoGenRule
+class TestAutoGenRuleBase : public ::testing::Test {
+ public:
+  void SetUp() override {
+    srand(0);
+    Context::Global().ResetNameId();
+  }
+  // Initialize context for specified target
+  void Initialize(const common::Target& target);
+
+  // construct an ir::IRSchedule by lowering the specified for following AutoGenRule test
+  ir::IRSchedule MakeIRSchedule(const frontend::Program& test_program,
+                                utils::LinearRandomEngine::StateType rand_seed = -1,
+                                bool apply_manual_schedule                     = false);
+
+  // Get the IR of bodies in IRSchedule
+  std::string GetIR(const ir::IRSchedule& schedule);
+
+  // build ir::Module from the original lowered funcs with their bodies updated by the schedule
+  ir::Module BuildIRModule(const ir::IRSchedule& schedule);
+
+  // generate source code with the built ir module
+  std::string GenSourceCode(const ir::Module& ir_module);
+
+  // generate executable kernel function with the built ir module
+  raw_func_type GenExecutableKernel(const ir::Module& ir_module);
+
+ protected:
+  common::Target target_;
+  std::vector<ir::LoweredFunc> lowered_funcs_;
+  std::unique_ptr<backends::Compiler> backend_compier_;
+};
+
+/* @brief: Interface for checking function correctness.
+ * @params-1: Function pointer of the function to be tested.
+ * @params-2: Expected function pointer for comparison.
+ * @params-3: Names of input data.
+ * @params-4: Names of output data.
+ * @params-5: Shapes of the input data, each input corresponds to a std::vector<int>.
+ * @params-6: Shapes of the output data, each output corresponds to a std::vector<int>.
+ * @params-7: The Target expressing computing platform and architecture of the function to be tested.
+ * @return: void
+ */
+void CheckResult(raw_func_type test_func,
+                 raw_func_type expected_func,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names,
+                 const std::vector<std::vector<int>>& input_shapes,
+                 const std::vector<std::vector<int>>& output_shapes,
+                 const common::Target& target);
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/block_sampler.cc b/paddle/cinn/auto_schedule/search_space/block_sampler.cc
new file mode 100644
index 0000000000000..66cfb8d7bfba1
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/block_sampler.h"
+
+#include <algorithm>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+std::unique_ptr<BlockSampler> BlockSampler::Make(const std::vector<ir::Expr>& all_blocks,
+                                                 bool default_remove_policy,
+                                                 const std::string& strategy,
+                                                 utils::LinearRandomEngine::StateType rand_seed,
+                                                 const std::vector<int>& weights) {
+  CHECK_GT(all_blocks.size(), 0) << "Empty block list";
+  if (strategy == "traversal") {
+    VLOG(6) << "Init TraversalBlockSampler with block num = " << all_blocks.size();
+    return std::make_unique<TraversalBlockSampler>(all_blocks, default_remove_policy);
+  } else if (strategy == "probabilistic") {
+    VLOG(6) << "Init ProbabilisticBlockSampler with block num = " << all_blocks.size();
+    return std::make_unique<ProbabilisticBlockSampler>(all_blocks, default_remove_policy, rand_seed, weights);
+  }
+
+  LOG(FATAL) << "Unimplemented strategy:" << strategy;
+  return nullptr;
+}
+
+BlockSampler::BlockSampler(const std::vector<ir::Expr>& all_blocks, bool default_remove_policy) {
+  default_remove_policy_ = default_remove_policy;
+  std::transform(all_blocks.begin(), all_blocks.end(), std::back_inserter(all_blocks_), [](const ir::Expr& block_expr) {
+    const ir::ScheduleBlockRealize* block_realize = block_expr.As<ir::ScheduleBlockRealize>();
+    const ir::ScheduleBlock* block                = block_realize->schedule_block.As<ir::ScheduleBlock>();
+    return block->name;
+  });
+}
+
+std::string TraversalBlockSampler::NextBlock(bool remove) {
+  if (cur_idx_ < all_blocks_.size()) {
+    VLOG(6) << "[TraversalBlockSampler] next block: " << all_blocks_.at(cur_idx_);
+    std::string block_name = all_blocks_.at(cur_idx_);
+    if (remove) {
+      ++cur_idx_;
+    }
+    return block_name;
+  }
+
+  VLOG(6) << "[TraversalBlockSampler] next block: empty";
+  return "";
+}
+
+ProbabilisticBlockSampler::ProbabilisticBlockSampler(const std::vector<ir::Expr>& all_blocks,
+                                                     bool default_remove_policy,
+                                                     utils::LinearRandomEngine::StateType rand_seed,
+                                                     const std::vector<int>& weights)
+    : BlockSampler(all_blocks, default_remove_policy), weights_(weights), rand_seed_(rand_seed) {
+  if (weights.empty()) {
+    weights_.resize(all_blocks.size(), 1);
+  } else {
+    CHECK_EQ(all_blocks.size(), weights_.size());
+  }
+  remains_ = all_blocks.size();
+}
+
+std::string ProbabilisticBlockSampler::NextBlock(bool remove) {
+  if (remains_ == 0) {
+    return "";
+  }
+  int block_idx = utils::SampleDiscreteFromDistribution<int>(weights_, &rand_seed_);
+  if (remove) {
+    weights_[block_idx] = 0;
+    --remains_;
+  }
+  VLOG(6) << "[ProbabilisticBlockSampler] next block: " << all_blocks_.at(block_idx);
+  return all_blocks_.at(block_idx);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
\ No newline at end of file
diff --git a/paddle/cinn/auto_schedule/search_space/block_sampler.h b/paddle/cinn/auto_schedule/search_space/block_sampler.h
new file mode 100644
index 0000000000000..7135afffb0280
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "cinn/ir/ir_base.h"
+#include "cinn/utils/random_engine.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class SearchState;
+
+// Select the next block to be operated for SearchState during the search process
+class BlockSampler {
+ public:
+  /**
+   * @brief Create a BlockSampler with the specific strategy name and necessary construct parameters.
+   * @param all_blocks All possible blocks to be sampled.
+   * @param default_remove_policy The default option to determine whether to delete the next block after selecting it.
+   * @param strategy The block sampling strategy.
+   *                 Currently, the available strategies are "traversal" and "probabilistic",
+   *                 where "traversal" means to select blocks one by one until all blocks are traversed,
+   *                 and "probabilistic" means randomly picking blocks according to the given distribution.
+   * @param weights Used for the probabilistic policy, giving each candidate a weight.
+   */
+  static std::unique_ptr<BlockSampler> Make(const std::vector<ir::Expr>& all_blocks,
+                                            bool default_remove_policy                     = true,
+                                            const std::string& strategy                    = "traversal",
+                                            utils::LinearRandomEngine::StateType rand_seed = 0,
+                                            const std::vector<int>& weights                = {});
+
+  // Return the name of sample strategy
+  virtual const char* Name() const = 0;
+
+  // Reset associated states to sample at the beginning
+  virtual void Reset() = 0;
+
+  // Select a block with default remove policy.
+  std::string NextBlock() { return NextBlock(default_remove_policy_); }
+
+ protected:
+  // A BlockSampler object should be created with the static function Make()
+  BlockSampler(const std::vector<ir::Expr>& all_blocks, bool default_remove_policy);
+
+  // Select a block to apply rule
+  // The param remove is used to determine whether to delete the next block after selecting it,
+  // If remove == true, it will not be sampled in the future.
+  virtual std::string NextBlock(bool remove) = 0;
+
+  // The names of all blocks
+  // Because the Block Expr will be changed in the search process, the name is saved for indexing
+  std::vector<std::string> all_blocks_;
+
+  // The default policy to determine whether to delete the next block after selecting it.
+  bool default_remove_policy_;
+};
+
+// Sample blocks with traversal strategy,
+// witch means to select blocks one by one until all blocks are traversed.
+class TraversalBlockSampler : public BlockSampler {
+ public:
+  TraversalBlockSampler(const std::vector<ir::Expr>& all_blocks, bool default_remove_policy)
+      : BlockSampler(all_blocks, default_remove_policy), cur_idx_(0) {}
+
+  const char* Name() const override { return "traversal"; }
+
+  void Reset() override { cur_idx_ = 0; }
+
+ private:
+  std::string NextBlock(bool remove) override;
+
+ private:
+  int cur_idx_;
+};
+
+// Sample blocks with probabilistic strategy,
+// witch means randomly picking blocks according to the given distribution.
+class ProbabilisticBlockSampler : public BlockSampler {
+ public:
+  ProbabilisticBlockSampler(const std::vector<ir::Expr>& all_blocks,
+                            bool default_remove_policy,
+                            utils::LinearRandomEngine::StateType rand_seed = 0,
+                            const std::vector<int>& weights                = {});
+
+  const char* Name() const override { return "probabilistic"; }
+
+  void Reset() override {}
+
+ private:
+  std::string NextBlock(bool remove) override;
+
+ private:
+  std::vector<int> weights_;
+  utils::LinearRandomEngine::StateType rand_seed_;
+  int remains_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/block_sampler_test.cc b/paddle/cinn/auto_schedule/search_space/block_sampler_test.cc
new file mode 100644
index 0000000000000..ef07d964dd153
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler_test.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/block_sampler.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+std::vector<ir::Expr> CreateTestBlocks() {
+  std::vector<ir::Expr> blocks;
+  for (int i = 0; i < 3; ++i) {
+    ir::Expr block = ir::ScheduleBlock::Make({}, {}, {}, "block_" + std::to_string(i), ir::Expr());
+    blocks.push_back(ir::ScheduleBlockRealize::Make({}, block));
+  }
+  return blocks;
+}
+
+TEST(BlockSampler, Make) {
+  std::vector<ir::Expr> mock_blocks = CreateTestBlocks();
+  auto traversal_block_sampler      = BlockSampler::Make(mock_blocks, true, "traversal");
+  ASSERT_STREQ(traversal_block_sampler->Name(), "traversal");
+  auto probabilistic_block_sampler = BlockSampler::Make(mock_blocks, true, "probabilistic");
+  ASSERT_STREQ(probabilistic_block_sampler->Name(), "probabilistic");
+}
+
+TEST(TraversalBlockSampler, NextBlock) {
+  std::vector<ir::Expr> blocks = CreateTestBlocks();
+  auto traversal_block_sampler = BlockSampler::Make(blocks, true, "traversal");
+  ASSERT_EQ("block_0", traversal_block_sampler->NextBlock());
+  ASSERT_EQ("block_1", traversal_block_sampler->NextBlock());
+  ASSERT_EQ("block_2", traversal_block_sampler->NextBlock());
+  ASSERT_EQ("", traversal_block_sampler->NextBlock());
+  traversal_block_sampler->Reset();
+  ASSERT_EQ("block_0", traversal_block_sampler->NextBlock());
+
+  traversal_block_sampler = BlockSampler::Make(blocks, false, "traversal");
+  ASSERT_EQ("block_0", traversal_block_sampler->NextBlock());
+  ASSERT_EQ("block_0", traversal_block_sampler->NextBlock());
+}
+
+TEST(ProbabilisticBlockSampler, NextBlock) {
+  std::vector<ir::Expr> blocks     = CreateTestBlocks();
+  auto probabilistic_block_sampler = BlockSampler::Make(blocks, false, "probabilistic", 0, {4, 2, 1});
+  std::string block_name;
+  for (int i = 0; i < 20; ++i) {
+    block_name = probabilistic_block_sampler->NextBlock();
+    VLOG(6) << "next block name: " << block_name;
+  }
+
+  probabilistic_block_sampler = BlockSampler::Make(blocks, true, "probabilistic", 0, {4, 2, 1});
+  probabilistic_block_sampler->NextBlock();
+  probabilistic_block_sampler->NextBlock();
+  probabilistic_block_sampler->NextBlock();
+  ASSERT_EQ("", probabilistic_block_sampler->NextBlock());
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/rule_sampler.cc b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
new file mode 100644
index 0000000000000..3951af427081f
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/rule_sampler.h"
+
+#include <algorithm>
+#include <random>
+
+namespace cinn {
+namespace auto_schedule {
+
+std::unique_ptr<RuleSampler> RuleSampler::Make(const std::vector<AutoGenRule*>& potential_rules,
+                                               bool default_remove_policy,
+                                               const std::string& strategy,
+                                               utils::LinearRandomEngine::StateType rand_seed,
+                                               const std::vector<int>& weights) {
+  CHECK_GT(potential_rules.size(), 0) << "Empty rule list";
+  if (strategy == "traversal") {
+    return std::make_unique<TraversalRuleSampler>(potential_rules, default_remove_policy);
+  } else if (strategy == "probabilistic") {
+    return std::make_unique<ProbabilisticRuleSampler>(potential_rules, default_remove_policy, rand_seed, weights);
+  }
+
+  LOG(FATAL) << "Unimplemented strategy:" << strategy;
+  return nullptr;
+}
+
+AutoGenRule* TraversalRuleSampler::NextRule(bool remove) {
+  if (cur_idx_ < potential_rules_->size()) {
+    AutoGenRule* rule = potential_rules_->at(cur_idx_);
+    if (remove) {
+      ++cur_idx_;
+    }
+    return rule;
+  }
+
+  return nullptr;
+}
+
+ProbabilisticRuleSampler::ProbabilisticRuleSampler(const std::vector<AutoGenRule*>& potential_rules,
+                                                   bool default_remove_policy,
+                                                   utils::LinearRandomEngine::StateType rand_seed,
+                                                   const std::vector<int>& weights)
+    : RuleSampler(potential_rules, default_remove_policy),
+      weights_(weights),
+      rand_seed_(utils::LinearRandomEngine::NormalizeState(rand_seed)) {
+  if (weights.empty()) {
+    weights_.resize(potential_rules.size(), 1);
+  } else {
+    CHECK_EQ(potential_rules.size(), weights_.size());
+  }
+  remains_ = potential_rules.size();
+}
+
+AutoGenRule* ProbabilisticRuleSampler::NextRule(bool remove) {
+  if (remains_ == 0) {
+    return nullptr;
+  }
+  int rule_idx = utils::SampleDiscreteFromDistribution<int>(weights_, &rand_seed_);
+  if (remove) {
+    weights_[rule_idx] = 0;
+    --remains_;
+  }
+
+  return potential_rules_->at(rule_idx);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
\ No newline at end of file
diff --git a/paddle/cinn/auto_schedule/search_space/rule_sampler.h b/paddle/cinn/auto_schedule/search_space/rule_sampler.h
new file mode 100644
index 0000000000000..828e4a775eeb1
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/utils/random_engine.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class SearchState;
+
+// Select the next potential rule for the SearchState during the search process.
+class RuleSampler {
+ public:
+  /**
+   * @brief Create a RuleSampler with the specific strategy name and necessary construct parameters.
+   * @param potential_rules All possible rules to be sampled.
+   * @param default_remove_policy The default option to determine whether to delete the next block after selecting it.
+   * @param strategy The rule sampling strategy.
+   *                 Currently, the available strategies are "traversal" and "probabilistic",
+   *                 where "traversal" means to select rules one by one until all rules are traversed,
+   *                 and "probabilistic" means randomly picking rules according to the given distribution.
+   * @param weights Used for the probabilistic policy, giving each candidate a weight.
+   */
+  static std::unique_ptr<RuleSampler> Make(const std::vector<AutoGenRule*>& potential_rules,
+                                           bool default_remove_policy                     = true,
+                                           const std::string& strategy                    = "traversal",
+                                           utils::LinearRandomEngine::StateType rand_seed = 0,
+                                           const std::vector<int>& weights                = {});
+  // Return the name of sample strategy
+  virtual const char* Name() const = 0;
+
+  // Reset associated states to sample at the beginning
+  virtual void Reset() = 0;
+
+  // Select a rule with default remove policy.
+  AutoGenRule* NextRule() { return NextRule(default_remove_policy_); }
+
+ protected:
+  // A RuleSampler object should be created with the static function Make()
+  RuleSampler(const std::vector<AutoGenRule*>& potential_rules, bool default_remove_policy)
+      : potential_rules_(&potential_rules), default_remove_policy_(default_remove_policy) {}
+
+  // Select a rule to apply.
+  // The param remove is used to determine whether to delete the next rule after selecting it,
+  // If remove == true, it will not be sampled in the future.
+  virtual AutoGenRule* NextRule(bool remove) = 0;
+
+  // The pointer refers to all potential rules
+  const std::vector<AutoGenRule*>* potential_rules_;
+
+  // The default policy to determine whether to delete the next rule after selecting it.
+  bool default_remove_policy_;
+};
+
+// Sample rules with traversal strategy,
+// witch means to select rules one by one until all rules are traversed.
+class TraversalRuleSampler : public RuleSampler {
+ public:
+  TraversalRuleSampler(const std::vector<AutoGenRule*>& potential_rules, bool default_remove_policy)
+      : RuleSampler(potential_rules, default_remove_policy), cur_idx_(0) {}
+
+  const char* Name() const override { return "traversal"; }
+
+  void Reset() override { cur_idx_ = 0; }
+
+ private:
+  AutoGenRule* NextRule(bool remove) override;
+
+ private:
+  int cur_idx_;
+};
+
+// Sample rules with probabilistic strategy,
+// which means randomly picking rules according to the given distribution.
+class ProbabilisticRuleSampler : public RuleSampler {
+ public:
+  ProbabilisticRuleSampler(const std::vector<AutoGenRule*>& potential_rules,
+                           bool default_remove_policy,
+                           utils::LinearRandomEngine::StateType rand_seed = 0,
+                           const std::vector<int>& weights                = {});
+
+  const char* Name() const override { return "probabilistic"; }
+
+  void Reset() override {}
+
+ private:
+  AutoGenRule* NextRule(bool remove) override;
+
+ private:
+  std::vector<int> weights_;
+  utils::LinearRandomEngine::StateType rand_seed_;
+  int remains_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc b/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
new file mode 100644
index 0000000000000..91ca4fd5926b0
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/rule_sampler.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+#ifdef CINN_WITH_CUDA
+Target target = common::DefaultNVGPUTarget();
+#else
+Target target = common::DefaultHostTarget();
+#endif
+
+std::vector<AutoGenRule*> GenerateTestRules() { return {new AutoUnroll(target), new SkipRule(target)}; }
+
+TEST(RuleSampler, Make) {
+  std::vector<AutoGenRule*> rules = GenerateTestRules();
+  auto traversal_block_sampler    = RuleSampler::Make(rules, true, "traversal");
+  ASSERT_STREQ(traversal_block_sampler->Name(), "traversal");
+  auto probabilistic_block_sampler = RuleSampler::Make(rules, true, "probabilistic");
+  ASSERT_STREQ(probabilistic_block_sampler->Name(), "probabilistic");
+}
+
+TEST(TraversalRuleSampler, NextRule) {
+  std::vector<AutoGenRule*> rules = GenerateTestRules();
+  auto traversal_rule_sampler     = RuleSampler::Make(rules, true, "traversal");
+  AutoGenRule* rule               = traversal_rule_sampler->NextRule();
+  ASSERT_EQ("AutoUnroll", rule->GetRuleName());
+  rule = traversal_rule_sampler->NextRule();
+  ASSERT_EQ("SkipRule", rule->GetRuleName());
+  traversal_rule_sampler->Reset();
+  rule = traversal_rule_sampler->NextRule();
+  ASSERT_EQ("AutoUnroll", rule->GetRuleName());
+
+  traversal_rule_sampler = RuleSampler::Make(rules, false, "traversal");
+  rule                   = traversal_rule_sampler->NextRule();
+  ASSERT_EQ("AutoUnroll", rule->GetRuleName());
+  rule = traversal_rule_sampler->NextRule();
+  ASSERT_EQ("AutoUnroll", rule->GetRuleName());
+}
+
+TEST(ProbabilisticRuleSampler, NextRule) {
+  std::vector<AutoGenRule*> rules = GenerateTestRules();
+  auto probabilistic_rule_sampler = RuleSampler::Make(rules, false, "probabilistic", 0, {4, 1});
+  AutoGenRule* rule;
+  for (int i = 0; i < 20; ++i) {
+    rule = probabilistic_rule_sampler->NextRule();
+    VLOG(6) << "next rule name: " << rule->GetRuleName();
+  }
+
+  probabilistic_rule_sampler = RuleSampler::Make(rules, true, "probabilistic", 0, {4, 1});
+  probabilistic_rule_sampler->NextRule();
+  probabilistic_rule_sampler->NextRule();
+  ASSERT_EQ(nullptr, probabilistic_rule_sampler->NextRule());
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/search_space.cc b/paddle/cinn/auto_schedule/search_space/search_space.cc
new file mode 100644
index 0000000000000..af10da2215100
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/search_space.cc
@@ -0,0 +1,301 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/search_space.h"
+
+#include <glog/logging.h>
+
+#include <cstdlib>
+#include <utility>
+#include <vector>
+
+#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
+#include "cinn/auto_schedule/search_space/block_sampler.h"
+#include "cinn/auto_schedule/search_space/rule_sampler.h"
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(auto_schedule_use_cost_model);
+
+namespace cinn {
+namespace auto_schedule {
+
+SearchSpace::SearchSpace(const TuneTask& tune_task, utils::LinearRandomEngine::StateType rand_seed)
+    : tune_task_(tune_task), rand_seed_(utils::LinearRandomEngine::NormalizeState(rand_seed)) {
+  const auto& target = tune_task_.target;
+  // initialize a set of rules and they are commonly used by all states
+  // TODO(zhhsplendid): pass correct output names to AutoInline
+  // sketch_rules_.emplace_back(new AutoInline(target, tune_task_.output_names));
+  sketch_rules_.emplace_back(new MultiLevelTiling(target, MultiLevelTiling::kConfigs.at(target.arch)));
+  sketch_rules_.emplace_back(new AutoUnroll(target));
+  sketch_rules_.emplace_back(new SkipRule(target));
+}
+
+SearchState SearchSpace::GetScheduleMutate(const SearchState& state, const ExprCostModel& cost_model) {
+  bool has_manual_schedule = false;
+  if (has_manual_schedule) {
+    SearchState ret = ManualScheduleMutate(state);
+    return ret;
+  }
+  SearchState ret = RandomScheduleMutate(state);
+  if (FLAGS_auto_schedule_use_cost_model) {
+    ret->predicted_cost = cost_model.Predict(ret->ir_schedule.GetModule(), tune_task_.target);
+  }
+  VLOG(4) << JoinStatesDebugString("SearchSpace::GetScheduleMutate", {state}, /*verbose=*/VLOG_IS_ON(5));
+  return ret;
+}
+
+SearchState SearchSpace::ManualScheduleMutate(const SearchState& state) {
+  // TODO(zhhsplendid): Add manual schedule mutate
+  return state;
+}
+
+SearchState SearchSpace::RandomScheduleMutate(const SearchState& state) {
+  // 1. Found the schedules which can apply on this Expr
+  // 2. Make a distribution on those schedules
+  std::map<int, int> weight_to_rule_index;
+  int cur_weight = 0;
+  SearchState ret(state);
+  std::vector<RuleApplyType> apply_types(ret->applicable_rules.size());
+  for (int idx = 0; idx != ret->applicable_rules.size(); ++idx) {
+    AutoGenRule* rule        = ret->applicable_rules.at(idx);
+    RuleApplyType apply_type = rule->Init(&ret->ir_schedule);
+    VLOG(6) << "Evaluate rule:" << rule->GetRuleName() << "=" << static_cast<int>(apply_type);
+    apply_types[idx] = apply_type;
+    if (apply_type != RuleApplyType::kCannotApply) {
+      weight_to_rule_index[cur_weight] = idx;
+      cur_weight += rule->NumberApplicable();
+    }
+  }
+
+  if (weight_to_rule_index.empty()) {
+    // No applicable rule, return the input mod_expr
+    VLOG(6) << "No applicable rule";
+    return ret;
+  }
+
+  // 3. Sample a schedule on the distribution
+  int sample_weighted_index = utils::SampleUniformInt(0, cur_weight, &rand_seed_);
+
+  auto iter = weight_to_rule_index.upper_bound(sample_weighted_index);
+  --iter;
+
+  int sample_rule_index = iter->second;
+  CHECK_LT(sample_rule_index, ret->applicable_rules.size());
+  AutoGenRule* sample_rule = ret->applicable_rules.at(sample_rule_index);
+  VLOG(7) << "Apply rule: " << sample_rule->GetRuleName() << " with index=" << sample_weighted_index - iter->first;
+  // 4. Apply the schedule change
+  sample_rule->Apply(sample_weighted_index - iter->first);
+
+  // 5. Remove the rule after applying it
+  if (apply_types.at(sample_rule_index) != RuleApplyType::kCannotApply) {
+    ret->applicable_rules.erase(ret->applicable_rules.begin() + sample_rule_index);
+  }
+
+  return ret;
+}
+
+std::vector<SearchState> SearchSpace::InitSketchWithRandomStrategy(int num) {
+  VLOG(5) << "SearchSpace::GetRandomInitialSketch with num=" << num;
+  ir::IRSchedule init_schedule(ir::ModuleExpr(tune_task_.GetLoweredFuncBodyExprs()),
+                               utils::ForkRandomState(&rand_seed_));
+  std::vector<AutoGenRule*> init_rules;
+  std::transform(sketch_rules_.begin(), sketch_rules_.end(), std::back_inserter(init_rules), [](const auto& rule) {
+    return rule.get();
+  });
+  std::vector<SearchState> result;
+  while (result.size() < num) {
+    SearchState state(init_schedule, SearchState::NOT_INIT_COST, init_rules);
+    for (int i = 0; i < init_sketch_random_depth_; ++i) {
+      VLOG(6) << "Generating random sketch with RandomScheduleMutate at depth: " << i;
+      state = RandomScheduleMutate(state);
+      if (state->applicable_rules.empty()) {
+        break;
+      }
+    }
+
+    VLOG(5) << JoinStatesDebugString(
+        "SearchSpace::GetRandomInitialSketch-New_Sketch", {state}, /*verbose=*/VLOG_IS_ON(6));
+    result.emplace_back(std::move(state));
+  }
+  return result;
+}
+
+std::vector<SearchState> SearchSpace::InitSketchWithRandomPrunedStrategy() {
+  VLOG(5) << "SearchSpace::InitSketchWithRandomPrunedStrategy";
+  ir::IRSchedule init_schedule(ir::ModuleExpr(tune_task_.GetLoweredFuncBodyExprs()),
+                               utils::ForkRandomState(&rand_seed_));
+  auto all_blocks    = init_schedule.GetAllBlocks();
+  auto block_sampler = BlockSampler::Make(all_blocks, true, "probabilistic", utils::ForkRandomState(&rand_seed_));
+
+  std::vector<AutoGenRule*> init_rules;
+  std::transform(sketch_rules_.begin(), sketch_rules_.end() - 1, std::back_inserter(init_rules), [](const auto& rule) {
+    return rule.get();
+  });
+  CHECK(init_rules.size() > 0) << "number of init rules cannot be 0";
+
+  SearchState init_state(init_schedule, SearchState::NOT_INIT_COST, {});
+  std::vector<SearchState> states_buf1{init_state}, states_buf2;
+  std::vector<SearchState>* p_states_cur  = &states_buf1;
+  std::vector<SearchState>* p_states_next = &states_buf2;
+  int total_steps                         = 0, steps;
+  std::string block_name;
+  while ("" != (block_name = block_sampler->NextBlock()) && total_steps < init_sketch_random_depth_) {
+    steps = utils::SampleUniformInt(1, init_rules.size() + 1, &rand_seed_);
+    if (total_steps + steps > init_sketch_random_depth_) {
+      steps = init_sketch_random_depth_ - total_steps;
+    }
+    total_steps += steps;
+    p_states_next->clear();
+    for (const auto& state : *p_states_cur) {
+      auto rule_sampler = RuleSampler::Make(init_rules, true, "probabilistic", utils::ForkRandomState(&rand_seed_));
+      auto new_states   = ApplySketchRule(state, block_name, rule_sampler.get(), steps, false, 1);
+      p_states_next->insert(p_states_next->end(), new_states.begin(), new_states.end());
+    }
+    std::swap(p_states_cur, p_states_next);
+  }
+  VLOG(5) << JoinStatesDebugString(
+      "SearchSpace::InitSketchWithRandomPrunedStrategy", *p_states_cur, /*verbose=*/VLOG_IS_ON(6));
+  return *p_states_cur;
+}
+
+std::vector<SearchState> SearchSpace::InitSketchWithRulePrunedStrategy() {
+  VLOG(5) << "SearchSpace::InitSketchWithRulePrunedStrategy";
+  ir::IRSchedule init_schedule(ir::ModuleExpr(tune_task_.GetLoweredFuncBodyExprs()),
+                               utils::ForkRandomState(&rand_seed_));
+  auto all_blocks = init_schedule.GetAllBlocks();
+  std::reverse(all_blocks.begin(), all_blocks.end());
+  auto block_sampler = BlockSampler::Make(all_blocks, true, "traversal");
+
+  std::vector<AutoGenRule*> init_rules;
+  std::transform(sketch_rules_.begin(), sketch_rules_.end() - 1, std::back_inserter(init_rules), [](const auto& rule) {
+    return rule.get();
+  });
+  CHECK(init_rules.size() > 0) << "number of init rules cannot be 0";
+
+  SearchState init_state(init_schedule, SearchState::NOT_INIT_COST, {});
+  std::vector<SearchState> states_buf1{init_state}, states_buf2;
+  std::vector<SearchState>* p_states_cur  = &states_buf1;
+  std::vector<SearchState>* p_states_next = &states_buf2;
+  std::string block_name;
+  while ("" != (block_name = block_sampler->NextBlock())) {
+    p_states_next->clear();
+    for (const auto& state : *p_states_cur) {
+      auto rule_sampler = RuleSampler::Make(init_rules, true, "traversal");
+      auto new_states   = ApplySketchRule(state, block_name, rule_sampler.get(), 0, true);
+      p_states_next->insert(p_states_next->end(), new_states.begin(), new_states.end());
+    }
+    std::swap(p_states_cur, p_states_next);
+  }
+  VLOG(5) << JoinStatesDebugString(
+      "SearchSpace::InitSketchWithRulePrunedStrategy", *p_states_cur, /*verbose=*/VLOG_IS_ON(6));
+  return *p_states_cur;
+}
+
+std::vector<SearchState> SearchSpace::GenerateSketches(int num, const std::string& strategy) {
+  VLOG(4) << "SearchSpace::GenerateSketches with num = " << num;
+
+  if (strategy == "random") {
+    return InitSketchWithRandomStrategy(num);
+  }
+
+  std::vector<SearchState> result;
+  while (result.size() < num) {
+    std::vector<SearchState> sketchs;
+    if (strategy == "rule_prune") {
+      sketchs = InitSketchWithRulePrunedStrategy();
+    } else if (strategy == "random_prune") {
+      sketchs = InitSketchWithRandomPrunedStrategy();
+    } else {
+      LOG(FATAL) << "Unimplemented init sketch strategy";
+    }
+
+    // the more rules are applied, the greater the possibility of good results,
+    // the more rules are applied, the more they are saved behind the queue,
+    // so we give priority to the results in the rear
+    for (auto iter = sketchs.rbegin(); iter != sketchs.rend(); ++iter) {
+      result.push_back(*iter);
+      if (result.size() == num) {
+        break;
+      }
+    }
+  }
+  VLOG(4) << JoinStatesDebugString("SearchSpace::GenerateSketches", result, /*verbose=*/VLOG_IS_ON(5));
+  return result;
+}
+
+std::vector<SearchState> SearchSpace::ApplySketchRule(const SearchState& state,
+                                                      const std::string& block_name,
+                                                      RuleSampler* rule_sampler,
+                                                      int steps,
+                                                      bool prune_by_rule,
+                                                      double prune_probability) {
+  std::list<SearchState> layer{state};
+  int step = 0;
+  AutoGenRule* rule;
+  // After determining a SearchState and a block, each rule has two possibilities: apply and not apply.
+  // In all transfer spaces, select a rule at each step, and collect all possible new states arrived by apply and not
+  // apply. This forms a tree, and we can use rule pruning or random pruning to reduce the number of sketches.
+  VLOG(6) << "Collect the states of all transfers within steps: " << steps;
+  while ((step++ < steps || steps == 0) && (rule = rule_sampler->NextRule())) {
+    VLOG(7) << "step = " << step << ", rule: " << rule->GetRuleName();
+    std::list<SearchState> new_states;
+    int id = 0;
+    for (std::list<SearchState>::iterator iter = layer.begin(); iter != layer.end();) {
+      // Some rules will reduce the number of blocks, such as AutoInline,
+      // so we need to check whether the SearchState still has the block.
+      if (!(*iter)->ir_schedule.HasBlock(block_name)) {
+        ++iter;
+        continue;
+      }
+      auto type = rule->AnalyseApplyType(*iter, block_name);
+      VLOG(7) << "At SearchState " << ++id
+              << ", apply type = " << static_cast<typename std::underlying_type<RuleApplyType>::type>(type);
+      // if cannot apply the rule, skip it
+      if (type == RuleApplyType::kCannotApply) {
+        ++iter;
+        continue;
+      }
+      // if can apply the rule, apply it and determine whether to prune the branch that do not apply
+      std::vector<SearchState> tmp_states = rule->ApplyOnBlock(*iter, block_name);
+      new_states.insert(new_states.end(), tmp_states.begin(), tmp_states.end());
+      bool need_prune = false;
+      if (prune_by_rule) {
+        need_prune = (type == RuleApplyType::kApplyAndPruneOtherRules);
+      } else {
+        need_prune = (utils::SampleUniformDouble(0, 1, &rand_seed_) < prune_probability);
+      }
+      if (need_prune) {
+        iter = layer.erase(iter);
+      } else {
+        ++iter;
+      }
+    }
+    VLOG(7) << "apply on block: " << block_name << ", generate " << new_states.size() << " new states at step " << step;
+    layer.splice(layer.end(), std::move(new_states));
+  }
+  VLOG(6) << "apply on block: " << block_name << ", generate " << layer.size() - 1 << " more states at all";
+  return std::vector<SearchState>(layer.begin(), layer.end());
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/search_space.h b/paddle/cinn/auto_schedule/search_space/search_space.h
new file mode 100644
index 0000000000000..afa87174ca2c9
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/search_space.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/auto_schedule/search_space/rule_sampler.h"
+#include "cinn/auto_schedule/search_space/search_state.h"
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/**
+ * This class is an abstraction of the transformations can be applied to
+ * ir::Expr during auto-tuning. The transformation can be:
+ *
+ * 1. Manual defined schedule
+ * 2. Schedule generated by AutoGenRule
+ *
+ * TODO(zhhsplendid): de-duplication the generated ModuleExpr
+ */
+class SearchSpace {
+ public:
+  SearchSpace(const TuneTask& tune_task, utils::LinearRandomEngine::StateType rand_seed = -1);
+
+  // Sketch mutate, returns the mutated ModuleExpr and estimited cost
+  virtual SearchState GetScheduleMutate(const SearchState& state, const ExprCostModel& cost_model);
+
+  /**
+   * \brief Generate sketch as initial population of evolutionary search.
+   * @param num The number of sketches to generate.
+   * @param strategy The strategy to generate sketchs,
+   *        Current optional strategies are "rule_prune" or "random_prune" or "random".
+   * - "rule_prune": will use rules to prune and generate sketches as efficiently as possible.
+   * - "random_prune": will use the new interface ApplySketchRules() to simulate the random generation of sketches,
+   *    and supports the function of a rule returning multiple SearchStates and random pruning by probability.
+   * - "random": will randomly select a block and a rule to apply and repeat this step several times,
+   *    however, each rule can only be used on one SearchState at most once.
+   * @return  Generated sketchs.
+   */
+  virtual std::vector<SearchState> GenerateSketches(int num, const std::string& strategy);
+
+ private:
+  // TODO(zhhsplendid): mutate by manual schedule.
+  SearchState ManualScheduleMutate(const SearchState& state);
+
+  // mutate by sketch rules randomly
+  SearchState RandomScheduleMutate(const SearchState& state);
+
+  // Generate num sketchs, each with several rounds of SketchMutate
+  std::vector<SearchState> InitSketchWithRandomStrategy(int num);
+
+  // Generate sketch pruned randomly as initial population of evolutionary search
+  std::vector<SearchState> InitSketchWithRandomPrunedStrategy();
+
+  // Generate sketch pruned by rules as initial population of evolutionary search
+  std::vector<SearchState> InitSketchWithRulePrunedStrategy();
+
+  /**
+   * @brief Collect the new states that may be transferred to after applying several rules on a block from a certain
+   * state.
+   * @param state Starting point of state transition.
+   * @param block_name Name of the block to apply the rules to.
+   * @param rule_sampler Sampler that samples the new rule to apply on the block.
+   * @param steps Number of steps to apply the rule.
+   * @param prune_by_rule If true, prune the state transition tree by rule, otherwise prune randomly.
+   * @param prune_probability Pruning probability of random pruning.
+   */
+  std::vector<SearchState> ApplySketchRule(const SearchState& state,
+                                           const std::string& block_name,
+                                           RuleSampler* rule_sampler,
+                                           int steps,
+                                           bool prune_by_rule,
+                                           double prune_probability = 1);
+
+ private:
+  const TuneTask& tune_task_;
+  int init_sketch_random_depth_ = 6;
+  // supported AutoGenRules, every task holds a set
+  std::vector<std::unique_ptr<AutoGenRule>> sketch_rules_;
+  utils::LinearRandomEngine::StateType rand_seed_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/search_space_test.cc b/paddle/cinn/auto_schedule/search_space/search_space_test.cc
new file mode 100644
index 0000000000000..2e1064ba7f929
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/search_space_test.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/search_space.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn {
+namespace auto_schedule {}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/search_state.cc b/paddle/cinn/auto_schedule/search_space/search_state.cc
new file mode 100644
index 0000000000000..48f9e8532085f
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/search_state.cc
@@ -0,0 +1,152 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/search_state.h"
+
+#include <memory>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/utils/functional.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+SearchState::SearchState(ir::IRSchedule ir_sch, float cost, const std::vector<AutoGenRule*>& rules)
+    : common::Shared<_SearchState_>(common::make_shared<_SearchState_>()) {
+  auto* state             = get();
+  state->ir_schedule      = std::move(ir_sch);
+  state->applicable_rules = rules;
+  state->predicted_cost   = cost;
+}
+
+SearchState SearchState::Copy() const { return SearchState((*this)->ir_schedule, (*this)->predicted_cost, {}); }
+
+std::string _SearchState_::DebugString() const {
+  const auto& exprs = ir_schedule.GetModule().GetExprs();
+  std::stringstream module_stream;
+  for (auto i = 0; i < exprs.size(); ++i) {
+    module_stream << "Expr " << i << " {\n" << exprs.at(i) << "\n}  // end Expr";
+  }
+
+  const char* fmt_str = R"ROC(
+ModuleExpr {
+%s
+} // end ModuleExpr
+ScheduleDesc {
+%s
+} // end ScheduleDesc
+predicted_cost: %f)ROC";
+
+  return utils::StringFormat(
+      fmt_str, module_stream.str().c_str(), ir_schedule.GetTraceDesc().DebugString().c_str(), predicted_cost);
+}
+
+bool operator<(const SearchState& left, const SearchState& right) {
+  return left->predicted_cost < right->predicted_cost;
+}
+
+// Visit every node by expanding all of their fields in dfs order
+class DfsWithExprsFields : public ir::IRVisitor {
+ protected:
+#define __m(t__)                          \
+  void Visit(const ir::t__* x) override { \
+    for (auto* n : x->expr_fields()) {    \
+      if (n->defined()) {                 \
+        Visit(n);                         \
+      }                                   \
+    }                                     \
+  }
+
+  NODETY_FORALL(__m)
+#undef __m
+
+  void Visit(const Expr* expr) override { IRVisitor::Visit(expr); }
+};
+
+// Generate a reduce hash of a AST tree by combining hash of each AST node
+class IrNodesStructuralHash : public DfsWithExprsFields {
+ public:
+  IrNodesStructuralHash(size_t init_key) : hash_key_(init_key) {}
+  size_t operator()(const Expr* expr) {
+    Visit(expr);
+    return hash_key_;
+  }
+
+  void Visit(const Expr* expr) override {
+    static decltype(ir::kIrNodeTyReprs) Node2Name = ir::kIrNodeTyReprs;
+    if (!expr->defined()) return;
+    auto type_code = static_cast<IrNodeTyUnderlyingType>(expr->node_type());
+    hash_key_      = utils::HashCombine(hash_key_, type_code);
+    DfsWithExprsFields::Visit(expr);
+  }
+
+ private:
+  void Visit(const ir::_Tensor_* x) override {
+    for (auto& e : x->shape) {
+      Visit(&e);
+    }
+    DfsWithExprsFields::Visit(x->buffer.As<ir::_Buffer_>());
+  }
+
+  using IrNodeTyUnderlyingType = std::underlying_type<ir::IrNodeTy>::type;
+  size_t hash_key_;
+};
+
+size_t SearchStateHash::operator()(const SearchState& s) const {
+  size_t hash_key   = 0;
+  const auto& exprs = s->ir_schedule.GetModule().GetExprs();
+  for (auto&& expr : exprs) {
+    hash_key = IrNodesStructuralHash(hash_key)(&expr);
+  }
+  return hash_key;
+}
+
+bool SearchStateEqual::operator()(const SearchState& lhs, const SearchState& rhs) const {
+  const auto& lhs_exprs = lhs->ir_schedule.GetModule().GetExprs();
+  const auto& rhs_exprs = rhs->ir_schedule.GetModule().GetExprs();
+  // compare exprs size firstly
+  if (lhs_exprs.size() != rhs_exprs.size()) return false;
+
+  // compare every expr one by one with ir::IrEqualVisitor
+  for (int i = 0; i < lhs_exprs.size(); ++i) {
+    ir::IrEqualVisitor compartor(/*allow_name_suffix_diff=*/true);  // ignore suffix difference in name
+    if (!compartor.Compare(lhs_exprs[i], rhs_exprs[i])) return false;
+  }
+  return true;
+}
+
+std::string JoinStatesDebugString(const std::string& title, const std::vector<SearchState>& states, bool verbose) {
+  std::stringstream ss;
+  ss << title << " states size:" << states.size() << "\n";
+  SearchStateHash state_hasher;
+  for (size_t i = 0; i < states.size(); ++i) {
+    uint64_t hash_key = state_hasher(states[i]);
+    if (verbose) {
+      ss << "\tState-" << i << " hash:" << hash_key << "\t content:------>" << states[i]->DebugString() << "\n<------";
+    } else {
+      ss << "\tState-" << i << " hash:" << hash_key << "\n";
+    }
+  }
+  return std::move(*ss.rdbuf()).str();
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/search_state.h b/paddle/cinn/auto_schedule/search_space/search_state.h
new file mode 100644
index 0000000000000..db2bfa3f7e276
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/search_state.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <limits>
+#include <vector>
+
+#include "cinn/common/object.h"
+#include "cinn/common/shared.h"
+#include "cinn/ir/ir_compare.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/ir_visitor.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+struct _SearchState_;
+class AutoGenRule;
+
+//! Shared Wrapper for _SearchState_
+class SearchState : public common::Shared<_SearchState_> {
+ public:
+  SearchState() = default;
+  // create a new SearchState
+  explicit SearchState(ir::IRSchedule ir_sch, float cost = NOT_INIT_COST, const std::vector<AutoGenRule*>& rules = {});
+
+  // Constant standing for a cost not being initialized
+  static constexpr float NOT_INIT_COST = std::numeric_limits<float>::max();
+  // compare function for two states
+  friend bool operator<(const SearchState& left, const SearchState& right);
+
+  // Deep copy a SearchState
+  SearchState Copy() const;
+};
+
+//! Class to store immediate states during search
+struct _SearchState_ : public common::Object {
+  // IRSchedule contains ir::ModuleExpr and trace scheduling process
+  ir::IRSchedule ir_schedule;
+  // Cost model predicted cost
+  float predicted_cost;
+  // The rules that can be applied to the IRSchedule at this state.
+  std::vector<AutoGenRule*> applicable_rules;
+
+  // return detail string of content for debug;
+  std::string DebugString() const;
+
+  const char* type_info() const override { return __type_info__; }
+  static constexpr char* __type_info__ = "auto_schedule_state";
+};
+
+// SearchStateHash hash functor that visits every AST node and combine their hash of node_type in dfs order
+struct SearchStateHash {
+  size_t operator()(const SearchState& s) const;
+};
+
+// SearchStateHash equal functor, use ir::IrEqualVisitor to compare their AST struct and fields
+struct SearchStateEqual {
+  bool operator()(const SearchState& lhs, const SearchState& rhs) const;
+};
+
+/*!
+ * \brief concatenate debug strings of all states with additional info
+ * \param title head of the result string
+ * \param states SearchState array to be debugged
+ * \param verbose whether to enable more verbose debug info
+ * \return the concatenated debug string
+ */
+std::string JoinStatesDebugString(const std::string& title,
+                                  const std::vector<SearchState>& states,
+                                  bool verbose = false);
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/search_state_test.cc b/paddle/cinn/auto_schedule/search_space/search_state_test.cc
new file mode 100644
index 0000000000000..598fc95317589
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_space/search_state_test.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_space/search_state.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/common/context.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(TestSearchState, SearchStateHash_Equal) {
+  Target target = common::DefaultHostTarget();
+
+  ir::Expr M(32);
+  ir::Expr N(32);
+
+  lang::Placeholder<float> A("A", {M, N});
+  ir::Tensor B = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) + ir::Expr(2.f); }, "B");
+  ir::Tensor C = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) + B(i, j); }, "C");
+
+  cinn::common::Context::Global().ResetNameId();
+  auto a_plus_const_funcs_1 =
+      lang::LowerVec("A_plus_const", poly::CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
+
+  cinn::common::Context::Global().ResetNameId();
+  auto a_plus_const_funcs_2 =
+      lang::LowerVec("A_plus_const", poly::CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
+
+  cinn::common::Context::Global().ResetNameId();
+  auto a_plus_b_funcs = lang::LowerVec("A_plus_B", poly::CreateStages({A, C}), {A, C}, {}, {}, nullptr, target, true);
+
+  std::string a_plus_const_funcs_1_str = R"ROC(function A_plus_const (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 32)
+      {
+        ScheduleBlock(B)
+        {
+          i0, i1 = axis.bind(i, j)
+          B[i0, i1] = (A[i0, i1] + 2.00000000f)
+        }
+      }
+    }
+  }
+})ROC";
+
+  std::string a_plus_const_funcs_2_str = R"ROC(function A_plus_const (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 32)
+      {
+        ScheduleBlock(B)
+        {
+          i0, i1 = axis.bind(i, j)
+          B[i0, i1] = (A[i0, i1] + 2.00000000f)
+        }
+      }
+    }
+  }
+})ROC";
+
+  std::string a_plus_b_funcs_str = R"ROC(function A_plus_B (_A, _C)
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 32)
+        {
+          ScheduleBlock(B)
+          {
+            i0, i1 = axis.bind(i, j)
+            B[i0, i1] = (A[i0, i1] + 2.00000000f)
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 32)
+        {
+          ScheduleBlock(C)
+          {
+            i0_0, i1_0 = axis.bind(i, j)
+            C[i0_0, i1_0] = (A[i0_0, i1_0] + B[i0_0, i1_0])
+          }
+        }
+      }
+    }
+  }
+})ROC";
+
+  ASSERT_EQ(a_plus_const_funcs_1.size(), 1);
+  EXPECT_EQ(a_plus_const_funcs_1_str, utils::GetStreamCnt(a_plus_const_funcs_1.front()));
+  ASSERT_EQ(a_plus_const_funcs_2.size(), 1);
+  EXPECT_EQ(a_plus_const_funcs_2_str, utils::GetStreamCnt(a_plus_const_funcs_2.front()));
+  ASSERT_EQ(a_plus_b_funcs.size(), 1);
+  EXPECT_EQ(a_plus_b_funcs_str, utils::GetStreamCnt(a_plus_b_funcs.front()));
+
+  SearchState a_plus_const_state1(ir::IRSchedule(ir::ModuleExpr({a_plus_const_funcs_1.front()->body})));
+  SearchState a_plus_const_state2(ir::IRSchedule(ir::ModuleExpr({a_plus_const_funcs_2.front()->body})));
+  SearchState a_plus_b_state(ir::IRSchedule(ir::ModuleExpr({a_plus_b_funcs.front()->body})));
+
+  SearchStateHash hash_functor;
+  SearchStateEqual equal_functor;
+  ASSERT_EQ(hash_functor(a_plus_const_state1), hash_functor(a_plus_const_state2));
+  ASSERT_TRUE(equal_functor(a_plus_const_state1, a_plus_const_state2));
+  ASSERT_NE(hash_functor(a_plus_const_state1), hash_functor(a_plus_b_state));
+  ASSERT_FALSE(equal_functor(a_plus_const_state1, a_plus_b_state));
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt b/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
new file mode 100644
index 0000000000000..a31e01c801a57
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_subdirectory(mutate_rule)
+
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS evolutionary_search.cc)
+
+cc_test(test_evolutionary_search SRCS evolutionary_search_test.cc DEPS cinncore test_program_builder)
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
new file mode 100644
index 0000000000000..c938718ad06af
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
@@ -0,0 +1,302 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_strategy/evolutionary_search.h"
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <utility>
+
+#include "cinn/auto_schedule/database/database.h"
+#include "cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
+#include "cinn/auto_schedule/search_space/search_space.h"
+#include "cinn/auto_schedule/search_space/search_state.h"
+#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
+#include "cinn/auto_schedule/task/task_registry.h"
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/auto_schedule/tuning.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/utils/multi_threading.h"
+#include "cinn/utils/sized_multi_set.h"
+#include "cinn/utils/string.h"
+
+DECLARE_bool(auto_schedule_use_cost_model);
+
+namespace cinn {
+namespace auto_schedule {
+
+EvolutionarySearch::EvolutionarySearch(const TuneTask& tune_task,
+                                       const ExprCostModel& cost_model,
+                                       Database* database,
+                                       utils::LinearRandomEngine::StateType rand_seed,
+                                       const std::vector<std::tuple<std::string, double>>& mutate_rules)
+    : tune_task_(tune_task),
+      cost_model_(cost_model),
+      database_(database),
+      rand_seed_(utils::LinearRandomEngine::NormalizeState(rand_seed)),
+      mutators_(mutate_rules) {
+  search_space_ = std::make_unique<SearchSpace>(tune_task, utils::ForkRandomState(&rand_seed_));
+  if (mutators_.empty()) {
+    mutators_.push_back(std::make_tuple("mutate_tile_size", 1.0));
+  }
+  double accum_weight = 0.0;
+  for (const auto& mutator : mutators_) {
+    if (std::get<1>(mutator) > 0) {
+      accum_weight += std::get<1>(mutator);
+      weighted_mutators_.insert(std::make_pair(accum_weight, MutateRule::Make(std::get<0>(mutator))));
+    }
+  }
+
+  post_schedule_rules_.emplace_back(new CooperativeProcess);
+}
+
+EvolutionarySearch::~EvolutionarySearch() {}
+
+SearchState EvolutionarySearch::SearchModuleExpr(const TuningOptions& options) {
+  return SearchModuleExprBests(options)[0];
+}
+
+std::vector<SearchState> EvolutionarySearch::SearchModuleExprBests(const TuningOptions& options) {
+  VLOG(4) << "start SearchModuleExprBests with initial statistics: visited_candidates size="
+          << visited_candidates_.size();
+  std::vector<SearchState> init_population;
+  std::vector<SearchState> topk_from_database = GetTopKCandidatesFromDatabase(options.evolution_pick_database_topk);
+  VLOG(4) << JoinStatesDebugString(
+      "EvolutionarySearch::GetTopKCandidatesFromDatabase", topk_from_database, /*verbose=*/VLOG_IS_ON(5));
+  int init_num = options.evolution_init_population_num - topk_from_database.size();
+
+  std::vector<SearchState> init_sketch = InitSketch(init_num, "rule_prune");
+  VLOG(4) << JoinStatesDebugString("EvolutionarySearch::InitSketch", init_sketch, /*verbose=*/VLOG_IS_ON(5));
+
+  init_population.insert(init_population.end(), topk_from_database.begin(), topk_from_database.end());
+  init_population.insert(init_population.end(), init_sketch.begin(), init_sketch.end());
+
+  std::vector<SearchState> picked_bests =
+      Evolve(init_population, options.evolution_cross_over_num, options.num_samples_per_iteration);
+  VLOG(4) << JoinStatesDebugString("EvolutionarySearch::Evolve", picked_bests, /*verbose=*/VLOG_IS_ON(5));
+  return picked_bests;
+}
+
+std::vector<SearchState> EvolutionarySearch::SearchModuleExprEpsGreedy(const TuningOptions& options) {
+  std::vector<SearchState> picked_bests = SearchModuleExprBests(options);
+  int random_num                        = options.evolution_init_population_num - options.evolution_pick_database_topk;
+  auto results                          = PickNextGenerationEpsGreedy(picked_bests,
+                                             InitSketch(random_num, "random_prune"),
+                                             options.num_samples_per_iteration,
+                                             options.evolution_eps_greedy);
+  VLOG(4) << JoinStatesDebugString(
+      "EvolutionarySearch::PickNextGenerationEpsGreedy", results, /*verbose=*/VLOG_IS_ON(5));
+  return results;
+}
+
+std::vector<SearchState> EvolutionarySearch::GetTopKCandidatesFromDatabase(int topk) {
+  std::vector<SearchState> results;
+  const auto& task_key               = tune_task_.serialized_key;
+  auto records                       = database_->GetTopK(task_key, topk);
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+  for (auto&& record : records) {
+    ir::IRSchedule ir_sch(optim::IRCopy(task_registry->Get(task_key)->module_expr),
+                          utils::ForkRandomState(&rand_seed_));
+    ir::ScheduleDesc::ReplayWithProto(record.trace, &ir_sch);
+    results.emplace_back(SearchState(std::move(ir_sch), record.predicted_cost));
+  }
+  return results;
+}
+
+void ApplyPostScheduleRules(ir::IRSchedule* schedule,
+                            const std::vector<std::unique_ptr<PostScheduleRule>>& post_schedule_rules) {
+  schedule->TagPostSchedule();
+  for (const auto& post_rule : post_schedule_rules) {
+    post_rule->Apply(schedule);
+  }
+}
+
+std::vector<SearchState> EvolutionarySearch::InitSketch(int num, const std::string& strategy) {
+  VLOG(4) << "InitSketch with num:" << num << ", strategy: " << strategy;
+  std::vector<SearchState> states = search_space_->GenerateSketches(num, strategy);
+  auto post_schedule_fn           = [this, &states](int index) {
+    ApplyPostScheduleRules(&states[index]->ir_schedule, post_schedule_rules_);
+  };
+  utils::parallel_run(post_schedule_fn, utils::SequenceDispatcher(0, states.size()), states.size());
+
+  return states;
+}
+
+SearchState EvolutionarySearch::CrossOver(const SearchState& state1, const SearchState& state2) {
+  // TODO(CtfGo): tracing CrossOver with IRSchedule
+  std::vector<ir::Expr> cross_over_exprs;
+  std::vector<ir::Expr> father_exprs = state1->ir_schedule.GetModule().GetExprs();
+  std::vector<ir::Expr> mother_exprs = state2->ir_schedule.GetModule().GetExprs();
+
+  CHECK_EQ(father_exprs.size(), mother_exprs.size())
+      << "CrossOver ModuleExpr in EvolutionarySearch must have same number of AST";
+
+  for (size_t i = 0; i < father_exprs.size(); ++i) {
+    if (utils::SampleUniformInt(0, 2, &rand_seed_) == 0) {
+      cross_over_exprs.push_back(optim::IRCopy(father_exprs[i]));
+    } else {
+      cross_over_exprs.push_back(optim::IRCopy(mother_exprs[i]));
+    }
+  }
+  auto res = SearchState(ir::IRSchedule(ir::ModuleExpr(cross_over_exprs), utils::ForkRandomState(&rand_seed_)));
+  if (FLAGS_auto_schedule_use_cost_model) {
+    res->predicted_cost = cost_model_.Predict(res->ir_schedule.GetModule(), tune_task_.target);
+  }
+  VLOG(5) << JoinStatesDebugString("EvolutionarySearch::CrossOver", {state1, state2, res}, /*verbose=*/VLOG_IS_ON(6));
+  return res;
+}
+
+SearchState EvolutionarySearch::Mutate(const SearchState& state, utils::LinearRandomEngine::StateType* rand_seed) {
+  CHECK_GT(weighted_mutators_.size(), 0) << "There is no mutate rule can be applied.";
+  double accu_weight = (weighted_mutators_.rbegin())->first;
+  CHECK_GT(accu_weight, 0) << "The accumulate weight must be greater than 0.";
+  // sample a mutate rule
+  double sample_weight = utils::SampleUniformDouble(0, accu_weight, rand_seed);
+  auto sampled_iter    = weighted_mutators_.upper_bound(sample_weight);
+  MutateRule* mutator  = sampled_iter->second.get();
+  CHECK(mutator) << "mutator not defined";
+  // apply mutation on the trace of SearchState
+  auto trace     = state->ir_schedule.GetTraceDesc();
+  auto new_trace = mutator->Apply(trace, rand_seed);
+  // replay the mutated trace on original ModuleExpr to generate a new ir_schedule
+  const auto& task_key               = tune_task_.serialized_key;
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+  ir::IRSchedule new_ir_sch(optim::IRCopy(task_registry->Get(task_key)->module_expr),
+                            utils::ForkRandomState(rand_seed));
+  new_trace.Replay(&new_ir_sch, true);
+  ApplyPostScheduleRules(&new_ir_sch, post_schedule_rules_);
+  auto res = SearchState(std::move(new_ir_sch));
+
+  VLOG(5) << JoinStatesDebugString("EvolutionarySearch::Mutate", {state, res}, /*verbose=*/VLOG_IS_ON(6));
+  return res;
+}
+
+std::vector<SearchState> EvolutionarySearch::Evolve(const std::vector<SearchState>& population,
+                                                    int cross_over_num,
+                                                    int ret_num) {
+  VLOG(4) << utils::StringFormat(
+      "Evolve with population size=%lu,cross_over_num:%lu,ret_num:%lu", population.size(), cross_over_num, ret_num);
+  int generation_num = population.size();
+  if (generation_num == 0) {
+    return std::vector<SearchState>();
+  }
+  // init evolution
+  std::vector<SearchState> evolution(population);
+  for (SearchState& search_state : evolution) {
+    if (search_state->predicted_cost == SearchState::NOT_INIT_COST && FLAGS_auto_schedule_use_cost_model) {
+      search_state->predicted_cost = cost_model_.Predict(search_state->ir_schedule.GetModule(), tune_task_.target);
+    }
+  }
+  VLOG(4) << JoinStatesDebugString("EvolutionarySearch::Evolve: Init evolution:", evolution, /*verbose=*/VLOG_IS_ON(5));
+  // cross over
+  for (int i = 0; i < cross_over_num; ++i) {
+    int first_rand_idx  = utils::SampleUniformInt(0, generation_num, &rand_seed_);
+    int second_rand_idx = utils::SampleUniformInt(0, generation_num, &rand_seed_);
+    while (first_rand_idx == second_rand_idx) {
+      second_rand_idx = utils::SampleUniformInt(0, generation_num, &rand_seed_);
+    }
+    evolution.push_back(CrossOver(population[first_rand_idx], population[second_rand_idx]));
+  }
+  VLOG(4) << JoinStatesDebugString(
+      "EvolutionarySearch::Evolve: after CrossOver evolution:", evolution, /*verbose=*/VLOG_IS_ON(5));
+  // mutate
+  std::vector<SearchState> mutated_individuals(evolution.size());
+  std::vector<utils::LinearRandomEngine::StateType> rand_seeds(evolution.size());
+  for (int i = 0; i < rand_seeds.size(); ++i) {
+    rand_seeds[i] = utils::ForkRandomState(&rand_seed_);
+  }
+  auto mutate_fn = [this, &evolution, &mutated_individuals, &rand_seeds](int index) {
+    mutated_individuals[index] = Mutate(evolution[index], &rand_seeds[index]);
+  };
+  utils::parallel_run(mutate_fn, utils::SequenceDispatcher(0, evolution.size()), evolution.size());
+  if (FLAGS_auto_schedule_use_cost_model) {
+    for (size_t i = 0; i < mutated_individuals.size(); ++i) {
+      mutated_individuals[i]->predicted_cost =
+          cost_model_.Predict(mutated_individuals[i]->ir_schedule.GetModule(), tune_task_.target);
+    }
+  }
+  VLOG(4) << JoinStatesDebugString(
+      "EvolutionarySearch::Evolve: mutated individuals:", mutated_individuals, /*verbose=*/VLOG_IS_ON(5));
+  // select top ret_num with predicted cost
+  utils::SizedMultiSet<SearchState> evolution_with_cost(ret_num);
+  for (size_t i = 0; i < evolution.size(); ++i) {
+    evolution_with_cost.Push(evolution[i]);
+  }
+  for (size_t i = 0; i < mutated_individuals.size(); ++i) {
+    evolution_with_cost.Push(mutated_individuals[i]);
+  }
+  auto selected_individuals = evolution_with_cost.ReturnAsContainer<std::vector<SearchState>>();
+  VLOG(4) << JoinStatesDebugString(
+      "EvolutionarySearch::Evolve: selected individuals:", selected_individuals, /*verbose=*/VLOG_IS_ON(5));
+
+  return selected_individuals;
+}
+
+std::vector<SearchState> EvolutionarySearch::PickNextGenerationEpsGreedy(const std::vector<SearchState>& picked_bests,
+                                                                         const std::vector<SearchState>& random_init,
+                                                                         int num,
+                                                                         float eps_greedy) {
+  int num_rands = num * eps_greedy;
+  int num_bests = num - num_rands;
+
+  std::vector<SearchState> result;
+  SearchState selected;
+  int deduplicated_cnt = 0;
+  int best_idx         = 0;
+  int rand_idx         = 0;
+  while (result.size() < num) {
+    if (result.size() < num_bests && best_idx < picked_bests.size()) {
+      selected = picked_bests[best_idx];
+      ++best_idx;
+    } else if (rand_idx < random_init.size()) {
+      selected = random_init[rand_idx];
+      ++rand_idx;
+    } else if (best_idx < picked_bests.size()) {
+      selected = picked_bests[best_idx];
+      ++best_idx;
+    } else {
+      break;
+    }
+
+    if (!visited_candidates_.count(selected)) {  // deduplicate
+      VLOG(4) << JoinStatesDebugString(
+          "EvolutionarySearch::PickNextGenerationEpsGreedy-Selected", {selected}, /*verbose=*/VLOG_IS_ON(5));
+      visited_candidates_.insert(selected);
+      result.push_back(selected);
+    } else {
+      ++deduplicated_cnt;
+      VLOG(4) << JoinStatesDebugString(
+          "EvolutionarySearch::PickNextGenerationEpsGreedy-Deduplicated", {selected}, /*verbose=*/VLOG_IS_ON(5));
+    }
+  }
+
+  VLOG(4) << utils::StringFormat(
+      "PickNextGenerationEpsGreedy: picked_bests size=%lu,random_init size=%lu,num=%d,"
+      "eps_greedy=%f,deduplicated_cnt=%d,result size=%lu",
+      picked_bests.size(),
+      random_init.size(),
+      num,
+      eps_greedy,
+      deduplicated_cnt,
+      result.size());
+  return result;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h
new file mode 100644
index 0000000000000..40e5bb9f7e889
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "cinn/auto_schedule/database/database.h"
+#include "cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h"
+#include "cinn/auto_schedule/search_space/search_space.h"
+#include "cinn/auto_schedule/search_space/search_state.h"
+#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h"
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/auto_schedule/tuning.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/**
+ * Class implement the evolutionary search on ModuleExpr search space.
+ */
+class EvolutionarySearch {
+ public:
+  /**
+   * constructor with TuneTask.
+   *
+   * @param tune_task: the TuneTask this class works on. This class doesn't
+   *     take ownership of the pointer.
+   */
+  EvolutionarySearch(const TuneTask& tune_task,
+                     const ExprCostModel& cost_model,
+                     Database* database,
+                     utils::LinearRandomEngine::StateType rand_seed                   = -1,
+                     const std::vector<std::tuple<std::string, double>>& mutate_rules = {});
+
+  /**
+   * Destructor
+   */
+  ~EvolutionarySearch();
+
+  /**
+   * Run the evolutionary search for one iteration.
+   *
+   * @return SearchState containing the best ir::ModuleExpr searched in this iteration
+   */
+  SearchState SearchModuleExpr(const TuningOptions& options);
+
+  /**
+   * Run the evolutionary search for one iteration.
+   *
+   * @return SearchState(s) containing best ir::ModuleExpr(s) searched in this iteration
+   */
+  std::vector<SearchState> SearchModuleExprBests(const TuningOptions& options);
+
+  /**
+   * Run the evolutionary search for one iteration, but since evolutionary
+   * search with cost model may not be accurate, this method picks
+   * "eps * total_return_size" random samples along with those best
+   * ir::ModuleExpr's searched in this iteration.
+   *
+   * @return SearchSpace containing those best ir::ModuleExpr's searched
+   *     in this iteration and some random samples. There are
+   *     "eps * total_return_size" random samples and
+   *     "(1 - eps) * total_return_size" best searched samples.
+   */
+  std::vector<SearchState> SearchModuleExprEpsGreedy(const TuningOptions& options);
+
+#ifdef CINN_WITH_TEST
+  /**
+   * Method only be called during testing. It is used to set mock search
+   * space.
+   *
+   * @param search_space: the mock search space, note that EvolutionarySearch
+   *     takes the ownership.
+   */
+  void SetSearchSpace(SearchSpace* search_space) { search_space_.reset(search_space); }
+
+  // Method only be called during testing, it is a wrapper of private method InitSketch().
+  std::vector<SearchState> TestInitSketch(int num, const std::string& strategy) { return InitSketch(num, strategy); }
+
+  // Method only be called during testing, it is a wrapper of private method Evolve().
+  std::vector<SearchState> TestEvolve(const std::vector<SearchState>& population, int cross_over_num, int ret_num) {
+    return Evolve(population, cross_over_num, ret_num);
+  }
+#endif
+
+ private:
+  std::vector<SearchState> GetTopKCandidatesFromDatabase(int topk);
+
+  /**
+   * \brief Generate sketch as initial population of evolutionary search.
+   * @param num The number of sketches to generate.
+   * @param strategy The strategy to generate sketches,
+   *        Current optional strategies are "rule_prune" or "random_prune" or "random".
+   * - "rule_prune": will use rules to prune and generate sketches as efficiently as possible.
+   * - "random_prune": will use the new interface ApplySketchRules() to simulate the random generation of sketches,
+   *    and supports the function of a rule returning multiple SearchStates and random pruning by probability.
+   * - "random": will randomly select a block and a rule to apply and repeat this step several times,
+   *    however, each rule can only be used on one SearchState at most once.
+   * @return  Generated sketches.
+   */
+  std::vector<SearchState> InitSketch(int num, const std::string& strategy);
+
+  SearchState Mutate(const SearchState& state, utils::LinearRandomEngine::StateType* rand_seed);
+
+  SearchState CrossOver(const SearchState& state1, const SearchState& state2);
+
+  std::vector<SearchState> Evolve(const std::vector<SearchState>& population, int cross_over_num, int ret_num);
+
+  std::vector<SearchState> PickNextGenerationEpsGreedy(const std::vector<SearchState>& population,
+                                                       const std::vector<SearchState>& random_init,
+                                                       int num,
+                                                       float eps_greedy);
+
+ private:
+  std::unique_ptr<SearchSpace> search_space_;
+  const TuneTask& tune_task_;
+  const ExprCostModel& cost_model_;  // not owned
+  Database* database_;               // not owned
+  // used to duplicate states with the same structural IR
+  std::unordered_set<SearchState, SearchStateHash, SearchStateEqual> visited_candidates_;
+  // mutate rule names and their weights
+  std::vector<std::tuple<std::string, double>> mutators_;
+  // mutate rules, the key is the accumulate weight of each mutate rule
+  std::map<double, std::unique_ptr<MutateRule>> weighted_mutators_;
+  // schedule rules used after mutation
+  std::vector<std::unique_ptr<PostScheduleRule>> post_schedule_rules_;
+  utils::LinearRandomEngine::StateType rand_seed_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
new file mode 100644
index 0000000000000..4f6764b41f65a
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
@@ -0,0 +1,196 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_strategy/evolutionary_search.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+
+#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "cinn/auto_schedule/database/database.h"
+#include "cinn/auto_schedule/search_space/search_space.h"
+#include "cinn/auto_schedule/search_space/search_state.h"
+#include "cinn/auto_schedule/task/task_creator.h"
+#include "cinn/auto_schedule/task/task_registry.h"
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/auto_schedule/tuning.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "tests/program_builder.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+std::vector<TuneTask> CreateTasks(const frontend::Program& program, const Target& target) {
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  TaskCreator task_creator;
+  auto tasks             = task_creator.CreateTuneTaskOpLevel(graph.get());
+  const auto& dtype_dict = graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+  const auto& shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+  auto op_lowerer        = std::make_unique<hlir::framework::OpLowerer>(dtype_dict, shape_dict, target);
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+  for (auto i = 0; i < tasks.size(); ++i) {
+    tasks[i].Initialize(shape_dict, dtype_dict, op_lowerer.get());
+    task_registry->Regist(tasks[i].serialized_key, ir::ModuleExpr(tasks[i].GetLoweredFuncBodyExprs()));
+  }
+  return tasks;
+}
+
+/**
+ * A mock search space is only used for test. It creates integer ir::Expr from
+ * 0, -1, -2, ... and set the cost value same as the integer value.
+ *
+ * So evolutionary search should be able to find the minimal ModuleExpr with
+ * smallest ir::Expr. This file tests it.
+ */
+class MockSearchSpace : public SearchSpace {
+ public:
+  MockSearchSpace(const TuneTask& tune_task) : SearchSpace(tune_task) {}
+
+  int GetMinExprValue() const { return min_expr_value_; }
+
+  int GetModuleExprSize() const { return module_expr_size_; }
+
+  std::vector<SearchState> GenerateSketches(int num, const std::string& strategy) override {
+    std::vector<SearchState> ret;
+    for (int i = 0; i < num; ++i) {
+      std::vector<ir::Expr> exprs;
+      for (int j = 0; j < module_expr_size_; ++j) {
+        exprs.push_back(ir::Expr(-i));
+      }
+      min_expr_value_ = -i;
+      ret.push_back(SearchState(ir::IRSchedule(ir::ModuleExpr(exprs))));
+    }
+    return ret;
+  }
+
+ private:
+  int module_expr_size_ = 10;
+  int min_expr_value_   = 0;
+};
+
+class MockCostModel : public ExprCostModel {
+  float Predict(const ir::ModuleExpr& sample, const common::Target& target) const override {
+    float cost                  = 0.0f;
+    std::vector<ir::Expr> exprs = sample.GetExprs();
+    for (const ir::Expr& expr : exprs) {
+      if (expr.as_int32()) {
+        cost += static_cast<float>((expr.as_int32()));
+      }
+    }
+    return cost;
+  }
+};
+
+TEST(EvolutionarySearch, GetOneBest) {
+  TuneTask mock_tune_task;
+  mock_tune_task.serialized_key      = "mock_task";
+  mock_tune_task.target              = common::DefaultTarget();
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+  task_registry->Regist(mock_tune_task.serialized_key, ir::ModuleExpr({ir::Expr(0)}));
+  MockCostModel cost_model;
+  TuningOptions options;
+  Database db(2);
+  EvolutionarySearch evolutionary_search(mock_tune_task, cost_model, &db);
+
+  MockSearchSpace* mock_search_space = new MockSearchSpace(mock_tune_task);
+  // Ownership is transferred so don't delete mock_search_space
+  evolutionary_search.SetSearchSpace(mock_search_space);
+  SearchState best_state      = evolutionary_search.SearchModuleExpr(options);
+  std::vector<ir::Expr> exprs = best_state->ir_schedule.GetModule().GetExprs();
+  EXPECT_GE(exprs.size(), 1UL);
+  for (const ir::Expr& e : exprs) {
+    EXPECT_EQ(e.as_int32(), mock_search_space->GetMinExprValue());
+  }
+}
+
+TEST(EvolutionarySearch, GetEpsGreedy) {
+  TuneTask mock_tune_task;
+  mock_tune_task.serialized_key      = "mock_task";
+  mock_tune_task.target              = common::DefaultTarget();
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+  task_registry->Regist(mock_tune_task.serialized_key, ir::ModuleExpr({ir::Expr(0)}));
+  ExprCostModel cost_model;
+  TuningOptions options;
+  Database db(2);
+  EvolutionarySearch evolutionary_search(mock_tune_task, cost_model, &db);
+
+  MockSearchSpace* mock_search_space = new MockSearchSpace(mock_tune_task);
+  // Ownership is transferred so don't delete mock_search_space
+  evolutionary_search.SetSearchSpace(mock_search_space);
+  std::vector<SearchState> search_states = evolutionary_search.SearchModuleExprEpsGreedy(options);
+
+  EXPECT_GE(search_states.size(), 1UL);
+  size_t expr_size = static_cast<size_t>(mock_search_space->GetModuleExprSize());
+  for (const SearchState& state : search_states) {
+    EXPECT_EQ(state->ir_schedule.GetModule().GetExprs().size(), expr_size);
+  }
+}
+
+TEST(EvolutionarySearch, Evolve) {
+  auto target = common::DefaultNVGPUTarget();
+  auto tasks  = CreateTasks(tests::OpBuilder("matmul").Build({{"X", {32, 32}}, {"Y", {32, 32}}}), target);
+  CHECK_EQ(tasks.size(), 1);
+  ExprCostModel cost_model;
+  std::vector<const ir::ModuleExpr*> cost_model_samples(1);
+  std::vector<float> cost_model_labels(1);
+  for (size_t i = 0; i < 2; ++i) {
+    ir::ModuleExpr me({ir::Expr(tasks[0].lowered_funcs[0])});
+    cost_model_samples[0] = &me;
+    cost_model_labels[0]  = i + 10;
+    cost_model.Update(cost_model_samples, cost_model_labels, target);
+  }
+
+  Database db(2);
+  TuningOptions options;
+  options.evolution_pick_database_topk = 0;
+
+  EvolutionarySearch evolutionary_search(tasks[0], cost_model, &db);
+
+  int num_population                   = 10;
+  std::vector<SearchState> init_sketch = evolutionary_search.TestInitSketch(num_population, "rule_prune");
+  for (int i = 0; i < num_population; ++i) {
+    ir::ModuleExpr me(init_sketch[i]->ir_schedule.GetModule());
+    cost_model_samples[0] = &me;
+    cost_model_labels[0]  = i;
+    cost_model.Update(cost_model_samples, cost_model_labels, target);
+  }
+  VLOG(6) << "init sketch costs:";
+  for (auto s : init_sketch) {
+    VLOG(6) << "cost = " << s->predicted_cost;
+  }
+  std::vector<SearchState>*population_pre_ptr = &init_sketch, *population_next_ptr;
+  std::vector<SearchState> population;
+  for (int i = 0; i < 10; ++i) {
+    population          = evolutionary_search.TestEvolve(*population_pre_ptr, /*cross_over_num*/ 0, /*ret_num*/ 10);
+    population_next_ptr = &population;
+    VLOG(6) << "population[" << i + 1 << "] costs:";
+    double total_cost_pre = 0.0, total_cost_next = 0.0;
+    for (auto s : *population_pre_ptr) {
+      total_cost_pre += s->predicted_cost;
+    }
+    for (auto s : *population_next_ptr) {
+      total_cost_next += s->predicted_cost;
+      VLOG(6) << "cost = " << s->predicted_cost;
+    }
+    VLOG(6) << "total_cost_next = " << total_cost_next;
+    CHECK_LE(total_cost_next, total_cost_pre);
+    std::swap(population_pre_ptr, population_next_ptr);
+  }
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/CMakeLists.txt b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/CMakeLists.txt
new file mode 100644
index 0000000000000..308f9a91feea5
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/CMakeLists.txt
@@ -0,0 +1,8 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+  mutate_rule.cc
+  mutate_tile_size.cc
+	)
+
+cc_test(test_mutate_tile_size SRCS mutate_tile_size_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.cc b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.cc
new file mode 100644
index 0000000000000..8e07e0d572788
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h"
+
+#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+std::unique_ptr<MutateRule> MutateRule::Make(const std::string& name) {
+  if (name == "mutate_tile_size") {
+    return std::make_unique<MutateTileSize>();
+  } else {
+    LOG(FATAL) << "MutateRule " << name << " is not supported.";
+  }
+  return nullptr;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h
new file mode 100644
index 0000000000000..b650a9c746763
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/ir/schedule_desc.h"
+#include "cinn/utils/random_engine.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/**
+ * Base class for rules of mutate,
+ * is used for mutating the trace(ScheduleDesc) to explore the search space.
+ */
+class MutateRule {
+ public:
+  MutateRule() = default;
+
+  /**
+   * @brief Apply the mutate rule to the given trace.
+   * @param trace The given trace for mutation.
+   * @param rand_seed The random seed for mutation.
+   * @return The mutated trace.
+   */
+  virtual ir::ScheduleDesc Apply(const ir::ScheduleDesc& trace, utils::LinearRandomEngine::StateType* rand_seed) = 0;
+
+  /**
+   * @brief Create a MutateRule with name.
+   * @param name The name of mutate rule, consisting of lowercase letters and underscores
+   * @return The created MutateRule.
+   */
+  static std::unique_ptr<MutateRule> Make(const std::string& name);
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.cc b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.cc
new file mode 100644
index 0000000000000..bc59bf668198d
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.cc
@@ -0,0 +1,142 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::ir::ScheduleDesc;
+using ::cinn::utils::LinearRandomEngine;
+
+using SampledTile = std::tuple<ScheduleDesc::Step, std::vector<int>, int>;
+
+static std::vector<int> Factorize(int n) {
+  std::vector<int> res;
+  for (int i = 1; i * i <= n; ++i) {
+    if (n % i == 0) {
+      res.push_back(i);
+      if (i * i != n) {
+        res.push_back(n / i);
+      }
+    }
+  }
+  std::sort(res.begin(), res.end());
+  return res;
+}
+
+std::vector<SampledTile> FindSampledTiles(const ScheduleDesc& trace) {
+  std::vector<SampledTile> tiles;
+  int step_idx = 0;
+  for (auto&& step : trace.Steps()) {
+    if (step.type == "TagPostSchedule") {
+      break;
+    }
+    if (step.type == "SamplePerfectTile") {
+      std::vector<int> tile_factors = absl::get<std::vector<int>>(step.attrs.at("decision"));
+      CHECK(tile_factors.size() >= 2) << "factors size must be greater equal than 2, which is " << tile_factors.size();
+      tiles.push_back(std::make_tuple(step, tile_factors, step_idx));
+    }
+    ++step_idx;
+  }
+
+  return tiles;
+}
+
+ScheduleDesc DoMutateTileSize(const ScheduleDesc& trace,
+                              const SampledTile& tile,
+                              LinearRandomEngine::StateType* rand_seed) {
+  ScheduleDesc::Step step       = std::get<0>(tile);
+  std::vector<int> tile_factors = std::get<1>(tile);
+  int split_size                = tile_factors.size();
+  // Step 1. Choose 2 loops with index: 'loop_x' and 'loop_y'
+  int loop_x, loop_y;
+
+  bool all_one_factors = true;
+  for (int t : tile_factors) {
+    if (t != 1) {
+      all_one_factors = false;
+      break;
+    }
+  }
+  if (all_one_factors) {
+    VLOG(6) << "Factors are all 1, unable to mutate, return the original trace";
+    return trace;
+  }
+
+  while (true) {
+    VLOG(6) << "while (true) loop in DoMutateTileSize";
+    loop_x = utils::SampleUniformInt(0, split_size, rand_seed);
+    if (tile_factors.at(loop_x) <= 1) {
+      continue;
+    }
+    loop_y = utils::SampleUniformInt(0, split_size - 1, rand_seed);
+    if (loop_y >= loop_x) {
+      ++loop_y;
+    }
+    std::vector<int> optional_factors = Factorize(tile_factors.at(loop_x));
+    // Step 2. Choose the divisor for mutate.
+    int divisor;
+    if (loop_y == split_size - 1) {
+      int max_innermost_factor    = absl::get<int>(step.attrs.at("max_innermost_factor"));
+      int max_optional_factor_idx = optional_factors.size() - 1;
+      for (; max_optional_factor_idx > 0; --max_optional_factor_idx) {
+        if (optional_factors.at(max_optional_factor_idx) * tile_factors.at(loop_y) <= max_innermost_factor) {
+          break;
+        }
+      }
+      if (max_optional_factor_idx == 0) {
+        if (split_size <= 2) {
+          VLOG(6) << "Unable to mutate, return the original trace";
+          return trace;
+        }
+        continue;
+      }
+      divisor = optional_factors.at(utils::SampleUniformInt(1, max_optional_factor_idx + 1, rand_seed));
+    } else {
+      divisor = optional_factors.at(utils::SampleUniformInt(1, optional_factors.size(), rand_seed));
+    }
+    // Step 3. Determine the new tile value
+    VLOG(6) << "DoMutateTileSize: divisor = " << divisor << ", before mutate: \n"
+            << "factors[" << loop_x << "] = " << tile_factors[loop_x] << ", factors[" << loop_y
+            << "] = " << tile_factors[loop_y];
+    tile_factors[loop_x] /= divisor;
+    tile_factors[loop_y] *= divisor;
+    VLOG(6) << "after mutate: \n"
+            << "factors[" << loop_x << "] = " << tile_factors[loop_x] << ", factors[" << loop_y
+            << "] = " << tile_factors[loop_y];
+    // Step 4. Create a new step with new tile values and return the new trace
+    int step_idx = std::get<2>(tile);
+    return trace.ForkAndUpdate(step_idx, tile_factors, true);
+  }
+}
+
+ScheduleDesc MutateTileSize::Apply(const ScheduleDesc& trace, LinearRandomEngine::StateType* rand_seed) {
+  VLOG(6) << "Start applying MutateTileSize, old trace: \n" << trace.DebugString();
+  std::vector<ScheduleDesc::Step> sample_tile_steps;
+  std::vector<std::vector<int>> sample_tile_data;
+
+  auto sampled_tiles = FindSampledTiles(trace);
+  if (sampled_tiles.size() == 0) {
+    VLOG(6) << "MutateTileSize failed, try other mutate rules.";
+    return trace;
+  }
+  int sample_step_idx = utils::SampleUniformInt(0, sampled_tiles.size(), rand_seed);
+  auto new_trace      = DoMutateTileSize(trace, sampled_tiles.at(sample_step_idx), rand_seed);
+  VLOG(6) << "End applying MutateTileSize, new trace: \n" << new_trace.DebugString();
+  return new_trace;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h
new file mode 100644
index 0000000000000..2313a38577c38
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/**
+ * The rule to mutate tile size, witch will modify the factors of the Split primitive.
+ */
+class MutateTileSize : public MutateRule {
+ public:
+  MutateTileSize() = default;
+
+  ir::ScheduleDesc Apply(const ir::ScheduleDesc& trace, utils::LinearRandomEngine::StateType* rand_seed) override;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
new file mode 100644
index 0000000000000..c8b4ce0a27ae6
--- /dev/null
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(MutateTileSize, Basic) {
+  srand(0);
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  const int kSize = 32;
+  Expr M(kSize);
+  Expr N(kSize);
+  Expr K(kSize);
+
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "reduce_axis_k");
+  ir::Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  poly::StageMap stages = CreateStages({A, B, C});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestMutateTileSize_Basic", stages, {A, B, C}, {}, {}, nullptr, target, true);
+
+  ir::Expr ast_expr = funcs[0]->body;
+  VLOG(6) << "Original Expr: ";
+  VLOG(6) << ast_expr;
+  ir::ModuleExpr module_expr({ast_expr});
+  // We need to fix the seed as a constant to ensure that the result can be repeated.
+  utils::LinearRandomEngine::StateType rand_seed = 123;
+  ir::IRSchedule ir_schedule(module_expr, rand_seed);
+  ir::IRSchedule new_ir_schedule(ir_schedule);
+
+  // apply schedule
+  auto loops   = ir_schedule.GetLoops("C");
+  auto factors = ir_schedule.SamplePerfectTile(loops[0], 2, kSize);
+  auto splited = ir_schedule.Split(loops[0], factors);
+
+  // apply mutate
+  MutateTileSize mutator;
+  ir::ScheduleDesc sch_desc = mutator.Apply(ir_schedule.GetTraceDesc(), &rand_seed);
+  sch_desc.Replay(&new_ir_schedule, true);
+  VLOG(6) << "Expr before mutate tile size: \n" << ir_schedule.GetModule().GetExprs()[0];
+  VLOG(6) << "Expr after mutate tile size: \n" << new_ir_schedule.GetModule().GetExprs()[0];
+
+  std::string target_new_ir = R"ROC({
+  ScheduleBlock(root)
+  {
+    serial for (i_1, 0, 2)
+    {
+      serial for (i_2, 0, 16)
+      {
+        serial for (j, 0, 32)
+        {
+          ScheduleBlock(C__reduce_init)
+          {
+            i0, i1 = axis.bind(((16 * i_1) + i_2), j)
+            C__reduce_init[i0, i1] = 0.00000000f
+          }
+          serial for (reduce_axis_k, 0, 32)
+          {
+            ScheduleBlock(C)
+            {
+              i0_0, i1_0, i2 = axis.bind(((16 * i_1) + i_2), j, reduce_axis_k)
+              C[i0_0, i1_0] = (C[i0_0, i1_0] + (A[i0_0, i2] * B[i2, i1_0]))
+            }
+          }
+        }
+      }
+    }
+  }
+})ROC";
+
+  auto get_ir_str = [](const ir::IRSchedule* ir_sch) -> std::string {
+    std::vector<ir::Expr> exprs = ir_sch->GetModule().GetExprs();
+    EXPECT_EQ(exprs.size(), 1UL);
+    std::stringstream ss;
+    ss << exprs[0];
+    return ss.str();
+  };
+  ASSERT_EQ(get_ir_str(&new_ir_schedule), target_new_ir);
+
+  std::vector<int> last_tile_factors = {2, 16};
+  for (int i = 0; i < 10; ++i) {
+    sch_desc = mutator.Apply(sch_desc, &rand_seed);
+    for (auto&& step : sch_desc.Steps()) {
+      if (step.type == "SamplePerfectTile") {
+        std::vector<int> tile_factors = absl::get<std::vector<int>>(step.attrs.at("decision"));
+        ASSERT_EQ(tile_factors.size(), last_tile_factors.size());
+        ASSERT_NE(tile_factors[0], last_tile_factors[0]);
+        ASSERT_NE(tile_factors[1], last_tile_factors[1]);
+        ASSERT_EQ(tile_factors[0] * tile_factors[1], kSize);
+        last_tile_factors = tile_factors;
+      }
+    }
+  }
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task/CMakeLists.txt b/paddle/cinn/auto_schedule/task/CMakeLists.txt
new file mode 100644
index 0000000000000..f3dc34dad4c86
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/CMakeLists.txt
@@ -0,0 +1,12 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    task_creator.cc
+    task_optimizer.cc
+    tune_task.cc
+    )
+gather_srcs(cinnapi_src SRCS task_creator.cc task_optimizer.cc)
+
+cc_test(test_task_creator SRCS task_creator_test.cc DEPS cinncore)
+cc_test(test_tune_task SRCS tune_task_test.cc DEPS cinncore)
+cc_test(test_task_registry SRCS task_registry_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/task/task_creator.cc b/paddle/cinn/auto_schedule/task/task_creator.cc
new file mode 100644
index 0000000000000..6d62ec2a7278d
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/task_creator.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/task/task_creator.h"
+
+#include <glog/logging.h>
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/pass.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::common::GraphEdge;
+using ::cinn::common::GraphNode;
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::Node;
+using ::cinn::hlir::framework::NodeData;
+
+std::vector<TuneTask> TaskCreator::CreateTuneTaskOpLevel(Graph* graph) {
+  std::vector<TuneTask> ret_tasks;
+
+  const std::vector<std::shared_ptr<Graph::Group>>* groups = &graph->fusion_groups;
+  std::vector<std::shared_ptr<Graph::Group>> non_fused_groups;
+  // The input graph doesn't run Op Fusion
+  if (graph->fusion_groups.empty()) {
+    hlir::framework::ApplyPasses(graph, {"BuildNonFusedGroupsPass"});
+    groups = &graph->fusion_groups;
+  }
+  VLOG(3) << "Graph groups size:" << groups->size();
+
+  for (const auto& sub_graph : *groups) {
+    ret_tasks.emplace_back(TuneTask());
+    ret_tasks.back().subgraph = sub_graph;
+    ret_tasks.back().target   = graph->target_;
+  }
+  return ret_tasks;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task/task_creator.h b/paddle/cinn/auto_schedule/task/task_creator.h
new file mode 100644
index 0000000000000..6dd600f54e340
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/task_creator.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+/**
+ * Class to create auto tune task.
+ */
+class TaskCreator {
+ public:
+  std::vector<TuneTask> CreateTuneTaskOpLevel(hlir::framework::Graph* graph);
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task/task_creator_test.cc b/paddle/cinn/auto_schedule/task/task_creator_test.cc
new file mode 100644
index 0000000000000..fe5638108e884
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/task_creator_test.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/task/task_creator.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/node.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::frontend::NetBuilder;
+using ::cinn::frontend::Program;
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::Node;
+
+Program CreateAddProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b       = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c       = builder.Add(a, b);
+  auto d       = builder.Add(a, c);
+  auto program = builder.Build();
+
+  return program;
+}
+
+TEST(TaskCreator, Basic) {
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  Program prog = CreateAddProgram();
+  auto graph   = std::make_shared<hlir::framework::Graph>(prog, target);
+
+  TaskCreator task_creator;
+  std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
+
+  ASSERT_EQ(tasks.size(), 2UL);
+  for (TuneTask& task : tasks) {
+    std::shared_ptr<Graph::Group> subgraph = task.subgraph;
+    ASSERT_EQ(subgraph->CollectNodes().size(), 1UL);
+    ASSERT_EQ(subgraph->nodes[0]->op()->name, "elementwise_add");
+  }
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc
new file mode 100644
index 0000000000000..b4afd2fa0bd4b
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
@@ -0,0 +1,407 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/task/task_optimizer.h"
+
+#include <glog/logging.h>
+
+#include <functional>
+#include <limits>
+
+#include "cinn/auto_schedule/analysis/analyze_ir.h"
+#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "cinn/auto_schedule/measure/measure.h"
+#include "cinn/auto_schedule/search_strategy/evolutionary_search.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/op_lowering.h"
+#include "cinn/hlir/op/external_api_registry.h"
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/transform_gpu_forloop.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/string.h"
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime_api.h>
+
+#include "cinn/backends/cuda_util.h"
+#endif
+
+DECLARE_bool(auto_schedule_use_cost_model);
+
+namespace cinn {
+namespace auto_schedule {
+
+using cinn::hlir::op::ExternalApiRegistry;
+
+// *** forward declarations of auxiliary functions to be used in this file only ***
+// update a scheduled function with several post-processors
+ir::LoweredFunc FuncWithUpdatedBody(const common::Target& target, const ir::LoweredFunc& old_func, ir::Expr& body);
+// check whether a scheduled lowered function is valid
+bool PruneInvalid(const ir::LoweredFunc& lowered_func, const common::Target& target);
+// exclude some special tasks
+bool IsForbiddenToTune(const TuneTask* task);
+// tell whether the task has been wrapped by custom_call in TransToCustomCallPass
+bool IsWrappedByCustomCall(const TuneTask* task);
+// tell whether the task has registered external api
+bool HasExternalApi(const TuneTask* task);
+
+TaskOptimizer::TaskOptimizer(TuneTask* task,
+                             ScheduleMeasurer* schedule_measurer,
+                             Database* database,
+                             utils::LinearRandomEngine::StateType rand_seed)
+    : task_(task),
+      schedule_measurer_(schedule_measurer),
+      database_(database),
+      cost_model_(),
+      rand_seed_(utils::LinearRandomEngine::NormalizeState(rand_seed)) {}
+
+FunctionGroup TaskOptimizer::Optimize(const TuningOptions& options) {
+  CHECK(task_->subgraph != nullptr) << "subgraph can't be empty";
+  // task with forbidden or custom_call ops can't be tuned
+  if (IsForbiddenToTune(task_) || IsWrappedByCustomCall(task_)) {
+    return task_->op_lowerer->Lower(task_->subgraph);
+  }
+  // TODO(CtfGo): the input/output names of a Graph::Group will be changed in Lowering by OpLowerer currently,
+  // so we should revert them after following different lower methods, remove this hard code by fixing the
+  // decoupling between lowering and BuildInstructions
+  auto initial_input_names  = task_->subgraph->input_names;
+  auto initial_output_names = task_->subgraph->output_names;
+
+  std::vector<TaskOptimizer::Result> candidates;
+  candidates.emplace_back(OptimizeByEvolution(options));
+  candidates.emplace_back(OptimizeByManual(options.num_measure_trials > 0));
+  if (HasExternalApi(task_)) {
+    candidates.emplace_back(OptimizeByExternal(options.num_measure_trials > 0));
+  }
+  sort(candidates.begin(), candidates.end(), [](const auto& lhs, const auto& rhs) { return lhs.cost < rhs.cost; });
+  auto&& best = candidates.front();
+  VLOG(4) << "Total candidates=" << candidates.size() << ", the best from=" << best.from << ", cost=" << best.cost;
+
+  // revert input/output names
+  task_->subgraph->input_names  = initial_input_names;
+  task_->subgraph->output_names = initial_output_names;
+  return best.functions;
+}
+
+TaskOptimizer::Result TaskOptimizer::OptimizeByManual(bool need_measured) {
+  static constexpr char* kManualMeasuredKeyPrefix = "@ManualMeasured:\n";
+  TaskOptimizer::Result result("Manual");
+  result.functions = task_->op_lowerer->Lower(task_->subgraph);
+
+  // pack functions body
+  std::vector<ir::Expr> func_bodys;
+  for (const ir::LoweredFunc& func : result.functions) {
+    func_bodys.push_back(func->body);
+  }
+
+  SearchState state(ir::IRSchedule(ir::ModuleExpr(std::move(func_bodys))));
+  // the manual is regarded as the second best in default, so we set its cost 0.0
+  result.cost = 0.0;
+
+  // add the specific prefix in front of serialized_key to be store/load measured record for manual schedule
+  std::string measured_key = kManualMeasuredKeyPrefix + task_->serialized_key;
+  if (need_measured && database_->Count(measured_key) == 0) {
+    std::vector<MeasureInput> inputs(1);
+    inputs.back().task          = task_;
+    inputs.back().lowered_funcs = result.functions;
+    VLOG(4) << "Measure manual schedule";
+    std::vector<MeasureResult> measure_outputs = schedule_measurer_->Measure(inputs);
+    database_->AddRecord(TuningRecord(measured_key, state, measure_outputs[0].execution_cost));
+  }
+
+  auto measured_records = database_->LookUp(measured_key);
+  if (!measured_records.empty()) {  // update result.cost by measured if exists
+    result.cost = measured_records[0].execution_cost;
+  }
+  return result;
+}
+
+TaskOptimizer::Result TaskOptimizer::OptimizeByExternal(bool need_measured) {
+  static constexpr char* kExternalMeasuredKeyPrefix = "@ExternalMeasured:\n";
+  TaskOptimizer::Result result("External");
+  auto nodes       = task_->subgraph->CollectNodes();
+  auto* first_node = nodes.front();
+
+  // set the necessary field for lowering with external api
+  std::string original_op                     = first_node->op()->name;
+  first_node->attrs.attr_store["original_op"] = original_op;
+  first_node->attrs.op                        = hlir::framework::Operator::Get("custom_call");
+  result.functions                            = task_->op_lowerer->Lower(task_->subgraph);
+
+  // add the specific prefix in front of serialized_key to be store/load measured record for external api
+  result.cost              = -1.0;  // the external is regarded as the best in default, so we set its cost -1.0
+  std::string measured_key = kExternalMeasuredKeyPrefix + task_->serialized_key;
+  if (need_measured && database_->Count(measured_key) == 0) {
+    std::vector<MeasureInput> inputs(1);
+    inputs.back().task          = task_;
+    inputs.back().lowered_funcs = result.functions;
+    VLOG(4) << "Measure external api";
+    std::vector<MeasureResult> measure_outputs = schedule_measurer_->Measure(inputs);
+    // the SearchState of external is invalid and will not be used, so we just put a temporary one
+    database_->AddRecord(TuningRecord(measured_key, SearchState(ir::IRSchedule()), measure_outputs[0].execution_cost));
+  }
+
+  auto measured_records = database_->LookUp(measured_key);
+  if (!measured_records.empty()) {  // update result.cost by measured if exists
+    result.cost = measured_records[0].execution_cost;
+  }
+  return result;
+}
+
+bool IsForbiddenToTune(const TuneTask* task) {
+  // TODO(CtfGo): some operators may change its linked edges in
+  // TransToCustomCallPass, like conv2d, we will skip these ops in auto-schedule
+  // because they can't revert original links for no schedule and manual schedule lowering.
+  static std::unordered_set<std::string> links_changed_ops = {"conv2d"};
+  auto nodes                                               = task->subgraph->CollectNodes();
+  auto&& op_name                                           = nodes.front()->op()->name;
+  if (nodes.size() == 1 && links_changed_ops.count(op_name)) {
+    VLOG(5) << "Op:" << op_name << " is forbidden to call external_api";
+    return true;
+  }
+
+  return false;
+}
+
+bool HasExternalApi(const TuneTask* task) {
+  auto nodes       = task->subgraph->CollectNodes();
+  auto* first_node = nodes.front();
+  if (nodes.size() == 1 && ExternalApiRegistry::Global()->Has(first_node->op()->name, task->target)) {
+    return true;
+  }
+  return false;
+}
+
+bool IsWrappedByCustomCall(const TuneTask* task) {
+  auto nodes       = task->subgraph->CollectNodes();
+  auto* first_node = nodes.front();
+  if (nodes.size() == 1 && first_node->op()->name == "custom_call") {
+    CHECK(first_node->attrs.attr_store.count("original_op")) << "a custom_call op must store its original op name";
+    std::string op_name = absl::get<std::string>(first_node->attrs.attr_store.at("original_op"));
+    VLOG(5) << "Op:" << op_name << " was wrapped as custom_call";
+    return true;
+  }
+
+  return false;
+}
+
+TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution(const TuningOptions& options) {
+  CHECK_EQ(options.num_measure_trials % options.num_samples_per_iteration, 0)
+      << "TuningOptions.num_measure_trials % TuningOptions.num_samples_per_iteration must be 0.";
+
+  VLOG(4) << "Optimizing TuneTask with num_measure_trials:" << options.num_measure_trials
+          << ", LoweredFunc before optimization is:";
+  VLOG(4) << "lowered function size = " << task_->lowered_funcs.size();
+  for (size_t i = 0; i < task_->lowered_funcs.size(); ++i) {
+    VLOG(4) << "lowered_funcs[" << i << "] detail:\n" << task_->lowered_funcs[i];
+  }
+
+  if (evolutionary_search_ == nullptr) {
+    // TODO(zhhsplendid): check whether the options is same as previous,
+    // if not, we should create new EvolutionarySearch
+    evolutionary_search_ =
+        std::make_unique<EvolutionarySearch>(*task_, cost_model_, database_, utils::ForkRandomState(&rand_seed_));
+  }
+
+  TaskOptimizer::Result result("Evolution");
+  auto& optimized_funcs = result.functions;
+  auto& best_cost       = result.cost;
+  // use initial lowered function as default result
+  optimized_funcs = optim::IRCopy(task_->lowered_funcs);
+  if (options.num_measure_trials == 0) {  // no need to measure and simply return the best searched
+    std::vector<MeasureInput> measure_candidates;
+    std::vector<SearchState> states = SearchOneRound(options, &measure_candidates);
+    if (!states.empty()) {
+      if (FLAGS_auto_schedule_use_cost_model) {
+        best_cost = cost_model_.Predict(states.front()->ir_schedule.GetModule(), task_->target);
+      }
+      optimized_funcs = measure_candidates[0].lowered_funcs;
+    } else {
+      LOG(WARNING) << "No valid candidate searched, will return initial state";
+    }
+    return result;
+  }
+
+  int measured_count            = 0;
+  uint32_t continuous_empty_cnt = 0;
+  while (measured_count < options.num_measure_trials) {
+    VLOG(4) << "Launch a new search, current measured_count:" << measured_count;
+    std::vector<MeasureInput> measure_inputs;
+    std::vector<SearchState> states = SearchOneRound(options, &measure_inputs);
+    if (states.empty()) {  // no new valid candidate achieved
+      ++continuous_empty_cnt;
+      if (continuous_empty_cnt <= kMaxRetryContinuousEmpty_) {
+        VLOG(4) << "No valid state searched, continuous_empty_cnt=" << continuous_empty_cnt;
+        continue;
+      } else {
+        LOG(WARNING)
+            << "OptimizeByEvolution will be exited in advance due to continuous invalid search, final measured_count="
+            << measured_count;
+        break;
+      }
+    }
+    continuous_empty_cnt = 0;  // reset if get valid candidates
+
+    VLOG(4) << "ScheduleMeasurer start with input size=" << measure_inputs.size();
+    std::vector<MeasureResult> measure_outputs = schedule_measurer_->Measure(measure_inputs);
+    CHECK_EQ(measure_outputs.size(), states.size())
+        << "ScheduleMeasurer didn't output same number of MeasureOutput of states in TaskOptimizer";
+    // record to database
+    for (size_t i = 0; i < states.size(); ++i) {
+      database_->AddRecord(
+          TuningRecord(measure_inputs[i].task->serialized_key, states[i], measure_outputs[i].execution_cost));
+    }
+
+    // update cost model
+    if (FLAGS_auto_schedule_use_cost_model) {
+      std::vector<const ir::ModuleExpr*> cost_model_samples(states.size());
+      std::vector<float> cost_model_labels(states.size());
+      for (size_t i = 0; i < states.size(); ++i) {
+        cost_model_samples[i] = &(states[i]->ir_schedule.GetModule());
+        cost_model_labels[i]  = measure_outputs[i].execution_cost;
+      }
+      VLOG(4) << utils::StringFormat("Update CostModel with samples size=%lu,labels size=%lu",
+                                     cost_model_samples.size(),
+                                     cost_model_labels.size());
+      cost_model_.Update(cost_model_samples, cost_model_labels, task_->target);
+    }
+
+    // update the best
+    for (size_t i = 0; i < measure_outputs.size(); ++i) {
+      if (measure_outputs[i].execution_cost < best_cost) {
+        VLOG(4) << "Update best candidate with execution_cost:" << measure_outputs[i].execution_cost << "us";
+        best_cost       = measure_outputs[i].execution_cost;
+        optimized_funcs = measure_inputs[i].lowered_funcs;
+      }
+    }
+
+    // count result size
+    measured_count += states.size();
+  }
+  return result;
+}
+
+std::vector<SearchState> TaskOptimizer::SearchOneRound(const TuningOptions& options,
+                                                       std::vector<MeasureInput>* measure_candidates) {
+  std::vector<SearchState> states = evolutionary_search_->SearchModuleExprEpsGreedy(options);
+  VLOG(4) << JoinStatesDebugString("TaskOptimizer::EvolutionarySearch-Result", states, /*verbose=*/VLOG_IS_ON(5));
+
+  size_t valid_cnt = 0;
+  for (size_t i = 0; i < states.size(); ++i) {
+    std::vector<ir::Expr> best_exprs = states[i]->ir_schedule.GetModule().GetExprs();
+    CHECK_EQ(best_exprs.size(), task_->lowered_funcs.size())
+        << "RuntimeError: Expr size is not equal to LoweredFunc size in TaskOptimizer";
+    auto init_funcs = optim::IRCopy(task_->lowered_funcs);
+    std::vector<ir::LoweredFunc> valid_funcs;
+    for (size_t j = 0; j < best_exprs.size(); ++j) {
+      auto updated_f = UpdateFuncWithNewBody(task_->target, init_funcs[j], best_exprs[j]);
+      if (PruneInvalid(updated_f, task_->target)) {
+        VLOG(4) << "PruneInvalid states-" << i;
+        break;
+      }
+      valid_funcs.emplace_back(updated_f);
+    }
+
+    // all functions are validated, collect this state to be measured
+    if (valid_funcs.size() == init_funcs.size()) {
+      states[valid_cnt++] = states[i];
+      measure_candidates->emplace_back(MeasureInput());
+      measure_candidates->back().task          = task_;
+      measure_candidates->back().lowered_funcs = std::move(valid_funcs);
+    }
+  }
+
+  states.erase(states.begin() + valid_cnt, states.end());
+  CHECK_EQ(states.size(), measure_candidates->size()) << "result size of states not equal to measure_candidates";
+  VLOG(4) << "EvolutionarySearch return size=" << states.size() << ", valid count=" << valid_cnt;
+  VLOG(4) << JoinStatesDebugString("TaskOptimizer::SearchOneRound-Result", states, /*verbose=*/VLOG_IS_ON(5));
+  return states;
+}
+
+// detect the limit of available shared memory on the current NVGPU with CUDA runtime
+size_t GetGPUSharedMemoryLimit() {
+#ifdef CINN_WITH_CUDA
+  int device_id;
+  CUDA_CALL(cudaGetDevice(&device_id));
+  cudaDeviceProp prop;
+  CUDA_CALL(cudaGetDeviceProperties(&prop, device_id));
+  VLOG(4) << utils::StringFormat("GPU-%d GPUSharedMemoryLimit=%d", device_id, prop.sharedMemPerBlock);
+  return prop.sharedMemPerBlock;
+#else
+  return 0;
+#endif
+}
+
+// detect the limit of available local/stack memory on the current NVGPU with CUDA runtime
+size_t GetGPULocalStackLimit() {
+#ifdef CINN_WITH_CUDA
+  int device_id;
+  CUDA_CALL(cudaGetDevice(&device_id));
+  cudaDeviceProp prop;
+  CUDA_CALL(cudaGetDeviceProperties(&prop, device_id));
+  size_t limit = prop.totalGlobalMem / prop.multiProcessorCount / prop.maxThreadsPerMultiProcessor;
+  VLOG(4) << utils::StringFormat(
+      "GPU-%d totalGlobalMem=%lu,maxThreadsPerMultiProcessor=%d,multiProcessorCount=%d, calculated "
+      "GPULocalStackLimit=%lu",
+      device_id,
+      prop.totalGlobalMem,
+      prop.multiProcessorCount,
+      prop.maxThreadsPerMultiProcessor,
+      limit);
+  return limit;
+#else
+  return 0;
+#endif
+}
+
+// check whether usage of the specific memory type in the lowered_func exceeds hardware limit
+bool IsGPUMemoryUsageExceedLimit(const ir::LoweredFunc& lowered_func,
+                                 const ir::MemoryType& used_memory_type,
+                                 const size_t limit_bytes) {
+  std::unordered_set<std::string> visited;
+  size_t used_bytes_cnt = 0;
+  for (auto&& buf : lowered_func->temp_bufs) {
+    VLOG(5) << "temp buf name=" << buf->name << ", numel=" << buf->numel() << ",dtype=" << buf->dtype;
+    if (buf->memory_type == used_memory_type && !visited.count(buf->name)) {
+      used_bytes_cnt += buf->numel() * buf->dtype.bytes();
+      visited.insert(buf->name);
+    }
+  }
+  VLOG(5) << "total used_bytes_cnt=" << used_bytes_cnt;
+  return used_bytes_cnt >= limit_bytes;
+}
+
+bool PruneInvalid(const ir::LoweredFunc& lowered_func, const common::Target& target) {
+  static const size_t kGPUSharedMemoryLimitBytes = GetGPUSharedMemoryLimit();
+  static const size_t kGPULocalStackLimitBytes   = GetGPULocalStackLimit();
+
+  if (target == common::DefaultNVGPUTarget()) {
+    if (IsGPUMemoryUsageExceedLimit(lowered_func, ir::MemoryType::GPUShared, kGPUSharedMemoryLimitBytes)) {
+      VLOG(5) << ir::MemoryType::GPUShared << " memory usage exceeds limit, func:\n" << lowered_func;
+      return true;
+    }
+
+    if (IsGPUMemoryUsageExceedLimit(lowered_func, ir::MemoryType::GPULocal, kGPULocalStackLimitBytes)) {
+      VLOG(5) << ir::MemoryType::GPULocal << " memory usage exceeds limit, func:\n" << lowered_func;
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.h b/paddle/cinn/auto_schedule/task/task_optimizer.h
new file mode 100644
index 0000000000000..68fb9f8457324
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "cinn/auto_schedule/database/database.h"
+#include "cinn/auto_schedule/measure/schedule_measurer.h"
+#include "cinn/auto_schedule/search_strategy/evolutionary_search.h"
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/auto_schedule/tuning.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/utils/random_engine.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// This class is responsible for tuning a specific task,
+// it will integrate necessary components to search the
+// optimal schedule for the task.
+class TaskOptimizer {
+ public:
+  TaskOptimizer(TuneTask* task,
+                ScheduleMeasurer* schedule_measurer,
+                Database* database,
+                utils::LinearRandomEngine::StateType rand_seed = -1);
+
+  FunctionGroup Optimize(const TuningOptions& options);
+
+ private:
+  struct Result {
+    std::string from;
+    double cost;
+    FunctionGroup functions;
+    Result(const std::string& from_type) : from(from_type), cost(std::numeric_limits<double>::max()) {}
+  };
+
+  Result OptimizeByManual(bool need_measure);
+  Result OptimizeByExternal(bool need_measure);
+  Result OptimizeByEvolution(const TuningOptions& options);
+
+  // call search candidates once by EvolutionarySearch and prune invalid ones
+  std::vector<SearchState> SearchOneRound(const TuningOptions& options, std::vector<MeasureInput>* measure_candidates);
+
+ private:
+  // the max retry times if continuously get empty result
+  static constexpr uint32_t kMaxRetryContinuousEmpty_ = 3;
+  TuneTask* task_;
+  ScheduleMeasurer* schedule_measurer_;
+  std::unique_ptr<EvolutionarySearch> evolutionary_search_ = nullptr;
+  ExprCostModel cost_model_;
+  Database* database_;
+  utils::LinearRandomEngine::StateType rand_seed_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task/task_registry.h b/paddle/cinn/auto_schedule/task/task_registry.h
new file mode 100644
index 0000000000000..ad069ecac8343
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/task_registry.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gflags/gflags.h>
+
+#include <mutex>
+#include <string>
+
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/utils/registry.h"
+
+namespace cinn {
+
+namespace auto_schedule {
+
+struct InitialTaskInfo {
+  std::string task_key;
+  ir::ModuleExpr module_expr;
+
+  InitialTaskInfo(const std::string& task_key, const ir::ModuleExpr& module_expr)
+      : task_key(task_key), module_expr(module_expr) {}
+};
+
+// Global task registry, used to save the initial ModuleExpr of each task.
+class InitialTaskRegistry : public Registry<InitialTaskInfo> {
+ public:
+  static InitialTaskRegistry* Global() {
+    static InitialTaskRegistry x;
+    return &x;
+  }
+
+  // Get the initial ModuleExpr of a task.
+  inline const InitialTaskInfo* Get(const std::string& task_key) {
+    const InitialTaskInfo* task_info = Registry<InitialTaskInfo>::Find(task_key);
+    CHECK(task_info) << "InitialTaskInfo [" << task_key << "] is not registered";
+    return task_info;
+  }
+
+  // Check if the task info with task_key exists;
+  inline const bool Has(const std::string& task_key) { return nullptr != Registry<InitialTaskInfo>::Find(task_key); }
+
+  // Regist the initial ModuleExpr of a task into the map
+  inline void Regist(const std::string& task_key, const ir::ModuleExpr& module_expr) {
+    std::lock_guard<std::mutex> guard(registering_mutex);
+    if (fmap_.count(task_key) == 0) {
+      InitialTaskInfo* task_info = new InitialTaskInfo(task_key, optim::IRCopy(module_expr));
+      __REGISTER__(task_key, task_info);
+    }
+  }
+
+ private:
+  InitialTaskRegistry() = default;
+  CINN_DISALLOW_COPY_AND_ASSIGN(InitialTaskRegistry);
+
+  // Regist the initial ModuleExpr of a task.
+  inline InitialTaskInfo* __REGISTER__(const std::string& task_key, InitialTaskInfo* task_info) {
+    fmap_[task_key] = task_info;
+    const_list_.push_back(task_info);
+    entry_list_.push_back(task_info);
+    return task_info;
+  }
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task/task_registry_test.cc b/paddle/cinn/auto_schedule/task/task_registry_test.cc
new file mode 100644
index 0000000000000..c94f0df743e9b
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/task_registry_test.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/task/task_registry.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+
+#include "cinn/auto_schedule/task/task_creator.h"
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/op_lowering.h"
+#include "cinn/utils/string.h"
+#include "cinn/utils/type_defs.h"
+
+DECLARE_bool(auto_schedule_use_cost_model);
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace auto_schedule {
+
+std::vector<TuneTask> CreateTasks(hlir::framework::Graph* graph, const common::Target& target) {
+  // create tasks
+  TaskCreator task_creator;
+  std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph);
+
+  const auto& dtype_dict = graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+  const auto& shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+
+  std::unique_ptr<hlir::framework::OpLowerer> op_lowerer =
+      std::make_unique<hlir::framework::OpLowerer>(dtype_dict, shape_dict, target);
+  for (TuneTask& task : tasks) {
+    task.Initialize(shape_dict, dtype_dict, op_lowerer.get());
+    VLOG(3) << "Add a task with serialized_key:\n" << task.serialized_key;
+  }
+
+  return tasks;
+}
+
+std::shared_ptr<hlir::framework::Graph> CreateAddProgram(const common::Target& target) {
+  frontend::NetBuilder builder("test");
+
+  auto a = builder.CreateInput(Float(32), {1, 64, 112, 112}, "A");
+  auto b = builder.CreateInput(Float(32), {64}, "B");
+  auto c = builder.Add(a, b, 1);
+
+  return std::make_shared<hlir::framework::Graph>(builder.Build(), target);
+}
+
+TEST(TestTaskRegistry, basic) {
+  FLAGS_auto_schedule_use_cost_model = true;
+  FLAGS_cinn_ir_schedule             = true;
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::shared_ptr<hlir::framework::Graph> graph = CreateAddProgram(target);
+  std::vector<TuneTask> tasks                   = CreateTasks(graph.get(), target);
+
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+
+  std::vector<ir::ModuleExpr> module_exprs;
+  for (const TuneTask& task : tasks) {
+    module_exprs.emplace_back(task.GetLoweredFuncBodyExprs());
+    task_registry->Regist(task.serialized_key, module_exprs.back());
+  }
+
+  for (int i = 0; i < tasks.size(); ++i) {
+    std::string key = tasks[i].serialized_key;
+    VLOG(3) << "serialized_key = " << key;
+    ir::ModuleExpr new_expr = task_registry->Get(key)->module_expr;
+
+    ASSERT_EQ(new_expr.GetExprs().size(), module_exprs[i].GetExprs().size());
+    for (int j = 0; j < new_expr.GetExprs().size(); ++j) {
+      VLOG(3) << "expr " << j << " of task " << key << " : " << new_expr.GetExprs().at(j);
+      ASSERT_EQ(utils::GetStreamCnt(new_expr.GetExprs().at(j)), utils::GetStreamCnt(module_exprs[i].GetExprs().at(j)));
+    }
+  }
+
+  bool flag = task_registry->Has(tasks[0].serialized_key);
+  ASSERT_EQ(flag, true);
+
+  flag = task_registry->Has("not_exist");
+  ASSERT_EQ(flag, false);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task/tune_task.cc b/paddle/cinn/auto_schedule/task/tune_task.cc
new file mode 100644
index 0000000000000..80998c3825a47
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/tune_task.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/task/tune_task.h"
+
+#include <glog/logging.h>
+
+#include <iostream>
+#include <vector>
+
+#include "cinn/auto_schedule/analysis/analyze_ir.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op_lowering.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+void TuneTask::Initialize(const absl::flat_hash_map<std::string, hlir::framework::shape_t>& shape_dict,
+                          const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
+                          hlir::framework::OpLowerer* lower_handler) {
+  CHECK(lower_handler != nullptr) << "op_lowerer can't be nullptr";
+  op_lowerer = lower_handler;
+
+  // Set lowered_funcs and analyze output names.
+  this->lowered_funcs  = op_lowerer->LowerWithoutSchedule(subgraph);
+  this->output_names   = GetOutputNamesFromLoweredFunc(this->lowered_funcs);
+  this->serialized_key = SerializeToString(shape_dict, dtype_dict);
+}
+
+std::vector<ir::Expr> TuneTask::GetLoweredFuncBodyExprs() const {
+  std::vector<ir::Expr> result;
+  for (const ir::LoweredFunc& func : lowered_funcs) {
+    result.push_back(func->body);
+  }
+  return result;
+}
+
+std::string TuneTask::SerializeToString(const absl::flat_hash_map<std::string, hlir::framework::shape_t>& shape_dict,
+                                        const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict) {
+  std::stringstream ss;
+  ss << target << "\n\n";  // print target
+
+  // local function to print dtype,shape of out/in variables of the specified node
+  auto print_node_links_fn = [&](const std::vector<common::Shared<common::GraphEdge>>& links, bool is_input) {
+    int printed_num = 0;
+    for (auto&& edge : links) {
+      const auto* var_node = is_input ? edge->source()->safe_as<hlir::framework::NodeData>()
+                                      : edge->sink()->safe_as<hlir::framework::NodeData>();
+      CHECK(var_node) << "var node invalid";
+      auto sit = shape_dict.find(var_node->id());
+      CHECK(sit != shape_dict.end()) << "can't find shape of variable:" << var_node->id();
+      auto dit = dtype_dict.find(var_node->id());
+      CHECK(dit != dtype_dict.end()) << "can't find dtype of variable:" << var_node->id();
+      if (printed_num > 0) {
+        ss << ", ";
+      }
+      ++printed_num;
+      // TODO(CtfGo): CINN uses the names of input/output NodeData ids as arguments of the LoweredFunc in the Lower
+      // process, so it will result in different LoweredFuncs for two Nodes even though they represents the same
+      // operator. Here we add `var_node->id()` into the serialized_key to distinguish them, otherwise AutoTuner will
+      // get wrong TuningRecords when querying cached results from database.  In the future, we should remove
+      // name-related limit in Lower process, to avoid duplicate tuning tasks with same operators.
+      ss << var_node->id() << "->" << cinn::common::Type2Str(dit->second) << "[" + utils::Join(sit->second, ",") << "]";
+    }
+  };
+
+  // print each node of the subgraph
+  ss << "Group {\n";
+  for (auto&& node : subgraph->CollectNodes()) {
+    ss << "  (";
+    print_node_links_fn(node->outlinks_in_order(), false);
+    ss << ") = " << node->op()->name << "(";
+    print_node_links_fn(node->inlinks_in_order(), true);
+    ss << ")\n";
+  }
+  ss << "}\n";
+
+  return ss.str();
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task/tune_task.h b/paddle/cinn/auto_schedule/task/tune_task.h
new file mode 100644
index 0000000000000..4963a36fc4133
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/tune_task.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/common/type.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op_lowering.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/lowered_func.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class TuneTask {
+ public:
+  TuneTask() = default;
+  TuneTask(std::shared_ptr<hlir::framework::Graph::Group> group) : subgraph(group) {}
+  // Initialize a task
+  void Initialize(const absl::flat_hash_map<std::string, hlir::framework::shape_t>& shape_dict,
+                  const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
+                  hlir::framework::OpLowerer* lower_handler);
+  // Extract bodies in lowered_funcs() and return
+  std::vector<ir::Expr> GetLoweredFuncBodyExprs() const;
+
+  // In CINN, we use hlir::framework::Graph::Group to represent a fused
+  // sub-graph (if an op won't be fused, it will be a Group with size=1).
+  std::shared_ptr<hlir::framework::Graph::Group> subgraph;
+  // Lower handler, Not owned
+  hlir::framework::OpLowerer* op_lowerer;
+  // target of this task
+  common::Target target;
+  // stores the initial (un-optimized) LoweredFuncs
+  std::vector<ir::LoweredFunc> lowered_funcs;
+  // names of the output arguments of lowered_funcs_
+  std::unordered_set<std::string> output_names;
+  // serialized string of this task, it contains struct,shape,dtype,input/output variable name
+  // of the subgraph and can be further used to hash
+  std::string serialized_key;
+
+ private:
+  // Serialize this task as a string contains specific fields of it
+  std::string SerializeToString(const absl::flat_hash_map<std::string, hlir::framework::shape_t>& shape_dict,
+                                const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict);
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task/tune_task_test.cc b/paddle/cinn/auto_schedule/task/tune_task_test.cc
new file mode 100755
index 0000000000000..9ff7ea26392cd
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task/tune_task_test.cc
@@ -0,0 +1,339 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/task/tune_task.h"
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include "cinn/auto_schedule/task/task_creator.h"
+#include "cinn/common/context.h"
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op_lowering.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/utils/string.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::frontend::NetBuilder;
+using ::cinn::frontend::Program;
+using ::cinn::hlir::framework::OpLowerer;
+
+Program CreateAddProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b       = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c       = builder.Add(a, b);
+  auto d       = builder.Add(a, c);
+  auto program = builder.Build();
+
+  return program;
+}
+
+TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
+  // Auto tuner is combined with IR schedule
+  FLAGS_cinn_ir_schedule = true;
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target                  = common::DefaultHostTarget();
+#endif
+  Program prog = CreateAddProgram();
+  auto graph   = std::make_shared<hlir::framework::Graph>(prog, target);
+
+  TaskCreator task_creator;
+  std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
+  ASSERT_EQ(tasks.size(), 2UL);
+
+  const auto& shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+  const auto& dtype_dict = graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+
+  std::stringstream ss;
+  for (TuneTask& task : tasks) {
+    task.Initialize(shape_dict, dtype_dict, &op_lowerer);
+
+    std::vector<ir::Expr> exprs = task.GetLoweredFuncBodyExprs();
+    VLOG(6) << "ir:Expr is: ";
+    for (const ir::Expr& e : exprs) {
+      VLOG(6) << e;
+      ss << e << std::endl;
+    }
+  }
+
+  std::string expr_str = ss.str();
+#ifdef CINN_WITH_CUDA
+  std::string target_str = R"ROC(
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 24)
+      {
+        ScheduleBlock(var_1)
+        {
+          i0, i1 = axis.bind(i, j)
+          var_1[i, j] = (A[i, j] + B[i, j])
+        }
+      }
+    }
+  }
+}
+{
+  ScheduleBlock(root_0)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 24)
+      {
+        ScheduleBlock(var_2)
+        {
+          i0_0, i1_0 = axis.bind(i, j)
+          var_2[i, j] = (A[i, j] + var_1[i, j])
+        }
+      }
+    }
+  }
+}
+)ROC";
+#else
+  std::string target_str         = R"ROC(
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 24)
+      {
+        ScheduleBlock(var_1)
+        {
+          i0, i1 = axis.bind(i, j)
+          var_1[i0, i1] = (A[i0, i1] + B[i0, i1])
+        }
+      }
+    }
+  }
+}
+{
+  ScheduleBlock(root_0)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 24)
+      {
+        ScheduleBlock(var_2)
+        {
+          i0_0, i1_0 = axis.bind(i, j)
+          var_2[i0_0, i1_0] = (A[i0_0, i1_0] + var_1[i0_0, i1_0])
+        }
+      }
+    }
+  }
+}
+)ROC";
+#endif
+
+  EXPECT_EQ(utils::Trim(target_str), utils::Trim(expr_str));
+}
+
+TEST(TuneTask, GraphToUnoptLoweredFunc_ApplyPass) {
+  // Auto tuner is combined with IR schedule
+  FLAGS_cinn_ir_schedule = true;
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target                  = common::DefaultHostTarget();
+#endif
+  Program prog = CreateAddProgram();
+  auto graph   = std::make_shared<hlir::framework::Graph>(prog, target);
+  ApplyPass(graph.get(), "OpFusionPass");
+
+  TaskCreator task_creator;
+  std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
+
+  ASSERT_EQ(tasks.size(), 1UL);
+
+  const auto& shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+  const auto& dtype_dict = graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+
+  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+
+  std::stringstream ss;
+  for (TuneTask& task : tasks) {
+    task.Initialize(shape_dict, dtype_dict, &op_lowerer);
+
+    std::vector<ir::Expr> exprs = task.GetLoweredFuncBodyExprs();
+    VLOG(6) << "ir:Expr is: ";
+    for (const ir::Expr& e : exprs) {
+      VLOG(6) << e;
+      ss << e << std::endl;
+    }
+  }
+
+  std::string expr_str = ss.str();
+#ifdef CINN_WITH_CUDA
+  std::string target_str = R"ROC(
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 24)
+        {
+          ScheduleBlock(var_1)
+          {
+            i0, i1 = axis.bind(i, j)
+            var_1[i, j] = (A[i, j] + B[i, j])
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 24)
+        {
+          ScheduleBlock(var_2)
+          {
+            i0_0, i1_0 = axis.bind(i, j)
+            var_2[i, j] = (A[i, j] + var_1[i, j])
+          }
+        }
+      }
+    }
+  }
+}
+)ROC";
+
+#else
+  std::string target_str         = R"ROC(
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 24)
+        {
+          ScheduleBlock(var_1)
+          {
+            i0, i1 = axis.bind(i, j)
+            var_1[i0, i1] = (A[i0, i1] + B[i0, i1])
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 24)
+        {
+          ScheduleBlock(var_2)
+          {
+            i0_0, i1_0 = axis.bind(i, j)
+            var_2[i0_0, i1_0] = (A[i0_0, i1_0] + var_1[i0_0, i1_0])
+          }
+        }
+      }
+    }
+  }
+}
+)ROC";
+#endif
+
+  EXPECT_EQ(utils::Trim(target_str), utils::Trim(expr_str));
+}
+
+TEST(TuneTask, SerializeToString) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target                  = common::DefaultHostTarget();
+#endif
+  Program prog = CreateAddProgram();
+  auto graph   = std::make_shared<hlir::framework::Graph>(prog, target);
+
+  TaskCreator task_creator;
+  std::vector<TuneTask> single_tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
+
+  const auto& shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+  const auto& dtype_dict = graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+  ASSERT_EQ(single_tasks.size(), 2UL);
+  for (auto&& task : single_tasks) {
+    task.Initialize(shape_dict, dtype_dict, &op_lowerer);
+  }
+
+#ifdef CINN_WITH_CUDA
+  std::string single_add_str = R"ROC(Target<linux,nvgpu,64>
+
+Group {
+  (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
+}
+)ROC";
+#else
+  std::string single_add_str     = R"ROC(Target<linux,x86,64>
+
+Group {
+  (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
+}
+)ROC";
+#endif
+  EXPECT_EQ(single_tasks[0].serialized_key, single_add_str);
+
+  ApplyPass(graph.get(), "OpFusionPass");
+  std::vector<TuneTask> fused_tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
+  ASSERT_EQ(fused_tasks.size(), 1UL);
+  fused_tasks[0].Initialize(shape_dict, dtype_dict, &op_lowerer);
+
+#ifdef CINN_WITH_CUDA
+  std::string fused_expected_str = R"ROC(Target<linux,nvgpu,64>
+
+Group {
+  (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
+  (var_2->float32[32,24]) = elementwise_add(A->float32[32,24], var_1->float32[32,24])
+}
+)ROC";
+#else
+  std::string fused_expected_str = R"ROC(Target<linux,x86,64>
+
+Group {
+  (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
+  (var_2->float32[32,24]) = elementwise_add(A->float32[32,24], var_1->float32[32,24])
+}
+)ROC";
+#endif
+  EXPECT_EQ(fused_tasks[0].serialized_key, fused_expected_str);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt b/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
new file mode 100644
index 0000000000000..d938b027a7c5f
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
@@ -0,0 +1,5 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS task_scheduler.cc round_robin.cc efficiency_priority.cc)
+
+cc_test(test_task_scheduler SRCS task_scheduler_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
new file mode 100644
index 0000000000000..a83f8004965c2
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/task_scheduler/efficiency_priority.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+int EfficiencyPriority::NextTaskId() {
+  while (cur_task_id_ < tasks_->size()) {
+    if (IsTaskToTune(&tasks_->at(cur_task_id_))) {
+      return cur_task_id_++;
+    }
+    ++cur_task_id_;
+  }
+  return -1;
+}
+
+bool EfficiencyPriority::IsTaskToTune(const TuneTask* task) { return config_.minimum_gain_threshold > 0.0; }
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
new file mode 100644
index 0000000000000..af6e5272b09fe
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// Schedule tasks with efficiency_priority strategy, that
+// is picking a task with the maximum earnings ratio.
+class EfficiencyPriority : public TaskScheduler {
+ public:
+  EfficiencyPriority(const std::vector<TuneTask>& tasks, const Config& config) : TaskScheduler(tasks, config) {}
+
+  const char* Name() const override { return "efficiency_priority"; };
+
+  int NextTaskId() override;
+
+ private:
+  bool IsTaskToTune(const TuneTask* task);
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc b/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
new file mode 100644
index 0000000000000..37af0cee556c0
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/task_scheduler/round_robin.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+int RoundRobin::NextTaskId() {
+  if (cur_task_id_ < tasks_->size()) {
+    return cur_task_id_++;
+  }
+  return -1;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task_scheduler/round_robin.h b/paddle/cinn/auto_schedule/task_scheduler/round_robin.h
new file mode 100644
index 0000000000000..55429fce92f1f
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task_scheduler/round_robin.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// Schedule tasks with round_robin strategy, that
+// is picking a task to tune once a time iteratively.
+class RoundRobin : public TaskScheduler {
+ public:
+  RoundRobin(const std::vector<TuneTask>& tasks, const Config& config) : TaskScheduler(tasks, config) {}
+
+  const char* Name() const override { return "round_robin"; };
+
+  int NextTaskId() override;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
new file mode 100644
index 0000000000000..0c6f99ad73c6e
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
+
+#include <algorithm>
+
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/auto_schedule/task_scheduler/efficiency_priority.h"
+#include "cinn/auto_schedule/task_scheduler/round_robin.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+std::unique_ptr<TaskScheduler> TaskScheduler::Make(const std::vector<TuneTask>& tasks,
+                                                   const Config& config,
+                                                   const std::string& strategy) {
+  CHECK_GT(tasks.size(), 0) << "Empty task list";
+  if (strategy == "round_robin") {
+    return std::make_unique<RoundRobin>(tasks, config);
+  } else if (strategy == "efficiency_priority") {
+    return std::make_unique<EfficiencyPriority>(tasks, config);
+  }
+
+  LOG(FATAL) << "Unimplemented strategy:" << strategy;
+  return nullptr;
+}
+
+TaskScheduler::TaskScheduler(const std::vector<TuneTask>& tasks, const Config& config)
+    : tasks_(&tasks), config_(config), cur_task_id_(0) {}
+
+void TaskScheduler::Reset() { cur_task_id_ = 0; }
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
new file mode 100644
index 0000000000000..cd8776bd97620
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/auto_schedule/task/task_optimizer.h"
+#include "cinn/auto_schedule/task/tune_task.h"
+#include "cinn/auto_schedule/tuning.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// Class for scheduling tasks to perform auto-tune
+class TaskScheduler {
+ public:
+  // All configs for different schedule strategies
+  // will be defined here together.
+  struct Config {
+    // The minimum threshold of earnings ratio, used by EfficiencyPriority
+    float minimum_gain_threshold = 0.0;
+  };
+
+  // Create a TaskScheduler with the specific strategy name
+  // and necessary construct parameters.
+  static std::unique_ptr<TaskScheduler> Make(const std::vector<TuneTask>& tasks,
+                                             const Config& config,
+                                             const std::string& strategy = "round_robin");
+
+  // Reset associated states to schedule at the beginning
+  void Reset();
+
+  // Return the name of schedule strategy
+  virtual const char* Name() const = 0;
+
+  // Select a task to tune
+  virtual int NextTaskId() = 0;
+
+ protected:
+  // A taskScheduler object should be created with the static function Make
+  TaskScheduler(const std::vector<TuneTask>& tasks, const Config& config);
+
+  // The config for scheduling strategy
+  Config config_;
+  // The current task id to be estimated
+  int cur_task_id_;
+  // The pointer refers to all tasks
+  const std::vector<TuneTask>* tasks_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
new file mode 100644
index 0000000000000..a05b8dab3fd28
--- /dev/null
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
+
+#include <gtest/gtest.h>
+
+#include <type_traits>
+
+#include "cinn/auto_schedule/task_scheduler/efficiency_priority.h"
+#include "cinn/auto_schedule/task_scheduler/round_robin.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(TaskScheduler, Make) {
+  std::vector<TuneTask> tasks(3);
+  TaskScheduler::Config config;
+
+  auto round_robin = TaskScheduler::Make(tasks, config);
+  ASSERT_STREQ(round_robin->Name(), "round_robin");
+  auto efficiency_priority = TaskScheduler::Make(tasks, config, "efficiency_priority");
+  ASSERT_STREQ(efficiency_priority->Name(), "efficiency_priority");
+}
+
+TEST(RoundRobinScheduler, NextTaskId) {
+  std::vector<TuneTask> tasks(3);
+  TaskScheduler::Config config;
+  auto round_robin = TaskScheduler::Make(tasks, config);
+  ASSERT_EQ(0, round_robin->NextTaskId());
+  ASSERT_EQ(1, round_robin->NextTaskId());
+  round_robin->Reset();
+  ASSERT_EQ(0, round_robin->NextTaskId());
+}
+
+TEST(EfficiencyPriorityScheduler, NextTaskId) {
+  std::vector<TuneTask> tasks(3);
+  TaskScheduler::Config config;
+  config.minimum_gain_threshold = -1.0;
+  auto efficiency_priority      = TaskScheduler::Make(tasks, config, "efficiency_priority");
+  ASSERT_EQ(-1, efficiency_priority->NextTaskId());
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/tests/CMakeLists.txt b/paddle/cinn/auto_schedule/tests/CMakeLists.txt
new file mode 100644
index 0000000000000..407400b1f241b
--- /dev/null
+++ b/paddle/cinn/auto_schedule/tests/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (WITH_CUDA AND (NOT WITH_CUDNN))
+  cc_test(test_performance_comparison
+          ARGS "--resnet50_model_dir=${THIRD_PARTY_PATH}/ResNet50"
+          SRCS performance_comparison_test.cc DEPS cinncore test_program_builder)
+endif()
diff --git a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
new file mode 100644
index 0000000000000..35a1e58063605
--- /dev/null
+++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
@@ -0,0 +1,310 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <bitset>
+#include <iostream>
+
+#include "cinn/auto_schedule/auto_tuner.h"
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/paddle_model_convertor.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/data_util.h"
+#include "tests/program_builder.h"
+
+/* This test is used as a tool to evaluate or compare performance of 3 schedules(no schedule, manual schedule,
+ * auto-schedule). One can specify which schedules to be evaluated through `FLAGS_evaluate_knobs` and specify which
+ * operator or model through `--gtest_filter=PerformanceTester.xx`, for example, `FLAGS_evaluate_knobs=4
+ * --gtest_filter=PerformanceTester.Matmul` means it will evaluate auto-schedule on Matmul operator. You can refer to
+ * explanation of following flags or parameters for more detail.
+ */
+
+DEFINE_string(resnet50_model_dir, "./ResNet50", "the path to paddle model resnet50.");
+// Flags that control which schedule tests will be run.
+// Bit with index 0 controls no schedule test, means options = 1 = "001" will run no schedule test.
+// Bit with index 1 controls manual schedule test, means options = 2 = "010" will run manual schedule test.
+// Bit with index 2 controls auto schedule test, means options = 4 = "100" will run auto schedule test.
+// The default value is -1, which means that this flag is disabled to set the options
+DEFINE_int32(evaluate_knobs, -1, "the options to control which schedule tests will be run.");
+DECLARE_int32(cinn_parallel_compile_size);
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::hlir::framework::BuildScope;
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::GraphCompiler;
+using ::cinn::hlir::framework::Instruction;
+using ::cinn::hlir::framework::Scope;
+
+class PerformanceTester : public ::testing::Test {
+ public:
+  struct Options {
+    // times of compiled runtime program will be executed repeatedly.
+    int repeat_times = 2;
+    // the num_tuning_rounds for auto tuning
+    int num_tuning_rounds = 2;
+    // knobs to control which schedules will be measured, refer to FLAGS_evaluate_knobs explanation
+    std::bitset<3> evaluate_knobs = 0UL;
+  };
+
+  void SetUp() override { FLAGS_cinn_parallel_compile_size = 0; }
+
+  void Evaluate(const frontend::Program& program) {
+    if (FLAGS_evaluate_knobs >= 0) {
+      options_.evaluate_knobs = FLAGS_evaluate_knobs;
+    }
+    VLOG(3) << "evaluate_knobs = " << options_.evaluate_knobs;
+
+    auto worker_fn = [this, &program](
+                         const std::string& schedule_name, BuildRuntimeProgramFn build_fn, bool execute = true) {
+      Context::Global().ResetNameId();
+      VLOG(3) << "Initialize graph.";
+      auto graph = std::make_shared<hlir::framework::Graph>(program, target_);
+      VLOG(3) << "Apply graph pass.";
+      hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+      VLOG(3) << "Build " << schedule_name << " program.";
+      auto scope           = BuildScope(target_, graph);
+      auto graph_compiler  = std::make_unique<GraphCompiler>(target_, scope, graph);
+      auto runtime_program = (this->*build_fn)(graph.get(), graph_compiler.get());
+      if (execute) {
+        VLOG(3) << "Execute " << schedule_name << " program.";
+        runtime_program->ExecuteTest(options_.repeat_times);
+      }
+    };
+
+    // if no one is set, build no/manual schedule cases to ensure their build functions are valid
+    if (options_.evaluate_knobs.none()) {
+      worker_fn("no schedule", &PerformanceTester::BuildNoScheduleProgram, /* execute */ false);
+      worker_fn("manual schedule", &PerformanceTester::BuildManualScheduleProgram, /* execute */ false);
+    } else {
+      if (options_.evaluate_knobs.test(0)) {
+        worker_fn("no schedule", &PerformanceTester::BuildNoScheduleProgram);
+      }
+      if (options_.evaluate_knobs.test(1)) {
+        worker_fn("manual schedule", &PerformanceTester::BuildManualScheduleProgram);
+      }
+      if (options_.evaluate_knobs.test(2)) {
+        worker_fn("auto schedule", &PerformanceTester::BuildAutoScheduleProgram);
+      }
+    }
+  }
+
+ protected:
+  using BuildRuntimeProgramFn = std::unique_ptr<hlir::framework::Program> (PerformanceTester::*)(Graph*,
+                                                                                                 GraphCompiler*);
+
+  std::unique_ptr<hlir::framework::Program> BuildNoScheduleProgram(Graph* graph, GraphCompiler* graph_compiler) {
+    const auto& dtype_dict = graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+    const auto& shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+
+    std::shared_ptr<hlir::framework::OpLowerer> op_lowerer =
+        std::make_unique<hlir::framework::OpLowerer>(dtype_dict, shape_dict, target_);
+
+    GraphCompiler::CompileOptions compile_options;
+    compile_options.with_instantiate_variables = true;
+
+    if (graph->fusion_groups.empty()) {
+      hlir::framework::ApplyPasses(graph, {"BuildNonFusedGroupsPass"});
+    }
+    compile_options.groups = graph->fusion_groups;
+
+    for (auto group : graph->fusion_groups) {
+      compile_options.lowered_funcs.push_back(op_lowerer->LowerWithoutSchedule(group));
+    }
+
+    VLOG(3) << "===========================No Schedule LoweredFunc Begin===========================";
+    for (const auto& funcvec : compile_options.lowered_funcs) {
+      for (const auto& func : funcvec) {
+        VLOG(3) << func;
+      }
+    }
+    VLOG(3) << "===========================No Schedule LoweredFunc End=============================";
+
+    return graph_compiler->Build(compile_options).runtime_program;
+  }
+
+  std::unique_ptr<hlir::framework::Program> BuildManualScheduleProgram(Graph* graph, GraphCompiler* graph_compiler) {
+    return graph_compiler->Build();
+  }
+
+  std::unique_ptr<hlir::framework::Program> BuildAutoScheduleProgram(Graph* graph, GraphCompiler* graph_compiler) {
+    auto tuner = std::make_unique<AutoTuner>(target_, graph);
+
+    AutoTuner::Config tuning_config;
+    TuningOptions tuning_options;
+    tuning_options.num_tuning_rounds         = options_.num_tuning_rounds;
+    tuning_options.num_measure_trials        = 2;
+    tuning_options.num_samples_per_iteration = 2;
+
+    tuner->Initialize(tuning_config, graph_compiler);
+    TuningResult tuning_result = tuner->Tune(tuning_options);
+
+    GraphCompiler::CompileOptions compile_options;
+    compile_options.with_instantiate_variables = true;
+    compile_options.Apply(tuning_result);
+
+    VLOG(3) << "===========================Auto Schedule LoweredFunc Begin===========================";
+    for (const auto& funcvec : compile_options.lowered_funcs) {
+      for (const auto& func : funcvec) {
+        VLOG(3) << func;
+      }
+    }
+    VLOG(3) << "===========================Auto Schedule LoweredFunc End=============================";
+
+    return graph_compiler->Build(compile_options).runtime_program;
+  }
+
+#ifdef CINN_WITH_CUDA
+  Target target_ = common::DefaultNVGPUTarget();
+#else
+  Target target_ = common::DefaultHostTarget();
+#endif
+  Options options_;
+};
+
+constexpr int batch_size = 2;
+
+TEST_F(PerformanceTester, Mul) { Evaluate(tests::OpBuilder("mul").Build({{"X", {32, 16}}, {"Y", {16, 32}}})); }
+
+TEST_F(PerformanceTester, Add) {
+  Evaluate(tests::OpBuilder("elementwise_add").Build({{"X", {1, 56, 56, 256}}, {"Y", {1, 56, 56, 256}}}));
+}
+
+TEST_F(PerformanceTester, Matmul) {
+  Evaluate(tests::OpBuilder("matmul").Build({{"X", {batch_size, 2048}}, {"Y", {2048, 1000}}}));
+}
+
+TEST_F(PerformanceTester, Relu) { Evaluate(tests::OpBuilder("relu").Build({{"X", {batch_size, 64, 56, 56}}})); }
+
+TEST_F(PerformanceTester, Conv2d) {
+  std::vector<int> strides{2, 2};
+  std::vector<int> paddings{3, 3};
+  std::vector<int> dilations{1, 1};
+  int groups                    = 1;
+  std::string conv_type         = "forward";
+  std::string data_format       = "NCHW";
+  std::string padding_algorithm = "EXPLICIT";
+
+  Evaluate(tests::OpBuilder("conv2d").Build({{"X", {batch_size, 3, 224, 224}}, {"W", {64, 3, 7, 7}}},
+                                            {{"stride", strides},
+                                             {"padding", paddings},
+                                             {"dilation", dilations},
+                                             {"groups", groups},
+                                             {"conv_type", conv_type},
+                                             {"data_format", data_format},
+                                             {"padding_algorithm", padding_algorithm}}));
+}
+
+TEST_F(PerformanceTester, Pool2d) {
+  std::vector<int32_t> input_shape{batch_size, 64, 112, 112};
+  std::string pooling_type = "max";
+  std::vector<int> ksize{3, 3};
+  std::vector<int> strides{2, 2};
+  std::vector<int> paddings{1, 1, 1, 1};
+  bool ceil_mode                = false;
+  bool exclusive                = true;
+  bool global_pooling           = false;
+  std::string data_format       = "NCHW";
+  bool adaptive                 = false;
+  std::string padding_algorithm = "EXPLICIT";
+
+  Evaluate(tests::OpBuilder("pool2d").Build({{"X", {batch_size, 64, 112, 112}}},
+                                            {{"pool_type", pooling_type},
+                                             {"kernel_size", ksize},
+                                             {"stride_size", strides},
+                                             {"padding_size", paddings},
+                                             {"ceil_mode", ceil_mode},
+                                             {"exclusive", exclusive},
+                                             {"global_pooling", global_pooling},
+                                             {"data_format", data_format},
+                                             {"adaptive", adaptive},
+                                             {"padding_algorithm", padding_algorithm}}));
+}
+
+TEST_F(PerformanceTester, BatchNorm) {
+  std::vector<int32_t> input_shape{batch_size, 64, 112, 112};
+  std::vector<int32_t> scale_shape{64};
+  std::vector<int32_t> bias_shape{64};
+  std::vector<int32_t> mean_shape{64};
+  std::vector<int32_t> variance_shape{64};
+  float epsilon                  = 1e-5f;
+  float momentum                 = 0.9f;
+  const std::string& data_layout = "NCHW";
+
+  Evaluate(
+      tests::OpBuilder("batch_norm")
+          .Build(
+              {{"X", {batch_size, 64, 112, 112}}, {"scale", {64}}, {"bias", {64}}, {"mean", {64}}, {"variance", {64}}},
+              {{"epsilon", epsilon}, {"momentum", momentum}, {"data_layout", data_layout}}));
+}
+
+TEST_F(PerformanceTester, Reshape) {
+  std::vector<int32_t> output_shape{batch_size, 2048};
+
+  Evaluate(tests::OpBuilder("reshape").Build({{"X", {batch_size, 2048, 1, 1}}}, {{"shape", output_shape}}));
+}
+
+TEST_F(PerformanceTester, Softmax) {
+  std::vector<int> axes   = {-1};
+  std::string mode        = "fast";
+  std::string data_format = "AnyLayout";
+
+  Evaluate(tests::OpBuilder("softmax").Build({{"X", {batch_size, 1000}}},
+                                             {{"axes", axes}, {"mode", mode}, {"data_format", data_format}}));
+}
+
+TEST_F(PerformanceTester, Scale) {
+  float scale           = 1.0f;
+  float bias            = 0.0f;
+  bool bias_after_scale = true;
+
+  Evaluate(tests::OpBuilder("scale").Build({{"X", {batch_size, 1000}}},
+                                           {{"scale", scale}, {"bias", bias}, {"bias_after_scale", bias_after_scale}}));
+}
+
+TEST_F(PerformanceTester, LookupTable) {
+  int64_t padding_idx = -1;
+
+  Evaluate(
+      tests::OpBuilder("lookup_table")
+          .Build({{"table", {50001, 768}}, {"ids", {10, 128, 1}, common::Int(64)}}, {{"padding_idx", padding_idx}}));
+}
+
+TEST_F(PerformanceTester, Gather) {
+  int axis = 3;
+
+  Evaluate(tests::OpBuilder("gather").Build(
+      {{"operand", {10, 12, 128, 512}}, {"index", {1, 1, 1, 128}, common::Int(32)}}, {{"axis", axis}}));
+}
+
+// paddle model test
+TEST_F(PerformanceTester, ResNet50) {
+  CHECK_NE(FLAGS_resnet50_model_dir, "");
+  std::unordered_map<std::string, std::vector<int64_t>> feeds = {{"inputs", {batch_size, 3, 224, 224}}};
+  Evaluate(cinn::frontend::PaddleModelConvertor(common::DefaultNVGPUTarget())
+               .LoadModel(FLAGS_resnet50_model_dir, true, feeds));
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/tuning.h b/paddle/cinn/auto_schedule/tuning.h
new file mode 100644
index 0000000000000..0b2bfe66d1273
--- /dev/null
+++ b/paddle/cinn/auto_schedule/tuning.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/ir/lowered_func.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// alias a LoweredFunc array as FunctionGroup
+using FunctionGroup = std::vector<ir::LoweredFunc>;
+// alias a Graph::Group array as SubGraph
+using SubGraphPtr = std::shared_ptr<hlir::framework::Graph::Group>;
+
+// Options for tuning process
+struct TuningOptions {
+  // The number of tuning rounds, each round will tune several tasks,
+  // each task involves TuningOptions.num_measure_trials measurements.
+  int num_tuning_rounds = 1;
+
+  // The number of measurement trials in a task, if it is 0,
+  // that means the tunner will return the best
+  // candidate of schedule config without measurement.
+  int num_measure_trials = 10;
+
+  // Every round TaskSchedule chooses some TuneTask(s) to optimize and run
+  // several iterations of search algorithm for a task to generate samples.
+  // Each iteration has num_samples_per_iteration samples.
+  //
+  // 1. if TuningOptions.num_measure_trials is 0, the autotune doesn't involve
+  // hardware measurements. It predicts performance by cost model.
+  //
+  // 2. num_measure_trials % num_samples_per_iteration must equal 0.
+  // In each round, autotune will run iterations until number of iterations
+  // * num_samples_per_iteration equals num_measure_trials.
+  int num_samples_per_iteration = 10;
+
+  //////////////////////////////////////
+  // Evolutionary Search Related Options
+  //////////////////////////////////////
+
+  // The number of picks from the stored database in each iteration
+  // These are best performance recorded from previous generations
+  //
+  // Note the number doesn't guaranteed returns those topk when the
+  // database doesn't have enough data. Evolutionary Search would get
+  // as many as possible without throwing errors or warnings.
+  int evolution_pick_database_topk = 8;
+
+  // The number of initial populations at each generation. It contains
+  // the picks from  database plus random generated samples.
+  int evolution_init_population_num = 10;
+
+  // The number of samples generated by cross over
+  int evolution_cross_over_num = 0;
+
+  // The fraction of random samples in num_samples_per_iteration.
+  // So the num_samples_per_iteration would have (1 - eps_greedy) best
+  // samples from evolutionary search and eps_greedy random samples.
+  //
+  // It explores the cases evolutionary search won't predict precisely
+  float evolution_eps_greedy = 0.1f;
+};
+
+// Result of the tuning process
+struct TuningResult {
+  // Result of graph tuning
+  std::vector<SubGraphPtr> subgraphs;
+  // Result of schedule tuning
+  std::vector<FunctionGroup> function_groups;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/backends/CMakeLists.txt b/paddle/cinn/backends/CMakeLists.txt
new file mode 100755
index 0000000000000..3949bc4e7313d
--- /dev/null
+++ b/paddle/cinn/backends/CMakeLists.txt
@@ -0,0 +1,67 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    outputs.cc
+    codegen_c.cc
+    codegen_c_x86.cc
+    codegen_cuda_host.cc
+    extern_func_emitter.cc
+    extern_func_emitter_builtin.cc
+    function_prototype.cc
+    extern_func_protos.cc
+    extern_func_jit_register.cc
+    modular.cc
+    compiler.cc
+)
+
+if (WITH_CUDA)
+  add_subdirectory(nvrtc)
+  list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc codegen_cuda_util.cc)
+endif()
+
+if (WITH_OPENMP)
+cc_library(__x86_source_fake_lib SRCS _x86_builtin_source.cc)
+endif()
+add_subdirectory(llvm)
+
+
+if (WITH_CUDA)
+    nv_test(test_raw_cuda_code SRCS raw_cuda_code_test.cu DEPS cinncore)
+endif()
+
+cc_test(test_codegen_c SRCS codegen_c_test.cc DEPS cinncore ARGS ${global_test_args})
+cc_test(test_codegen_c_x86 SRCS codegen_c_x86_test.cc DEPS cinncore ARGS ${global_test_args})
+cc_test(test_generated1 SRCS generated_module1.cc DEPS cinn_runtime)
+add_run_test_dependency(test_generated1 test_codegen_c)
+cc_test(test_ir_schedule SRCS ir_schedule_test.cc DEPS cinncore)
+include_directories(${CMAKE_SOURCE_DIR}/cinn/runtime)
+if (TARGET test_generated1)
+  add_dependencies(test_generated1 test_codegen_c)
+endif()
+
+if (WITH_CUDA)
+  nv_test(test_codegen_cuda_generate SRCS codegen_cuda_generate_test.cc DEPS cinncore)
+  nv_test(test_codegen_debug SRCS codegen_debug_test.cc DEPS cinncore)
+
+  if (WITH_TESTING)
+    nv_test(generated1_cuda SRCS generated1.cu DEPS cinncore)
+    add_run_test_dependency(generated1_cuda test_codegen_cuda_generate)
+  endif()
+
+  nv_test(test_compiler SRCS compiler_test.cc DEPS cinncore)
+else()
+  cc_test(test_compiler SRCS compiler_test.cc DEPS cinncore)
+endif()
+
+
+foreach(cpp ${srcs})
+  set(cinnapi_src
+          "${cinnapi_src};cinn/backends/${cpp}"
+          CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/cinn/backends/_x86_builtin_source.cc b/paddle/cinn/backends/_x86_builtin_source.cc
new file mode 100644
index 0000000000000..f29b3cc79ca81
--- /dev/null
+++ b/paddle/cinn/backends/_x86_builtin_source.cc
@@ -0,0 +1,378 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+///                     Predefined utilities in CINN BEGIN(
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <immintrin.h>
+#include <math.h>
+#include <omp.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "cinn/runtime/cpu/thread_backend.h"
+
+#ifndef _CINN_X86_BUILTIN_SOURCE_
+#define _CINN_X86_BUILTIN_SOURCE_
+//! Vector in stack, this can only used in generated .cc file.
+template <typename T, size_t Num>
+struct StackVec {
+  typedef T value_type;
+  typedef StackVec<T, Num> self_type;
+
+  self_type& operator=(const StackVec& src) {
+    if (this != &src) {
+      memcpy(data_, src.data_, num_bytes());
+    }
+    return *this;
+  }
+
+  StackVec() { memset(data_, 0, num_bytes()); }
+
+  explicit StackVec(const T* externl) : external_data_(externl) {}
+
+  static self_type Broadcast(const value_type& v) {
+    self_type res;
+    for (size_t i = 0; i < Num; i++) res.data_[i] = v;
+    return res;
+  }
+
+  static self_type Ramp(const value_type& base, const value_type& stride) {
+    self_type res;
+    for (size_t i = 0; i < Num; i++) {
+      res.data_[i] = base + stride * i;
+    }
+  }
+
+  static self_type Load(const void* base, int32_t offset) {
+    self_type res;
+    memcpy(&res.data_[0], (const value_type*)base + offset, num_bytes());
+  }
+
+  static self_type Load(const void* base, const StackVec<int32_t, Num>& offset) {
+    self_type res;
+    for (size_t i = 0; i < Num; i++) {
+      res.data_[i] = ((const value_type*)base)[offset[i]];
+    }
+  }
+
+  void Store(void* base, int32_t offset) const {
+    mempcpy((value_type*)base + offset, &data_[0], num_bytes());  // NOLINT
+  }
+
+  inline value_type& operator[](size_t i) { return data_[i]; }
+  inline value_type operator[](size_t i) const { return data_[i]; }
+
+  // binary operator between two vectors
+  // @{
+#define __(op__)                                                           \
+  friend self_type operator op__(const self_type& a, const self_type& b) { \
+    self_type res;                                                         \
+    for (size_t i = 0; i < Num; i++) {                                     \
+      res.data_[i] = a[i] op__ b[i];                                       \
+    }                                                                      \
+    return res;                                                            \
+  }
+  __(+)
+  __(-)
+  __(*)
+  __(/)
+  __(%)
+  // @}
+#undef __
+
+  // binary operator between a vector and a scalar
+  // @{
+#define __(op__)                                                            \
+  friend self_type operator op__(const self_type& a, const value_type& b) { \
+    self_type res;                                                          \
+    for (size_t i = 0; i < Num; i++) {                                      \
+      res.data_[i] = a[i] op__ b;                                           \
+    }                                                                       \
+    return res;                                                             \
+  }
+  __(+)
+  __(-)
+  __(*)
+  __(/)
+  __(%)
+#undef __
+  // @}
+
+  static constexpr size_t num_bytes() { return sizeof(data_); }
+
+ private:
+  T data_[Num];
+  T* external_data_{nullptr};
+};
+
+/**
+ * The vector with external data.
+ */
+template <typename T, size_t Num>
+struct ExternalVec {
+  typedef T value_type;
+  typedef ExternalVec<T, Num> self_type;
+
+  explicit ExternalVec(T* data) : data_(data) {}
+
+  self_type& operator=(const self_type& src) {
+    if (data_ != src.data_) {
+      memcpy(data_, src.data_, num_bytes());
+    }
+    return *this;
+  }
+
+  static self_type Load(const void* base, int32_t offset) {
+    self_type res((T*)base + offset);  // NOLINT
+    return res;
+  }
+
+  static constexpr size_t num_bytes() { return sizeof(value_type) * Num; }
+
+ private:
+  T* data_{nullptr};
+};
+
+// AVX256 load
+//@{
+inline __m256 cinn_avx256_load(const float* dst) { return _mm256_load_ps(dst); }
+inline __m256d cinn_avx256_load(const double* dst) { return _mm256_load_pd(dst); }
+//@}
+// AVX512 load
+//@{
+inline __m512 cinn_avx512_load(const float* dst) { return _mm512_load_ps(dst); }
+inline __m512d cinn_avx512_load(const double* dst) { return _mm512_load_pd(dst); }
+//@}
+
+// FP32x8 * FP32x8
+// @{
+inline void cinn_avx256_add(float* dst, float* a, float* b) {
+  _mm256_store_ps(dst, _mm256_add_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_sub(float* dst, float* a, float* b) {
+  _mm256_store_ps(dst, _mm256_sub_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_mul(float* dst, float* a, float* b) {
+  _mm256_store_ps(dst, _mm256_mul_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_div(float* dst, float* a, float* b) {
+  _mm256_store_ps(dst, _mm256_div_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
+}
+// @}
+
+// FP32x4 * float
+// @{
+inline void cinn_avx256_add(float* dst, float* a, float b) {
+  _mm256_store_ps(dst, _mm256_add_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
+}
+inline void cinn_avx256_sub(float* dst, float* a, float b) {
+  _mm256_store_ps(dst, _mm256_sub_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
+}
+inline void cinn_avx256_mul(float* dst, float* a, float b) {
+  _mm256_store_ps(dst, _mm256_mul_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
+}
+inline void cinn_avx256_div(float* dst, float* a, float b) {
+  _mm256_store_ps(dst, _mm256_div_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
+}
+// @}
+
+// float * FP32x4
+// @{
+inline void cinn_avx256_add(float* dst, float a, float* b) {
+  _mm256_store_ps(dst, _mm256_add_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_sub(float* dst, float a, float* b) {
+  _mm256_store_ps(dst, _mm256_sub_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_mul(float* dst, float a, float* b) {
+  _mm256_store_ps(dst, _mm256_mul_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_div(float* dst, float a, float* b) {
+  _mm256_store_ps(dst, _mm256_div_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
+}
+// @}
+
+// 4 x float64
+// @{
+inline void cinn_avx256_add(double* dst, double* a, double* b) {
+  _mm256_store_pd(dst, _mm256_add_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_sub(double* dst, double* a, double* b) {
+  _mm256_store_pd(dst, _mm256_sub_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_mul(double* dst, double* a, double* b) {
+  _mm256_store_pd(dst, _mm256_mul_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_div(double* dst, double* a, double* b) {
+  _mm256_store_pd(dst, _mm256_div_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
+}
+// @}
+
+// FP32x4 * FP64
+// @{
+inline void cinn_avx256_add(double* dst, double* a, double b) {
+  _mm256_store_pd(dst, _mm256_add_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
+}
+inline void cinn_avx256_sub(double* dst, double* a, double b) {
+  _mm256_store_pd(dst, _mm256_sub_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
+}
+inline void cinn_avx256_mul(double* dst, double* a, double b) {
+  _mm256_store_pd(dst, _mm256_mul_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
+}
+inline void cinn_avx256_div(double* dst, double* a, double b) {
+  _mm256_store_pd(dst, _mm256_div_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
+}
+// @}
+
+// float * FP32x4
+// @{
+inline void cinn_avx256_add(double* dst, double a, double* b) {
+  _mm256_store_pd(dst, _mm256_add_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_sub(double* dst, double a, double* b) {
+  _mm256_store_pd(dst, _mm256_sub_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_mul(double* dst, double a, double* b) {
+  _mm256_store_pd(dst, _mm256_mul_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_div(double* dst, double a, double* b) {
+  _mm256_store_pd(dst, _mm256_div_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
+}
+// @}
+
+//! 32 x float32 operations.
+// @{
+inline void cinn_avx512_add(float* dst, float* a, float* b) {
+  _mm512_store_ps(dst, _mm512_add_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
+}
+inline void cinn_avx512_sub(float* dst, float* a, float* b) {
+  _mm512_store_ps(dst, _mm512_sub_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
+}
+inline void cinn_avx512_mul(float* dst, float* a, float* b) {
+  _mm512_store_ps(dst, _mm512_mul_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
+}
+inline void cinn_avx512_div(float* dst, float* a, float* b) {
+  _mm512_store_ps(dst, _mm512_div_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
+}
+// @}
+
+// FP32x4 * FP64
+// @{
+inline void cinn_avx512_add(float* dst, float* a, float b) {
+  _mm512_store_pd(dst, _mm512_add_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
+}
+inline void cinn_avx512_sub(float* dst, float* a, float b) {
+  _mm512_store_pd(dst, _mm512_sub_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
+}
+inline void cinn_avx512_mul(float* dst, float* a, float b) {
+  _mm512_store_pd(dst, _mm512_mul_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
+}
+inline void cinn_avx512_div(float* dst, float* a, float b) {
+  _mm512_store_pd(dst, _mm512_div_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
+}
+// @}
+
+// float * FP32x4
+// @{
+inline void cinn_avx512_add(float* dst, float a, float* b) {
+  _mm512_store_pd(dst, _mm512_add_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_sub(float* dst, float a, float* b) {
+  _mm512_store_pd(dst, _mm512_sub_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_mul(float* dst, float a, float* b) {
+  _mm512_store_pd(dst, _mm512_mul_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_div(float* dst, float a, float* b) {
+  _mm512_store_pd(dst, _mm512_div_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
+}
+// @}
+
+//! 16 x float32 operations.
+// @{
+inline void cinn_avx512_add(double* dst, double* a, double* b) {
+  _mm512_store_pd(dst, _mm512_add_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_sub(double* dst, double* a, double* b) {
+  _mm512_store_pd(dst, _mm512_sub_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_mul(double* dst, double* a, double* b) {
+  _mm512_store_pd(dst, _mm512_mul_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_div(double* dst, double* a, double* b) {
+  _mm512_store_pd(dst, _mm512_div_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
+}
+// @}
+
+inline __m512 cinn_avx512_add(const __m512& a, const __m512& b);
+
+inline __m256 cinn_avx256_add_float(const __m256& a, const __m256& b) { return _mm256_add_ps(a, b); }
+inline __m256d cinn_avx256_add_double(const __m256d& a, const __m256d& b) { return _mm256_add_pd(a, b); }
+inline __m512 cinn_avx512_add_float(const __m512& a, const __m512& b) { return _mm512_add_ps(a, b); }
+inline __m512d cinn_avx512_add_double(const __m512d& a, const __m512d& b) { return _mm512_add_pd(a, b); }
+
+//! set1
+// @{
+inline __m256 cinn_avx256_set1(float value) { return _mm256_set1_ps(value); }
+inline __m256d cinn_avx256_set1(double value) { return _mm256_set1_pd(value); }
+inline __m512 cinn_avx512_set1(float value) { return _mm512_set1_ps(value); }
+inline __m512d cinn_avx512_set1(double value) { return _mm512_set1_pd(value); }
+// @}
+
+//! store
+// @{
+inline void cinn_avx512_store(float* dst, const __m512& x) { _mm512_store_ps(dst, x); }
+inline void cinn_avx512_store(double* dst, const __m512d& x) { _mm512_store_pd(dst, x); }
+inline void cinn_avx256_store(float* dst, const __m256& x) { _mm256_store_ps(dst, x); }
+inline void cinn_avx256_store(double* dst, const __m256d& x) { _mm256_store_pd(dst, x); }
+// @}
+
+//! add
+// @{
+inline __m256 cinn_avx256_add(const __m256& a, const __m256& b) { return _mm256_add_ps(a, b); }
+inline __m256d cinn_avx256_add(const __m256d& a, const __m256d& b) { return _mm256_add_pd(a, b); }
+inline __m512 cinn_avx512_add(const __m512& a, const __m512& b) { return _mm512_add_ps(a, b); }
+inline __m512d cinn_avx512_add(const __m512d& a, const __m512d& b) { return _mm512_add_pd(a, b); }
+// @}
+
+//! mul
+// @{
+inline __m256 cinn_avx256_mul(const __m256& a, const __m256& b) { return _mm256_mul_ps(a, b); }
+inline __m256d cinn_avx256_mul(const __m256d& a, const __m256d& b) { return _mm256_mul_pd(a, b); }
+inline __m512 cinn_avx512_mul(const __m512& a, const __m512& b) { return _mm512_mul_ps(a, b); }
+inline __m512d cinn_avx512_mul(const __m512d& a, const __m512d& b) { return _mm512_mul_pd(a, b); }
+// @}
+
+//! fma
+// @{
+inline __m128 cinn_avx128_fma(const __m128& a, const __m128& b, const __m128& c) { return _mm_fmadd_ps(a, b, c); }
+inline __m128d cinn_avx128_fma(const __m128d& a, const __m128d& b, const __m128d& c) { return _mm_fmadd_pd(a, b, c); }
+inline __m256 cinn_avx256_fma(const __m256& a, const __m256& b, const __m256& c) { return _mm256_fmadd_ps(a, b, c); }
+inline __m256d cinn_avx256_fma(const __m256d& a, const __m256d& b, const __m256d& c) {
+  return _mm256_fmadd_pd(a, b, c);
+}
+inline __m512 cinn_avx512_fma(const __m512& a, const __m512& b, const __m512& c) { return _mm512_fmadd_ps(a, b, c); }
+inline __m512d cinn_avx512_fma(const __m512d& a, const __m512d& b, const __m512d& c) {
+  return _mm512_fmadd_pd(a, b, c);
+}
+// @}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+///                     )END Predefined utilities in CINN
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif  // _CINN_X86_BUILTIN_SOURCE_
diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc
new file mode 100644
index 0000000000000..a5a26ecea027c
--- /dev/null
+++ b/paddle/cinn/backends/codegen_c.cc
@@ -0,0 +1,868 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/codegen_c.h"
+
+#include <fstream>
+#include <string>
+
+#include "cinn/backends/extern_func_emitter.h"
+#include "cinn/backends/extern_func_emitter_builtin.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_verify.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/remove_nested_block.h"
+#include "cinn/runtime/cpu/thread_backend.h"
+#include "cinn/runtime/intrinsic.h"
+#include "cinn/utils/string.h"
+
+//! Root of the builtin code.
+DECLARE_string(cinn_x86_builtin_code_root);
+
+namespace cinn {
+namespace backends {
+using namespace utils;  // NOLINT
+using cinn::common::float16;
+
+const char *kCKeywordRestrict = "__restrict__";
+
+void CodeGenC::Compile(const ir::Module &module, const Outputs &outputs) {
+  ir::IrVerify(Expr(module));
+
+  if (!outputs.c_header_name.empty()) {
+    auto source = Compile(module, OutputKind::CHeader);
+    std::ofstream file(outputs.c_header_name);
+    CHECK(file.is_open()) << "failed to open file " << outputs.c_header_name;
+    file << source;
+    file.close();
+    LOG(WARNING) << "Output C header to file " << outputs.c_header_name;
+  }
+
+  if (!outputs.c_source_name.empty()) {
+    auto source = Compile(module, OutputKind::CImpl);
+    std::ofstream file(outputs.c_source_name);
+    CHECK(file.is_open()) << "failed to open file " << outputs.c_source_name;
+    file << source;
+    file.close();
+    LOG(WARNING) << "Output C source to file " << outputs.c_source_name;
+  }
+}
+
+CodeGenC::CodeGenC(Target target) : ir::IrPrinter(ss_) {}
+
+std::string CodeGenC::Compile(const ir::Module &module, OutputKind output_kind) {
+  if (output_kind == OutputKind::CHeader) {
+    GenerateHeaderFile(module);
+  } else if (output_kind == OutputKind::CImpl) {
+    PrintIncludes();
+
+    if (inline_builtin_codes_) PrintBuiltinCodes();
+
+    std::vector<ir::Buffer> buffers;
+    for (auto &buffer : module->buffers) {
+      buffers.emplace_back(buffer.as_buffer_ref());
+    }
+
+    for (auto &func : module.functions()) {
+      Compile(func);
+    }
+  } else {
+    LOG(FATAL) << "Not supported OutputKind";
+  }
+  return ss_.str();
+}
+std::string CodeGenC::Compile(const ir::LoweredFunc &function) {
+  CHECK(function.defined());
+  Print(function);
+  os() << "\n\n";
+  return ss_.str();
+}
+
+std::string CodeGenC::GetTypeName(Type type) {
+  // common scalar type
+#define GET_SCALAR_TYPE(pred_expr, scalar_name) \
+  if (pred_expr) {                              \
+    return scalar_name;                         \
+  }
+
+  GET_SCALAR_TYPE(type.is_void(), "void");
+  GET_SCALAR_TYPE(type.is_bool(), "bool");
+
+  GET_SCALAR_TYPE(type.is_int(8), "int8_t");
+  GET_SCALAR_TYPE(type.is_int(16), "int16_t");
+  GET_SCALAR_TYPE(type.is_int(32), "int32_t");
+  GET_SCALAR_TYPE(type.is_int(64), "int64_t");
+
+  GET_SCALAR_TYPE(type.is_uint(8), "uint8_t");
+  GET_SCALAR_TYPE(type.is_uint(16), "uint16_t");
+  GET_SCALAR_TYPE(type.is_uint(32), "uint32_t");
+  GET_SCALAR_TYPE(type.is_uint(64), "uint64_t");
+
+  GET_SCALAR_TYPE(type.is_bfloat16(), "bfloat16");
+  GET_SCALAR_TYPE(type.is_float16(), "float16");
+  GET_SCALAR_TYPE(type.is_float(32), "float")
+  GET_SCALAR_TYPE(type.is_float(64), "double")
+#undef GET_SCALAR_TYPE
+
+  // customized_type
+  if (type.is_customized_type()) {
+    CHECK(!type.customized_type().empty()) << "customized_type can't be empty.";
+    auto customized_name = type.customized_type();
+    // get name of a cuda built-in vector type, it is started with a 'CudaVectorType::' prefix
+    if (utils::Startswith(customized_name, common::customized_type::kcuda_builtin_vector_t)) {
+      customized_name.erase(0, strlen(common::customized_type::kcuda_builtin_vector_t));
+    }
+    return customized_name;
+  }
+
+  // other types are not implemented yet
+  CINN_NOT_IMPLEMENTED
+  return "";
+}
+
+std::string CodeGenC::GetTypeRepr(Type type) {
+  std::string str;
+  if (type.is_cpp_const()) {
+    str = "const ";
+  }
+
+  str += GetTypeName(type);
+  if (type.is_cpp_handle()) {
+    str += "*";
+  } else if (type.is_cpp_handle2()) {
+    str += "**";
+  }
+  return str;
+}
+void CodeGenC::Visit(const ir::IntImm *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::UIntImm *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::FloatImm *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::StringImm *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Add *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Sub *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Mul *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Div *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Mod *op) {
+  auto copied = op->b();
+  optim::Simplify(&copied);
+  if (copied.is_constant()) {
+    int temp = (int)(copied.get_constant());
+    if ((temp & (temp - 1)) == 0) {
+      os() << "(";
+      Print(op->a());
+      os() << " & ";
+      os() << std::to_string(temp - 1);
+      os() << ")";
+      return;
+    }
+  }
+  PrintBinaryOp("%", op);
+}
+void CodeGenC::Visit(const ir::EQ *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::NE *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::LT *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::LE *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::GT *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::GE *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::And *op) { PrintBinaryOp("&&", op); }
+void CodeGenC::Visit(const ir::Or *op) { PrintBinaryOp("||", op); }
+void CodeGenC::Visit(const ir::Min *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Max *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Minus *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Not *op) {
+  os() << "(!";
+  IrPrinter::Print(op->v());
+  os() << ")";
+}
+void CodeGenC::Visit(const ir::Cast *op) { PrintCastExpr(op->type(), op->v()); }
+void CodeGenC::Visit(const ir::For *op) {
+  Expr extent  = op->extent;
+  Expr min     = op->min;
+  int num_task = 1;
+  if (op->is_parallel()) {
+    os() << "int num_task = max_concurrency();\n";
+    DoIndent();
+    os() << "omp_set_num_threads(num_task);\n";
+    DoIndent();
+    os() << "auto flambda = [=](int task_id, int num_task) -> int {\n";
+    IncIndent();
+    DoIndent();
+    os() << "int n_per_task = ";
+    Expr num_task_var = Var("num_task");
+    Print((op->extent + num_task_var - 1) / num_task_var);
+    os() << ";\n";
+    CHECK_EQ(min.as_int32(), 0);
+    auto task_id    = Var("task_id");
+    auto n_per_task = Var("n_per_task");
+    min             = task_id * n_per_task;
+    extent          = (task_id + 1) * n_per_task;
+    DoIndent();
+  }
+  os() << "for (";
+  os() << GetTypeRepr(Int(32));
+  os() << " " << op->loop_var->name;
+  os() << " = ";
+  Print(min);
+  os() << "; ";
+  os() << op->loop_var->name;
+  os() << " < ";
+  Print(op->extent);
+  if (op->is_parallel()) {
+    os() << " && ";
+    os() << op->loop_var->name;
+    os() << " < ";
+    Print(extent);
+  }
+  os() << "; ";
+
+  os() << op->loop_var->name;
+  os() << " += 1";
+  os() << ") ";
+
+  Print(op->body);
+  if (op->is_parallel()) {
+    os() << "\n";
+    DoIndent();
+    os() << "return 0;\n";
+    DecIndent();
+    DoIndent();
+    os() << "};\n";
+    os() << "#pragma omp parallel num_threads(num_task)\n";
+    DoIndent();
+    os() << "{\n";
+    IncIndent();
+    DoIndent();
+    os() << "int task_id = omp_get_thread_num();\n";
+    DoIndent();
+    os() << "flambda(task_id, num_task);\n";
+    DecIndent();
+    DoIndent();
+    os() << "}";
+  }
+}
+void CodeGenC::Visit(const ir::PolyFor *op) {
+  os() << "for (";
+  os() << GetTypeRepr(Int(32));
+  os() << " " << op->iterator->name;
+  os() << " = ";
+  Print(op->init);
+  os() << "; ";
+  Print(op->condition);
+  os() << "; ";
+
+  os() << op->iterator->name;
+  os() << " += ";
+  Print(op->inc);
+  os() << ") ";
+
+  Print(op->body);
+}
+void CodeGenC::Visit(const ir::Select *op) {
+  os() << "(";
+  os() << "(";
+  Print(op->condition);
+  os() << ") ? ";
+  Print(op->true_value);
+  os() << " : ";
+  Print(op->false_value);
+  os() << ")";
+}
+void CodeGenC::Visit(const ir::IfThenElse *op) {
+  os() << "if (";
+  Print(op->condition);
+  os() << ") {\n";
+
+  if (!op->true_case.As<ir::Block>()) IncIndent();
+  DoIndent();
+  Print(op->true_case);
+  if (!op->true_case.As<ir::Block>()) os() << ";";
+  os() << "\n";
+
+  if (!op->true_case.As<ir::Block>()) DecIndent();
+
+  DoIndent();
+  os() << "}";
+
+  if (op->false_case.defined()) {
+    os() << " else {\n";
+
+    if (!op->true_case.As<ir::Block>()) IncIndent();
+    DoIndent();
+    Print(op->false_case);
+    if (!op->false_case.As<ir::Block>()) os() << ";";
+    os() << "\n";
+    if (!op->true_case.As<ir::Block>()) DecIndent();
+
+    DoIndent();
+    os() << "}";
+  }
+}
+void CodeGenC::Visit(const ir::Block *op) {
+  os() << "{\n";
+
+  IncIndent();
+
+  for (int i = 0; i < op->stmts.size() - 1; i++) {
+    DoIndent();
+    Print(op->stmts[i]);
+    os() << ";\n";
+  }
+  if (op->stmts.size() >= 1) {
+    DoIndent();
+    Print(op->stmts.back());
+    os() << ";";
+  }
+
+  DecIndent();
+  os() << "\n";
+  DoIndent();
+  os() << "}";
+}
+void CodeGenC::Visit(const ir::Call *op) {
+  if (op->name == runtime::intrinsic::buffer_malloc) {
+    PrintCall_buffer_malloc(op);
+  } else if (op->name == runtime::intrinsic::pod_values_to_array_repr) {
+    PrintCall_pod_values_to_array(op);
+  } else if (op->is_intrinsic_call()) {
+    os() << op->name << "(";
+    PrintCallArgs(op);
+    os() << ")";
+  } else if (op->is_cinn_call()) {  // call CINN LoweredFunc
+    os() << op->name << "(";
+    PrintCallArgs(op);
+    os() << ")";
+  } else if (op->is_extern_call()) {
+    const auto &fn_name = ExternFunctionEmitterRegistry::Global().Lookup(ExternFuncID{backend_C, op->name.c_str()});
+    if (!fn_name.empty()) {
+      ExternFunctionLLVMEmitter emitter(fn_name);
+      emitter.BindCodeGen(this);
+      emitter.Emit(op);
+    } else {
+      CHECK(!op->read_args.empty() || !op->write_args.empty());
+      os() << op->name << "(";
+      PrintCallArgs(op);
+      os() << ")";
+    }
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+void CodeGenC::PrintCallArgs(const ir::Call *op) {
+  if (!op->read_args.empty()) {
+    for (int i = 0; i < op->read_args.size() - 1; i++) {
+      Print(op->read_args[i]);
+      os() << ", ";
+    }
+    Print(op->read_args.back());
+  }
+  if (!op->write_args.empty()) {
+    if (!op->read_args.empty()) os() << ", ";
+
+    for (int i = 0; i < op->write_args.size() - 1; i++) {
+      Print(op->write_args[i]);
+      os() << ", ";
+    }
+    Print(op->write_args.back());
+  }
+}
+
+void CodeGenC::PrintCall_buffer_malloc(const ir::Call *op) {
+  CHECK_EQ(op->read_args.size(), 2UL);
+  os() << op->name << "(";
+  PrintCastExpr("void*", op->read_args[0]);
+  os() << ", ";
+  os() << op->read_args[1];
+  os() << ")";
+}
+
+void CodeGenC::PrintCall_cinn_pod_value_to_(const ir::Call *op) {
+  CHECK_EQ(op->read_args.size(), 1UL);
+  os() << op->name << "(";
+  os() << "&(";
+  Print(op->read_args[0]);
+  os() << ")";
+  os() << ")";
+}
+
+void CodeGenC::PrintCall_get_address(const ir::Call *op) {
+  CHECK_EQ(op->read_args.size(), 1UL);
+  CHECK(op->write_args.empty());
+  auto *read_var = op->read_args.front().as_var();
+  auto *read_buf = op->read_args.front().as_buffer();
+  CHECK(read_var || read_buf) << "Only Var or Buffer can get address";
+
+  if (read_var) {
+    if (read_var->type().lanes() <= 1) os() << "&";
+    os() << read_var->name;
+  } else if (read_buf) {
+    if (read_buf->type().lanes() <= 1) os() << "&";
+    os() << read_buf->name;
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void CodeGenC::PrintCall_pod_values_to_array(const ir::Call *op) {
+  CHECK(!op->read_args.empty());
+  CHECK_EQ(op->write_args.size(), 1UL);
+  auto output_var = op->write_args.front().as_var_ref();
+  CHECK(output_var.defined());
+
+  std::vector<std::string> arg_names;
+  for (auto &arg : op->read_args) {
+    auto arg_var = arg.as_var();
+    CHECK(arg_var);
+    arg_names.push_back(arg_var->name);
+  }
+
+  os() << "cinn_pod_value_t " << output_var->name << "[] = ";
+  os() << "{ ";
+
+  os() << utils::Join(arg_names, ", ");
+
+  os() << " }";
+}
+
+void CodeGenC::Visit(const ir::_Module_ *op) { CINN_NOT_IMPLEMENTED }
+void CodeGenC::Visit(const ir::_Var_ *op) { os() << op->name; }
+
+void CodeGenC::Visit(const ir::Load *op) {
+  Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1);
+  if (dense_strided_ramp.defined()) {  // Loading a continuous Ramp address.
+    CHECK(op->type().is_vector());
+    PrintStackVecType(op->type().ElementOf(), op->index().type().lanes());
+    os() << "::"
+         << "Load(";
+    os() << op->tensor.As<ir::_Tensor_>()->name;
+    os() << ",";
+    Print(dense_strided_ramp);
+    os() << ")";
+  } else if (op->index().type().is_vector()) {
+    // gather
+    CHECK(op->type().is_vector());
+    PrintStackVecType(op->type().ElementOf(), op->index().type().lanes());
+    os() << "::Load(";
+    os() << op->tensor.As<ir::_Tensor_>()->name;
+    os() << ",";
+    Print(op->index());
+    os() << ")";
+  } else if (op->is_addr_tensor()) {
+    auto *tensor = op->tensor.As<ir::_Tensor_>();
+    os() << tensor->name << "[";
+    Print(op->index());
+    os() << "]";
+  } else {
+    IrPrinter::Visit(op);
+  }
+}
+
+void CodeGenC::Visit(const ir::Store *op) {
+  CHECK(op->is_addr_tensor());
+
+  auto *tensor = op->tensor.As<ir::_Tensor_>();
+  CHECK(tensor);
+  os() << tensor->name << "[";
+  Print(op->index());
+  os() << "]";
+  os() << " = ";
+  Print(op->value);
+}
+void CodeGenC::Visit(const ir::Alloc *op) {
+  os() << runtime::intrinsic::buffer_malloc;
+  os() << "(";
+  os() << "(void*)(0), ";
+
+  auto *buffer = op->destination.As<ir::_Buffer_>();
+  os() << buffer->name;
+  os() << ")";
+}
+
+void CodeGenC::Visit(const ir::Free *op) {
+  os() << runtime::intrinsic::buffer_free;
+  os() << "(";
+  os() << "(void*)(0), ";
+
+  auto *buffer = op->destination.As<ir::_Buffer_>();
+  os() << buffer->name;
+  os() << ")";
+}
+
+void CodeGenC::Visit(const ir::_Buffer_ *op) { os() << op->name; }
+void CodeGenC::Visit(const ir::_Tensor_ *op) { os() << op->buffer->name; }
+void CodeGenC::Visit(const ir::Let *op) {
+  bool is_vec = false;
+  CHECK(op->type().valid());
+  if (op->body.defined() && op->body.As<ir::Broadcast>()) {
+    // broadcast's type is hard to print, so use c++11 auto instead.
+    os() << "auto";
+    is_vec = true;
+  } else {
+    os() << GetTypeRepr(op->type());
+  }
+
+  os() << " ";
+  Print(op->symbol);
+
+  // native C array.
+  if (op->type().lanes() > 1 && !is_vec) {
+    os() << "[" << op->type().lanes() << "]";
+  }
+
+  if (op->body.defined()) {
+    os() << " = ";
+    Print(op->body);
+  }
+}
+
+void CodeGenC::Visit(const ir::Reduce *op) {
+  LOG(FATAL) << "Reduce IR is just for internal representation, should not be used for CodeGen.";
+}
+
+void CodeGenC::Visit(const ir::Ramp *op) {
+  os() << "StackVec<" << op->lanes << "," << GetTypeRepr(op->type().ElementOf()) << ">::Ramp(";
+  Print(op->base);
+  os() << ", ";
+  Print(op->stride);
+  os() << ", ";
+  os() << op->lanes;
+  os() << ")";
+}
+
+void CodeGenC::Visit(const ir::Broadcast *op) {
+  os() << "StackVec<" << op->lanes << "," << GetTypeRepr(op->type().ElementOf()) << ">::Broadcast(";
+  Print(op->value);
+  os() << ", ";
+  os() << op->lanes << ")";
+}
+
+void CodeGenC::Visit(const ir::FracOp *op) { ir::IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Sum *op) { ir::IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Product *op) { ir::IrPrinter::Visit(op); }
+
+void CodeGenC::PrintCastExpr(const Type &type, Expr e) {
+  os() << "((" << GetTypeRepr(type) << ")";
+  os() << "(";
+  Print(e);
+  os() << "))";
+}
+void CodeGenC::PrintCastExpr(const std::string &type, Expr e) {
+  os() << "(" << type << ")";
+  os() << "(";
+  Print(e);
+  os() << ")";
+}
+
+void CodeGenC::PrintShape(const std::vector<Expr> &shape, char leftb, char rightb) {
+  os() << leftb << " ";
+
+  for (int i = 0; i < shape.size() - 1; i++) {
+    Print(shape[i]);
+    os() << ", ";
+  }
+  if (shape.size() > 1) Print(shape.back());
+
+  os() << " " << rightb;
+}
+
+void CodeGenC::Visit(const ir::_LoweredFunc_ *op) {
+  PrintFunctionDeclaration(op);
+  os() << "\n";
+
+  DoIndent();
+
+  CHECK_EQ(op->alloc_output_buffer_exprs.size(), op->dealloc_output_buffer_exprs.size())
+      << "the count of allocation and deallocaton expressions is not match";
+
+  std::vector<Expr> new_body;
+
+  std::vector<Expr> create_temp_buffers   = op->PrepareCreateTempBufferExprs();
+  std::vector<Expr> alloca_temp_buffers   = op->PrepareAllocTempBufferExprs();
+  std::vector<Expr> dealloca_temp_buffers = op->PrepareDeallocTempBufferExprs();
+#define APPEND_TO_NEW_BODY(field__) new_body.insert(std::end(new_body), std::begin(op->field__), std::end(op->field__));
+  APPEND_TO_NEW_BODY(argument_prepare_exprs)
+  new_body.insert(std::end(new_body), std::begin(create_temp_buffers), std::end(create_temp_buffers));
+  APPEND_TO_NEW_BODY(alloc_output_buffer_exprs)
+  new_body.insert(std::end(new_body), std::begin(alloca_temp_buffers), std::end(alloca_temp_buffers));
+  APPEND_TO_NEW_BODY(buffer_data_cast_exprs)
+  new_body.push_back(op->body);
+  new_body.insert(std::end(new_body), std::begin(dealloca_temp_buffers), std::end(dealloca_temp_buffers));
+  APPEND_TO_NEW_BODY(dealloc_output_buffer_exprs)
+
+  Expr func_body = ir::Block::Make(new_body);
+
+  optim::RemoveNestedBlock(&func_body);
+
+  Print(func_body);
+}
+void CodeGenC::PrintIncludes() {
+  os() << "#include <cinn_runtime.h>\n";
+  os() << "#include <stdio.h>\n";
+  os() << "\n";
+}
+
+void CodeGenC::PrintFileGuardOpen(const std::string &name) {
+  os() << utils::StringFormat("#ifndef _%s_CINN_H_\n", Uppercase(name).c_str());
+  os() << utils::StringFormat("#define _%s_CINN_H_\n", Uppercase(name).c_str());
+  os() << "\n";
+}
+void CodeGenC::PrintFileGuardClose(const std::string &module_name) {
+  os() << utils::StringFormat("#endif  // _%s_CINN_H_\n", Uppercase(module_name).c_str());
+}
+
+void CodeGenC::PrintBufferCreation(const std::vector<ir::Buffer> &buffers) {
+  for (auto &buffer : buffers) {
+    // Ignore the buffer in other devices.
+    if (!buffer->is_on_host()) continue;
+    DoIndent();
+    auto buffer_ptr_type = Type().set_customized_type(common::customized_type::kbuffer_t).set_cpp_handle();
+    Var variable         = ir::_Var_::Make(buffer->name, buffer_ptr_type);
+    auto expr            = ir::intrinsics::BufferCreate::Make(buffer);
+    expr                 = ir::Let::Make(variable, expr);
+    Print(expr);
+    os() << ";\n";
+  }
+}
+
+void CodeGenC::PrintBufferDestroy(const std::vector<ir::Buffer> &buffers) {
+  for (auto &buffer : buffers) {
+    DoIndent();
+    Print(buffer.DestroyExpr());
+    os() << ";\n";
+  }
+}
+
+void CodeGenC::GenerateHeaderFile(const ir::Module &module) {
+  PrintFileGuardOpen(module.name());
+  PrintIncludes();
+
+  for (auto &func : module.functions()) {
+    PrintFunctionDeclaration(func.As<ir::_LoweredFunc_>());
+    os() << ";\n";
+    os() << "\n\n";
+  }
+
+  PrintFileGuardClose(module.name());
+}
+
+void CodeGenC::PrintFuncArg(const ir::Argument &arg) {
+  if (arg.is_buffer()) {
+    if (arg.is_input()) {
+      os() << "const struct cinn_buffer_t *";
+    } else {
+      os() << "struct cinn_buffer_t *";
+    }
+  } else if (arg.is_var()) {
+    os() << GetTypeRepr(arg.type()) << " ";
+    os() << arg.name();
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+  os() << arg.name();
+}
+
+void CodeGenC::PrintRuntimeType(const cinn_type_t &type) {
+  if (type == cinn_bool_t()) {
+    os() << "cinn_bool_t()";
+  } else if (type == cinn_int8_t()) {
+    os() << "cinn_int8_t()";
+  } else if (type == cinn_int16_t()) {
+    os() << "cinn_int16_t()";
+  } else if (type == cinn_int32_t()) {
+    os() << "cinn_int32_t()";
+  } else if (type == cinn_int64_t()) {
+    os() << "cinn_int64_t()";
+  } else if (type == cinn_uint8_t()) {
+    os() << "cinn_uint8_t()";
+  } else if (type == cinn_uint16_t()) {
+    os() << "cinn_uint16_t()";
+  } else if (type == cinn_uint32_t()) {
+    os() << "cinn_uint32_t()";
+  } else if (type == cinn_uint64_t()) {
+    os() << "cinn_uint64_t()";
+  } else if (type == cinn_bfloat16_t()) {
+    os() << "cinn_bfloat16_t()";
+  } else if (type == cinn_float16_t()) {
+    os() << "cinn_float16_t()";
+  } else if (type == cinn_float32_t()) {
+    os() << "cinn_float32_t()";
+  } else if (type == cinn_float64_t()) {
+    os() << "cinn_float64_t()";
+  } else {
+    LOG(FATAL) << "Unknown type is not supported to print";
+  }
+}
+
+void CodeGenC::PrintStackVecType(Type type, int lanes) {
+  os() << "StackedVec<" << GetTypeRepr(type) << "," << lanes << ">";
+}
+
+void CodeGenC::Visit(const ir::PrimitiveNode *op) { CINN_NOT_IMPLEMENTED }
+void CodeGenC::Visit(const ir::_BufferRange_ *op) { CINN_NOT_IMPLEMENTED }
+void CodeGenC::Visit(const ir::ScheduleBlock *op) { CINN_NOT_IMPLEMENTED }
+void CodeGenC::Visit(const ir::ScheduleBlockRealize *op) { CINN_NOT_IMPLEMENTED }
+
+void CodeGenC::Visit(const ir::IntrinsicOp *op) {
+  switch (op->getKind()) {
+#define __(x)                                     \
+  case ir::IntrinsicKind::k##x:                   \
+    Visit(llvm::dyn_cast<ir::intrinsics::x>(op)); \
+    break;
+
+    INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+  }
+}
+
+void CodeGenC::Visit(const ir::intrinsics::BufferGetDataHandle *op) {
+  os() << op->buffer.as_buffer()->name;
+  os() << "->";
+  os() << "memory";
+}
+
+void CodeGenC::Visit(const ir::intrinsics::BufferGetDataConstHandle *op) {
+  os() << op->buffer.as_buffer()->name;
+  os() << "->";
+  os() << "memory";
+}
+
+void CodeGenC::Visit(const ir::intrinsics::PodValueToX *op) {
+  auto to_type = op->GetOutputType(0);
+  if (to_type == type_of<float>()) {
+    os() << runtime::intrinsic::pod_value_to_float;
+  } else if (to_type == type_of<double>()) {
+    os() << runtime::intrinsic::pod_value_to_double;
+  } else if (to_type == type_of<float16>()) {
+    os() << runtime::intrinsic::pod_value_to_float16;
+  } else if (to_type == type_of<bool>()) {
+    os() << runtime::intrinsic::pod_value_to_bool;
+  } else if (to_type == type_of<int8_t>()) {
+    os() << runtime::intrinsic::pod_value_to_int8;
+  } else if (to_type == type_of<int16_t>()) {
+    os() << runtime::intrinsic::pod_value_to_int16;
+  } else if (to_type == type_of<int32_t>()) {
+    os() << runtime::intrinsic::pod_value_to_int32;
+  } else if (to_type == type_of<int64_t>()) {
+    os() << runtime::intrinsic::pod_value_to_int64;
+  } else if (to_type == type_of<uint8_t>()) {
+    os() << runtime::intrinsic::pod_value_to_uint8;
+  } else if (to_type == type_of<uint16_t>()) {
+    os() << runtime::intrinsic::pod_value_to_uint16;
+  } else if (to_type == type_of<uint32_t>()) {
+    os() << runtime::intrinsic::pod_value_to_uint32;
+  } else if (to_type == type_of<uint64_t>()) {
+    os() << runtime::intrinsic::pod_value_to_uint64;
+  } else if (to_type == type_of<void *>()) {
+    os() << runtime::intrinsic::pod_value_to_void_p;
+  } else if (to_type == type_of<cinn_buffer_t *>()) {
+    os() << runtime::intrinsic::pod_value_to_buffer_p;
+  } else {
+    LOG(FATAL) << "Not supported type: " << to_type;
+  }
+
+  os() << "(";
+  Print(op->pod_value_ptr);
+  os() << ")";
+}
+
+void CodeGenC::Visit(const ir::intrinsics::BufferCreate *op) {
+  const ir::_Buffer_ *buffer_arg = op->buffer.as_buffer();
+  CHECK(buffer_arg);
+
+  os() << runtime::intrinsic::buffer_create;
+  os() << "(";
+  PrintCastExpr("cinn_device_kind_t", Expr(buffer_arg->target.runtime_arch()));
+  os() << "/*target*/, ";
+  PrintRuntimeType(runtime::ToRuntimeType(buffer_arg->dtype.ElementOf()));
+  os() << ", ";
+  PrintShape(buffer_arg->shape);
+  if (buffer_arg->data_alignment > 0) {
+    os() << ", " << buffer_arg->data_alignment << "/*align*/";
+  }
+  os() << ")";
+}
+
+void CodeGenC::Visit(const ir::intrinsics::GetAddr *op) {
+  if (op->data.as_buffer()) {
+    os() << "&" << op->data.as_buffer()->name;
+  } else if (op->data.as_var()) {
+    os() << "&" << op->data.as_var()->name;
+  } else {
+    os() << "&(";
+    Print(op->data);
+    os() << ")";
+  }
+}
+
+void CodeGenC::Visit(const ir::intrinsics::ArgsConstruct *op) {
+  os() << runtime::intrinsic::args_construct_repr << "(";
+  os() << op->var->name << ", ";
+  os() << op->args.size() << ", ";
+  for (int i = 0; i < op->args.size() - 1; i++) {
+    Print(op->args[i]);
+    os() << ", ";
+  }
+  if (!op->args.empty()) {
+    Print(op->args.back());
+  }
+  os() << ")";
+}
+
+void CodeGenC::Visit(const ir::intrinsics::BuiltinIntrin *op) {
+  os() << op->name << "(";
+  if (!op->args.empty()) {
+    for (int i = 0; i < op->args.size() - 1; i++) {
+      Print(op->args[i]);
+      os() << ", ";
+    }
+    Print(op->args.back());
+  }
+  os() << ")";
+}
+
+std::string ReadWholeFile(const std::string &path) {
+  CHECK(!path.empty());
+  std::ifstream file(path);
+  CHECK(file.is_open()) << "Failed to open file: " << path;
+  std::stringstream ss;
+  ss << file.rdbuf();
+  return ss.str();
+}
+
+void CodeGenC::PrintBuiltinCodes() {
+  CHECK(!FLAGS_cinn_x86_builtin_code_root.empty()) << "The flag cinn_x86_builtin_code_root should be set first";
+
+  const std::string x86_code_file = "_x86_builtin_source.cc";
+
+  auto source = ReadWholeFile(FLAGS_cinn_x86_builtin_code_root + "/" + x86_code_file);
+
+  os() << source << "\n";
+}
+
+namespace detail {
+
+Expr StridedRampBase(Expr e, int stride) {
+  auto *ramp_n = e.As<ir::Ramp>();
+  if (ramp_n) {
+    auto *iv = ramp_n->stride.As<ir::IntImm>();
+    if (iv && iv->value == stride) return ramp_n->base;
+  }
+  return Expr();
+}
+
+}  // namespace detail
+
+}  // namespace backends
+
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_c.h b/paddle/cinn/backends/codegen_c.h
new file mode 100755
index 0000000000000..42458d549bed3
--- /dev/null
+++ b/paddle/cinn/backends/codegen_c.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gflags/gflags.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/intrinsic_ops.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/ir/module.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace cinn {
+
+namespace ir {
+class Module;
+}  // namespace ir
+
+namespace backends {
+
+//! keyword of __restrict__.
+extern const char* kCKeywordRestrict;
+
+class CodeGenC : public ir::IrPrinter {
+ public:
+  enum class OutputKind {
+    CHeader,  //! output the C header file.
+    CImpl,    //! output the C implementation file.
+  };
+
+  explicit CodeGenC(Target target);
+
+  void Compile(const ir::Module& module, const Outputs& outputs);
+
+  virtual std::string Compile(const ir::Module& module, OutputKind output_kind);
+
+  //! Disable inline the builtin codes(too large) for simpler string comparison.
+  void SetInlineBuiltinCodes(bool x = true) { inline_builtin_codes_ = x; }
+
+ protected:
+  std::string Compile(const ir::LoweredFunc& function);
+  std::string Compile(const ir::Buffer& buffer);
+
+  void GenerateHeaderFile(const ir::Module& module);
+
+  std::string GetTypeName(Type type);
+
+  std::string GetTypeRepr(Type type);
+  //! type cast, print like "int(x)"
+  // @{
+  void PrintCastExpr(const Type& type, Expr e);
+  void PrintCastExpr(const std::string& type, Expr e);
+  // @}
+
+  void PrintFunctionDeclaration(const ir::_LoweredFunc_* op) {
+    os() << "void " << op->name << "(";
+    os() << "void* _args, int32_t num_args";
+    os() << ")";
+  }
+
+  void PrintShape(const std::vector<Expr>& shape, char leftb = '{', char rightb = '}');
+
+  virtual void PrintIncludes();
+  void PrintBuiltinCodes();
+  void PrintFileGuardOpen(const std::string& module_name);
+  void PrintFileGuardClose(const std::string& module_name);
+
+  //! Create the buffers in global scope(just creation without allocating them).
+  void PrintBufferCreation(const std::vector<ir::Buffer>& buffers);
+  void PrintBufferDestroy(const std::vector<ir::Buffer>& buffers);
+  void PrintRuntimeType(const cinn_type_t& type);
+
+  //! Print different kinds of Calls.
+  // @{
+  void PrintCallArgs(const ir::Call* call);
+  void PrintCall_buffer_malloc(const ir::Call* op);
+  void PrintCall_cinn_pod_value_to_(const ir::Call* op);
+  void PrintCall_get_address(const ir::Call* op);
+  void PrintCall_pod_values_to_array(const ir::Call* op);
+  // @}
+
+#define __DEFINE_VISIT(op__) void Visit(const ir::op__* op) override;
+  NODETY_FORALL(__DEFINE_VISIT)
+#undef __DEFINE_VISIT
+
+#define __DEFINE_VISIT(op__) void Visit(const ir::intrinsics::op__* op) override;
+  INTRINSIC_KIND_FOR_EACH(__DEFINE_VISIT)
+#undef __DEFINE_VISIT
+
+  void PrintFuncArg(const ir::Argument& arg);
+
+  void PrintStackVecType(Type type, int lanes);
+
+  friend class ExternFunctionEmitter;
+
+ protected:
+  Target target_;
+  std::stringstream ss_;
+  bool inline_builtin_codes_{true};
+};
+
+namespace detail {
+
+Expr StridedRampBase(Expr e, int stride);
+
+}  // namespace detail
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_c_test.cc b/paddle/cinn/backends/codegen_c_test.cc
new file mode 100755
index 0000000000000..3a95774c2f53f
--- /dev/null
+++ b/paddle/cinn/backends/codegen_c_test.cc
@@ -0,0 +1,436 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/codegen_c.h"
+
+#include <gtest/gtest.h>
+
+#include <sstream>
+#include <tuple>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/module.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/runtime/cpu/use_extern_funcs.h"
+
+namespace cinn {
+namespace backends {
+
+using ir::Module;
+using lang::Compute;
+using lang::Lower;
+using lang::Placeholder;
+using utils::StringFormat;
+using utils::Trim;
+
+std::tuple<ir::Tensor, ir::Tensor, ir::Tensor, lang::Buffer> CreateTensor1() {
+  Expr M(100);
+  Expr N(20);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  lang::Buffer C_buf(Float(32));
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) + B(i, j); }, "C");
+  C->Bind(C_buf);
+  return std::make_tuple(A, B, C, C_buf);
+}
+
+TEST(CodeGenC, module) {
+  ir::Tensor A, B, C;
+  lang::Buffer C_buf(Float(32));
+  std::tie(A, B, C, C_buf) = CreateTensor1();
+
+  LOG(INFO) << "C.body: " << C->get_compute_op()->body.front();
+
+  Target target;
+  target.arch = Target::Arch ::X86;
+  target.bits = Target::Bit ::k32;
+  target.os   = Target::OS ::Linux;
+  Module::Builder builder("module1", target);
+
+  auto stages = CreateStages({A, B, C});
+  auto func   = Lower("add1", stages, {A, B, C});
+
+  builder.AddFunction(func);
+
+  {
+    CodeGenC codegen(target);
+    codegen.SetInlineBuiltinCodes(false);
+    auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+    std::cout << "codegen C:" << std::endl << out << std::endl;
+
+    std::string target_str = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void add1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 100; i += 1) {
+    for (int32_t j = 0; j < 20; j += 1) {
+      C[((20 * i) + j)] = (A[((20 * i) + j)] + B[((20 * i) + j)]);
+    };
+  };
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+    EXPECT_EQ(utils::Trim(target_str), utils::Trim(out));
+  }
+
+  {
+    CodeGenC compiler(target);
+    auto out = compiler.Compile(builder.Build(), CodeGenC::OutputKind::CHeader);
+    std::cout << "header:\n" << out << std::endl;
+    auto target_str = R"ROC(
+#ifndef _MODULE1_CINN_H_
+#define _MODULE1_CINN_H_
+
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void add1(void* _args, int32_t num_args);
+
+
+#endif  // _MODULE1_CINN_H_
+)ROC";
+
+    EXPECT_EQ(utils::Trim(out), utils::Trim(target_str));
+  }
+
+  {
+    CodeGenC compiler(target);
+    compiler.SetInlineBuiltinCodes(false);
+    Outputs outputs;
+    outputs = outputs.c_header("./generated_module1.h").c_source("./_generated_module1.cc");
+    compiler.Compile(builder.Build(), outputs);
+  }
+}
+
+TEST(CodeGenC, matmul) {
+  using namespace ir;  // NOLINT
+  Context::Global().ResetNameId();
+
+  Placeholder<float> A("A", {Expr(100), Expr(20)});
+  Placeholder<float> B("B", {Expr(20), Expr(50)});
+
+  Target target{};
+
+  Module::Builder builder("module1", target);
+
+  // C = A * B
+  Var k(20, "k0");
+
+  Tensor C = Compute(
+      {Expr(100), Expr(50)}, [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  // Code gen
+  auto func = Lower("matmul", stages, {A, B, C});
+  builder.AddFunction(func);
+  builder.AddBuffer(C->buffer);
+
+  {  // main
+    std::vector<lang::ReturnType> returns({lang::ReturnType{Float(32), C->shape, C->name}});
+
+    auto tensors = lang::CallLowered("matmul", {A, B}, returns);
+
+    auto C = tensors[0];
+    C->WithBuffer();
+
+    LOG(INFO) << "C.body: " << C->body();
+
+    auto stages = CreateStages({C});
+
+    auto f = Lower("main", stages, {A, B, C}, {});
+    std::cout << "f\n" << Expr(f) << std::endl;
+    builder.AddFunction(f);
+  }
+
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "codegen C:" << std::endl << out << std::endl;
+
+  auto tgt = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void matmul(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  float* C__reduce_init = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 100; i += 1) {
+    for (int32_t j = 0; j < 50; j += 1) {
+      C__reduce_init[((50 * i) + j)] = 0.00000000f;
+      for (int32_t k0 = 0; k0 < 20; k0 += 1) {
+        C[((50 * i) + j)] = (C[((50 * i) + j)] + (A[((20 * i) + k0)] * B[((50 * k0) + j)]));
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _C);
+}
+
+void main(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  cinn_pod_value_t _pod_val_;
+  buffer_p_to_cinn_pod_value(_A, &_pod_val_);
+  cinn_pod_value_t _pod_val__0;
+  buffer_p_to_cinn_pod_value(_B, &_pod_val__0);
+  cinn_pod_value_t _pod_val__1;
+  buffer_p_to_cinn_pod_value(_C, &_pod_val__1);
+  cinn_pod_value_t _pod_arr[3];
+  cinn_args_construct(_pod_arr, 3, &_pod_val_, &_pod_val__0, &_pod_val__1);
+  matmul(_pod_arr, 3);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+
+  ASSERT_EQ(Trim(tgt), Trim(out));
+}
+
+// This matches output of competitor.
+TEST(CodeGenC, matmul_tile) {
+  using namespace ir;  // NOLINT
+  Expr M(100);
+  Expr K(200);
+  Expr N(500);
+  Expr bn(32);
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  // C = A * B
+  Var k(K.as_int32(), "k0");
+
+  Tensor C_init = Compute(
+      {M, N}, [&](Var i, Var j) { return Expr(0.f); }, "C_init");
+
+  Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  auto stages = CreateStages({C, C_init});
+  stages[C]->ShareBufferWith(stages[C_init]);
+
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ = stages[C_init]->Tile(0, 1, bn.as_int32(), bn.as_int32());  // NOLINT
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    stages[C_init]->Reorder({i_outer, j_outer, i_inner, j_inner});
+  }
+
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ = stages[C]->Tile(0, 1, bn.as_int32(), bn.as_int32());  // NOLINT
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_                 = stages[C]->Split(poly::Iterator("k0"), 4);  // NOLINT
+    auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+    stages[C]->Reorder({i_outer, j_outer, i_inner, j_inner, k_outer, k_inner});
+  }
+
+  stages[C_init]->ComputeAtSchedule(stages[C], 3, poly::Stage::kComputeAtBefore);
+
+  // Code gen
+  auto func = Lower("matmul", stages, {A, B, C});
+
+  Target target = common::DefaultHostTarget();
+
+  Module::Builder builder("module1", target);
+  builder.AddFunction(func);
+  builder.AddBuffer(C_init->buffer);
+
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "codegen C:" << std::endl << out << std::endl;
+
+  auto target_out = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void matmul(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  float* C__reduce_init = ((float*)(_C->memory));
+  float* C_init = ((float*)(_C->memory));
+  for (int32_t i_outer = 0; i_outer < 4; i_outer += 1) {
+    for (int32_t j_outer = 0; j_outer < 16; j_outer += 1) {
+      for (int32_t i_inner = 0; i_inner < cinn_min(32, (100 + (-32 * i_outer))); i_inner += 1) {
+        for (int32_t j_inner = 0; j_inner < cinn_min(32, (500 + (-32 * j_outer))); j_inner += 1) {
+          C__reduce_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0.00000000f;
+          C_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0.00000000f;
+          for (int32_t k0_outer = 0; k0_outer < 50; k0_outer += 1) {
+            for (int32_t k0_inner = 0; k0_inner < 4; k0_inner += 1) {
+              C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = fma(A[((200 * i_inner) + ((6400 * i_outer) + ((4 * k0_outer) + k0_inner)))], B[((32 * j_outer) + ((500 * k0_inner) + ((2000 * k0_outer) + j_inner)))], C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))]);
+            };
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+
+  ASSERT_EQ(Trim(target_out), Trim(out));
+}
+
+TEST(CodeGenC, matmul_packed) {
+  Expr M(100);
+  Expr K(200);
+  Expr N(500);
+  Expr bn(32);
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  // TODO(Superjomn) Make sure the domain works.
+  Var k(K.as_int32(), "k0");
+  auto packedB = Compute(
+      {N / bn, K, bn}, [&](Expr x, Expr y, Expr z) { return B(y, x * bn + z); }, "PackedB");
+  auto C = Compute(
+      {M, N}, [&](Expr i, Expr j) { return ReduceSum(A(i, k) * packedB(j / bn, k, j % bn), {k}); }, "C");
+
+  auto stages = CreateStages({packedB, C});
+
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ = stages[C]->Tile(0, 1, bn.as_int32(), bn.as_int32());
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_                 = stages[C]->Split(poly::Iterator("k0"), 4);
+    auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+    stages[C]->Reorder({i_outer, j_outer, i_inner, j_inner, k_outer, k_inner});
+  }
+
+  // Code gen
+  auto func = Lower("matmul_with_packing", stages, {A, B, packedB, C});
+
+  Target target = common::DefaultHostTarget();
+
+  Module::Builder builder("module1", target);
+  builder.AddFunction(func);
+  builder.AddBuffer(C->buffer);
+  builder.AddBuffer(packedB->buffer);
+
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "codegen C:" << std::endl << out << std::endl;
+
+  auto target_out = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void matmul_with_packing(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _PackedB = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[3]));
+  cinn_buffer_malloc((void*)(0), _PackedB);
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  float* C__reduce_init = ((float*)(_C->memory));
+  float* PackedB = ((float*)(_PackedB->memory));
+  for (int32_t i = 0; i < 15; i += 1) {
+    for (int32_t j = 0; j < 200; j += 1) {
+      for (int32_t k = 0; k < 32; k += 1) {
+        PackedB[((6400 * i) + ((32 * j) + k))] = B[((32 * i) + ((500 * j) + k))];
+      };
+    };
+  };
+  for (int32_t i_outer = 0; i_outer < 4; i_outer += 1) {
+    for (int32_t j_outer = 0; j_outer < 16; j_outer += 1) {
+      for (int32_t i_inner = 0; i_inner < cinn_min(32, (100 + (-32 * i_outer))); i_inner += 1) {
+        for (int32_t j_inner = 0; j_inner < cinn_min(32, (500 + (-32 * j_outer))); j_inner += 1) {
+          C__reduce_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0;
+          for (int32_t k0_outer = 0; k0_outer < 50; k0_outer += 1) {
+            for (int32_t k0_inner = 0; k0_inner < 4; k0_inner += 1) {
+              C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = fma(A[((200 * i_inner) + ((6400 * i_outer) + ((4 * k0_outer) + k0_inner)))], PackedB[((6400 * (j_inner / 32)) + ((j_inner & 31) + ((6400 * j_outer) + ((32 * k0_inner) + (128 * k0_outer)))))], C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))]);
+            };
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _PackedB);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+  // ToDo @haoze @wangyue Check Codegen
+  // ASSERT_EQ(utils::Trim(target_out), utils::Trim(out));
+}
+
+TEST(CodeGenC, call_extern) {
+  Expr M(100);
+
+  Placeholder<float> x("x", {M});
+
+  ir::Tensor y = Compute(
+      {M}, [=](Var i) -> Expr { return lang::CallExtern("tanh", {x(i)}); }, "y");
+
+  auto stages = CreateStages({y});
+
+  auto yexpr = Lower("yy", stages, {y});
+
+  Module::Builder builder("module0", common::DefaultHostTarget());
+  builder.AddFunction(yexpr);
+
+  CodeGenC codegen(common::DefaultHostTarget());
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "codegen C:" << std::endl << out << std::endl;
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_c_x86.cc b/paddle/cinn/backends/codegen_c_x86.cc
new file mode 100644
index 0000000000000..737566dc2c651
--- /dev/null
+++ b/paddle/cinn/backends/codegen_c_x86.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/codegen_c_x86.h"
+
+namespace cinn {
+namespace backends {
+
+void CodeGenCX86::Visit(const ir::Add *op) { VisitBinaryOp(op, op->a(), op->b(), "add"); }
+void CodeGenCX86::Visit(const ir::Sub *op) { VisitBinaryOp(op, op->a(), op->b(), "sub"); }
+void CodeGenCX86::Visit(const ir::Mul *op) { VisitBinaryOp(op, op->a(), op->b(), "mul"); }
+void CodeGenCX86::Visit(const ir::Div *op) { VisitBinaryOp(op, op->a(), op->b(), "div"); }
+
+void CodeGenCX86::Visit(const ir::Load *op) {
+  Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1);
+  if (dense_strided_ramp.defined()) {  // Loading a continuous Ramp address.
+    CHECK(op->type().is_vector());
+
+    int bits = op->type().bits() * op->type().lanes();
+    if (SupportsAVX512() && bits == 512) {
+      os() << "cinn_avx512_load(";
+      PrintAbsAddr(op);
+      os() << ")";
+    } else if (SupportsAVX256() && bits == 256) {
+      os() << "cinn_avx256_load(";
+      PrintAbsAddr(op);
+      os() << ")";
+    } else {
+      CodeGenC::Visit(op);
+    }
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+void CodeGenCX86::Visit(const ir::Broadcast *op) {
+  CHECK_GT(op->type().lanes(), 1);
+  int bits = op->type().bits() * op->type().lanes();
+
+  if (SupportsAVX512() && bits == 512) {
+    os() << "cinn_avx512_set1(";
+    PrintCastExpr(op->value.type().ElementOf(), op->value);
+    os() << ")";
+  } else if (SupportsAVX256() && bits == 256) {
+    os() << "cinn_avx256_set1(";
+    PrintCastExpr(op->value.type().ElementOf(), op->value);
+    os() << ")";
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+void CodeGenCX86::Visit(const ir::Store *op) {
+  if (op->type().lanes() == 1) {
+    CodeGenC::Visit(op);
+    return;
+  }
+
+  int bits = op->type().bits() * op->type().lanes();
+  if (SupportsAVX512() && bits == 512) {
+    os() << "cinn_avx512_store(";
+    PrintAbsAddr(op);
+    os() << ", ";
+    Print(op->value);
+    os() << ")";
+  } else if (SupportsAVX256() && bits == 256) {
+    os() << "cinn_avx256_store(";
+    PrintAbsAddr(op);
+    os() << ", ";
+    Print(op->value);
+    os() << ")";
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+void CodeGenCX86::PrintVecInputArgument(const Expr *op) {
+  int bits          = op->type().bits() * op->type().lanes();
+  auto *broadcast_n = op->As<ir::Broadcast>();
+
+  if (op->type().lanes() == 1 || broadcast_n) {
+    Expr value = op->type().lanes() == 1 ? *op : broadcast_n->value;
+
+    if (SupportsAVX512()) {
+      os() << "cinn_avx512_set1(";
+      Print(value);
+      os() << ")";
+    } else if (SupportsAVX256()) {
+      os() << "cinn_avx256_set1(";
+      Print(value);
+      os() << ")";
+    } else {
+      CINN_NOT_IMPLEMENTED
+    }
+  } else {
+    Print(*op);
+  }
+}
+
+void CodeGenCX86::Visit(const ir::intrinsics::BuiltinIntrin *op) {
+  if (op->type().lanes() == 1) {
+    CodeGenC::Visit(op);
+    return;
+  }
+  int bits = op->type().bits() * op->type().lanes();
+  if (SupportsAVX512() && bits == 512) {
+    os() << "cinn_avx512_" << op->name << "(";
+    if (!op->args.empty()) {
+      for (int i = 0; i < op->args.size() - 1; i++) {
+        PrintVecInputArgument(&op->args[i]);
+        os() << ", ";
+      }
+      Print(op->args.back());
+    }
+    os() << ")";
+  } else if (SupportsAVX256() && bits == 256) {
+    os() << "cinn_avx256_" << op->name << "(";
+    if (!op->args.empty()) {
+      for (int i = 0; i < op->args.size() - 1; i++) {
+        PrintVecInputArgument(&op->args[i]);
+        os() << ", ";
+      }
+      PrintVecInputArgument(&op->args.back());
+    }
+    os() << ")";
+  } else if (bits == 128) {
+    os() << "cinn_avx128_" << op->name << "(";
+    if (!op->args.empty()) {
+      for (int i = 0; i < op->args.size() - 1; i++) {
+        PrintVecInputArgument(&op->args[i]);
+        os() << ", ";
+      }
+      PrintVecInputArgument(&op->args.back());
+    }
+    os() << ")";
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_c_x86.h b/paddle/cinn/backends/codegen_c_x86.h
new file mode 100644
index 0000000000000..29555df3c5e9a
--- /dev/null
+++ b/paddle/cinn/backends/codegen_c_x86.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/ir/intrinsic_ops.h"
+
+namespace cinn {
+namespace backends {
+
+/**
+ * C code generation with X86 instruction or math library support.
+ */
+class CodeGenCX86 : public CodeGenC {
+ public:
+  //! The X86 CPU supports some following features. We use SSE or AVX to accelerate the basic operations if forloop is
+  //! vectorized.
+  enum class Feature : int {
+    None   = 0,
+    SSE    = 1,       //! support SSE instruction set.
+    AVX256 = 1 << 1,  // ! support AVX256 instruction set.
+    AVX512 = 1 << 2,  // ! support AVX512 instruction set.
+    BLAS   = 1 << 3,  // ! support BLAS library.
+  };
+
+  Feature feature{Feature::None};
+
+  /**
+   * constructor.
+   * @param target The device.
+   * @param features Features it supported.
+   */
+  CodeGenCX86(Target target, Feature feature) : CodeGenC(target), feature(feature) {}
+
+ protected:
+  void Visit(const ir::Add *op) override;
+  void Visit(const ir::Sub *op) override;
+  void Visit(const ir::Mul *op) override;
+  void Visit(const ir::Div *op) override;
+  void Visit(const ir::Mod *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::EQ *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::NE *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::LT *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::LE *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::GT *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::GE *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::And *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::Or *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::Load *op) override;
+  void Visit(const ir::Store *op) override;
+  void Visit(const ir::Broadcast *op) override;
+  void Visit(const ir::intrinsics::BuiltinIntrin *op);
+
+  //! Check the features.
+  // @{
+  bool SupportsSSE() { return static_cast<int>(feature) & static_cast<int>(Feature::SSE); }
+  bool SupportsAVX256() { return static_cast<int>(feature) & static_cast<int>(Feature::AVX256); }
+  bool SupportsAVX512() { return static_cast<int>(feature) & static_cast<int>(Feature::AVX512); }
+  bool SupportsBLAS() { return static_cast<int>(feature) & static_cast<int>(Feature::BLAS); }
+  // @}
+
+  //! Print (and prepare) a argument in vectorize type, for example:
+  // 3. -> set1(3.)
+  // a[i:j] -> load_ps(a+i)
+  void PrintVecInputArgument(const Expr *op);
+  //! The output argument, such as the destination for Load.
+  void PrintVecOutputArgument(const Expr *op);
+
+  template <typename Op>
+  void PrintAbsAddr(const Op *op) {
+    os() << op->tensor.template As<ir::_Tensor_>()->name << " + ";
+
+    auto index   = op->index();
+    auto *ramp_n = index.template As<ir::Ramp>();
+    if (ramp_n) {
+      CHECK(!ramp_n->base.template As<ir::Ramp>()) << "base of a Ramp node should not be Ramp type";
+      Print(ramp_n->base);
+    } else {
+      Print(op->index());
+    }
+  }
+
+  template <typename Op>
+  void VisitBinaryOp(const Op *op, Expr a, Expr b, const std::string &op_repr);
+};
+
+template <typename Op>
+void CodeGenCX86::VisitBinaryOp(const Op *op, Expr a, Expr b, const std::string &op_repr) {
+  CHECK_EQ(a.type(), b.type()) << " a is : " << a << ", and b is : " << b << ". op_repr is : " << op_repr;
+
+  // scalar.
+  if (a.type().lanes() == 1) {
+    CodeGenC::Visit(op);
+    return;
+  }
+
+  // TODO(Superjomn) Consider support BLAS.
+  int bits = a.type().bits() * a.type().lanes();
+  if (SupportsAVX512() && bits == 512) {
+    os() << "cinn_avx512_" << op_repr << "(";
+    PrintVecInputArgument(&a);
+    os() << ", ";
+    PrintVecInputArgument(&b);
+    os() << ")";
+  } else if (SupportsAVX256() && bits == 256) {
+    os() << "cinn_avx256_" << op_repr << "(";
+    PrintVecInputArgument(&a);
+    os() << ", ";
+    PrintVecInputArgument(&b);
+    os() << ")";
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_c_x86_test.cc b/paddle/cinn/backends/codegen_c_x86_test.cc
new file mode 100644
index 0000000000000..b4cb6bf376a51
--- /dev/null
+++ b/paddle/cinn/backends/codegen_c_x86_test.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/codegen_c_x86.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/module.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/transform_polyfor_to_for.h"
+#include "cinn/optim/vectorize_loops.h"
+
+namespace cinn {
+namespace backends {
+
+TEST(CodeGenCX86, basic) {
+  // create two forloops, check only one forloop is marked Vectorize.
+  Context::info_rgt().Clear();
+
+  using namespace ir;  // NOLINT
+
+  const int M  = 100;
+  const int K  = 200;
+  const int N  = 500;
+  const int bn = 32;
+
+  Target target;
+  target.arch = Target::Arch ::X86;
+  target.bits = Target::Bit ::k32;
+  target.os   = Target::OS ::Linux;
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  // C = A * B
+  Tensor C = Compute(
+      {Expr(M), Expr(N)}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "C");
+
+  Tensor D = Compute(
+      {Expr(M), Expr(N)}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "D");
+
+  auto stages = CreateStages({C, D});
+  // vectorize C, not D
+  stages[C]->Vectorize(1, 16);
+  stages[C]->Unroll(1);
+
+  auto func = Lower("matmul", stages, {A, B, C, D});
+
+  std::cout << "before optim\n" << func->body << std::endl;
+
+  ir::Module::Builder builder("module1", target);
+  builder.AddFunction(func);
+
+  CodeGenCX86 codegen(target, CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "out:\n" << out;
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
new file mode 100644
index 0000000000000..21fc8961faeea
--- /dev/null
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -0,0 +1,391 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/codegen_cuda_dev.h"
+
+#include <cinn/utils/string.h>
+#include <glog/logging.h>
+
+#include <fstream>
+#include <set>
+#include <unordered_set>
+
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_verify.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/remove_nested_block.h"
+
+namespace cinn {
+namespace backends {
+
+const std::string CodeGenCUDA_Dev::source_header_ =
+    R"(#include <cstdint>
+
+#define CINN_WITH_CUDA
+#include "bfloat16.h"
+#include "float16.h"
+using cinn::common::bfloat16;
+using cinn::common::float16;
+using cinn::common::half4;
+using cinn::common::half8;
+using cinn::common::float8;
+
+#include "cinn_cuda_runtime_source.cuh"
+)";
+
+const std::string &CodeGenCUDA_Dev::GetSourceHeader() { return source_header_; }
+
+CodeGenCUDA_Dev::CodeGenCUDA_Dev(Target target) : CodeGenC(target) {}
+
+std::string CodeGenCUDA_Dev::Compile(const ir::Module &module, bool for_nvrtc) {
+  for_nvrtc_  = for_nvrtc;
+  auto source = Compile(module, OutputKind::CImpl);
+
+  return source;
+}
+
+void CodeGenCUDA_Dev::Compile(const ir::Module &module, const Outputs &outputs) {
+  ir::IrVerify(Expr(module));
+
+  CodeGenC::inline_builtin_codes_ = false;
+  if (!outputs.c_header_name.empty()) {
+    auto source = Compile(module, OutputKind::CHeader);
+    std::ofstream file(outputs.c_header_name);
+    CHECK(file.is_open()) << "failed to open file " << outputs.c_header_name;
+    file << source;
+    file.close();
+    LOG(WARNING) << "Output C header to file " << outputs.c_header_name;
+  }
+
+  if (!outputs.cuda_source_name.empty()) {
+    auto source = Compile(module, OutputKind::CImpl);
+    std::ofstream file(outputs.cuda_source_name);
+    CHECK(file.is_open()) << "failed to open file " << outputs.cuda_source_name;
+    file << source;
+    file.close();
+    LOG(WARNING) << "Output C source to file " << outputs.cuda_source_name;
+  }
+}
+
+std::string CodeGenCUDA_Dev::Compile(const ir::LoweredFunc &func) {
+  Print(Expr(func));
+  return ss_.str();
+}
+
+std::vector<Expr> CodeGenCUDA_Dev::GenerateBufferAliasExprs(const ir::_LoweredFunc_ *op,
+                                                            const std::vector<ir::Buffer> &temp_buffers) {
+  std::set<ir::Buffer> temp_buffer_set(temp_buffers.begin(), temp_buffers.end());
+  // prepare temp buffer alias
+  std::vector<Expr> buffer_alias;
+  auto tensors = ir::CollectIRNodes(op->body, [&](const Expr *x) {
+    return x->as_tensor() && x->as_tensor()->buffer.defined() && temp_buffer_set.count(x->as_tensor()->buffer);
+  });
+
+  // unique tensors
+  std::set<ir::Tensor> unique_tensors;
+  for (auto &e : tensors) {
+    unique_tensors.insert(e.as_tensor_ref());
+  }
+
+  for (auto &t : unique_tensors) {
+    auto data_type     = t->type();
+    auto data_ptr_type = data_type;
+    data_ptr_type.set_cpp_handle();
+
+    Var t_var(t->name, data_ptr_type);
+    Var buf_var(t->buffer->name, data_ptr_type);
+    buffer_alias.push_back(ir::Let::Make(t_var, buf_var));
+  }
+
+  return buffer_alias;
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::_LoweredFunc_ *op) {
+  // clear names valid within scope when enter a new function
+  vectorized_tensor_names_.clear();
+  os() << "__global__\n";
+
+  PrintFunctionDeclaration(op);
+  os() << "\n";
+
+  DoIndent();
+
+  std::vector<Expr> new_body;
+
+  auto alloca_temp_buffers = op->PrepareAllocTempBufferExprs();
+  auto temp_buffer_alias   = GenerateBufferAliasExprs(op, op->temp_bufs);
+  auto alis_var_exprs      = op->CudaAliasVarExprs();
+
+#define APPEND_TO_NEW_BODY(field__) new_body.insert(std::end(new_body), std::begin(field__), std::end(field__));
+  APPEND_TO_NEW_BODY(alloca_temp_buffers)
+  APPEND_TO_NEW_BODY(temp_buffer_alias)
+  APPEND_TO_NEW_BODY(alis_var_exprs)
+
+  new_body.push_back(op->body);
+
+  Expr func_body = ir::Block::Make(new_body);
+
+  optim::RemoveNestedBlock(&func_body);
+  // Make sure that the function's body is wrapped by a block
+  if (!func_body.As<ir::Block>()) {
+    func_body = ir::Block::Make({func_body});
+  }
+  Print(func_body);
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::_Var_ *op) {
+  if (utils::Startswith(op->name, "threadIdx") || utils::Startswith(op->name, "blockIdx")) {
+    os() << "(int)" + op->name;
+  } else {
+    os() << op->name;
+  }
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Alloc *op) {
+  CHECK(op->destination.as_buffer());
+  PrintTempBufferCreation(op->destination.as_buffer_ref());
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Min *op) {
+  os() << "min(";
+  Print(op->a());
+  os() << ", ";
+  Print(op->b());
+  os() << ")";
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Max *op) {
+  os() << "max(";
+  Print(op->a());
+  os() << ", ";
+  Print(op->b());
+  os() << ")";
+}
+
+void CodeGenCUDA_Dev::PrintFunctionDeclaration(const ir::_LoweredFunc_ *op) {
+  os() << "void ";
+  if (op->cuda_axis_info.valid()) {
+    int thread_num = 1;
+    for (int i = 0; i < 3; i++) {
+      thread_num *= op->cuda_axis_info.block_dim(i);
+    }
+    os() << "__launch_bounds__(" << thread_num << ") ";
+  }
+
+  os() << op->name << "(";
+  for (int i = 0; i < op->args.size() - 1; i++) {
+    auto &arg = op->args[i];
+    PrintFuncArg(arg);
+    os() << ", ";
+  }
+  if (!op->args.empty()) {
+    PrintFuncArg(op->args.back());
+  }
+  os() << ")";
+}
+
+void CodeGenCUDA_Dev::PrintFuncArg(const ir::Argument &arg) {
+  if (arg.is_buffer()) {
+    // In CUDA kernel, only primitive type is supported, so we replace the buffer with T*j
+    if (arg.is_input()) os() << "const ";
+    os() << GetTypeRepr(arg.buffer_arg()->dtype);
+    os() << "* ";
+    os() << kCKeywordRestrict << " ";
+    os() << ir::BufferGetTensorName(arg.buffer_arg().As<ir::_Buffer_>());
+  } else if (arg.is_var()) {
+    if (arg.var_arg()->type().is_cpp_handle()) {
+      os() << kCKeywordRestrict;
+    }
+    os() << GetTypeRepr(arg.type()) << " ";
+    os() << arg.name();
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void CodeGenCUDA_Dev::PrintBuiltinCodes() {}
+
+std::string CodeGenCUDA_Dev::Compile(const ir::Module &module, CodeGenC::OutputKind output_kind) {
+  if (output_kind == OutputKind::CHeader) {
+    GenerateHeaderFile(module);
+  } else if (output_kind == OutputKind::CImpl) {
+    PrintIncludes();
+
+    if (for_nvrtc_) {
+      os() << "\nextern \"C\" {\n\n";
+    }
+
+    PrintBuiltinCodes();
+
+    for (auto &func : module.functions()) {
+      Compile(func);
+    }
+  } else {
+    LOG(FATAL) << "Not supported OutputKind";
+  }
+
+  if (for_nvrtc_) {
+    os() << "\n\n}";
+  }
+
+  return ss_.str();
+}
+
+void CodeGenCUDA_Dev::PrintIncludes() { os() << GetSourceHeader(); }
+
+void CodeGenCUDA_Dev::PrintTempBufferCreation(const ir::Buffer &buffer) {
+  CHECK_NE(buffer->type(), Void());
+  auto print_gpu_memory = [&](const std::string &mark) {
+    os() << mark << GetTypeRepr(buffer->dtype) << " " << buffer->name << " ";
+
+    os() << "[ ";
+    Expr buffer_size(1);
+    for (int i = 0; i < buffer->shape.size(); i++) {
+      buffer_size = buffer_size * buffer->shape[i];
+    }
+    optim::Simplify(&buffer_size);
+    Print(buffer_size);
+    os() << " ]";
+  };
+  switch (buffer->memory_type) {
+    case ir::MemoryType::GPUShared:
+      print_gpu_memory("__shared__ ");
+      break;
+
+    case ir::MemoryType::GPULocal:
+      print_gpu_memory("");
+      break;
+
+    default:
+      LOG(FATAL) << "CUDA device codegen not support memory " << buffer->name << ", type " << buffer->memory_type;
+  }
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Call *op) {
+  os() << op->name + "(";
+
+  if (!op->read_args.empty()) {
+    for (int i = 0; i < op->read_args.size() - 1; i++) {
+      auto &arg = op->read_args[i];
+      if (arg.as_tensor()) {
+        os() << arg.as_tensor()->name;
+        os() << ", ";
+      } else {
+        Print(arg);
+        os() << ", ";
+      }
+    }
+    if (op->read_args.back().as_tensor()) {
+      os() << op->read_args.back().as_tensor()->name;
+    } else {
+      Print(op->read_args.back());
+    }
+  }
+
+  if (!op->write_args.empty()) {
+    os() << ", ";
+    for (int i = 0; i < op->write_args.size() - 1; i++) {
+      auto &arg = op->write_args[i];
+      if (arg.as_tensor()) {
+        os() << arg.as_tensor()->name;
+        os() << ", ";
+      } else {
+        Print(arg);
+        os() << ", ";
+      }
+    }
+    if (op->write_args.back().as_tensor()) {
+      os() << op->write_args.back().as_tensor()->name;
+    } else {
+      Print(op->write_args.back());
+    }
+  }
+
+  os() << ")";
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Let *op) {
+  CHECK(op->type().valid());
+
+  // identify vectorized tensors by checking their dtypes are customized_type
+  // with customized_type::kcuda_builtin_vector_t prefix, and save their names
+  if (op->type().is_customized() &&
+      utils::Startswith(op->type().customized_type(), common::customized_type::kcuda_builtin_vector_t)) {
+    os() << GetTypeRepr(op->type());
+    if (op->type().is_cpp_handle()) {
+      os() << " " << kCKeywordRestrict;
+    }
+    os() << " ";
+    Print(op->symbol);
+    vectorized_tensor_names_.insert(utils::GetStreamCnt(op->symbol));
+    // skip "=0" in "half8 temp = 0;" sincethe operator= of half8 may not overloaded.
+    if (op->body.As<ir::IntImm>() && op->body.As<ir::IntImm>()->value == 0) {
+      return;
+    }
+    os() << " = ";
+    Print(op->body);
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+bool CodeGenCUDA_Dev::PrintBuiltinVectorAccess(const ir::LoadStoreAddrMnger *op, ir::Expr index_expr, bool is_store) {
+  static constexpr char index2suffix[8] = {'x', 'y', 'z', 'w', 'v', 'u', 't', 's'};
+
+  // addr of op should be a place of tensor and the index is simple int number
+  if (!op->is_addr_tensor() || !index_expr.As<ir::IntImm>()) {
+    return false;
+  }
+  auto *tensor = op->tensor.As<ir::_Tensor_>();
+  CHECK(tensor);
+
+  // identify vectorized tensors by their names
+  if (!vectorized_tensor_names_.count(tensor->name)) {
+    return false;
+  }
+
+  // the index can't exceed the range of cuda built-in vector type
+  int index = index_expr.As<ir::IntImm>()->value;
+  if (index < 0 || index >= 8) {
+    return false;
+  }
+  if (is_store && tensor->type().is_cpp_handle()) {
+    os() << tensor->name << "[" << index << "]";
+  } else {
+    os() << tensor->name << (tensor->type().is_cpp_handle() ? "->" : ".") << index2suffix[index];
+  }
+  return true;
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Load *op) {
+  // overload this visit function to especially deal with the case when it accesses
+  // element at a cuda built-in vector, others still resolve to CodeGenC
+  if (!PrintBuiltinVectorAccess(op, op->index(), false)) {
+    CodeGenC::Visit(op);
+  }
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Store *op) {
+  // overload this visit function to especially deal with the case when it accesses
+  // element at a cuda built-in vector, others still resolve to CodeGenC
+  if (PrintBuiltinVectorAccess(op, op->index(), true)) {
+    os() << " = ";
+    Print(op->value);
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_cuda_dev.h b/paddle/cinn/backends/codegen_cuda_dev.h
new file mode 100644
index 0000000000000..ad7e03024553f
--- /dev/null
+++ b/paddle/cinn/backends/codegen_cuda_dev.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/common/common.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/ir/module.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace cinn::ir {
+class Module;
+}  // namespace cinn::ir
+
+namespace cinn {
+namespace backends {
+
+/**
+ * CUDA device code generator.
+ *
+ * It generates the device function, e.g, the function called "myadd" will have a __global__ functon called
+ * "myadd_kernel", different from codegen_c, the declaration of the "myadd_kernel" function has an expanded argument
+ * list, which finally similar to `__global__ void myadd(float* __restrict__ A, float* __restrict__ B, int n);`
+ */
+class CodeGenCUDA_Dev : public CodeGenC {
+ public:
+  explicit CodeGenCUDA_Dev(Target target);
+
+  /**
+   * Compile the \p module to \p outputs.
+   */
+  void Compile(const ir::Module& module, const Outputs& outputs);
+
+  //! Compile on NVRTC.
+  std::string Compile(const ir::Module& module, bool for_nvrtc = true);
+
+  std::string Compile(const ir::LoweredFunc& func);
+
+  /**
+   * \brief Print a function argument in CUDA syntax. Currently, just some decoration of __restrict__.
+   * @param arg the argument.
+   * @return the representation in CUDA syntax.
+   *
+   * We make it a static to make the test easier.
+   */
+  void PrintFuncArg(const ir::Argument& arg);
+
+  std::string Compile(const ir::Module& module, OutputKind output_kind);
+
+  static const std::string& GetSourceHeader();
+
+ protected:
+  void Visit(const ir::_Var_* op) override;
+  void Visit(const ir::_LoweredFunc_* op) override;
+  void Visit(const ir::Min* op) override;
+  void Visit(const ir::Max* op) override;
+  void Visit(const ir::Alloc* op) override;
+  void Visit(const ir::Call* op) override;
+  void Visit(const ir::Load* op) override;
+  void Visit(const ir::Store* op) override;
+  void Visit(const ir::Let* op) override;
+
+  // Print element access at a cuda built-in vector on a load/store node
+  bool PrintBuiltinVectorAccess(const ir::LoadStoreAddrMnger* op, ir::Expr index, bool is_store);
+
+  void PrintBuiltinCodes();
+
+  void PrintIncludes() override;
+
+  void PrintTempBufferCreation(const ir::Buffer& buffer);
+
+  void PrintTempBufferAliasDefinition(const ir::Buffer& buffer);
+
+  std::vector<Expr> GenerateBufferAliasExprs(const ir::_LoweredFunc_* op, const std::vector<ir::Buffer>& temp_buffers);
+
+  /**
+   * Print the function declaration, this is different from C, we expand the arguments and get something like
+   * `__global__ void myadd(float* __restrict__ A, float* __restrict__ B, int n);`
+   */
+  void PrintFunctionDeclaration(const ir::_LoweredFunc_* op);
+
+ private:
+  Target target_;
+  bool for_nvrtc_{false};
+  // names of vectorized tensors from `Let` statments where dtypes of the tensors
+  // are customized_type with customized_type::kcuda_builtin_vector_t prefix
+  std::unordered_set<std::string> vectorized_tensor_names_;
+  static const std::string source_header_;
+};
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_cuda_generate_test.cc b/paddle/cinn/backends/codegen_cuda_generate_test.cc
new file mode 100644
index 0000000000000..5d4fc35afe663
--- /dev/null
+++ b/paddle/cinn/backends/codegen_cuda_generate_test.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <stdlib.h>
+
+#include <fstream>
+#include <tuple>
+#include <vector>
+
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/codegen_cuda_host.h"
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/cinn.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/lang/lower.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/utils/timer.h"
+
+namespace cinn {
+namespace backends {
+
+TEST(CUDAFile, Module_output) {
+  std::string cuda_source_name = "_generated1.cu";
+  std::string cuda_source_code = R"ROC(
+extern "C" {
+
+__global__
+void __launch_bounds__(200) elementwise_mul(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C)
+{
+  if (((int)blockIdx.x < 100)) {
+    if (((int)threadIdx.x < 200)) {
+      C[((200 * (int)blockIdx.x) + (int)threadIdx.x)] = (A[((200 * (int)blockIdx.x) + (int)threadIdx.x)] * B[((200 * (int)blockIdx.x) + (int)threadIdx.x)]);
+    };
+  };
+}
+
+}
+  )ROC";
+  std::ofstream file(cuda_source_name);
+  CHECK(file.is_open()) << "failed to open file " << cuda_source_name;
+  file << CodeGenCUDA_Dev::GetSourceHeader();
+  file << cuda_source_code;
+  file.close();
+  LOG(WARNING) << "Output C source to file " << cuda_source_name;
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_cuda_host.cc b/paddle/cinn/backends/codegen_cuda_host.cc
new file mode 100644
index 0000000000000..38774b181dbcc
--- /dev/null
+++ b/paddle/cinn/backends/codegen_cuda_host.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/codegen_cuda_host.h"
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/extern_func_emitter_builtin.h"
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/llvm/llvm_util.h"
+#include "cinn/runtime/intrinsic.h"
+
+namespace cinn {
+namespace backends {
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+const int kArgsArrayMaxLen = 20;
+
+llvm::Value* CodeGenCUDA_Host::LowerGPUKernelLauncher(const ir::_LoweredFunc_* func) {
+  auto body     = func->body;
+  auto* call_ir = body.As<ir::Call>();
+  CHECK(call_ir);
+
+  // Create the function
+  // @{
+  auto* function_type      = GenFunctionTypeFromCinnFunction(func, true);
+  llvm::Function* function = llvm::Function::Create(function_type, llvm::Function::ExternalLinkage, func->name, m_);
+  function->setCallingConv(llvm::CallingConv::C);
+  function->setHasUWTable();
+
+  std::vector<llvm::Value*> ll_function_args;
+  std::transform(function->arg_begin(), function->arg_end(), std::back_inserter(ll_function_args), [](auto& arg) {
+    return std::addressof(arg);
+  });
+  // @}
+
+  llvm::BasicBlock* entry = llvm::BasicBlock::Create(
+      /*Context=*/b_->getContext(),
+      /*Name=*/"entry",
+      /*Parent=*/function,
+      /*InsertBefore=*/nullptr);
+  b_->SetInsertPoint(entry);
+
+  auto* kernel_args          = ll_function_args[0];
+  auto* kernel_args_count    = ll_function_args[1];
+  llvm::Value* kernel_stream = nullptr;
+  if (ll_function_args.size() == 3) {
+    kernel_stream = ll_function_args[2];
+    CHECK_EQ(kernel_stream->getType(), ll_void_p_ty());  // void* stream
+  }
+  CHECK_EQ(kernel_args->getType(), ll_void_p_ty());       // void* args
+  CHECK_EQ(kernel_args_count->getType(), ll_int32_ty());  // int32
+
+  std::unordered_map<std::string, llvm::Value*> global_args = {
+      {KERNEL_ARGS, kernel_args}, {KERNEL_ARGS_NUM, kernel_args_count}, {KERNEL_STREAM, kernel_stream}};
+
+  auto ret_type = CinnTypeToLLVMType(Void(), m_);
+  std::vector<llvm::Type*> args_type;
+  for (auto r_arg : call_ir->read_args) {
+    if (r_arg.is_var()) {
+      if (r_arg.as_var()->type().is_cpp_handle() || r_arg.as_var()->type().is_string()) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<void*>(), m_));
+      } else if (r_arg.as_var()->type().is_int(32)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<int32_t>(), m_));
+      } else {
+        CINN_NOT_IMPLEMENTED;
+      }
+    } else {
+      if (r_arg.type().is_bool()) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<bool>(), m_));
+      } else if (r_arg.type().is_uint(8)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<uint8_t>(), m_));
+      } else if (r_arg.type().is_uint(16)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<uint16_t>(), m_));
+      } else if (r_arg.type().is_uint(32)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<uint32_t>(), m_));
+      } else if (r_arg.type().is_uint(64)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<uint64_t>(), m_));
+      } else if (r_arg.type().is_int(8)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<int8_t>(), m_));
+      } else if (r_arg.type().is_int(16)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<int16_t>(), m_));
+      } else if (r_arg.type().is_int(32)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<int32_t>(), m_));
+      } else if (r_arg.type().is_int(64)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<int64_t>(), m_));
+      } else if (r_arg.type().is_float(32)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<float>(), m_));
+      } else if (r_arg.type().is_float(64)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<double>(), m_));
+      } else if (r_arg.type().is_bfloat16()) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<bfloat16>(), m_));
+      } else if (r_arg.type().is_float16()) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<float16>(), m_));
+      } else {
+        CINN_NOT_IMPLEMENTED;
+      }
+    }
+  }
+  auto func_type = llvm::FunctionType::get(ret_type, args_type, false);
+  auto call_func = m_->getOrInsertFunction(call_ir->name, func_type);
+
+  std::vector<llvm::Value*> call_args;
+  for (auto& r_arg : call_ir->read_args) {
+    if (r_arg.is_var()) {
+      if (r_arg.as_var()->type().is_string()) {
+        auto kvalue = m_->getOrInsertGlobal(r_arg.as_var()->name + "_ptr_", b_->getInt8PtrTy());
+        call_args.push_back(b_->CreateLoad(b_->getInt8PtrTy(), kvalue, r_arg.as_var()->name + "_ptr_load"));
+      } else if (r_arg.as_var()->type().is_cpp_handle() || r_arg.as_var()->type().is_int(32)) {
+        CHECK(global_args.count(r_arg.as_var()->name));
+        call_args.push_back(global_args[r_arg.as_var()->name]);
+      } else {
+        CINN_NOT_IMPLEMENTED;
+      }
+    } else {
+      if (r_arg.type().is_bool()) {
+        call_args.push_back(b_->getInt1(r_arg.as_bool()));
+      } else if (r_arg.type().is_int(8)) {
+        call_args.push_back(b_->getInt8(r_arg.as_int8()));
+      } else if (r_arg.type().is_int(16)) {
+        call_args.push_back(b_->getInt16(r_arg.as_int16()));
+      } else if (r_arg.type().is_int(32)) {
+        call_args.push_back(b_->getInt32(r_arg.as_int32()));
+      } else if (r_arg.type().is_int(64)) {
+        call_args.push_back(b_->getInt64(r_arg.as_int64()));
+      } else if (r_arg.type().is_uint(8)) {
+        call_args.push_back(b_->getInt8(r_arg.as_uint8()));
+      } else if (r_arg.type().is_uint(16)) {
+        call_args.push_back(b_->getInt16(r_arg.as_uint16()));
+      } else if (r_arg.type().is_uint(32)) {
+        call_args.push_back(b_->getInt32(r_arg.as_uint32()));
+      } else if (r_arg.type().is_uint(64)) {
+        call_args.push_back(b_->getInt64(r_arg.as_uint64()));
+      } else if (r_arg.type().is_float(32)) {
+        call_args.push_back(llvm::ConstantFP::get(b_->getFloatTy(), llvm::APFloat(r_arg.as_float())));
+      } else if (r_arg.type().is_float(64)) {
+        call_args.push_back(llvm::ConstantFP::get(b_->getDoubleTy(), llvm::APFloat(r_arg.as_double())));
+      } else if (r_arg.type().is_bfloat16()) {
+        call_args.push_back(
+            llvm::ConstantFP::get(b_->getBFloatTy(), llvm::APFloat(static_cast<float>(r_arg.as_bfloat16()))));
+      } else if (r_arg.type().is_float16()) {
+        call_args.push_back(
+            llvm::ConstantFP::get(b_->getHalfTy(), llvm::APFloat(static_cast<float>(r_arg.as_float16()))));
+      } else {
+        CINN_NOT_IMPLEMENTED;
+      }
+    }
+  }
+
+  b_->CreateCall(call_func, call_args);
+  RetVoid();
+
+  return function;
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_cuda_host.h b/paddle/cinn/backends/codegen_cuda_host.h
new file mode 100644
index 0000000000000..4f0b858db4144
--- /dev/null
+++ b/paddle/cinn/backends/codegen_cuda_host.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+
+namespace cinn {
+namespace backends {
+
+/**
+ * CodeGenCUDA takes a CINN Module with host functions and output a LLVM module.
+ */
+class CodeGenCUDA_Host : public CodeGenLLVM {
+ public:
+  explicit CodeGenCUDA_Host(llvm::Module *m, llvm::IRBuilder<> *b, const std::shared_ptr<SymbolTable> &vars = nullptr)
+      : CodeGenLLVM(m, b, vars) {}
+
+  using CodeGenLLVM::Visit;
+  llvm::Value *Visit(const ir::_LoweredFunc_ *func) override { return LowerGPUKernelLauncher(func); }
+
+ private:
+  /**
+   * Lower a CUDA kernel launcher.
+   *
+   * We launch a CUDA kernel in the following way:
+   *
+   * 1. a GPU function (called fn) will compiled to PTX and lower by CUDA driver to a function pointer, which we store
+   * as a `void*` type global variable [fn_kernel_ptr] in LLVM module.
+   * 2. when lower the host launcher, we replace the Call of the original kernel [fn] to a Call of
+   * `cinn_call_cuda_kernel` method which is registered as an external function.
+   *
+   */
+  llvm::Value *LowerGPUKernelLauncher(const ir::_LoweredFunc_ *func);
+};
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_cuda_util.cc b/paddle/cinn/backends/codegen_cuda_util.cc
new file mode 100644
index 0000000000000..ee7174be9f407
--- /dev/null
+++ b/paddle/cinn/backends/codegen_cuda_util.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/codegen_cuda_util.h"
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/ir/ir_mutator.h"
+
+namespace cinn {
+namespace backends {
+
+std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module) {
+  detail::CollectHostFunctionVisitor visitor(module->name);
+  Expr expr(module);
+  return visitor(&expr);
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_cuda_util.h b/paddle/cinn/backends/codegen_cuda_util.h
new file mode 100755
index 0000000000000..598feede403ae
--- /dev/null
+++ b/paddle/cinn/backends/codegen_cuda_util.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/optim/ir_copy.h"
+
+namespace cinn {
+namespace backends {
+
+#define KERNEL_ARGS "kernel_args"
+#define KERNEL_ARGS_NUM "kernel_args_num"
+#define KERNEL_STREAM "kernel_stream"
+
+/**
+ * Split a CINN Module into two separate modules, one cantains the host functions, the other contains the device
+ * kernels.
+ *
+ * This contains some process:
+ *
+ * - replace the original kernel function with a Call node and add it to the first module, add a device kernel function
+ * to the second module.
+ */
+std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module);
+
+namespace detail {
+
+struct CollectHostFunctionVisitor : public ir::IRMutator<> {
+  explicit CollectHostFunctionVisitor(const std::string& module_name)
+      : host_module_builder(module_name + "_host", common::DefaultHostTarget()),
+        device_module_builder(module_name + "_gpu_device", common::DefaultNVGPUTarget()) {}
+
+  std::tuple<ir::Module, ir::Module> operator()(Expr* expr) {
+    ir::IRMutator<>::Visit(expr, expr);
+    return std::make_tuple(host_module_builder.Build(), device_module_builder.Build());
+  }
+
+ private:
+  void Visit(const ir::_LoweredFunc_* op, Expr* expr) override {
+    if (op->body.As<ir::Call>()) {
+      host_module_builder.AddFunction(expr->as_lowered_func_ref());
+    } else {
+      if (!op->cuda_axis_info.valid()) {
+        expr->as_lowered_func_ref()->cuda_axis_info.set_valid(true);
+      }
+      auto host_func = CreateHostFunctionGivenDeviceKernel(op);
+      host_module_builder.AddFunction(host_func.as_lowered_func_ref());
+      device_module_builder.AddFunction(CreateDeviceFunctionGivenDeviceKernel(*expr).as_lowered_func_ref());
+    }
+  }
+
+  /**
+   * Create a wrapper function for a kernel.
+   *
+   * For example, we get a kernel function:
+   *
+   * \code
+   * __global__
+   * void fn (float* a, float* out) { ... }
+   * \endcode
+   *
+   * A host wrapper function will generate for it
+   *
+   * \code
+   * void fn (cinn_buffer_t* a, cinn_buffer_t* out) {
+   *   Call(fn_kernel);
+   * }
+   * \endcode
+   */
+  Expr CreateHostFunctionGivenDeviceKernel(const ir::_LoweredFunc_* func) {
+    // std::vector<Expr> args;
+    // NOTE the suffix `__ptr` makes this argument lower to a pointer in LLVM backend.
+    // args.push_back(Var("args__ptr", type_of<cinn_pod_value_t*>()));
+    // args.push_back(Var("num_args", type_of<int32_t>()));
+    ir::Var kernel_ptr(GenDeviceKernelName(func->name), type_of<std::string>());
+    ir::Var kernel_args(KERNEL_ARGS, type_of<void*>());
+    ir::Var kernel_args_num(KERNEL_ARGS_NUM, type_of<int>());
+    ir::Var kernel_stream(KERNEL_STREAM, type_of<void*>());
+
+    auto call_extern_api                = ir::Call::Make(Void(),
+                                          runtime::intrinsic::call_cuda_kernel,
+                                          {kernel_ptr,
+                                           kernel_args,
+                                           kernel_args_num,
+                                           Expr(func->cuda_axis_info.grid_dim(0)),   // grid_x
+                                           Expr(func->cuda_axis_info.grid_dim(1)),   // grid_y
+                                           Expr(func->cuda_axis_info.grid_dim(2)),   // grid_z
+                                           Expr(func->cuda_axis_info.block_dim(0)),  // block_x
+                                           Expr(func->cuda_axis_info.block_dim(1)),  // block_y
+                                           Expr(func->cuda_axis_info.block_dim(2)),  // block_z
+                                           kernel_stream},
+                                          {},
+                                          ir::CallType::Extern,
+                                          ir::FunctionRef(),
+                                          0);
+    std::vector<ir::Argument> arguments = {ir::Argument(kernel_args, ir::Argument::IO::kOutput),
+                                           ir::Argument(kernel_args_num, ir::Argument::IO::kInput),
+                                           ir::Argument(kernel_stream, ir::Argument::IO::kOutput)};
+
+    return ir::_LoweredFunc_::Make(func->name, arguments, call_extern_api, {});
+  }
+
+  Expr CreateDeviceFunctionGivenDeviceKernel(Expr expr) {
+    auto copied        = optim::IRCopy(expr);
+    auto* lowered_func = copied.as_lowered_func();
+    lowered_func->name = GenDeviceKernelName(lowered_func->name);
+    return copied;
+  }
+
+  inline std::string GenDeviceKernelName(const std::string& fn) { return fn + "_kernel"; }
+
+ private:
+  ir::Module::Builder host_module_builder;
+  ir::Module::Builder device_module_builder;
+};
+
+}  // namespace detail
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/codegen_debug_test.cc b/paddle/cinn/backends/codegen_debug_test.cc
new file mode 100644
index 0000000000000..306238f58fe52
--- /dev/null
+++ b/paddle/cinn/backends/codegen_debug_test.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <vector>
+
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "cinn/common/context.h"
+#include "cinn/runtime/cuda/cuda_module.h"
+
+namespace cinn {
+namespace backends {
+
+/**
+ * This file is not a common test, it is used as a util for developers to
+ * write source CUDA code to debug whether it runs correctly during runtime
+ */
+using runtime::cuda::CUDAModule;
+
+/**
+ * Utility function to create cuda memory of non-empty shape.
+ *
+ * @param shape: a non-empty shape for the created cuda memory
+ * @param data: the data to initialize the cuda memory. Function doesn't
+ *     initailize if it is nullptr
+ * @return the CUdeviceptr pointing to the created memory
+ */
+template <typename T>
+CUdeviceptr CreateCudaMemory(const std::vector<int>& shape, const T* data) {
+  CHECK(!shape.empty()) << "Couldn't create CUDA memory for empty shape";
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  int numel = 1;
+  for (int s : shape) {
+    numel = numel * s;
+  }
+
+  CUdeviceptr cuda_ptr = cuMemAlloc(&cuda_ptr, numel * sizeof(T));
+  if (data != nullptr) {
+    CUDA_CALL(cudaMemcpy(reinterpret_cast<void*>(cuda_ptr), data, numel * sizeof(T), cudaMemcpyHostToDevice));
+  }
+  return cuda_ptr;
+}
+
+TEST(CodeGenDebug, RunCudaSourceCode) {
+  common::Context::Global().ResetNameId();
+
+  std::string source_code = R"ROC(
+extern "C" {
+
+__global__
+void __launch_bounds__(512) fn_relu_1_kernel(const float* __restrict__ var_1, float* __restrict__ Relu_output)
+{
+  for (int32_t j_0 = 0; j_0 < 8; j_0 += 1) {
+    for (int32_t j_1 = 0; j_1 < 1; j_1 += 1) {
+      for (int32_t j_2 = 0; j_2 < 1; j_2 += 1) {
+        for (int32_t j_3 = 0; j_3 < 8; j_3 += 1) {
+          for (int32_t j_4 = 0; j_4 < 1; j_4 += 1) {
+            for (int32_t k_0 = 0; k_0 < 1; k_0 += 1) {
+              for (int32_t k_1 = 0; k_1 < 7; k_1 += 1) {
+                for (int32_t k_2 = 0; k_2 < 4; k_2 += 1) {
+                  for (int32_t k_3 = 0; k_3 < 4; k_3 += 1) {
+                    for (int32_t k_4 = 0; k_4 < 1; k_4 += 1) {
+                      for (int32_t a_0 = 0; a_0 < 16; a_0 += 1) {
+                        for (int32_t a_1 = 0; a_1 < 1; a_1 += 1) {
+                          for (int32_t a_2 = 0; a_2 < 1; a_2 += 1) {
+                            for (int32_t a_3 = 0; a_3 < 1; a_3 += 1) {
+                              for (int32_t a_4 = 0; a_4 < 7; a_4 += 1) {
+                                Relu_output[((7 * a_0) + ((7 * a_1) + ((7 * a_2) + ((7 * a_3) + ((100352 * j_0) + ((100352 * j_1) + ((100352 * j_2) + ((12544 * j_3) + ((12544 * j_4) + ((12544 * k_0) + ((1792 * k_1) + ((448 * k_2) + ((112 * k_3) + ((112 * k_4) + a_4))))))))))))))] = max(var_1[((7 * a_0) + ((7 * a_1) + ((7 * a_2) + ((7 * a_3) + ((100352 * j_0) + ((100352 * j_1) + ((100352 * j_2) + ((12544 * j_3) + ((12544 * j_4) + ((12544 * k_0) + ((1792 * k_1) + ((448 * k_2) + ((112 * k_3) + ((112 * k_4) + a_4))))))))))))))], 0.00000000f);
+                              };
+                            };
+                          };
+                        };
+                      };
+                    };
+                  };
+                };
+              };
+            };
+          };
+        };
+      };
+    };
+  };
+}
+
+}
+)ROC";
+
+  backends::nvrtc::Compiler compiler;
+
+  std::string ptx = compiler(CodeGenCUDA_Dev::GetSourceHeader() + source_code);
+  ASSERT_FALSE(ptx.empty());
+
+  CUDAModule cuda_module(ptx, CUDAModule::Kind::PTX);
+  CUdeviceptr var = CreateCudaMemory<float>(/* shape */ {64 * 112 * 112}, /* data */ nullptr);
+  CUdeviceptr out = CreateCudaMemory<float>(/* shape */ {64 * 112 * 112}, /* data */ nullptr);
+
+  void* args[] = {&var, &out};
+  dim3 grid(512, 1, 1);
+  dim3 block(512, 1, 1);
+  cuda_module.LaunchKernel(/*device_id*/ 0, "fn_relu_1_kernel", grid, block, args);
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
new file mode 100644
index 0000000000000..798b0a96a216d
--- /dev/null
+++ b/paddle/cinn/backends/compiler.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/compiler.h"
+
+#include <fstream>
+
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/common/context.h"
+#ifdef CINN_WITH_CUDA
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/codegen_cuda_host.h"
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "cinn/runtime/cuda/cuda_module.h"
+#include "cinn/runtime/cuda/cuda_util.h"
+#include "cinn/runtime/flags.h"
+#endif
+
+DECLARE_string(cinn_source_code_save_path);
+
+namespace cinn {
+namespace backends {
+using ir::Module;
+
+static constexpr int DebugLogMaxLen = 30000;
+
+SourceCodePrint::SourceCodePrint() {
+  if (!FLAGS_cinn_source_code_save_path.empty()) {
+    LOG(INFO) << "The CINN auto generated source code will writing into file: \"" << FLAGS_cinn_source_code_save_path
+              << "\"";
+    of.open(FLAGS_cinn_source_code_save_path, std::ios_base::out);
+  }
+}
+
+SourceCodePrint::~SourceCodePrint() {
+  if (of.is_open()) {
+    of.close();
+  }
+}
+
+void SourceCodePrint::write(const std::string& source_code) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  if (of.is_open()) {
+    of << source_code << std::endl;
+  } else if (!FLAGS_cinn_source_code_save_path.empty()) {
+    LOG(WARNING) << "Failed to open \"" << FLAGS_cinn_source_code_save_path << "\", source code will print.";
+    if (source_code.size() > DebugLogMaxLen) {
+      LOG(INFO) << "[CUDA] source code-0:\n" << source_code.substr(0, DebugLogMaxLen);
+      for (int i = 1; i * DebugLogMaxLen < source_code.size(); ++i) {
+        LOG(INFO) << "[CUDA] source code-" << i << ":\n" << source_code.substr(DebugLogMaxLen * i, DebugLogMaxLen);
+      }
+    } else {
+      LOG(INFO) << "[CUDA] source code:\n" << source_code;
+    }
+  }
+}
+
+void Compiler::Build(const Module& module, const std::string& code) {
+  if (target_.arch == Target::Arch::NVGPU) {
+    CompileCudaModule(module, code);
+  } else if (target_.arch == Target::Arch::X86) {
+    CompileX86Module(module);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+std::string Compiler::GetSourceCode(const ir::Module& module) {
+  if (target_.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+    auto _host_module_device_module_ = SplitCudaAndHostModule(module);  // NOLINT
+    auto& host_module                = std::get<0>(_host_module_device_module_);
+    auto& device_module              = std::get<1>(_host_module_device_module_);
+    CodeGenCUDA_Dev codegen(target_);
+    auto source_code = codegen.Compile(device_module);
+    return source_code;
+#else
+    CINN_NOT_IMPLEMENTED
+#endif
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void Compiler::BuildDefault(const Module& module) {
+  if (target_.arch == Target::Arch::NVGPU) {
+    CompileCudaModule(module);
+  } else if (target_.arch == Target::Arch::X86) {
+    CompileX86Module(module);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void Compiler::CompileCudaModule(const Module& module, const std::string& code) {
+#ifdef CINN_WITH_CUDA
+  auto _host_module_device_module_ = SplitCudaAndHostModule(module);  // NOLINT
+  auto& host_module                = std::get<0>(_host_module_device_module_);
+  auto& device_module              = std::get<1>(_host_module_device_module_);
+  VLOG(3) << "[CUDA] host module:\n" << host_module;
+
+  VLOG(3) << "[CUDA] device module:\n" << device_module;
+  std::string source_code;
+  if (code.empty()) {
+    CodeGenCUDA_Dev codegen(target_);
+    source_code = codegen.Compile(device_module);
+  } else {
+    source_code = code;
+  }
+  CHECK(!source_code.empty()) << "Compile CUDA C code failed from device module:\n" << device_module;
+  VLOG(3) << "[CUDA] C:\n" << source_code;
+  SourceCodePrint::GetInstance()->write(source_code);
+  using runtime::cuda::CUDAModule;
+
+  nvrtc::Compiler compiler;
+  auto ptx = compiler(source_code);
+  CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << source_code;
+  cuda_module_.reset(
+      new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX));
+
+  RuntimeSymbols symbols;
+  for (auto& fn : device_module.functions()) {
+    std::string kernel_fn_name = fn->name;
+    auto fn_kernel             = cuda_module_->GetFunction(0, kernel_fn_name);
+    CHECK(fn_kernel);
+
+    symbols.RegisterVar(kernel_fn_name + "_ptr_", reinterpret_cast<void*>(fn_kernel));
+  }
+
+  engine_ = ExecutionEngine::Create(ExecutionOptions(), std::move(symbols));
+  engine_->Link<CodeGenCUDA_Host>(host_module);
+
+#else
+  CINN_NOT_IMPLEMENTED
+#endif
+}
+
+void Compiler::CompileX86Module(const Module& module) { engine_->Link<CodeGenX86>(module); }
+
+void Compiler::ExportObject(const std::string& path) { engine_->ExportObject(path); }
+
+void* Compiler::Lookup(absl::string_view fn_name) {
+  CHECK(engine_);
+  if (engine_->Lookup(fn_name) != nullptr) {
+    return engine_->Lookup(fn_name);
+  }
+  return nullptr;
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h
new file mode 100644
index 0000000000000..bba22e60303a6
--- /dev/null
+++ b/paddle/cinn/backends/compiler.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/strings/string_view.h>
+
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <string>
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/lang/packed_func.h"
+#ifdef CINN_WITH_CUDA
+#include "cinn/runtime/cuda/cuda_module.h"
+#endif
+
+namespace cinn {
+namespace backends {
+
+class SourceCodePrint {
+ public:
+  static SourceCodePrint* GetInstance() {
+    static SourceCodePrint print;
+    return &print;
+  }
+
+  void write(const std::string& source_code);
+
+ private:
+  SourceCodePrint();
+  ~SourceCodePrint();
+
+  std::ofstream of;
+  std::mutex mtx_;
+};
+
+class Compiler final {
+ public:
+  static std::unique_ptr<Compiler> Create(const Target& target) {
+    return std::unique_ptr<Compiler>(new Compiler(target));
+  }
+
+  /**
+   * Compile and link to a CINN module.
+   */
+  void Build(const ir::Module& module, const std::string& code = "");
+
+  void ExportObject(const std::string& path);
+
+  std::string GetSourceCode(const ir::Module& module);
+
+  void BuildDefault(const ir::Module& module);
+
+  /**
+   * Retrieve a function by \p fn_name.
+   * @return function address or null if not exists.
+   */
+  void* Lookup(absl::string_view fn_name);
+
+ private:
+  void CompileCudaModule(const ir::Module& module, const std::string& code = "");
+
+  void CompileX86Module(const ir::Module& module);
+
+  explicit Compiler(const Target& target) : target_(target), engine_(ExecutionEngine::Create(ExecutionOptions())) {}
+
+  CINN_DISALLOW_COPY_AND_ASSIGN(Compiler);
+
+ private:
+  Target target_;
+  std::unique_ptr<ExecutionEngine> engine_;
+
+#ifdef CINN_WITH_CUDA
+  std::unique_ptr<runtime::cuda::CUDAModule> cuda_module_;
+#endif
+};
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/compiler_test.cc b/paddle/cinn/backends/compiler_test.cc
new file mode 100644
index 0000000000000..0393c97eb4d5a
--- /dev/null
+++ b/paddle/cinn/backends/compiler_test.cc
@@ -0,0 +1,196 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/compiler.h"
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "cinn/cinn.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/runtime/use_extern_funcs.h"
+#include "cinn/utils/timer.h"
+
+namespace cinn {
+namespace backends {
+
+TEST(Compiler, x86) {
+  Expr M(1024), N(1024);
+
+  auto create_module = [&]() {
+    Placeholder<float> A("A", {M, N});
+    Placeholder<float> B("B", {M, N});
+
+    auto C = Compute(
+        {M, N}, [=](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
+    return std::make_tuple(A, B, C);
+  };
+
+  {                                  // test x86
+    auto _A_B_C_ = create_module();  // NOLINT
+    auto& A      = std::get<0>(_A_B_C_);
+    auto& B      = std::get<1>(_A_B_C_);
+    auto& C      = std::get<2>(_A_B_C_);
+
+    auto stages = CreateStages({C});
+
+    auto fn = Lower("fn", stages, {A, B, C});
+
+    ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+    builder.AddFunction(fn);
+
+    auto compiler = Compiler::Create(common::DefaultHostTarget());
+    compiler->Build(builder.Build());
+
+    auto* fnp = compiler->Lookup("fn");
+    ASSERT_TRUE(fnp);
+
+    auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+    auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+    auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_zero().Build();
+
+    auto args = common::ArgsBuilder().Add(Ab).Add(Bb).Add(Cb).Build();
+    reinterpret_cast<void (*)(void*, int)>(fnp)(args.data(), args.size());
+
+    // test result
+    auto* Ad = reinterpret_cast<float*>(Ab->memory);
+    auto* Bd = reinterpret_cast<float*>(Bb->memory);
+    auto* Cd = reinterpret_cast<float*>(Cb->memory);
+    for (int i = 0; i < Ab->num_elements(); i++) {
+      ASSERT_NEAR(Ad[i] + Bd[i], Cd[i], 1e-5);
+    }
+  }
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(Compiler, cuda) {
+  Expr M(1024), N(1024);
+
+  auto create_module = [&]() {
+    Placeholder<float> A("A", {M, N});
+    Placeholder<float> B("B", {M, N});
+
+    auto C = Compute(
+        {M, N}, [=](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
+    return std::make_tuple(A, B, C);
+  };
+
+  {                                  // cuda
+    auto _A_B_C_ = create_module();  // NOLINT
+    auto& A      = std::get<0>(_A_B_C_);
+    auto& B      = std::get<1>(_A_B_C_);
+    auto& C      = std::get<2>(_A_B_C_);
+    auto stages  = CreateStages({C});
+
+    stages[C]->Bind(0, "blockIdx.x");
+    stages[C]->Bind(1, "threadIdx.x");
+
+    auto fn = Lower("fn", stages, {A, B, C});
+
+    ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+    builder.AddFunction(fn);
+
+    auto compiler = Compiler::Create(common::DefaultNVGPUTarget());
+    compiler->Build(builder.Build());
+
+    auto* fnp = compiler->Lookup("fn");
+    ASSERT_TRUE(fnp);
+
+    auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+    auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+    auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_zero().Build();
+
+    // allocate CUDA buffer
+    void *Ag, *Bg, *Cg;
+    const int num_bytes = Ab->num_elements() * sizeof(float);
+    cudaMalloc(&Ag, num_bytes);
+    cudaMalloc(&Bg, num_bytes);
+    cudaMalloc(&Cg, num_bytes);
+
+    CUDA_CALL(cudaMemcpy(Ag, Ab->memory, num_bytes, cudaMemcpyHostToDevice));
+    CUDA_CALL(cudaMemcpy(Bg, Bb->memory, num_bytes, cudaMemcpyHostToDevice));
+    CUDA_CALL(cudaMemcpy(Cg, Cb->memory, num_bytes, cudaMemcpyHostToDevice));
+
+    cinn_buffer_t Abb;
+    Abb.memory = reinterpret_cast<uint8_t*>(Ag);
+    cinn_buffer_t Bbb;
+    Bbb.memory = reinterpret_cast<uint8_t*>(Bg);
+    cinn_buffer_t Cbb;
+    Cbb.memory = reinterpret_cast<uint8_t*>(Cg);
+
+    auto args = common::ArgsBuilder().Add(&Abb).Add(&Bbb).Add(&Cbb).Build();
+
+    utils::Timer timer;
+    timer.Start();
+    void* stream = nullptr;
+    for (int i = 0; i < 1000; i++) {
+      reinterpret_cast<void (*)(void*, int, void*)>(fnp)(args.data(), args.size(), stream);
+    }
+
+    CUDA_CALL(cudaDeviceSynchronize());
+    float latency = timer.Stop();
+    LOG(INFO) << "latency: " << latency / 1000;
+
+    std::vector<float> ch(M.as_int32() * N.as_int32(), 0.f);
+    CUDA_CALL(cudaMemcpy(ch.data(), Cg, ch.size() * sizeof(float), cudaMemcpyDeviceToHost));
+
+    auto* Ad = reinterpret_cast<float*>(Ab->memory);
+    auto* Bd = reinterpret_cast<float*>(Bb->memory);
+    for (int i = 0; i < Ab->num_elements(); i++) {
+      ASSERT_NEAR(Ad[i] + Bd[i], ch[i], 1e-5);
+    }
+  }
+}
+#endif
+
+TEST(Compiler, sqrt) {
+  Expr N(100);
+  Expr C(10);
+  Expr H(10);
+  Expr W(10);
+
+  Placeholder<float> input("input", {N, C, H, W});
+  Placeholder<float> mean("mean", {C});
+  Placeholder<float> scale("scale", {C});
+  Placeholder<float> variance("variance", {C});
+  Placeholder<float> bias("bias", {C});
+  float epsilon = 0.1f;
+
+  auto A = Compute(
+      {N, C, H, W},
+      [=](Expr n, Expr c, Expr h, Expr w) {
+        return (input(n, c, h, w) - mean(c)) * scale(c) / lang::Sqrt(variance(c) + Expr(epsilon)) + bias(c);
+      },
+      "A");
+
+  auto B = hlir::pe::Pool2d(input, {3, 3}, {1, 1}, {1, 1, 1, 1}, "max", false, false);
+
+  auto BB = hlir::pe::BatchNorm_NCHW(input, scale, bias, mean, variance, epsilon, "batchnorm");
+
+  auto stages = CreateStages({input, mean, scale, variance, A, bias, B[0], BB});
+
+  auto fn = Lower("fn", stages, {input, mean, scale, bias, variance, A, B[0], BB});
+
+  Module::Builder builder("some", common::DefaultHostTarget());
+  builder.AddFunction(fn);
+
+  auto compiler = Compiler::Create(common::DefaultHostTarget());
+  compiler->Build(builder.Build());
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/cuda_util.cc b/paddle/cinn/backends/cuda_util.cc
new file mode 100644
index 0000000000000..fa6f5b25f78df
--- /dev/null
+++ b/paddle/cinn/backends/cuda_util.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/cuda_util.h"
+
+#include <glog/logging.h>
+
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/common/target.h"
+
+namespace cinn {
+namespace backends {
+
+std::string cuda_thread_axis_name(int level) {
+  switch (level) {
+    case 0:
+      return "threadIdx.x";
+      break;
+    case 1:
+      return "threadIdx.y";
+      break;
+    case 2:
+      return "threadIdx.z";
+      break;
+  }
+  return "";
+}
+
+std::string cuda_block_axis_name(int level) {
+  switch (level) {
+    case 0:
+      return "blockIdx.x";
+      break;
+    case 1:
+      return "blockIdx.y";
+      break;
+    case 2:
+      return "blockIdx.z";
+      break;
+  }
+  return "";
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/cuda_util.h b/paddle/cinn/backends/cuda_util.h
new file mode 100644
index 0000000000000..f86dc177febc8
--- /dev/null
+++ b/paddle/cinn/backends/cuda_util.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef CINN_WITH_CUDA
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <glog/logging.h>
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "cinn/runtime/cinn_runtime.h"
+
+#define CUDA_DRIVER_CALL(func)                                                 \
+  {                                                                            \
+    auto status = func;                                                        \
+    if (status != CUDA_SUCCESS) {                                              \
+      const char* msg;                                                         \
+      cuGetErrorString(status, &msg);                                          \
+      LOG(FATAL) << "CUDA Driver Error: " #func " failed with error: " << msg; \
+    }                                                                          \
+  }
+
+#define CUDA_CALL(func)                                            \
+  {                                                                \
+    auto status = func;                                            \
+    if (status != cudaSuccess) {                                   \
+      LOG(FATAL) << "CUDA Error : " << cudaGetErrorString(status); \
+    }                                                              \
+  }
+
+#define CURAND_CALL(func)                        \
+  {                                              \
+    auto status = func;                          \
+    if (status != CURAND_STATUS_SUCCESS) {       \
+      LOG(FATAL) << "CURAND Error : " << status; \
+    }                                            \
+  }
+
+#define CUSOLVER_CALL(func)                       \
+  {                                               \
+    auto status = func;                           \
+    if (status != CUSOLVER_STATUS_SUCCESS) {      \
+      LOG(FATAL) << "CUSOLVER Error: " << status; \
+    }                                             \
+  }
+
+#define CUBLAS_CALL(func)                  \
+  {                                        \
+    auto status = func;                    \
+    if (status != CUBLAS_STATUS_SUCCESS) { \
+      LOG(FATAL) << "CUBLAS Error!";       \
+    }                                      \
+  }
+
+#define CUDNN_CALL(func)                                             \
+  {                                                                  \
+    auto status = func;                                              \
+    if (status != CUDNN_STATUS_SUCCESS) {                            \
+      LOG(FATAL) << "CUDNN Error : " << cudnnGetErrorString(status); \
+    }                                                                \
+  }
+
+#define NVRTC_CALL(func)                                             \
+  {                                                                  \
+    auto status = func;                                              \
+    if (status != NVRTC_SUCCESS) {                                   \
+      LOG(FATAL) << "NVRTC Error : " << nvrtcGetErrorString(status); \
+    }                                                                \
+  }
+
+namespace cinn {
+namespace backends {
+
+// CUDA syntax for thread axis.
+std::string cuda_thread_axis_name(int level);
+
+// CUDA syntax for block axis.
+std::string cuda_block_axis_name(int level);
+
+}  // namespace backends
+}  // namespace cinn
+
+#endif  // CINN_WITH_CUDA
diff --git a/paddle/cinn/backends/extern_func_emitter.cc b/paddle/cinn/backends/extern_func_emitter.cc
new file mode 100644
index 0000000000000..bede4f99ff198
--- /dev/null
+++ b/paddle/cinn/backends/extern_func_emitter.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/extern_func_emitter.h"
+
+#include <absl/hash/hash.h>
+#include <glog/raw_logging.h>
+
+#include <functional>
+#include <iostream>
+#include <string>
+
+#include "cinn/backends/extern_func_emitter_builtin.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/string.h"
+
+DECLARE_bool(verbose_function_register);
+
+namespace cinn {
+namespace backends {
+
+ExternFunctionEmitterRegistry& ExternFunctionEmitterRegistry::Global() {
+  static ExternFunctionEmitterRegistry x;
+  return x;
+}
+
+void ExternFunctionEmitterRegistry::Register(const ExternFuncID& name, const std::string& x) {
+#ifdef CINN_WITH_DEBUG
+  if (FLAGS_verbose_function_register) {
+    RAW_LOG_INFO("Register extern function emitter [%s]", utils::GetStreamCnt(name).c_str());
+  }
+#endif  // CINN_WITH_DEBUG
+  CHECK(!x.empty()) << "Extern Function name is empty.";
+  data_[name] = x;
+}
+
+const std::string& ExternFunctionEmitterRegistry::Lookup(const ExternFuncID& name) const {
+  static const std::string not_found = "";
+  auto it                            = data_.find(name);
+  if (it != data_.end()) {
+    return it->second;
+  }
+  return not_found;
+}
+
+std::ostream& operator<<(std::ostream& os, const ExternFuncID& x) {
+  os << x.name << ":" << x.backend_id;
+  return os;
+}
+
+ExternFunctionEmitterRegistry::ExternFunctionEmitterRegistry() {}
+
+const FunctionProto& ExternFunctionEmitter::func_proto() const {
+  auto* proto = ExternFunctionProtoRegistry::Global().Lookup(func_name());
+  CHECK(proto) << "No prototype of function [" << func_name() << "]";
+  return *proto;
+}
+
+}  // namespace backends
+}  // namespace cinn
+
+namespace std {
+
+size_t hash<cinn::backends::ExternFuncID>::operator()(const cinn::backends::ExternFuncID& x) const {
+  return absl::Hash<absl::string_view>{}(x.name) ^ absl::Hash<absl::string_view>{}(x.backend_id);
+}
+
+}  // namespace std
diff --git a/paddle/cinn/backends/extern_func_emitter.h b/paddle/cinn/backends/extern_func_emitter.h
new file mode 100644
index 0000000000000..b2b8870d51124
--- /dev/null
+++ b/paddle/cinn/backends/extern_func_emitter.h
@@ -0,0 +1,134 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * \file Implements the ExternFuncEmitter class, which is the base of all the emitter of extern function in the
+ * backends.
+ */
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "cinn/backends/extern_func_protos.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace backends {
+class ExternFuncID;
+}  // namespace backends
+}  // namespace cinn
+
+namespace std {
+template <>
+struct hash<cinn::backends::ExternFuncID> {
+  size_t operator()(const cinn::backends::ExternFuncID& x) const;
+};
+}  // namespace std
+
+namespace cinn {
+namespace backends {
+
+//! IDs of backends.
+static const char* backend_C         = "C";
+static const char* backend_llvm_host = "llvm_host";
+static const char* backend_llvm_x86  = "llvm_x86";
+static const char* backend_nvgpu     = "nvgpu";
+
+/**
+ * \brief Base class of the emitter of all the extern functions able to trigger inside CINN CodeGen system.
+ * There are some common attributes and interfaces.
+ */
+class ExternFunctionEmitter {
+ public:
+  ExternFunctionEmitter() = default;
+
+  virtual void BindCodeGen(void* codegen) = 0;
+  /**
+   * Get the name of the function.
+   */
+  virtual const char* func_name() const = 0;
+  /**
+   * Emit a store node, if the call node's RetValuePacked is true, otherwise Emit a Call node.
+   */
+
+  void Emit(const ir::Call* op, bool insert_global_if_missing = false) {
+    insert_global_if_missing_ = insert_global_if_missing;
+    func_proto().AssertMatch(op);
+    EmitImpl(op);
+  }
+
+  const FunctionProto& func_proto() const;
+
+  /**
+   * \brief Tell whether the return value is packed to the argument list.
+   *
+   * e.g. Given the original IR
+   * \code
+   * s = Call(some_func, arg0)
+   * \endcode
+   *
+   * If this function returns true, some pass will applied and transform the IR to
+   * \code
+   * Call(some_func, get_addr(s)
+   * \endcode
+   *
+   * The `RetValuePacked` should be true when the external function modify an existing buffer (or some view of it) due
+   * to that the C language can't return a container.
+   */
+  virtual bool RetValuePacked() const = 0;
+
+  /**
+   * @return the backend identifier of this emitter.
+   */
+  virtual const char* backend_kind() const = 0;
+
+ protected:
+  virtual void EmitImpl(const ir::Call* op) = 0;
+
+  bool insert_global_if_missing_ = false;
+};
+
+struct ExternFuncID {
+  std::string name;
+  std::string backend_id;
+
+  ExternFuncID(const char* name, const char* backend_id) : name(name), backend_id(backend_id) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const ExternFuncID& x);
+  friend bool operator==(const ExternFuncID& a, const ExternFuncID& b) {
+    return a.name == b.name && a.backend_id == b.backend_id;
+  }
+};
+
+class ExternFunctionEmitterRegistry {
+ public:
+  static ExternFunctionEmitterRegistry& Global();
+
+  void Register(const ExternFuncID& name, const std::string& x);
+
+  const std::string& Lookup(const ExternFuncID& name) const;
+
+ private:
+  absl::flat_hash_map<ExternFuncID, std::string> data_;
+
+  ExternFunctionEmitterRegistry();
+  CINN_DISALLOW_COPY_AND_ASSIGN(ExternFunctionEmitterRegistry);
+};
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/extern_func_emitter_builtin.cc b/paddle/cinn/backends/extern_func_emitter_builtin.cc
new file mode 100644
index 0000000000000..087ddc6b81d33
--- /dev/null
+++ b/paddle/cinn/backends/extern_func_emitter_builtin.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/extern_func_emitter_builtin.h"
+
+#include <glog/logging.h>
+
+#include "cinn/backends/llvm/ir_builder_mixin.h"
+#include "cinn/backends/llvm/llvm_util.h"
+
+namespace cinn {
+namespace backends {
+
+void ExternFunctionLLVMEmitter::BindCodeGen(void* codegen) { codegen_ = reinterpret_cast<CodeGenLLVM*>(codegen); }
+
+const char* ExternFunctionLLVMEmitter::func_name() const { return fn_name_.c_str(); }
+
+bool ExternFunctionLLVMEmitter::RetValuePacked() const { return fn_proto().ret_type.is_void(); }
+
+FunctionProto& ExternFunctionLLVMEmitter::fn_proto() const {
+  auto* proto = ExternFunctionProtoRegistry::Global().Lookup(fn_name_);
+  CHECK(proto) << "No function prototype found for " << fn_name_;
+  return *proto;
+}
+llvm::FunctionType* ExternFunctionLLVMEmitter::llvm_fn_type() const {
+  auto* proto = ExternFunctionProtoRegistry::Global().Lookup(fn_name_);
+  CHECK(proto) << "No function prototype found for " << fn_name_;
+
+  auto* llvm_ret_type = CinnTypeToLLVMType(proto->ret_type, codegen_->m());
+  std::vector<llvm::Type*> arg_types;
+  for (auto& t : proto->readonly_arg_types) {
+    arg_types.push_back(CinnTypeToLLVMType(t, codegen_->m()));
+  }
+  for (auto& t : proto->mutable_arg_types) {
+    arg_types.push_back(CinnTypeToLLVMType(t, codegen_->m()));
+  }
+  auto* fn_type = llvm::FunctionType::get(llvm_ret_type, arg_types, false);
+  return fn_type;
+}
+const char* ExternFunctionLLVMEmitter::backend_kind() const { return nullptr; }
+
+void ExternFunctionLLVMEmitter::EmitImpl(const ir::Call* op) {
+  CHECK(codegen_);
+  CodeGenLLVMforEmitter codegen_for_emitter(codegen_);
+  llvm::Function* custom_function = llvm::dyn_cast<llvm::Function>(
+      codegen_for_emitter.m()->getOrInsertFunction(fn_name_, llvm_fn_type()).getCallee());
+  CHECK(custom_function) << "No function registered in JIT called " << fn_name_;
+  custom_function->setCallingConv(llvm::CallingConv::C);
+
+  std::vector<llvm::Value*> args;
+  for (auto& v : op->read_args) {
+    if (v.as_tensor()) {
+      args.push_back(codegen_for_emitter.GetVar(v.as_tensor()->buffer->name, false));
+    } else {
+      auto* arg = codegen_for_emitter.Visit(&v);
+      args.push_back(arg);
+    }
+  }
+  for (auto& v : op->write_args) {
+    if (v.as_tensor()) {
+      args.push_back(codegen_for_emitter.GetVar(v.as_tensor()->buffer->name, false));
+    } else {
+      auto* arg = codegen_->Visit(&v);
+      args.push_back(arg);
+    }
+  }
+
+  VLOG(3) << "function type " << op->name << ": " << DumpToString(*custom_function);
+
+  auto* command                   = codegen_for_emitter.b()->CreateCall(custom_function, args);
+  codegen_->extern_func_emit_res_ = command;
+  VLOG(3) << "call: " << DumpToString(*command);
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/extern_func_emitter_builtin.h b/paddle/cinn/backends/extern_func_emitter_builtin.h
new file mode 100644
index 0000000000000..59d508e0e8906
--- /dev/null
+++ b/paddle/cinn/backends/extern_func_emitter_builtin.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/extern_func_emitter.h"
+#include "cinn/backends/extern_func_protos.h"
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/llvm_util.h"
+
+namespace cinn {
+namespace backends {
+
+//! Function names
+
+static const char* extern_tanh_host_repr   = "__cinn_host_tanh_fp32";
+static const char* extern_tanh_v_host_repr = "__cinn_host_tanh_v";
+
+/**
+ * A bridge for the Emitters to access CodeGenLLVM's internal members.
+ */
+class CodeGenLLVMforEmitter : public CodeGenLLVM {
+ public:
+  explicit CodeGenLLVMforEmitter(CodeGenLLVM* x) : CodeGenLLVM(x->m(), x->b(), x->named_vars()) {}
+};
+
+class ExternFunctionLLVMEmitter : public ExternFunctionEmitter {
+ public:
+  explicit ExternFunctionLLVMEmitter(const std::string& fn_name) : fn_name_(fn_name) {}
+
+  void BindCodeGen(void* codegen) override;
+  const char* func_name() const override;
+  bool RetValuePacked() const override;
+  const char* backend_kind() const override;
+
+ protected:
+  void EmitImpl(const ir::Call* op) override;
+  FunctionProto& fn_proto() const;
+  llvm::FunctionType* llvm_fn_type() const;
+
+  CodeGenLLVM* codegen_{};
+  std::string fn_name_;
+};
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/extern_func_jit_register.cc b/paddle/cinn/backends/extern_func_jit_register.cc
new file mode 100644
index 0000000000000..1c9113c9f5da3
--- /dev/null
+++ b/paddle/cinn/backends/extern_func_jit_register.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/extern_func_jit_register.h"
+
+#include <string>
+
+namespace cinn {
+namespace backends {
+
+void RegisterExternFunctionHelper(const std::string &fn_name,
+                                  std::unique_ptr<FunctionProto> &&fn_proto,
+                                  Target target,
+                                  void *fn_ptr) {
+  ExternFunctionProtoRegistry::Global().Register(fn_name, fn_proto.release());
+  CHECK(ExternFunctionProtoRegistry::Global().Lookup(fn_name));
+
+  ExternFunctionEmitterRegistry::Global().Register(ExternFuncID{TargetToBackendRepr(target), fn_name.c_str()}, fn_name);
+
+  GlobalSymbolRegistry::Global().RegisterFn(fn_name, reinterpret_cast<void *>(fn_ptr));
+}
+
+void RegisterExternFunction::End() {
+  auto fn_proto = fn_proto_builder_.Build();
+  RegisterExternFunctionHelper(fn_name_, std::move(fn_proto), target_, fn_ptr_);
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/extern_func_jit_register.h b/paddle/cinn/backends/extern_func_jit_register.h
new file mode 100644
index 0000000000000..ad738ec288667
--- /dev/null
+++ b/paddle/cinn/backends/extern_func_jit_register.h
@@ -0,0 +1,161 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * \file This file defines some functions and macros to help register the extern functions into JIT.
+ */
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/backends/extern_func_emitter.h"
+#include "cinn/backends/extern_func_emitter_builtin.h"
+#include "cinn/backends/extern_func_protos.h"
+#include "cinn/backends/function_prototype.h"
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/ir_builder_mixin.h"
+#include "cinn/backends/llvm/llvm_util.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/common/macros.h"
+
+/**
+ * Helper to register an external function into CINN, including the prototype, the function address.
+ * @param fn__: name of the function
+ * @param target__: the Target.
+ */
+#define REGISTER_EXTERN_FUNC_HELPER(fn__, target__) \
+  ::cinn::backends::RegisterExternFunction(#fn__, target__, reinterpret_cast<void*>(fn__))
+
+#define REGISTER_FACKED_EXTERN_FUNC_HELPER(fn__, target__) ::cinn::backends::RegisterExternFunction(#fn__, target__)
+
+/**
+ * Register an external function with one input and one output.
+ */
+#define REGISTER_EXTERN_FUNC_1_IN_1_OUT(fn__, target__, in_type__, out_type__) \
+  REGISTER_EXTERN_FUNC_HELPER(fn__, target__).SetRetType<out_type__>().AddInputType<in_type__>().End()
+
+/**
+ * Register an external function with one input and one output.
+ */
+#define REGISTER_EXTERN_FUNC_2_IN_1_OUT(fn__, target__, in_type1__, in_type2__, out_type__) \
+  REGISTER_EXTERN_FUNC_HELPER(fn__, target__)                                               \
+      .SetRetType<out_type__>()                                                             \
+      .AddInputType<in_type1__>()                                                           \
+      .AddInputType<in_type2__>()                                                           \
+      .End()
+
+/**
+ * Register a sourced function(No function address, called in generated source code).
+ */
+#define REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(fn__, target__, in_type__, out_type__) \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(fn__, target__).SetRetType<out_type__>().AddInputType<in_type__>().End()
+
+/**
+ * Register a sourced function(No function address, called in generated source code).
+ */
+#define REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(fn__, target__, in_type1__, in_type2__, out_type__) \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(fn__, target__)                                               \
+      .SetRetType<out_type__>()                                                                    \
+      .AddInputType<in_type1__>()                                                                  \
+      .AddInputType<in_type2__>()                                                                  \
+      .End()
+
+namespace cinn {
+namespace backends {
+
+static const char* TargetToBackendRepr(Target target) {
+  switch (target.arch) {
+    case Target::Arch::X86:
+      return backend_llvm_host;
+    case Target::Arch::NVGPU:
+      return backend_nvgpu;
+    default:
+      CINN_NOT_IMPLEMENTED
+  }
+  return nullptr;
+}
+
+/**
+ * Helper class to register an external function.
+ */
+struct RegisterExternFunction {
+  /**
+   * Constructor.
+   * @param fn_name Name of the function.
+   * @param target Target of the function.
+   * @param fn_ptr Address of the function, not valid if leave as null.
+   */
+  RegisterExternFunction(const std::string& fn_name, Target target, void* fn_ptr = nullptr)
+      : fn_name_(fn_name), target_(target), fn_ptr_(fn_ptr), fn_proto_builder_(fn_name) {}
+
+  /**
+   * Add an input type.
+   * @tparam T The input type.
+   * @return itself.
+   */
+  template <typename T>
+  RegisterExternFunction& AddInputType() {
+    fn_proto_builder_.AddInputType<T>();
+    return *this;
+  }
+
+  /**
+   * Add an output type.
+   * @tparam T The output type.
+   * @return itself.
+   */
+  template <typename T>
+  RegisterExternFunction& AddOutputType() {
+    fn_proto_builder_.AddOutputType<T>();
+    return *this;
+  }
+
+  /**
+   * Add an return type.
+   * @tparam T The return type.
+   * @return itself.
+   */
+  template <typename T>
+  RegisterExternFunction& SetRetType() {
+    fn_proto_builder_.SetRetType<T>();
+    return *this;
+  }
+
+  /**
+   * Add an shape inference.
+   * @param handle The handle to help inference the shape.
+   * @return itself.
+   */
+  RegisterExternFunction& SetShapeInference(FunctionProto::shape_inference_t handle) {
+    fn_proto_builder_.SetShapeInference(handle);
+    return *this;
+  }
+
+  /**
+   * End the register, once end, futher modification is disallowed.
+   */
+  void End();
+
+ private:
+  const std::string& fn_name_;
+  Target target_;
+  void* fn_ptr_{};
+  FunctionProto::Builder fn_proto_builder_;
+};
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/extern_func_protos.cc b/paddle/cinn/backends/extern_func_protos.cc
new file mode 100644
index 0000000000000..58472677b3ea9
--- /dev/null
+++ b/paddle/cinn/backends/extern_func_protos.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/extern_func_protos.h"
+
+#include <string>
+#include <vector>
+
+namespace cinn {
+namespace backends {
+
+ExternFunctionProtoRegistry::ExternFunctionProtoRegistry() {
+  static const std::vector<std::string> extern_funcs_fp32_unary = {
+      "exp",  "erf", "sigmoid", "sqrt", "log",  "log2", "log10", "floor", "ceil",  "round", "trunc", "cos",
+      "cosh", "tan", "tanh",    "sin",  "sinh", "acos", "acosh", "asin",  "asinh", "atan",  "atanh", "fabs"};
+  static const std::vector<std::string> extern_funcs_float_bool_unary = {"isnan", "isfinite", "isinf"};
+  static const std::vector<std::string> extern_funcs_int_binary       = {
+      "left_shift", "right_shift", "bitwise_or", "bitwise_and", "bitwise_xor", "bitwise_not"};
+  static const std::vector<std::string> extern_funcs_int_int_unary = {"bitwise_not"};
+  for (int i = 0; i < extern_funcs_fp32_unary.size(); ++i) {
+    auto* proto = new FunctionProto(extern_funcs_fp32_unary[i], {Float(32)}, Float(32));
+    Register(proto->name, proto);
+  }
+  for (int i = 0; i < extern_funcs_float_bool_unary.size(); ++i) {
+    auto* proto = new FunctionProto(extern_funcs_float_bool_unary[i], {Float(32)}, Bool());
+    Register(proto->name, proto);
+  }
+  for (int i = 0; i < extern_funcs_int_binary.size(); ++i) {
+    auto* proto = new FunctionProto(extern_funcs_int_binary[i], {Int(32), Int(32)}, Int(32));
+    Register(proto->name, proto);
+  }
+  for (int i = 0; i < extern_funcs_int_int_unary.size(); ++i) {
+    auto* proto = new FunctionProto(extern_funcs_int_int_unary[i], {Int(32)}, Int(32));
+    Register(proto->name, proto);
+  }
+
+  auto* n = detail::CreateTanhVProto();
+  Register(n->name, n);
+}
+
+ExternFunctionProtoRegistry& ExternFunctionProtoRegistry::Global() {
+  static ExternFunctionProtoRegistry x;
+  return x;
+}
+
+namespace detail {
+
+FunctionProto* CreateTanhVProto() {
+  return new FunctionProto(
+      extern_func__tanh_v, {type_of<float*>()}, {type_of<float*>()}, Void(), FunctionProto::ShapeFollowNthArgument(0));
+}
+
+}  // namespace detail
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/extern_func_protos.h b/paddle/cinn/backends/extern_func_protos.h
new file mode 100644
index 0000000000000..8b9dbd230dfd5
--- /dev/null
+++ b/paddle/cinn/backends/extern_func_protos.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/backends/function_prototype.h"
+
+namespace cinn {
+namespace backends {
+
+static const char* extern_func__tanh_v = "tanh_v";
+
+class ExternFunctionProtoRegistry : public FunctionProtoRegistry {
+ public:
+  using FunctionProtoRegistry::Lookup;
+  using FunctionProtoRegistry::Register;
+
+  static ExternFunctionProtoRegistry& Global();
+
+ private:
+  ExternFunctionProtoRegistry();
+  CINN_DISALLOW_COPY_AND_ASSIGN(ExternFunctionProtoRegistry);
+};
+
+namespace detail {
+
+FunctionProto* CreateTanhVProto();
+
+}  // namespace detail
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/function_prototype.cc b/paddle/cinn/backends/function_prototype.cc
new file mode 100644
index 0000000000000..87fb0ec2a40b2
--- /dev/null
+++ b/paddle/cinn/backends/function_prototype.cc
@@ -0,0 +1,130 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/function_prototype.h"
+
+#include <glog/raw_logging.h>
+
+#include <iostream>
+
+#include "cinn/ir/tensor.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(verbose_function_register);
+
+namespace cinn {
+namespace backends {
+
+bool FunctionProto::Match(const ir::Call *op) const {
+  if (name != op->name) return false;
+  if (ret_type != op->type()) return false;
+  if (op->read_args.size() != readonly_arg_types.size()) return false;
+  if (op->write_args.size() != mutable_arg_types.size()) return false;
+
+  for (int i = 0; i < op->read_args.size(); i++) {
+    if (op->read_args[i].type() != readonly_arg_types[i]) return false;
+  }
+  for (int i = 0; i < op->write_args.size(); i++) {
+    if (op->write_args[i].type() != mutable_arg_types[i]) return false;
+  }
+  return true;
+}
+
+void FunctionProto::AssertMatch(const ir::Call *op) const {
+  CHECK_EQ(name, op->name);
+  CHECK_EQ(ret_type, op->type()) << "function proto " << name << " check failed";
+  CHECK_EQ(op->read_args.size(), readonly_arg_types.size()) << "function proto " << name << " check failed";
+  CHECK_EQ(op->write_args.size(), mutable_arg_types.size()) << "function proto " << name << " check failed";
+
+  auto get_type = [](Expr u) {
+    if (u.as_tensor() || u.as_buffer()) {
+      Type t = u.type();
+      return t.set_cpp_handle();
+    }
+    return u.type();
+  };
+  for (int i = 0; i < op->read_args.size(); i++) {
+    if (readonly_arg_types[i] == type_of<cinn_buffer_t *>()) {
+      if (!op->read_args[i].as_tensor()) continue;
+    } else {
+      CHECK_EQ(get_type(op->read_args[i]), readonly_arg_types[i]);
+    }
+  }
+  for (int i = 0; i < op->write_args.size(); i++) {
+    if (mutable_arg_types[i] == type_of<cinn_buffer_t *>()) {
+      if (!op->write_args[i].as_tensor()) continue;
+    } else {
+      CHECK_EQ(get_type(op->write_args[i]), mutable_arg_types[i]);
+    }
+  }
+}
+
+void FunctionProto::CheckValid() {
+  if (ret_type.is_void()) {
+    CHECK(!mutable_arg_types.empty())
+        << "A void function should have at least one mutable argument to output something";
+  } else {
+    CHECK(mutable_arg_types.empty()) << "A function with return should not have mutable argument";
+  }
+}
+
+FunctionProto::shape_inference_t FunctionProto::ShapeFollowNthArgument(int n) {
+  return [=](const std::vector<Expr> &args, int value_offset) {
+    CHECK_LT(n, args.size());
+    auto x = args[n].as_tensor();
+    CHECK(x);
+    return x->shape;
+  };
+}
+
+FunctionProto::FunctionProto(const std::string &name,
+                             const std::vector<Type> &readonly_arg_types,
+                             const std::vector<Type> &mutable_arg_types,
+                             Type ret_type,
+                             FunctionProto::shape_inference_t shape_inference)
+    : name(name),
+      readonly_arg_types(readonly_arg_types),
+      mutable_arg_types(mutable_arg_types),
+      ret_type(ret_type),
+      shape_inference(shape_inference) {
+  CheckValid();
+}
+
+FunctionProto *FunctionProtoRegistry::Lookup(const std::string &name) {
+  auto it = data_.find(name);
+  if (it != data_.end()) {
+    return it->second.get();
+  }
+  return nullptr;
+}
+
+FunctionProto *FunctionProtoRegistry::Register(absl::string_view name, FunctionProto *x) {
+#ifdef CINN_WITH_DEBUG
+  if (FLAGS_verbose_function_register) {
+    RAW_LOG_INFO("Register function prototype  [%s]", name.data());
+  }
+#endif  // CINN_WITH_DEBUG
+  data_.emplace(name, std::unique_ptr<FunctionProto>(x));
+  return x;
+}
+
+std::string FunctionProtoRegistry::debug_string() const {
+  std::stringstream ss;
+  for (auto &item : data_) {
+    ss << item.first << "\n";
+  }
+  return ss.str();
+}
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/function_prototype.h b/paddle/cinn/backends/function_prototype.h
new file mode 100644
index 0000000000000..2ec058fa7edb2
--- /dev/null
+++ b/paddle/cinn/backends/function_prototype.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/strings/string_view.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace backends {
+
+struct FunctionProto {
+  using shape_inference_t =
+      std::function<std::vector<Expr> /*shape*/ (const std::vector<Expr>& /*arguments*/, int /*value_offset*/)>;
+
+  std::string name;
+  std::vector<Type> readonly_arg_types;
+  std::vector<Type> mutable_arg_types;
+  Type ret_type;
+
+  // Inference the output's shape.
+  shape_inference_t shape_inference;
+
+  /**
+   * Constructor for multiple output function.
+   * @param name Name of the function.
+   * @param readonly_arg_types The input types.
+   * @param mutable_arg_types The output types.
+   * @param ret_type The return type, default to Void().
+   * @param shape_inference The shape inference for each of the output tensor.
+   */
+  FunctionProto(const std::string& name,
+                const std::vector<Type>& readonly_arg_types,
+                const std::vector<Type>& mutable_arg_types,
+                Type ret_type                     = Void(),
+                shape_inference_t shape_inference = shape_inference_t());
+
+  /**
+   * Constructor for single output function.
+   * @param name Name of the function.
+   * @param input_types The input types.
+   * @param ret_type The return type.
+   */
+  FunctionProto(const std::string& name, const std::vector<Type>& input_types, Type ret_type)
+      : name(name), readonly_arg_types(input_types), ret_type(ret_type) {}
+
+  /**
+   * Tell whether the Call \p op matches the function prototype.
+   */
+  bool Match(const ir::Call* op) const;
+
+  /**
+   * Assert the call should match the function prototype.
+   */
+  void AssertMatch(const ir::Call* op) const;
+
+  struct Builder {
+    explicit Builder(const std::string& name) {
+      data_.reset(new FunctionProto);
+      data_->name = name;
+    }
+    template <typename T>
+    Builder& SetRetType() {
+      data_->ret_type = type_of<T>();
+      return *this;
+    }
+    template <typename T>
+    Builder& AddInputType() {
+      data_->readonly_arg_types.push_back(type_of<T>());
+      return *this;
+    }
+    template <typename T>
+    Builder& AddOutputType() {
+      data_->mutable_arg_types.push_back(type_of<T>());
+      return *this;
+    }
+    Builder& SetShapeInference(shape_inference_t fn) {
+      data_->shape_inference = fn;
+      return *this;
+    }
+
+    std::unique_ptr<FunctionProto> Build() { return std::move(data_); }
+
+   private:
+    std::unique_ptr<FunctionProto> data_;
+  };
+
+  /**
+   * All the outputs use the n-th argument's shape.
+   */
+  static shape_inference_t ShapeFollowNthArgument(int n);
+
+ protected:
+  void CheckValid();
+
+  FunctionProto() = default;
+};
+
+class FunctionProtoRegistry {
+ public:
+  FunctionProto* Register(absl::string_view name, FunctionProto* x);
+
+  FunctionProto* Lookup(const std::string& name);
+
+  std::string debug_string() const;
+
+ private:
+  absl::flat_hash_map<std::string, std::unique_ptr<FunctionProto>> data_;
+};
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/generated1.cu b/paddle/cinn/backends/generated1.cu
new file mode 100644
index 0000000000000..88459ce83f588
--- /dev/null
+++ b/paddle/cinn/backends/generated1.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/_generated1.cu"
diff --git a/paddle/cinn/backends/generated_module1.cc b/paddle/cinn/backends/generated_module1.cc
new file mode 100644
index 0000000000000..4c74a485bec27
--- /dev/null
+++ b/paddle/cinn/backends/generated_module1.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/_generated_module1.cc"
diff --git a/paddle/cinn/backends/ir_schedule_test.cc b/paddle/cinn/backends/ir_schedule_test.cc
new file mode 100644
index 0000000000000..0d11d4230d911
--- /dev/null
+++ b/paddle/cinn/backends/ir_schedule_test.cc
@@ -0,0 +1,3019 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_schedule.h"
+
+#include <gtest/gtest.h>
+#include <stdlib.h>
+
+#include <tuple>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/cinn.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/lang/lower.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/remove_schedule_block.h"
+#include "cinn/optim/unroll_loops.h"
+#include "cinn/optim/vectorize_loops.h"
+
+namespace cinn {
+namespace backends {
+
+TEST(IrSchedule, split_and_fuse1) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  auto stages = CreateStages({A, B});
+
+  auto func     = cinn::lang::LowerVec("test_split_and_fuse1", stages, {A, B}, {}, {}, nullptr, target, true);
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto fused   = ir_sch.Fuse("B", {0, 1});
+  auto splited = ir_sch.Split(fused, {4, -1});
+
+  auto loops = ir_sch.GetLoops("B");
+  fused      = ir_sch.Fuse(loops);
+  splited    = ir_sch.Split(fused, {256, -1});
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_split_and_fuse1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t i_j_fused_i_j_fused_0_fused = 0; i_j_fused_i_j_fused_0_fused < 256; i_j_fused_i_j_fused_0_fused += 1) {
+    for (int32_t i_j_fused_i_j_fused_0_fused_0 = 0; i_j_fused_i_j_fused_0_fused_0 < 4; i_j_fused_i_j_fused_0_fused_0 += 1) {
+      B[(((i_j_fused_i_j_fused_0_fused / 8) * 32) + (((4 * i_j_fused_i_j_fused_0_fused) + i_j_fused_i_j_fused_0_fused_0) & 31))] = A[(((i_j_fused_i_j_fused_0_fused / 8) * 32) + (((4 * i_j_fused_i_j_fused_0_fused) + i_j_fused_i_j_fused_0_fused_0) & 31))];
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, split_and_fuse2) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  auto stages = CreateStages({A, B});
+
+  auto func     = cinn::lang::LowerVec("test_split_and_fuse2", stages, {A, B}, {}, {}, nullptr, target, true);
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("B");
+
+  auto fused   = ir_sch.Fuse(loops);
+  auto splited = ir_sch.Split(fused, {-1, 20});
+  VLOG(3) << "After split {-1, 20}, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(3) << "split_and_fuse2 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_split_and_fuse2(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t i_j_fused = 0; i_j_fused < 52; i_j_fused += 1) {
+    for (int32_t i_j_fused_0 = 0; i_j_fused_0 < 20; i_j_fused_0 += 1) {
+      if ((((20 * i_j_fused) + i_j_fused_0) < 1024)) {
+        B[((20 * i_j_fused) + i_j_fused_0)] = A[((20 * i_j_fused) + i_j_fused_0)];
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, reorder1) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k); }, "B");
+
+  auto stages = CreateStages({A, B});
+
+  auto func     = cinn::lang::LowerVec("test_reorder1", stages, {A, B}, {}, {}, nullptr, target, true);
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto splited = ir_sch.Split("B", 0, {-1, 4});
+  splited      = ir_sch.Split("B", 2, {-1, 2});
+
+  auto loops = ir_sch.GetLoops("B");
+  ir_sch.Reorder({loops[4], loops[0]});
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(3) << "reorder1 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_reorder1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t k = 0; k < 32; k += 1) {
+    for (int32_t i_0 = 0; i_0 < 4; i_0 += 1) {
+      for (int32_t j = 0; j < 16; j += 1) {
+        for (int32_t j_0 = 0; j_0 < 2; j_0 += 1) {
+          for (int32_t i = 0; i < 8; i += 1) {
+            B[((4096 * i) + ((1024 * i_0) + ((64 * j) + ((32 * j_0) + k))))] = A[((4096 * i) + ((1024 * i_0) + ((64 * j) + ((32 * j_0) + k))))];
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, reorder2) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k); }, "B");
+
+  auto stages = CreateStages({A, B});
+
+  auto func     = cinn::lang::LowerVec("test_reorder2", stages, {A, B}, {}, {}, nullptr, target, true);
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto splited = ir_sch.Split("B", 0, {-1, 4});
+  splited      = ir_sch.Split("B", 2, {-1, 2});
+
+  ir_sch.Reorder("B", {4, 2, 3, 1, 0});
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(3) << "reorder2 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_reorder2(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t k = 0; k < 32; k += 1) {
+    for (int32_t j = 0; j < 16; j += 1) {
+      for (int32_t j_0 = 0; j_0 < 2; j_0 += 1) {
+        for (int32_t i_0 = 0; i_0 < 4; i_0 += 1) {
+          for (int32_t i = 0; i < 8; i += 1) {
+            B[((4096 * i) + ((1024 * i_0) + ((64 * j) + ((32 * j_0) + k))))] = A[((4096 * i) + ((1024 * i_0) + ((64 * j) + ((32 * j_0) + k))))];
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, reorder3) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k); }, "B");
+
+  auto stages = CreateStages({A, B});
+
+  auto func     = cinn::lang::LowerVec("test_reorder3", stages, {A, B}, {}, {}, nullptr, target, true);
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto all_blocks = ir_sch.GetAllBlocks();
+  auto loops      = ir_sch.GetLoops(all_blocks[0]);
+
+  auto splited = ir_sch.Split(loops[0], {-1, 5});
+  splited      = ir_sch.Split("B", 2, {-1, 2});
+
+  ir_sch.Reorder("B", {3, 1, 2, 0, 4});
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(3) << "reorder3 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_reorder3(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t j_0 = 0; j_0 < 2; j_0 += 1) {
+    for (int32_t i_0 = 0; i_0 < 5; i_0 += 1) {
+      for (int32_t j = 0; j < 16; j += 1) {
+        for (int32_t i = 0; i < 7; i += 1) {
+          if ((((5 * i) + i_0) < 32)) {
+            for (int32_t k = 0; k < 32; k += 1) {
+              B[((5120 * i) + ((1024 * i_0) + ((64 * j) + ((32 * j_0) + k))))] = A[((5120 * i) + ((1024 * i_0) + ((64 * j) + ((32 * j_0) + k))))];
+            };
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, reorder4) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k); }, "B");
+
+  auto stages = CreateStages({A, B});
+
+  auto func     = cinn::lang::LowerVec("test_reorder4", stages, {A, B}, {}, {}, nullptr, target, true);
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto all_blocks = ir_sch.GetAllBlocks();
+  auto block_b    = ir_sch.GetBlock("B");
+  auto loops      = ir_sch.GetLoops(block_b);
+
+  auto splited = ir_sch.Split("B", 0, {-1, 10});
+  splited      = ir_sch.Split("B", 2, {-1, 5});
+
+  ir_sch.Reorder("B", {0, 2, 1, 3, 4});
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(3) << "reorder4 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_reorder4(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t j = 0; j < 7; j += 1) {
+      for (int32_t i_0 = 0; i_0 < 10; i_0 += 1) {
+        if ((((10 * i) + i_0) < 32)) {
+          for (int32_t j_0 = 0; j_0 < 5; j_0 += 1) {
+            if ((((5 * j) + j_0) < 32)) {
+              for (int32_t k = 0; k < 32; k += 1) {
+                B[((10240 * i) + ((1024 * i_0) + ((160 * j) + ((32 * j_0) + k))))] = A[((10240 * i) + ((1024 * i_0) + ((160 * j) + ((32 * j_0) + k))))];
+              };
+            };
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+#ifdef CINN_USE_OPENMP
+TEST(IrSchedule, parallel) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  auto stages = CreateStages({A, B});
+  auto func   = cinn::lang::LowerVec("test_parallel", stages, {A, B}, {}, {}, nullptr, target, true);
+  CHECK(!func.empty());
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("B");
+  CHECK(!loops.empty());
+  ir_sch.Parallel(loops[0]);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_parallel(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  int num_task = max_concurrency();
+  omp_set_num_threads(num_task);
+  auto flambda = [=](int task_id, int num_task) -> int {
+    int n_per_task = (((32 + num_task) - 1) / num_task);
+    for (int32_t i = (task_id * n_per_task); i < 32 && i < ((task_id + 1) * n_per_task); i += 1) {
+      for (int32_t j = 0; j < 32; j += 1) {
+        B[((32 * i) + j)] = A[((32 * i) + j)];
+      };
+    }
+    return 0;
+  };
+#pragma omp parallel num_threads(num_task)
+  {
+    int task_id = omp_get_thread_num();
+    flambda(task_id, num_task);
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+#endif  // CINN_USE_OPENMP
+
+TEST(IrSchedule, vectorize) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  auto stages = CreateStages({A, B});
+  auto func   = cinn::lang::LowerVec("test_vectorize", stages, {A, B}, {}, {}, nullptr, target, true);
+  CHECK(!func.empty());
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("B");
+  CHECK_EQ(loops.size(), 2U);
+  ir_sch.Vectorize(loops[1], 16);
+  std::string origin = utils::GetStreamCnt(func[0]);
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+function test_vectorize (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      vectorize[16] for (j, 0, 32)
+      {
+        ScheduleBlock(B)
+        {
+          i0, i1 = axis.bind(i, j)
+          B[i0, i1] = A[i0, i1]
+        }
+      }
+    }
+  }
+}
+)ROC"));
+  optim::VectorizeLoops(&func[0]->body, target);
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_vectorize(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 2; j += 1) {
+      B[StackVec<16,int32_t>::Ramp(((32 * i) + (16 * j)), 1, 16)] = StackedVec<float,16>::Load(A,((32 * i) + (16 * j)));
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, unroll) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(2);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  auto stages = CreateStages({A, B});
+  auto func   = cinn::lang::LowerVec("test_unroll", stages, {A, B}, {}, {}, nullptr, target, true);
+  CHECK(!func.empty());
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("B");
+  CHECK_EQ(loops.size(), 2U);
+  ir_sch.Unroll(loops[1]);
+  std::string origin = utils::GetStreamCnt(func[0]);
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+function test_unroll (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      unroll for (j, 0, 2)
+      {
+        ScheduleBlock(B)
+        {
+          i0, i1 = axis.bind(i, j)
+          B[i0, i1] = A[i0, i1]
+        }
+      }
+    }
+  }
+}
+)ROC"));
+  optim::UnrollLoop(&func[0]->body);
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_unroll(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t i = 0; i < 32; i += 1) {
+    B[(2 * i)] = A[(2 * i)];
+    B[(1 + (2 * i))] = A[(1 + (2 * i))];
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, bind) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(2);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  auto stages = CreateStages({A, B});
+  auto func   = cinn::lang::LowerVec("test_bind", stages, {A, B}, {}, {}, nullptr, target, true);
+  CHECK(!func.empty());
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("B");
+  CHECK_EQ(loops.size(), 2U);
+  ir_sch.Bind(loops[0], "blockIdx.x");
+  std::string origin = utils::GetStreamCnt(func[0]);
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+function test_bind (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    thread_bind[blockIdx.x] for (i, 0, 32)
+    {
+      serial for (j, 0, 2)
+      {
+        ScheduleBlock(B)
+        {
+          i0, i1 = axis.bind(i, j)
+          B[i0, i1] = A[i0, i1]
+        }
+      }
+    }
+  }
+}
+)ROC"));
+}
+
+TEST(IrSchedule, simple_compute_at) {
+  Context::Global().ResetNameId();
+  Expr M(128);
+  Expr N(10);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return B(i, j); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_simple_compute_at", stages, {A, C}, {}, {}, nullptr, target, true);
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto fused   = ir_sch.Fuse("B", {0, 1});
+  auto splited = ir_sch.Split(fused, {-1, 1024});
+
+  fused        = ir_sch.Fuse("C", {0, 1});
+  splited      = ir_sch.Split(fused, {-1, 1024});
+  auto block_b = ir_sch.GetBlock("B");
+  ir_sch.SimpleComputeAt(block_b, splited[1]);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "simple_compute_at source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_simple_compute_at(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 128, 10 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i_j_fused_1 = 0; i_j_fused_1 < 2; i_j_fused_1 += 1) {
+    for (int32_t i_j_fused_2 = 0; i_j_fused_2 < 1024; i_j_fused_2 += 1) {
+      if ((((1024 * i_j_fused_1) + i_j_fused_2) < 1280)) {
+      {
+        B[((1024 * i_j_fused_1) + i_j_fused_2)] = A[((1024 * i_j_fused_1) + i_j_fused_2)];
+        C[((1024 * i_j_fused_1) + i_j_fused_2)] = B[((1024 * i_j_fused_1) + i_j_fused_2)];
+      }
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, compute_at0) {
+  Context::Global().ResetNameId();
+  Expr M(128);
+  Expr N(10);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return B(i, j); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_compute_at0", stages, {A, C}, {}, {}, nullptr, target, true);
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto fused   = ir_sch.Fuse("B", {0, 1});
+  auto splited = ir_sch.Split(fused, {-1, 1024});
+
+  fused        = ir_sch.Fuse("C", {0, 1});
+  splited      = ir_sch.Split(fused, {-1, 1024});
+  auto block_b = ir_sch.GetBlock("B");
+  ir_sch.ComputeAt(block_b, splited[1]);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_at0 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_compute_at0(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 128, 10 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i_j_fused_1 = 0; i_j_fused_1 < 2; i_j_fused_1 += 1) {
+    for (int32_t i_j_fused_2 = 0; i_j_fused_2 < 1024; i_j_fused_2 += 1) {
+      if ((((1024 * i_j_fused_1) + i_j_fused_2) < 1280)) {
+      {
+        B[((1024 * i_j_fused_1) + i_j_fused_2)] = A[((1024 * i_j_fused_1) + i_j_fused_2)];
+        C[((1024 * i_j_fused_1) + i_j_fused_2)] = B[((1024 * i_j_fused_1) + i_j_fused_2)];
+      }
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, compute_at1) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k); }, "B");
+  auto C = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return B(i, j, k); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_compute_at1", stages, {A, C}, {}, {}, nullptr, target, true);
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto loops   = ir_sch.GetLoops("C");
+
+  ir_sch.ComputeAt(block_b, loops[1]);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_at1 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_compute_at1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 32, 32, 32 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      for (int32_t ax0 = 0; ax0 < 32; ax0 += 1) {
+        B[((1024 * i) + ((32 * j) + ax0))] = A[((1024 * i) + ((32 * j) + ax0))];
+      };
+      for (int32_t k = 0; k < 32; k += 1) {
+        C[((1024 * i) + ((32 * j) + k))] = B[((1024 * i) + ((32 * j) + k))];
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, compute_at2) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, M});
+  auto B = Compute(
+      {M, M}, [&](Var i, Var j) { return A(i, j); }, "B");
+  auto C = Compute(
+      {N, N}, [&](Var i, Var j) { return B(i + j, i + j); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_compute_at2", stages, {A, C}, {}, {}, nullptr, target, true);
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto loops   = ir_sch.GetLoops("C");
+
+  ir_sch.ComputeAt(block_b, loops[0]);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_at2 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_compute_at2(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 64, 64 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t ax0 = 0; ax0 < 32; ax0 += 1) {
+      for (int32_t ax1 = 0; ax1 < 32; ax1 += 1) {
+        B[((64 * ax0) + ((64 * i) + (ax1 + i)))] = A[((64 * ax0) + ((64 * i) + (ax1 + i)))];
+      };
+    };
+    for (int32_t j = 0; j < 32; j += 1) {
+      C[((32 * i) + j)] = B[((65 * i) + (65 * j))];
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, compute_at3) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, M});
+  auto B = Compute(
+      {M, M}, [&](Var i, Var j) { return A(i, j); }, "B");
+  auto C = Compute(
+      {M, M}, [&](Var i, Var j) { return B(i, j); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_compute_at3", stages, {A, C}, {}, {}, nullptr, target, true);
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+
+  auto fused   = ir_sch.Fuse("C", {0, 1});
+  auto splited = ir_sch.Split(fused, {32, -1});
+
+  auto loops = ir_sch.GetLoops("C");
+
+  ir_sch.ComputeAt(block_b, loops[0]);
+
+  VLOG(1) << "After ComputeAt, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_at3 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_compute_at3(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 64, 64 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i_j_fused = 0; i_j_fused < 32; i_j_fused += 1) {
+    for (int32_t ax0 = 0; ax0 < 2; ax0 += 1) {
+      for (int32_t ax1 = 0; ax1 < 64; ax1 += 1) {
+        B[((64 * ax0) + ((128 * i_j_fused) + ax1))] = A[((64 * ax0) + ((128 * i_j_fused) + ax1))];
+      };
+    };
+    for (int32_t i_j_fused_0 = 0; i_j_fused_0 < 128; i_j_fused_0 += 1) {
+      C[((128 * i_j_fused) + i_j_fused_0)] = B[((128 * i_j_fused) + i_j_fused_0)];
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(IrSchedule, compute_at4) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultNVGPUTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k); }, "B");
+  auto C = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return B(i, j, k); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+  stages[B]->SetBuffer("local");
+
+  auto func = cinn::lang::LowerVec("test_compute_at4", stages, {A, C}, {}, {}, nullptr, target, true);
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto loops   = ir_sch.GetLoops("C");
+
+  ir_sch.ComputeAt(block_b, loops[1]);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenCUDA_Dev codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_at4 source code is :\n" << source_code;
+
+  std::string target_code = codegen.GetSourceHeader() + R"ROC(__global__
+void test_compute_at4(const float* __restrict__ A, float* __restrict__ C)
+{
+  float _B_temp_buffer [ 32768 ];
+  float* B = _B_temp_buffer;
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      for (int32_t ax0 = 0; ax0 < 32; ax0 += 1) {
+        B[((1024 * i) + ((32 * j) + ax0))] = A[((1024 * i) + ((32 * j) + ax0))];
+      };
+      for (int32_t k = 0; k < 32; k += 1) {
+        C[((1024 * i) + ((32 * j) + k))] = B[((1024 * i) + ((32 * j) + k))];
+      };
+    };
+  };
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, compute_at5) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+
+  Target target = common::DefaultNVGPUTarget();
+
+  Placeholder<float> A("A", {M, M});
+  auto B = Compute(
+      {M, M}, [&](Var i, Var j) { return A(i, j); }, "B");
+  auto C = Compute(
+      {N, N}, [&](Var i, Var j) { return B(i + j, i + j); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+  stages[B]->SetBuffer("local");
+
+  auto func = cinn::lang::LowerVec("test_compute_at5", stages, {A, C}, {}, {}, nullptr, target, true);
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto loops   = ir_sch.GetLoops("C");
+
+  ir_sch.ComputeAt(block_b, loops[0]);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenCUDA_Dev codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_at5 source code is :\n" << source_code;
+
+  std::string target_code = codegen.GetSourceHeader() +
+                            R"ROC(__global__
+void test_compute_at5(const float* __restrict__ A, float* __restrict__ C)
+{
+  float _B_temp_buffer [ 4096 ];
+  float* B = _B_temp_buffer;
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t ax0 = 0; ax0 < 32; ax0 += 1) {
+      for (int32_t ax1 = 0; ax1 < 32; ax1 += 1) {
+        B[((64 * ax0) + ((64 * i) + (ax1 + i)))] = A[((64 * ax0) + ((64 * i) + (ax1 + i)))];
+      };
+    };
+    for (int32_t j = 0; j < 32; j += 1) {
+      C[((32 * i) + j)] = B[((65 * i) + (65 * j))];
+    };
+  };
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, compute_at6) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+
+  Target target = common::DefaultNVGPUTarget();
+
+  Placeholder<float> A("A", {M, M});
+  auto B = Compute(
+      {M, M}, [&](Var i, Var j) { return A(i, j); }, "B");
+  auto C = Compute(
+      {M, M}, [&](Var i, Var j) { return B(i, j); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+  stages[B]->SetBuffer("local");
+
+  auto func = cinn::lang::LowerVec("test_compute_at6", stages, {A, C}, {}, {}, nullptr, target, true);
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+
+  auto fused   = ir_sch.Fuse("C", {0, 1});
+  auto splited = ir_sch.Split(fused, {32, -1});
+
+  auto loops = ir_sch.GetLoops("C");
+
+  ir_sch.ComputeAt(block_b, loops[1]);
+
+  VLOG(1) << "After ComputeAt, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenCUDA_Dev codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_at6 source code is :\n" << source_code;
+
+  std::string target_code = codegen.GetSourceHeader() + R"ROC(__global__
+void test_compute_at6(const float* __restrict__ A, float* __restrict__ C)
+{
+  float _B_temp_buffer [ 4096 ];
+  float* B = _B_temp_buffer;
+  for (int32_t i_j_fused = 0; i_j_fused < 32; i_j_fused += 1) {
+    for (int32_t i_j_fused_0 = 0; i_j_fused_0 < 128; i_j_fused_0 += 1) {
+      B[((128 * i_j_fused) + i_j_fused_0)] = A[((128 * i_j_fused) + i_j_fused_0)];
+      C[((128 * i_j_fused) + i_j_fused_0)] = B[((128 * i_j_fused) + i_j_fused_0)];
+    };
+  };
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+#endif
+
+TEST(IrSchedule, cache_read1) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+  Expr P(16);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, M});
+  auto B = Compute(
+      {N, N}, [&](Var i, Var j) { return A(i, j) * Expr(2.f); }, "B");
+  auto C = Compute(
+      {P, P}, [&](Var i, Var j) { return B(i, j) + Expr(1.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_cache_read1", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto a_cache = ir_sch.CacheRead(block_b, 0, "local");
+  auto block_c = ir_sch.GetBlock("C");
+  auto b_cache = ir_sch.CacheRead(block_c, 0, "local");
+
+  VLOG(1) << "After CacheRead, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "cache_read1 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_cache_read1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 32, 32 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t cache_ax0 = 0; cache_ax0 < 32; cache_ax0 += 1) {
+    for (int32_t cache_ax1 = 0; cache_ax1 < 32; cache_ax1 += 1) {
+      A_local_temp_buffer[((64 * cache_ax0) + cache_ax1)] = A[((64 * cache_ax0) + cache_ax1)];
+    };
+  };
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      B[((32 * i) + j)] = (2.00000000f * A_local_temp_buffer[((64 * i) + j)]);
+    };
+  };
+  for (int32_t cache_ax0_0 = 0; cache_ax0_0 < 16; cache_ax0_0 += 1) {
+    for (int32_t cache_ax1_0 = 0; cache_ax1_0 < 16; cache_ax1_0 += 1) {
+      B_local_temp_buffer[((32 * cache_ax0_0) + cache_ax1_0)] = B[((32 * cache_ax0_0) + cache_ax1_0)];
+    };
+  };
+  for (int32_t i = 0; i < 16; i += 1) {
+    for (int32_t j = 0; j < 16; j += 1) {
+      C[((16 * i) + j)] = (1.00000000f + B_local_temp_buffer[((32 * i) + j)]);
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, cache_read2) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * Expr(2.f); }, "B");
+
+  auto stages = CreateStages({A, B});
+
+  auto func = cinn::lang::LowerVec("test_cache_read2", stages, {A, B}, {}, {}, nullptr, target, true);
+
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+
+  auto a_cache = ir_sch.CacheRead(block_b, 0, "local");
+
+  auto loops = ir_sch.GetLoops("B");
+  ir_sch.ComputeAt(a_cache, loops[1]);
+
+  VLOG(1) << "After CacheRead and ComputeAt, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "cache_read2 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_cache_read2(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t i = 0; i < 64; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      A_local_temp_buffer[((32 * i) + j)] = A[((32 * i) + j)];
+      B[((32 * i) + j)] = (2.00000000f * A_local_temp_buffer[((32 * i) + j)]);
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, cache_write1) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * Expr(2.f); }, "B");
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return B(i, j) + Expr(1.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_cache_write1", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto b_cache = ir_sch.CacheWrite(block_b, 0, "local");
+  auto block_c = ir_sch.GetBlock("C");
+  auto c_cache = ir_sch.CacheWrite(block_c, 0, "local");
+
+  VLOG(1) << "After CacheWrite, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "cache_write1 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_cache_write1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 64, 32 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 64; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      B_local_temp_buffer[((32 * i) + j)] = (2.00000000f * A[((32 * i) + j)]);
+    };
+  };
+  for (int32_t cache_ax0 = 0; cache_ax0 < 64; cache_ax0 += 1) {
+    for (int32_t cache_ax1 = 0; cache_ax1 < 32; cache_ax1 += 1) {
+      B[((32 * cache_ax0) + cache_ax1)] = B_local_temp_buffer[((32 * cache_ax0) + cache_ax1)];
+    };
+  };
+  for (int32_t i = 0; i < 64; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      C_local_temp_buffer[((32 * i) + j)] = (1.00000000f + B[((32 * i) + j)]);
+    };
+  };
+  for (int32_t cache_ax0_0 = 0; cache_ax0_0 < 64; cache_ax0_0 += 1) {
+    for (int32_t cache_ax1_0 = 0; cache_ax1_0 < 32; cache_ax1_0 += 1) {
+      C[((32 * cache_ax0_0) + cache_ax1_0)] = C_local_temp_buffer[((32 * cache_ax0_0) + cache_ax1_0)];
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, cache_write2) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * Expr(2.f); }, "B");
+
+  auto stages = CreateStages({A, B});
+
+  auto func = cinn::lang::LowerVec("test_cache_write2", stages, {A, B}, {}, {}, nullptr, target, true);
+
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto b_cache = ir_sch.CacheWrite(block_b, 0, "local");
+  auto loops   = ir_sch.GetLoops("B");
+  ir_sch.ComputeAt(b_cache, loops[1]);
+
+  VLOG(1) << "After CacheWrite and ComputeAt, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "cache_write2 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_cache_write2(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t cache_ax0 = 0; cache_ax0 < 64; cache_ax0 += 1) {
+    for (int32_t cache_ax1 = 0; cache_ax1 < 32; cache_ax1 += 1) {
+      B_local_temp_buffer[((32 * cache_ax0) + cache_ax1)] = (2.00000000f * A[((32 * cache_ax0) + cache_ax1)]);
+      B[((32 * cache_ax0) + cache_ax1)] = B_local_temp_buffer[((32 * cache_ax0) + cache_ax1)];
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(IrSchedule, cache_read3) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+  Expr P(16);
+
+  Target target = common::DefaultNVGPUTarget();
+
+  Placeholder<float> A("A", {M, M});
+  auto B = Compute(
+      {N, N}, [&](Var i, Var j) { return A(i, j) * Expr(2.f); }, "B");
+  auto C = Compute(
+      {P, P}, [&](Var i, Var j) { return B(i, j) + Expr(1.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+  stages[B]->SetBuffer("local");
+
+  auto func = cinn::lang::LowerVec("test_cache_read3", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto a_cache = ir_sch.CacheRead(block_b, 0, "local");
+  auto block_c = ir_sch.GetBlock("C");
+  auto b_cache = ir_sch.CacheRead(block_c, 0, "local");
+  auto loops_c = ir_sch.GetLoops("C");
+  ir_sch.SyncThreads(loops_c[1], false);
+  auto loops_b = ir_sch.GetLoops("B");
+  ir_sch.SyncThreads(loops_b[1]);
+
+  VLOG(1) << "After CacheRead, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenCUDA_Dev codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "cache_read3 source code is :\n" << source_code;
+
+  std::string target_code = codegen.GetSourceHeader() + R"ROC(__global__
+void test_cache_read3(const float* __restrict__ A, float* __restrict__ C)
+{
+  float _B_temp_buffer [ 1024 ];
+  float* B = _B_temp_buffer;
+  for (int32_t cache_ax0 = 0; cache_ax0 < 32; cache_ax0 += 1) {
+    for (int32_t cache_ax1 = 0; cache_ax1 < 32; cache_ax1 += 1) {
+      A_local_temp_buffer[((64 * cache_ax0) + cache_ax1)] = A[((64 * cache_ax0) + cache_ax1)];
+    };
+  };
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      B[((32 * i) + j)] = (2.00000000f * A_local_temp_buffer[((64 * i) + j)]);
+    };
+    __syncthreads();
+  };
+  for (int32_t cache_ax0_0 = 0; cache_ax0_0 < 16; cache_ax0_0 += 1) {
+    for (int32_t cache_ax1_0 = 0; cache_ax1_0 < 16; cache_ax1_0 += 1) {
+      B_local_temp_buffer[((32 * cache_ax0_0) + cache_ax1_0)] = B[((32 * cache_ax0_0) + cache_ax1_0)];
+    };
+  };
+  for (int32_t i = 0; i < 16; i += 1) {
+    __syncthreads();
+    for (int32_t j = 0; j < 16; j += 1) {
+      C[((16 * i) + j)] = (1.00000000f + B_local_temp_buffer[((32 * i) + j)]);
+    };
+  };
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, cache_write3) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+
+  Target target = common::DefaultNVGPUTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * Expr(2.f); }, "B");
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return B(i, j) + Expr(1.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+  stages[B]->SetBuffer("shared");
+
+  auto func = cinn::lang::LowerVec("test_cache_write3", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto b_cache = ir_sch.CacheWrite(block_b, 0, "local");
+  auto block_c = ir_sch.GetBlock("C");
+  auto c_cache = ir_sch.CacheWrite(block_c, 0, "local");
+  auto loops_c = ir_sch.GetLoops("C");
+  ir_sch.SyncThreads(loops_c[0], false);
+  auto loops_b = ir_sch.GetLoops("B");
+  ir_sch.SyncThreads(loops_b[0]);
+
+  VLOG(1) << "After CacheWrite, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenCUDA_Dev codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "cache_write3 source code is :\n" << source_code;
+
+  std::string target_code = codegen.GetSourceHeader() + R"ROC(__global__
+void test_cache_write3(const float* __restrict__ A, float* __restrict__ C)
+{
+  __shared__ float _B_temp_buffer [ 2048 ];
+  float* B = _B_temp_buffer;
+  for (int32_t i = 0; i < 64; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      B_local_temp_buffer[((32 * i) + j)] = (2.00000000f * A[((32 * i) + j)]);
+    };
+  };
+  for (int32_t cache_ax0 = 0; cache_ax0 < 64; cache_ax0 += 1) {
+    for (int32_t cache_ax1 = 0; cache_ax1 < 32; cache_ax1 += 1) {
+      B[((32 * cache_ax0) + cache_ax1)] = B_local_temp_buffer[((32 * cache_ax0) + cache_ax1)];
+    };
+  };
+  __syncthreads();
+  for (int32_t i = 0; i < 64; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      C_local_temp_buffer[((32 * i) + j)] = (1.00000000f + B[((32 * i) + j)]);
+    };
+  };
+  __syncthreads();
+  for (int32_t cache_ax0_0 = 0; cache_ax0_0 < 64; cache_ax0_0 += 1) {
+    for (int32_t cache_ax1_0 = 0; cache_ax1_0 < 32; cache_ax1_0 += 1) {
+      C[((32 * cache_ax0_0) + cache_ax1_0)] = C_local_temp_buffer[((32 * cache_ax0_0) + cache_ax1_0)];
+    };
+  };
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, sync_threads) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+
+  Target target = common::DefaultNVGPUTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * Expr(2.f); }, "B");
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return B(i, j) + Expr(1.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+  stages[B]->SetBuffer("shared");
+
+  auto func = cinn::lang::LowerVec("test_sync_threads", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto b_cache = ir_sch.CacheWrite(block_b, 0, "local");
+  auto block_c = ir_sch.GetBlock("C");
+  auto c_cache = ir_sch.CacheWrite(block_c, 0, "local");
+  block_c      = ir_sch.GetBlock("C");
+  ir_sch.SyncThreads(block_c, false);
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.SyncThreads(block_b);
+
+  VLOG(1) << "After CacheWrite and SyncThreads, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenCUDA_Dev codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = codegen.GetSourceHeader() + R"ROC(__global__
+void test_sync_threads(const float* __restrict__ A, float* __restrict__ C)
+{
+  __shared__ float _B_temp_buffer [ 2048 ];
+  float* B = _B_temp_buffer;
+  for (int32_t i = 0; i < 64; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      B_local_temp_buffer[((32 * i) + j)] = (2.00000000f * A[((32 * i) + j)]);
+    };
+  };
+  for (int32_t cache_ax0 = 0; cache_ax0 < 64; cache_ax0 += 1) {
+    for (int32_t cache_ax1 = 0; cache_ax1 < 32; cache_ax1 += 1) {
+      B[((32 * cache_ax0) + cache_ax1)] = B_local_temp_buffer[((32 * cache_ax0) + cache_ax1)];
+      __syncthreads();
+    };
+  };
+  for (int32_t i = 0; i < 64; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      C_local_temp_buffer[((32 * i) + j)] = (1.00000000f + B[((32 * i) + j)]);
+    };
+  };
+  for (int32_t cache_ax0_0 = 0; cache_ax0_0 < 64; cache_ax0_0 += 1) {
+    for (int32_t cache_ax1_0 = 0; cache_ax1_0 < 32; cache_ax1_0 += 1) {
+      __syncthreads();
+      C[((32 * cache_ax0_0) + cache_ax1_0)] = C_local_temp_buffer[((32 * cache_ax0_0) + cache_ax1_0)];
+    };
+  };
+}
+
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+#endif
+
+TEST(IrSchedule, cache_write4) {
+  Context::Global().ResetNameId();
+  Expr M(64);
+  Expr N(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, N});
+  Var k(32, "k0");
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return lang::ReduceSum(A(i, j, k), {k}); }, "B");
+
+  auto stages = CreateStages({A, B});
+
+  auto func = cinn::lang::LowerVec("test_cache_write4", stages, {A, B}, {}, {}, nullptr, target, true);
+
+  CHECK_EQ(func.size(), 1U);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto b_cache = ir_sch.CacheWrite(block_b, 0, "local");
+  auto loops   = ir_sch.GetLoops("B");
+
+  VLOG(1) << "After CacheWrite, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "cache_write4 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_cache_write4(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* B__reduce_init = ((float*)(_B->memory));
+  for (int32_t i = 0; i < 64; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      B__reduce_init[((32 * i) + j)] = 0.00000000f;
+      for (int32_t k0 = 0; k0 < 32; k0 += 1) {
+        B_local_temp_buffer[((32 * i) + j)] = (B_local_temp_buffer[((32 * i) + j)] + A[((1024 * i) + ((32 * j) + k0))]);
+      };
+    };
+  };
+  for (int32_t cache_ax0 = 0; cache_ax0 < 64; cache_ax0 += 1) {
+    for (int32_t cache_ax1 = 0; cache_ax1 < 32; cache_ax1 += 1) {
+      B[((32 * cache_ax0) + cache_ax1)] = B_local_temp_buffer[((32 * cache_ax0) + cache_ax1)];
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, rfactor) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(2);
+  Expr K(16);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, K});
+  Var j(2, "j0");
+  Var k(16, "k0");
+  auto B = Compute(
+      {M},
+      [&](Var i) {
+        return lang::ReduceSum(A(i, j, k), {j, k});
+      },
+      "B");
+
+  auto stages = CreateStages({A, B});
+  auto func   = cinn::lang::LowerVec("test_rfactor", stages, {A, B}, {}, {}, nullptr, target, true);
+  CHECK(!func.empty());
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("B");
+  CHECK_EQ(loops.size(), 3U);
+  auto new_rf_tensor      = ir_sch.Rfactor(loops[2], 0);
+  auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
+  CHECK(new_rf_tensor_ref);
+  CHECK(new_rf_tensor_ref->buffer.defined());
+  func[0]->temp_bufs.push_back(new_rf_tensor_ref->buffer);
+  func[0]->PrepareBufferCastExprs();
+  std::string origin = utils::GetStreamCnt(func[0]);
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+function test_rfactor (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (rf_k0, 0, 16)
+      {
+        serial for (i, 0, 32)
+        {
+          ScheduleBlock(rf_B__reduce_init)
+          {
+            i0, i1_0 = axis.bind(i, rf_k0)
+            rf_B__reduce_init[i1_0, i0] = 0.00000000f
+          }
+          serial for (j0, 0, 2)
+          {
+            ScheduleBlock(rf_B)
+            {
+              i0_0, i1, i2 = axis.bind(i, j0, rf_k0)
+              rf_B[i2, i0_0] = (rf_B[i2, i0_0] + A[i0_0, i1, i2])
+            }
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        ScheduleBlock(B__reduce_init)
+        {
+          i0 = axis.bind(i)
+          B__reduce_init[i0] = 0.00000000f
+        }
+        serial for (k0, 0, 16)
+        {
+          ScheduleBlock(B)
+          {
+            i0_0, i2 = axis.bind(i, k0)
+            B[i0_0] = (B[i0_0] + rf_B[i2, i0_0])
+          }
+        }
+      }
+    }
+  }
+}
+)ROC"));
+  // optimze pass: add temp buffers
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_rfactor(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* rf__B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 16, 32 });
+  cinn_buffer_malloc((void*)(0), _B);
+  cinn_buffer_malloc((void*)(0), rf__B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* B__reduce_init = ((float*)(_B->memory));
+  float* rf_B = ((float*)(rf__B->memory));
+  float* rf_B__reduce_init = ((float*)(rf__B->memory));
+  for (int32_t rf_k0 = 0; rf_k0 < 16; rf_k0 += 1) {
+    for (int32_t i = 0; i < 32; i += 1) {
+      rf_B__reduce_init[((32 * rf_k0) + i)] = 0.00000000f;
+      for (int32_t j0 = 0; j0 < 2; j0 += 1) {
+        rf_B[((32 * rf_k0) + i)] = (rf_B[((32 * rf_k0) + i)] + A[((32 * i) + ((16 * j0) + rf_k0))]);
+      };
+    };
+  };
+  for (int32_t i = 0; i < 32; i += 1) {
+    B__reduce_init[i] = 0.00000000f;
+    for (int32_t k0 = 0; k0 < 16; k0 += 1) {
+      B[i] = (B[i] + rf_B[((32 * k0) + i)]);
+    };
+  };
+  cinn_buffer_free((void*)(0), rf__B);
+  cinn_buffer_free((void*)(0), _B);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, rfactor1) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(2);
+  Expr K(16);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, K});
+  Var j(2, "j0");
+  Var k(16, "k0");
+  auto B = Compute(
+      {M},
+      [&](Var i) {
+        return lang::ReduceSum(A(i, j, k), {j, k});
+      },
+      "B");
+
+  auto stages = CreateStages({A, B});
+  auto func   = cinn::lang::LowerVec("test_rfactor", stages, {A, B}, {}, {}, nullptr, target, true);
+  CHECK(!func.empty());
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("B");
+  CHECK_EQ(loops.size(), 3U);
+  auto new_rf_tensor      = ir_sch.Rfactor(loops[1], 1);
+  auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
+  CHECK(new_rf_tensor_ref);
+  CHECK(new_rf_tensor_ref->buffer.defined());
+  func[0]->temp_bufs.push_back(new_rf_tensor_ref->buffer);
+  func[0]->PrepareBufferCastExprs();
+  std::string origin = utils::GetStreamCnt(func[0]);
+
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+function test_rfactor (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 32)
+      {
+        serial for (rf_j0, 0, 2)
+        {
+          ScheduleBlock(rf_B__reduce_init)
+          {
+            i0, i1_0 = axis.bind(i, rf_j0)
+            rf_B__reduce_init[i0, i1_0] = 0.00000000f
+          }
+          serial for (k0, 0, 16)
+          {
+            ScheduleBlock(rf_B)
+            {
+              i0_0, i1, i2 = axis.bind(i, rf_j0, k0)
+              rf_B[i0_0, i1] = (rf_B[i0_0, i1] + A[i0_0, i1, i2])
+            }
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        ScheduleBlock(B__reduce_init)
+        {
+          i0 = axis.bind(i)
+          B__reduce_init[i0] = 0.00000000f
+        }
+        serial for (j0, 0, 2)
+        {
+          ScheduleBlock(B)
+          {
+            i0_0, i1 = axis.bind(i, j0)
+            B[i0_0] = (B[i0_0] + rf_B[i0_0, i1])
+          }
+        }
+      }
+    }
+  }
+}
+)ROC"));
+  // optimze pass: add temp buffers
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_rfactor(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* rf__B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 32, 2 });
+  cinn_buffer_malloc((void*)(0), _B);
+  cinn_buffer_malloc((void*)(0), rf__B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* B__reduce_init = ((float*)(_B->memory));
+  float* rf_B = ((float*)(rf__B->memory));
+  float* rf_B__reduce_init = ((float*)(rf__B->memory));
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t rf_j0 = 0; rf_j0 < 2; rf_j0 += 1) {
+      rf_B__reduce_init[((2 * i) + rf_j0)] = 0.00000000f;
+      for (int32_t k0 = 0; k0 < 16; k0 += 1) {
+        rf_B[((2 * i) + rf_j0)] = (rf_B[((2 * i) + rf_j0)] + A[((32 * i) + ((16 * rf_j0) + k0))]);
+      };
+    };
+  };
+  for (int32_t i = 0; i < 32; i += 1) {
+    B__reduce_init[i] = 0.00000000f;
+    for (int32_t j0 = 0; j0 < 2; j0 += 1) {
+      B[i] = (B[i] + rf_B[((2 * i) + j0)]);
+    };
+  };
+  cinn_buffer_free((void*)(0), rf__B);
+  cinn_buffer_free((void*)(0), _B);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, rfactor2) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(2);
+  Expr K(16);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+  Var k(16, "k0");
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+  auto func   = cinn::lang::LowerVec("test_rfactor", stages, {A, B, C}, {}, {}, nullptr, target, true);
+  CHECK(!func.empty());
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("C");
+  CHECK_EQ(loops.size(), 3U);
+  auto new_rf_tensor      = ir_sch.Rfactor(loops[2], 0);
+  auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
+  CHECK(new_rf_tensor_ref);
+  CHECK(new_rf_tensor_ref->buffer.defined());
+  func[0]->temp_bufs.push_back(new_rf_tensor_ref->buffer);
+  func[0]->PrepareBufferCastExprs();
+  std::string origin = utils::GetStreamCnt(func[0]);
+
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+function test_rfactor (_A, _B, _C)
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (rf_k0, 0, 16)
+      {
+        serial for (i, 0, 32)
+        {
+          serial for (j, 0, 2)
+          {
+            ScheduleBlock(rf_C__reduce_init)
+            {
+              i0, i1, i2_0 = axis.bind(i, j, rf_k0)
+              rf_C__reduce_init[i2_0, i0, i1] = 0.00000000f
+            }
+            ScheduleBlock(rf_C)
+            {
+              i0_0, i1_0, i2 = axis.bind(i, j, rf_k0)
+              rf_C[i2, i0_0, i1_0] = (rf_C[i2, i0_0, i1_0] + (A[i0_0, i2] * B[i2, i1_0]))
+            }
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 2)
+        {
+          ScheduleBlock(C__reduce_init)
+          {
+            i0, i1 = axis.bind(i, j)
+            C__reduce_init[i0, i1] = 0.00000000f
+          }
+          serial for (k0, 0, 16)
+          {
+            ScheduleBlock(C)
+            {
+              i0_0, i1_0, i2 = axis.bind(i, j, k0)
+              C[i0_0, i1_0] = (C[i0_0, i1_0] + rf_C[i2, i0_0, i1_0])
+            }
+          }
+        }
+      }
+    }
+  }
+}
+)ROC"));
+  // optimze pass: add temp buffers
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_rfactor(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_t* rf__C = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 16, 32, 2 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), rf__C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  float* C__reduce_init = ((float*)(_C->memory));
+  float* rf_C = ((float*)(rf__C->memory));
+  float* rf_C__reduce_init = ((float*)(rf__C->memory));
+  for (int32_t rf_k0 = 0; rf_k0 < 16; rf_k0 += 1) {
+    for (int32_t i = 0; i < 32; i += 1) {
+      for (int32_t j = 0; j < 2; j += 1) {
+        rf_C__reduce_init[((2 * i) + ((64 * rf_k0) + j))] = 0.00000000f;
+        rf_C[((2 * i) + ((64 * rf_k0) + j))] = fma(A[((16 * i) + rf_k0)], B[((2 * rf_k0) + j)], rf_C[((2 * i) + ((64 * rf_k0) + j))]);
+      };
+    };
+  };
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 2; j += 1) {
+      C__reduce_init[((2 * i) + j)] = 0.00000000f;
+      for (int32_t k0 = 0; k0 < 16; k0 += 1) {
+        C[((2 * i) + j)] = (C[((2 * i) + j)] + rf_C[((2 * i) + ((64 * k0) + j))]);
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), rf__C);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, compute_inline1) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k) + Expr(1.f); }, "B");
+  auto C = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return B(j, i, k) * Expr(2.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_compute_inline1", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  ir_sch.ComputeInline(block_b);
+  VLOG(1) << "After ComputeInline, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_inline1 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_compute_inline1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 32, 32, 32 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      for (int32_t k = 0; k < 32; k += 1) {
+        C[((1024 * i) + ((32 * j) + k))] = fma(2.00000000f, A[((32 * i) + ((1024 * j) + k))], 2.00000000f);
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, compute_inline2) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k) + Expr(1.f); }, "B");
+  auto C = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return B(i, j, k) * Expr(2.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_compute_inline2", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto loops   = ir_sch.GetLoops("C");
+  ir_sch.ComputeAt(block_b, loops[1]);
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.ComputeInline(block_b);
+  VLOG(1) << "After ComputeInline, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_inline2 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_compute_inline2(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 32, 32, 32 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      for (int32_t k = 0; k < 32; k += 1) {
+        C[((1024 * i) + ((32 * j) + k))] = fma(2.00000000f, A[((1024 * i) + ((32 * j) + k))], 2.00000000f);
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(IrSchedule, compute_inline3) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultNVGPUTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k) + Expr(1.f); }, "B");
+  auto C = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return B(j, i, k) * Expr(2.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+  stages[B]->SetBuffer("local");
+
+  auto func = cinn::lang::LowerVec("test_compute_inline3", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  ir_sch.ComputeInline(block_b);
+  VLOG(1) << "After ComputeInline, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenCUDA_Dev codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_inline3 source code is :\n" << source_code;
+
+  std::string target_code = codegen.GetSourceHeader() + R"ROC(__global__
+void test_compute_inline3(const float* __restrict__ A, float* __restrict__ C)
+{
+  float _B_temp_buffer [ 32768 ];
+  float* B = _B_temp_buffer;
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      for (int32_t k = 0; k < 32; k += 1) {
+        C[((1024 * i) + ((32 * j) + k))] = (2.00000000f + (2.00000000f * A[((32 * i) + ((1024 * j) + k))]));
+      };
+    };
+  };
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, compute_inline4) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultNVGPUTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k) + Expr(1.f); }, "B");
+  auto C = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return B(i, j, k) * Expr(2.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+  stages[B]->SetBuffer("local");
+
+  auto func = cinn::lang::LowerVec("test_compute_inline4", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto loops   = ir_sch.GetLoops("C");
+  ir_sch.ComputeAt(block_b, loops[1]);
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.ComputeInline(block_b);
+  VLOG(1) << "After ComputeInline, IR is : " << ir_sch.GetModule().GetExprs().at(0);
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenCUDA_Dev codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = codegen.GetSourceHeader() + R"ROC(__global__
+void test_compute_inline4(const float* __restrict__ A, float* __restrict__ C)
+{
+  float _B_temp_buffer [ 32768 ];
+  float* B = _B_temp_buffer;
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      for (int32_t k = 0; k < 32; k += 1) {
+        C[((1024 * i) + ((32 * j) + k))] = (2.00000000f + (2.00000000f * A[((1024 * i) + ((32 * j) + k))]));
+      };
+    };
+  };
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+#endif
+
+TEST(IrSchedule, reverse_compute_inline1) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(64);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return Expr(1.f) + A(i, j); }, "B");
+  auto C = Compute(
+      {N, M}, [&](Var i, Var j) { return Expr(2.f) * B(j, i); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_compute_inline1", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_c = ir_sch.GetBlock("C");
+  ir_sch.ReverseComputeInline(block_c);
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_inline1 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_compute_inline1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 32, 64 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 64; j += 1) {
+      C[((32 * j) + i)] = fma(2.00000000f, A[((64 * i) + j)], 2.00000000f);
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, reverse_compute_inline2) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return Expr(1.f) + A(i, j, k); }, "B");
+  auto C = Compute(
+      {N, M, P}, [&](Var i, Var j, Var k) { return Expr(2.f) * B(j, i, k); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_compute_inline1", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_c = ir_sch.GetBlock("C");
+  ir_sch.ReverseComputeInline(block_c);
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  VLOG(1) << "compute_inline1 source code is :\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_compute_inline1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 32, 32, 32 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 32; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      for (int32_t k = 0; k < 32; k += 1) {
+        C[((32 * i) + ((1024 * j) + k))] = fma(2.00000000f, A[((1024 * i) + ((32 * j) + k))], 2.00000000f);
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, copytransform1) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k) + Expr(1.f); }, "B");
+  auto C = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return B(j, i, k) * Expr(2.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_copytransform1", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_c = ir_sch.GetBlock("C");
+  auto loops_c = ir_sch.GetLoops(block_c);
+  auto splited = ir_sch.Split(loops_c[1], {-1, 4});
+  block_c      = ir_sch.GetBlock("C");
+  loops_c      = ir_sch.GetLoops(block_c);
+  splited      = ir_sch.Split(loops_c[0], {-1, 8});
+
+  auto block_b = ir_sch.GetBlock("B");
+  block_c      = ir_sch.GetBlock("C");
+
+  ir_sch.CopyTransformAndLoopInfo(block_b, block_c);
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_copytransform1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 32, 32, 32 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t i_0 = 0; i_0 < 8; i_0 += 1) {
+      for (int32_t j = 0; j < 8; j += 1) {
+        for (int32_t j_0 = 0; j_0 < 4; j_0 += 1) {
+          for (int32_t k = 0; k < 32; k += 1) {
+            B[((8192 * i) + ((1024 * i_0) + ((128 * j) + ((32 * j_0) + k))))] = (1.00000000f + A[((8192 * i) + ((1024 * i_0) + ((128 * j) + ((32 * j_0) + k))))]);
+          };
+        };
+      };
+    };
+  };
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t i_0 = 0; i_0 < 8; i_0 += 1) {
+      for (int32_t j = 0; j < 8; j += 1) {
+        for (int32_t j_0 = 0; j_0 < 4; j_0 += 1) {
+          for (int32_t k = 0; k < 32; k += 1) {
+            C[((8192 * i) + ((1024 * i_0) + ((128 * j) + ((32 * j_0) + k))))] = (2.00000000f * B[((256 * i) + ((32 * i_0) + ((4096 * j) + ((1024 * j_0) + k))))]);
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, copytransform2) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(64);
+  Expr P(128);
+
+  Target target = common::DefaultHostTarget();
+
+  Placeholder<float> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k) + Expr(1.f); }, "B");
+  auto C = Compute(
+      {M, M, P}, [&](Var i, Var j, Var k) { return B(i, j, k) * Expr(2.f); }, "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  auto func = cinn::lang::LowerVec("test_copytransform2", stages, {A, C}, {}, {}, nullptr, target, true);
+
+  auto ast_expr = func[0]->body;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+
+  auto block_c = ir_sch.GetBlock("C");
+  auto loops_c = ir_sch.GetLoops(block_c);
+  auto splited = ir_sch.Split(loops_c[1], {-1, 4});
+  block_c      = ir_sch.GetBlock("C");
+  loops_c      = ir_sch.GetLoops(block_c);
+  splited      = ir_sch.Split(loops_c[0], {-1, 8});
+
+  auto block_b = ir_sch.GetBlock("B");
+  block_c      = ir_sch.GetBlock("C");
+  ir_sch.CopyTransformAndLoopInfo(block_b, block_c);
+  Module::Builder builder("module1", target);
+  for (auto& i : func) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void test_copytransform2(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _B = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 32, 64, 128 });
+  cinn_buffer_malloc((void*)(0), _C);
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t i_0 = 0; i_0 < 8; i_0 += 1) {
+      for (int32_t j = 0; j < 64; j += 1) {
+        for (int32_t k = 0; k < 128; k += 1) {
+          B[((65536 * i) + ((8192 * i_0) + ((128 * j) + k)))] = (1.00000000f + A[((65536 * i) + ((8192 * i_0) + ((128 * j) + k)))]);
+        };
+      };
+    };
+  };
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t i_0 = 0; i_0 < 8; i_0 += 1) {
+      for (int32_t j = 0; j < 8; j += 1) {
+        for (int32_t j_0 = 0; j_0 < 4; j_0 += 1) {
+          for (int32_t k = 0; k < 128; k += 1) {
+            C[((32768 * i) + ((4096 * i_0) + ((512 * j) + ((128 * j_0) + k))))] = (2.00000000f * B[((65536 * i) + ((8192 * i_0) + ((512 * j) + ((128 * j_0) + k))))]);
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, Annotate) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  auto funcs = cinn::lang::LowerVec(
+      "test_annotate", CreateStages({A, B}), {A, B}, {}, {}, nullptr, common::DefaultHostTarget(), true);
+  ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
+  auto fused   = ir_sch.Fuse("B", {0, 1});
+  auto block_b = ir_sch.GetBlock("B");
+  ir_sch.Annotate(block_b, "k1", int(64));
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.Annotate(block_b, "k2", bool(true));
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.Annotate(block_b, "k3", float(2.0));
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.Annotate(block_b, "k4", std::string("v4"));
+  std::string expected_expr = R"ROC({
+  ScheduleBlock(root)
+  {
+    serial for (i_j_fused, 0, 1024)
+    {
+      ScheduleBlock(B)
+      {
+        i0, i1 = axis.bind((i_j_fused / 32), (i_j_fused % 32))
+        attrs(k1:64, k2:1, k3:2, k4:v4)
+        B[i0, i1] = A[i0, i1]
+      }
+    }
+  }
+})ROC";
+  ASSERT_EQ(utils::GetStreamCnt(ir_sch.GetModule().GetExprs().front()), expected_expr);
+}
+
+TEST(IrSchedule, Unannotate) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Placeholder<float> A("A", {M, N});
+  auto B = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+
+  auto funcs = cinn::lang::LowerVec(
+      "test_unannotate", CreateStages({A, B}), {A, B}, {}, {}, nullptr, common::DefaultHostTarget(), true);
+  ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
+  auto fused   = ir_sch.Fuse("B", {0, 1});
+  auto block_b = ir_sch.GetBlock("B");
+  ir_sch.Annotate(block_b, "k1", int(64));
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.Annotate(block_b, "k2", bool(true));
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.Annotate(block_b, "k3", float(2.0));
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.Annotate(block_b, "k4", std::string("v4"));
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.Unannotate(block_b, "k1");
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.Unannotate(block_b, "k2");
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.Unannotate(block_b, "k3");
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.Unannotate(block_b, "k4");
+  std::string expected_expr = R"ROC({
+  ScheduleBlock(root)
+  {
+    serial for (i_j_fused, 0, 1024)
+    {
+      ScheduleBlock(B)
+      {
+        i0, i1 = axis.bind((i_j_fused / 32), (i_j_fused % 32))
+        B[i0, i1] = A[i0, i1]
+      }
+    }
+  }
+})ROC";
+  ASSERT_EQ(utils::GetStreamCnt(ir_sch.GetModule().GetExprs().front()), expected_expr);
+}
+
+TEST(IrSchedule, ComplexIndices) {
+  Target target = common::DefaultHostTarget();
+  ir::Expr M(32);
+  ir::Expr K(64);
+
+  Placeholder<float> A("A", {M, K});
+  Var k(K.as_int32(), "reduce_axis_k");
+  ir::Tensor B = Compute(
+      {M}, [&](Var i) { return ReduceSum(A(i, k), {k}); }, "B");
+
+  poly::StageMap stages = CreateStages({B});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestIrSchedule_ReduceSum", stages, {A, B}, {}, {}, nullptr, target, true);
+  ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
+  VLOG(3) << "Lowered Expr:" << ir_sch.GetModule().GetExprs().front();
+
+  auto loops_b = ir_sch.GetLoops("B");
+  CHECK_EQ(loops_b.size(), 2);
+  ir_sch.Split("B", 0, {8, -1});
+  ir_sch.Split("B", 2, {32, -1});  // after first splited, loops size has added to 3
+  VLOG(3) << "Splited Expr:" << ir_sch.GetModule().GetExprs().front();
+
+  CHECK_EQ(ir_sch.GetLoops("B").size(), 4);
+  ir_sch.Reorder("B", {2, 0, 3, 1});
+  VLOG(3) << "Reordered Expr:\n" << ir_sch.GetModule().GetExprs().front();
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto a_cache = ir_sch.CacheRead(block_b, 1, "shared");  // actually the read_buffer A should be indexed by 0
+  VLOG(3) << "CacheRead-A Expr:\n" << ir_sch.GetModule().GetExprs().front();
+
+  loops_b = ir_sch.GetLoops("B");
+  ir_sch.ComputeAt(a_cache, loops_b[0]);
+  VLOG(3) << "A_cache-ComputeAt-B Expr:\n" << ir_sch.GetModule().GetExprs().front();
+
+  block_b      = ir_sch.GetBlock("B");
+  auto b_cache = ir_sch.CacheWrite(block_b, 0, "local");
+  VLOG(3) << "CacheWrite-B Expr:\n" << ir_sch.GetModule().GetExprs().front();
+
+  auto loops_b_cache =
+      ir_sch.GetLoops(b_cache.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name);
+  block_b = ir_sch.GetBlock("B");
+  ir_sch.ReverseComputeAt(block_b, loops_b_cache[1]);
+  VLOG(3) << "B-ReverseComputeAt-B_cache Expr:\n" << ir_sch.GetModule().GetExprs().front();
+
+  Module::Builder builder("module1", target);
+  for (auto& i : funcs) {
+    builder.AddFunction(i);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+  VLOG(3) << "scheduled source code:\n" << source_code;
+
+  std::string target_code = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void TestIrSchedule_ReduceSum(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  float* B__reduce_init = ((float*)(_B->memory));
+  for (int32_t i = 0; i < 8; i += 1) {
+    for (int32_t i_0 = 0; i_0 < 4; i_0 += 1) {
+      B__reduce_init[((4 * i) + i_0)] = 0.00000000f;
+    };
+  };
+  for (int32_t reduce_axis_k = 0; reduce_axis_k < 32; reduce_axis_k += 1) {
+    for (int32_t ax0 = 0; ax0 < 32; ax0 += 1) {
+      for (int32_t ax1 = 0; ax1 < 2; ax1 += 1) {
+        A_shared_temp_buffer[((64 * ax0) + ((2 * reduce_axis_k) + ax1))] = A[((64 * ax0) + ((2 * reduce_axis_k) + ax1))];
+      };
+    };
+    for (int32_t i = 0; i < 8; i += 1) {
+      for (int32_t reduce_axis_k_0 = 0; reduce_axis_k_0 < 2; reduce_axis_k_0 += 1) {
+        for (int32_t i_0 = 0; i_0 < 4; i_0 += 1) {
+          B_local_temp_buffer[((4 * i) + i_0)] = (B_local_temp_buffer[((4 * i) + i_0)] + A_shared_temp_buffer[((256 * i) + ((64 * i_0) + ((2 * reduce_axis_k) + reduce_axis_k_0)))]);
+        };
+      };
+      for (int32_t ax0_0 = 0; ax0_0 < 4; ax0_0 += 1) {
+        B[((4 * i) + ax0_0)] = B_local_temp_buffer[((4 * i) + ax0_0)];
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+)ROC";
+  ASSERT_EQ(utils::Trim(target_code), utils::Trim(source_code));
+}
+
+TEST(IrSchedule, SamplePerfectTile) {
+  Context::Global().ResetNameId();
+  Expr M(1024);
+  Placeholder<int> A("A", {M});
+  auto B = Compute(
+      {M}, [&](Expr i) { return A(i) + 1; }, "B");
+  poly::StageMap stages = CreateStages({A, B});
+
+  auto funcs = cinn::lang::LowerVec(
+      "test_sampleperfecttile", stages, {A, B}, {}, {}, nullptr, common::DefaultHostTarget(), true);
+
+  ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
+  auto loops_b             = ir_sch.GetLoops("B");
+  std::vector<Expr> result = ir_sch.SamplePerfectTile(loops_b[0], 3, 64);
+  ASSERT_EQ(result.size(), 3);
+}
+
+TEST(IrSchedule, GetChildBlocks) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr K(32);
+  Placeholder<float> A("A", {M, N, K});
+  auto B = Compute(
+      {M, N, K}, [&A](Var i, Var j, Var k) { return A(i, j, k); }, "B");
+  auto C = Compute(
+      {M, N, K}, [&B](Var i, Var j, Var k) { return B(i, j, k); }, "C");
+  auto funcs = cinn::lang::LowerVec(
+      "test_getchildblocks", CreateStages({A, B, C}), {A, C}, {}, {}, nullptr, common::DefaultHostTarget(), true);
+  ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
+
+  auto block_b = ir_sch.GetBlock("B");
+  auto loops   = ir_sch.GetLoops("C");
+  ir_sch.ComputeAt(block_b, loops[1]);
+  loops           = ir_sch.GetLoops("B");
+  auto root_block = ir_sch.GetRootBlock(loops[1]);
+
+  std::string expected_expr = R"ROC(ScheduleBlock(B)
+{
+  i0, i1, i2 = axis.bind(i, j, (0 + ax0))
+  attrs(compute_at_extra_var:ax0)
+  B[i0, i1, i2] = A[i0, i1, i2]
+}, ScheduleBlock(C)
+{
+  i0_0, i1_0, i2_0 = axis.bind(i, j, k)
+  C[i0_0, i1_0, i2_0] = B[i0_0, i1_0, i2_0]
+})ROC";
+  ASSERT_EQ(utils::GetStreamCnt(ir_sch.GetChildBlocks(root_block)), expected_expr);
+}
+
+TEST(IrSchedule, SampleCategorical) {
+  Context::Global().ResetNameId();
+  Expr M(32);
+  Expr N(32);
+  Expr P(32);
+  Placeholder<int> A("A", {M, N, P});
+  auto B = Compute(
+      {M, N, P}, [&](Var i, Var j, Var k) { return A(i, j, k); }, "B");
+  poly::StageMap stages = CreateStages({A, B});
+  std::vector<int> decision;
+  auto funcs = cinn::lang::LowerVec(
+      "test_samplecategorical", stages, {A, B}, {}, {}, nullptr, common::DefaultHostTarget(), true);
+
+  ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
+  Expr result = ir_sch.SampleCategorical({1, 2, 3}, {1.0, 2.0, 3.0}, {decision});
+  ASSERT_EQ(result.type(), Int(32));
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/CMakeLists.txt b/paddle/cinn/backends/llvm/CMakeLists.txt
new file mode 100755
index 0000000000000..f405b6b8801b6
--- /dev/null
+++ b/paddle/cinn/backends/llvm/CMakeLists.txt
@@ -0,0 +1,41 @@
+add_definitions(${LLVM_DEFINITIONS})
+
+# generate cinn_runtime.ll file
+
+add_custom_command(
+  OUTPUT ${CMAKE_BINARY_DIR}/cinn/backends/llvm/cinn_runtime_llvm_ir.h
+  COMMAND ${LLVM_PATH}/bin/clang++ -mavx2 -std=c++11 -masm=intel -S -emit-llvm -O3 ${PROJECT_SOURCE_DIR}/cinn/runtime/cinn_runtime.cc -I${PROJECT_SOURCE_DIR} -o ${CMAKE_BINARY_DIR}/cinn/runtime/cinn_runtime.ll
+  COMMAND ${PYTHON_EXECUTABLE} generate_runtime_llvm_ir.py ${CMAKE_BINARY_DIR}/cinn/runtime/cinn_runtime.ll ${CMAKE_BINARY_DIR}/cinn/backends/llvm/cinn_runtime_llvm_ir.h ${LLVM_PATH}/bin/llvm-config
+  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/cinn/backends/llvm
+  DEPENDS ${PROJECT_SOURCE_DIR}/cinn/runtime/cinn_runtime.cc ${PROJECT_SOURCE_DIR}/cinn/runtime/cinn_runtime.h
+  )
+add_custom_target(GEN_LLVM_RUNTIME_IR_HEADER ALL
+  DEPENDS ${CMAKE_BINARY_DIR}/cinn/backends/llvm/cinn_runtime_llvm_ir.h
+  )
+
+set(srcs
+  llvm_util.cc
+  runtime_symbol_registry.cc
+  codegen_llvm.cc
+  codegen_x86.cc
+  simple_jit.cc
+  execution_engine.cc
+  llvm_optimizer.cc
+)
+
+
+cc_test(test_codegen_llvm SRCS codegen_llvm_test.cc DEPS cinncore)
+#cc_test(test_execution_engine SRCS execution_engine_test.cc DEPS cinncore)
+cc_test(test_codegen_x86 SRCS codegen_x86_test.cc DEPS cinncore)
+
+foreach(cpp ${srcs})
+  set(cinnapi_src
+    "${cinnapi_src};cinn/backends/llvm/${cpp}"
+    CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc
new file mode 100644
index 0000000000000..169fe3cfd40e3
--- /dev/null
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -0,0 +1,1527 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+
+#include <glog/logging.h>
+#include <glog/stl_logging.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/Instruction.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <type_traits>
+
+#include "cinn/backends/extern_func_emitter.h"
+#include "cinn/backends/extern_func_emitter_builtin.h"
+#include "cinn/backends/llvm/llvm_util.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/type.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_verify.h"
+#include "cinn/optim/var_mod_simplify.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/runtime/intrinsic.h"
+#include "cinn/utils/string.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Alignment.h"
+
+namespace cinn {
+namespace backends {
+
+using BinaryInstruction = llvm::Instruction::BinaryOps;
+using common::bfloat16;
+using common::float16;
+
+namespace {
+
+template <typename T>
+auto NodeToExpr(const T *node) {
+  std::ostringstream oss;
+  // oss << "\033[32m";
+  oss << ir::Expr(const_cast<T *>(node));
+  // oss << "\033[0m";
+  return oss.str();
+}
+
+bool is_integral_type(common::Type t) { return t.is_int() || t.is_uint(); }
+
+bool is_floating_type(common::Type t) { return t.is_float(); }
+
+llvm::Value *EmitComparison(llvm::CmpInst::Predicate predicate,
+                            llvm::Value *lhs,
+                            llvm::Value *rhs,
+                            llvm::IRBuilder<> *b) {
+  llvm::Value *comparison_result{nullptr};
+  if (lhs->getType()->isIntegerTy()) {
+    comparison_result = b->CreateICmp(predicate, lhs, rhs);
+  } else {
+    comparison_result = b->CreateFCmp(predicate, lhs, rhs);
+  }
+
+  return comparison_result;
+}
+
+#define __IR_EMITTER_NOT_IMPLEMENTED(__op) CINN_NOT_IMPLEMENTED
+
+int NextPowerOfTwo(int x) {
+  for (int p2 = 1;; p2 *= 2) {
+    if (p2 >= x) {
+      return p2;
+    }
+  }
+  return 0;
+}
+
+}  // namespace
+
+CodeGenLLVM::CodeGenLLVM(llvm::Module *m,
+                         llvm::IRBuilder<> *b,
+                         const std::shared_ptr<SymbolTable> &symbol_table,
+                         const Target &target)
+    : m_(m), b_(b), symbol_table_(symbol_table), target_(target) {
+  if (!symbol_table.get()) {
+    symbol_table_ = std::make_shared<SymbolTable>();
+  }
+  symbol_table_->PushScope();  // Create a new scope by default.
+
+  md_builder_        = std::make_unique<llvm::MDBuilder>(b_->getContext());
+  md_tbaa_root_      = md_builder_->createTBAARoot("cinn-tbaa");
+  md_tbaa_alias_set_ = md_builder_->createTBAANode("cinn-alias", md_tbaa_root_);
+  InitTarget(target_);
+}
+
+CodeGenLLVM::~CodeGenLLVM() {}
+
+llvm::Value *CodeGenLLVM::EmitVectorSlice(llvm::Value *vec, int begin, int extent) {
+  int numel = llvm::dyn_cast<llvm::VectorType>(vec->getType())->getNumElements();
+  if (extent == numel && begin == 0) return vec;
+
+  CHECK(begin >= 0 && extent <= numel) << "Slicing out of bound!";
+
+  std::vector<llvm::Constant *> indices(extent);
+  for (int i = 0; i < extent; i++) {
+    llvm::Constant **v = &indices[i];
+    if (begin + i >= 0 && begin + i < numel) {
+      *v = llvm::ConstantInt::get(b_->getInt32Ty(), begin + i);
+    } else {
+      *v = llvm::UndefValue::get(b_->getInt32Ty());
+    }
+  }
+  return ShuffleVector(vec, vec, llvm::ConstantVector::get(std::move(indices)));
+}
+
+llvm::Value *CodeGenLLVM::EmitVectorPad(llvm::Value *vec, int lanes) {
+#if LLVM_VERSION_MAJOR <= 10
+  llvm::Value *mask = llvm::UndefValue::get(llvm::VectorType::get(b_->getInt32Ty(), lanes));
+#else
+  llvm::Value *mask =
+      llvm::UndefValue::get(llvm::VectorType::get(b_->getInt32Ty(), llvm::ElementCount(lanes, false /*Scalable*/)));
+#endif
+  int numel = llvm::dyn_cast<llvm::VectorType>(vec->getType())->getNumElements();
+
+  CHECK(numel <= lanes);
+  if (numel == lanes) return vec;
+  for (int i = 0; i < numel; i++) {
+    mask =
+        InsertElement(mask, llvm::ConstantInt::get(b_->getInt32Ty(), i), llvm::ConstantInt::get(b_->getInt32Ty(), i));
+  }
+
+  return ShuffleVector(vec, vec, mask);
+}
+
+llvm::Value *CodeGenLLVM::EmitVectorConcat(std::vector<llvm::Value *> vecs) {
+  int lanes = 0;
+  for (auto *v : vecs) {
+    lanes += llvm::dyn_cast<llvm::VectorType>(v->getType())->getNumElements();
+  }
+  while (vecs.size() > 1) {
+    std::vector<llvm::Value *> new_vecs;
+    for (size_t i = 0; i < vecs.size() - 1; i += 2) {
+      auto *lhs            = vecs[i];
+      auto *rhs            = vecs[i + 1];
+      const auto lhs_lanes = llvm::dyn_cast<llvm::VectorType>(lhs->getType())->getNumElements();
+      const auto rhs_lanes = llvm::dyn_cast<llvm::VectorType>(rhs->getType())->getNumElements();
+      if (lhs_lanes < rhs_lanes) {
+        lhs = EmitVectorPad(lhs, rhs_lanes);
+      } else if (lhs_lanes > rhs_lanes) {
+        rhs = EmitVectorPad(rhs, lhs_lanes);
+      }
+
+      const auto shared_lanes = std::max(lhs_lanes, rhs_lanes);
+      std::vector<unsigned> mask(lhs_lanes + rhs_lanes);
+      std::iota(mask.begin(), std::next(mask.begin(), lhs_lanes), 0);
+      std::iota(std::next(mask.begin(), lhs_lanes), mask.end(), shared_lanes);
+      new_vecs.push_back(ShuffleVector(lhs, rhs, mask));
+    }
+    if (vecs.size() % 2) {
+      new_vecs.push_back(vecs.back());
+    }
+
+    vecs = std::move(new_vecs);
+  }
+
+  return EmitVectorSlice(vecs[0], 0, lanes);
+}
+
+llvm::Value *CodeGenLLVM::EmitBinaryOp(
+    llvm::Value *lhs, llvm::Value *rhs, char opcode, bool is_integral, bool is_signed) {
+  llvm::Instruction::BinaryOps ops;
+  CHECK_EQ(lhs->getType(), rhs->getType())
+      << "the types of operands of binary operation are mismatch"
+      << ", lhs[" << DumpToString(*lhs) << "] " << opcode << " rhs[" << DumpToString(*rhs) << "]"
+      << ", lhs_type[" << DumpToString(*lhs->getType()) << "], rhs_type[" << DumpToString(*rhs->getType()) << "]";
+  switch (opcode) {
+    case '+':
+      ops = is_integral ? llvm::Instruction::BinaryOps::Add : llvm::Instruction::BinaryOps::FAdd;
+      break;
+    case '-':
+      ops = is_integral ? llvm::Instruction::BinaryOps::Sub : llvm::Instruction::BinaryOps::FSub;
+      break;
+    case '*':
+      ops = is_integral ? llvm::Instruction::BinaryOps::Mul : llvm::Instruction::BinaryOps::FMul;
+      break;
+    case '/':
+      ops = is_integral ? (is_signed ? llvm::Instruction::BinaryOps::SDiv : llvm::Instruction::BinaryOps::UDiv)
+                        : llvm::Instruction::BinaryOps::FDiv;
+      break;
+    case '%':
+      ops = is_integral ? (is_signed ? llvm::Instruction::BinaryOps::SRem : llvm::Instruction::BinaryOps::URem)
+                        : llvm::Instruction::BinaryOps::FRem;
+      break;
+    default:
+      return nullptr;
+  }
+  return BinOp(ops, lhs, rhs);
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::IntImm *op) {
+  auto *type = b_->getIntNTy(op->type().bits());
+  return llvm::ConstantInt::get(type, op->value, true);
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::UIntImm *op) {
+  if (op->type().is_bool()) {
+    auto *type = b_->getInt1Ty();
+    return llvm::ConstantInt::get(type, op->value, false);
+  }
+  auto *type = b_->getIntNTy(op->type().bits());
+  return llvm::ConstantInt::get(type, op->value, false);
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::FloatImm *op) {
+  if (op->type().is_float(64)) {
+    return llvm::ConstantFP::get(b_->getDoubleTy(), op->value);
+  } else if (op->type().is_float(32)) {
+    return llvm::ConstantFP::get(b_->getFloatTy(), op->value);
+  } else if (op->type().is_bfloat16()) {
+    return llvm::ConstantFP::get(b_->getBFloatTy(), op->value);
+  } else if (op->type().is_float16()) {
+    return llvm::ConstantFP::get(b_->getHalfTy(), op->value);
+  } else {
+    LOG(FATAL) << "illegal float type.";
+  }
+  return nullptr;
+}
+
+llvm::Value *CodeGenLLVM::LLVMGenGlobalStringVar(const std::string &data) { return b_->CreateGlobalStringPtr(data); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::StringImm *op) { return LLVMGenGlobalStringVar(op->value); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Add *op) {
+  return EmitBinaryOp(Visit(&op->a()), Visit(&op->b()), '+', is_integral_type(op->type()));
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Sub *op) {
+  return EmitBinaryOp(Visit(&op->a()), Visit(&op->b()), '-', is_integral_type(op->type()));
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Mul *op) {
+  auto *lhs = Visit(&op->a());
+  auto *rhs = Visit(&op->b());
+  return EmitBinaryOp(lhs, rhs, '*', is_integral_type(op->type()));
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Div *op) {
+  return EmitBinaryOp(Visit(&op->a()), Visit(&op->b()), '/', is_integral_type(op->type()));
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Mod *op) {
+  return EmitBinaryOp(Visit(&op->a()), Visit(&op->b()), '%', is_integral_type(op->type()));
+}
+
+#define __IR_EMITTER_DEFINE_CMP_VISITOR(__sop, __uop, __fop) \
+  auto *lhs = Visit(&op->a());                               \
+  auto *rhs = Visit(&op->b());                               \
+  CHECK(op->a().type() == op->b().type());                   \
+  llvm::CmpInst::Predicate predicate;                        \
+  if (op->a().type().is_int()) {                             \
+    predicate = llvm::CmpInst::ICMP_##__sop;                 \
+  } else if (op->a().type().is_uint()) {                     \
+    predicate = llvm::CmpInst::ICMP_##__uop;                 \
+  } else /*float*/ {                                         \
+    predicate = llvm::CmpInst::FCMP_##__fop;                 \
+  }                                                          \
+  return EmitComparison(predicate, lhs, rhs, b_)
+
+llvm::Value *CodeGenLLVM::Visit(const ir::EQ *op) { __IR_EMITTER_DEFINE_CMP_VISITOR(EQ, EQ, OEQ); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::NE *op) { __IR_EMITTER_DEFINE_CMP_VISITOR(NE, NE, ONE); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::LT *op) { __IR_EMITTER_DEFINE_CMP_VISITOR(SLT, ULT, OLT); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::LE *op) { __IR_EMITTER_DEFINE_CMP_VISITOR(SLE, ULE, OLE); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::GT *op) { __IR_EMITTER_DEFINE_CMP_VISITOR(SGT, UGT, OGT); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::GE *op) { __IR_EMITTER_DEFINE_CMP_VISITOR(SGE, UGE, OGE); }
+
+#undef __IR_EMITTER_DEFINE_CMP_VISITOR
+
+llvm::Value *CodeGenLLVM::Visit(const ir::And *op) { return And(Visit(&op->a()), Visit(&op->b())); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Or *op) { return Or(Visit(&op->a()), Visit(&op->b())); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Min *op) {
+  auto *lhs = Visit(&op->a());
+  auto *rhs = Visit(&op->b());
+
+  llvm::Value *p{nullptr};
+  if (op->type().is_int()) {
+    p = ICmpSLT(lhs, rhs);
+  } else if (op->type().is_uint()) {
+    p = ICmpULT(lhs, rhs);
+  } else /*float*/ {
+    p = FCmpOLT(lhs, rhs);
+  }
+
+  return Select(p, lhs, rhs);
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Max *op) {
+  auto *lhs = Visit(&op->a());
+  auto *rhs = Visit(&op->b());
+
+  llvm::Value *p = nullptr;
+  if (op->type().is_int()) {
+    p = ICmpSGT(lhs, rhs);
+  } else if (op->type().is_uint()) {
+    p = ICmpUGT(lhs, rhs);
+  } else /*float*/ {
+    p = FCmpOGT(lhs, rhs);
+  }
+
+  return Select(p, lhs, rhs);
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Minus *op) {
+  auto *v = Visit(&op->v());
+  return (op->type().is_int() || op->type().is_uint()) ? Neg(v) : FNeg(v);
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Not *op) { return Not(Visit(&op->v())); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Cast *op) {
+  auto from = op->v().type();
+  auto to   = op->type();
+
+  llvm::Type *source = CinnTypeToLLVMType(from, m_);
+  llvm::Type *target = CinnTypeToLLVMType(to, m_);
+  CHECK(source) << "source ir type is null";
+  CHECK(target) << "target ir type is null";
+
+  llvm::Value *value = Visit(&op->v());
+  CHECK(value) << "value is null";
+
+  // pod_value_t cast to a value.
+  if (op->v().type().is_customized_type() &&
+      op->v().type().customized_type() == common::customized_type::kpod_value_t) {  // pod_value_t operator
+    llvm::Function *callee{};
+    if (op->type().is_bool()) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_bool);
+    } else if (op->type().is_int(8)) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_int8);
+    } else if (op->type().is_int(16)) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_int16);
+    } else if (op->type().is_int(32)) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_int32);
+    } else if (op->type().is_int(64)) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_int64);
+    } else if (op->type().is_uint(8)) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_uint8);
+    } else if (op->type().is_uint(16)) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_uint16);
+    } else if (op->type().is_uint(32)) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_uint32);
+    } else if (op->type().is_uint(64)) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_uint64);
+    } else if (op->type().is_float(32)) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_float);
+    } else if (op->type().is_float(64)) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_double);
+    } else if (op->type().is_bfloat16()) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_bfloat16);
+    } else if (op->type().is_float16()) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_float16);
+    } else if (op->type() == type_of<void *>()) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_void_p);
+    } else if (op->type() == type_of<cinn_buffer_t *>() || op->type() == type_of<const cinn_buffer_t *>()) {
+      callee = m_->getFunction(runtime::intrinsic::pod_value_to_buffer_p);
+    } else {
+      LOG(ERROR) << "can't cast cinn_pod_value_t to " << op->type();
+      CINN_NOT_IMPLEMENTED
+    }
+
+    CHECK(callee);
+    CHECK(op->v().as_var()) << "argument to the intrinsic function "
+                               "cinn_pod_value_to_x should be a Var";
+    value = GetVar(op->v().as_var()->name);
+    return Call(callee, std::vector<llvm::Value *>({value}), "pod_value_cast");
+  }
+
+  do {
+    if (value->getType() == target) break;
+
+    if (to.is_cpp_handle() || to.is_cpp_handle2()) {
+      value = BitCast(value, target, "cast_to_cpp_handle");
+      break;
+    }
+
+    if (to.is_bool()) {
+      if (from.is_float()) {
+        llvm::Constant *zero = llvm::ConstantFP::get(source, 0.);
+        value                = FCmpONE(value, zero);
+      } else {
+        llvm::Constant *zero = llvm::ConstantInt::get(source, 0);
+        value                = ICmpNE(value, zero);
+      }
+      break;
+    }
+
+    if (from.is_float() == false && to.is_float() == false) {
+      value = IntCast(value, target, from.is_int());
+      break;
+    }
+
+    if (from.is_float() && to.is_int()) {
+      value = FPToSI(value, target);
+      break;
+    }
+
+    if (from.is_float() && to.is_uint()) {
+      value = FPToUI(value, target);
+      if (to.bits() < 8) {
+        value = IntCast(value, target, false);
+      }
+      break;
+    }
+
+    if (from.is_int() && to.is_float()) {
+      value = SIToFP(value, target);
+      break;
+    }
+
+    if (from.is_uint() && to.is_float()) {
+      value = UIToFP(value, target);
+      break;
+    }
+
+    CHECK(from.is_float() && to.is_float());
+    value = FPCast(value, target);
+  } while (false);
+
+  return value;
+}
+
+llvm::Value *CodeGenLLVM::CreateSerialFor(const ir::For *op, int stride) {
+  SymbolTableGuard symbol_table_guard(*symbol_table_);
+
+  do {
+    break;
+    llvm::BasicBlock *preheader_bb = b_->GetInsertBlock();
+    auto *for_begin = llvm::BasicBlock::Create(b_->getContext(), "for_begin", b_->GetInsertBlock()->getParent());
+    auto *for_body  = llvm::BasicBlock::Create(b_->getContext(), "for_body", b_->GetInsertBlock()->getParent());
+    auto *for_end   = llvm::BasicBlock::Create(b_->getContext(), "for_end", b_->GetInsertBlock()->getParent());
+
+    Br(for_begin);
+    b_->SetInsertPoint(for_begin);
+
+    auto *begin      = Visit(&op->min);
+    auto *loop_value = PHI(begin->getType(), 2);
+    loop_value->addIncoming(begin, preheader_bb);
+
+    llvm::Value *old_var = GetVar(op->loop_var->name);
+    SetVar(op->loop_var->name, loop_value);
+    auto *end = Visit(&op->extent);
+    CondBr(ICmpSLT(loop_value, end), for_body, for_end);
+    b_->SetInsertPoint(for_body);
+    Visit(&op->body);
+
+    if (old_var) {
+      SetVar(op->loop_var->name, old_var);
+    } else {
+      symbol_table_->Erase(op->loop_var->name);
+    }
+
+    auto loop_next = Add(loop_value, llvm::ConstantInt::get(b_->getInt32Ty(), stride), "indvar.inc", true, true);
+    loop_value->addIncoming(loop_next, b_->GetInsertBlock());
+
+    Br(for_begin);
+    b_->SetInsertPoint(for_end);
+
+    return nullptr;
+    // llvm::AllocaInst *loop_var = Alloca(b_->getInt32Ty(), nullptr, op->loop_var->name);
+    // loop_var->setAlignment(llvm::Align(4));
+    // SetVar(op->loop_var->name, loop_var);
+  } while (false);
+
+  ////////////////////////////////////
+  llvm::BasicBlock *preheader_bb = b_->GetInsertBlock();
+  llvm::BasicBlock *exit_bb      = nullptr;
+
+  llvm::BasicBlock::iterator insert_point = b_->GetInsertPoint();
+
+  if (insert_point == preheader_bb->end()) {
+    CHECK(!preheader_bb->getTerminator());
+    exit_bb = llvm::BasicBlock::Create(b_->getContext(), "loop_exit", b_->GetInsertBlock()->getParent(), nullptr);
+  } else {
+    CHECK(preheader_bb->getTerminator());
+    exit_bb = preheader_bb->splitBasicBlock(insert_point, "loop_exit");
+    preheader_bb->getTerminator()->eraseFromParent();
+  }
+
+  llvm::BasicBlock *header_bb =
+      llvm::BasicBlock::Create(b_->getContext(), "loop_header", b_->GetInsertBlock()->getParent(), nullptr);
+  llvm::BasicBlock *body_bb =
+      llvm::BasicBlock::Create(b_->getContext(), "loop_body", b_->GetInsertBlock()->getParent(), nullptr);
+
+  llvm::Function *func = preheader_bb->getParent();
+  b_->SetInsertPoint(&func->getEntryBlock(), func->getEntryBlock().getFirstInsertionPt());
+
+  llvm::Value *old_var = GetVar(op->loop_var->name);
+  // loop iterator
+  llvm::AllocaInst *loop_var = Alloca(b_->getInt32Ty(), nullptr, op->loop_var->name);
+  loop_var->setAlignment(llvm::Align(4));
+  SetVar(op->loop_var->name, loop_var);
+
+  b_->SetInsertPoint(preheader_bb);
+  llvm::Value *start_index = Visit(&op->min);
+  llvm::Value *end_index   = Visit(&op->extent);
+  Store(start_index, loop_var);
+  CHECK(!preheader_bb->getTerminator());
+  Br(header_bb);
+
+  // loop_header
+  b_->SetInsertPoint(header_bb);
+  llvm::Value *indvar    = Load(loop_var, "indvar");
+  llvm::Value *exit_cond = ICmpSGE(indvar, end_index);
+  CondBr(/*Cond=*/exit_cond,
+         /*True=*/exit_bb,
+         /*False=*/body_bb);
+
+  // loop_body
+  b_->SetInsertPoint(body_bb);
+  llvm::Value *step = llvm::ConstantInt::get(b_->getInt32Ty(), stride);
+
+  Visit(&op->body);
+  llvm::Value *indvar_inc = Add(indvar,
+                                step,
+                                "indvar.inc",
+                                /*HasNUW=*/true,
+                                /*HasNSW=*/true);
+  Store(indvar_inc, loop_var);
+  llvm::BranchInst *back_branch = Br(header_bb);
+
+  // Add loop metadata
+  decltype(auto) ctx = b_->getContext();
+  std::vector<llvm::Metadata *> loop_metadata;
+  auto temp_node = llvm::MDNode::getTemporary(ctx, llvm::None);
+  loop_metadata.push_back(temp_node.get());
+
+  // TODO(fc500110): Loop vectorize
+  // auto *vectorization = op->metadata.vectorization ? b_->getTrue() : b_->getFalse();
+  // loop_metadata.push_back(llvm::MDNode::get(
+  //        ctx, {llvm::MDString::get(ctx, "llvm.loop.vectorize.enable"),
+  //        llvm::ConstantAsMetadata::get(b_->getFalse())}));
+
+  // Loop unroll
+  std::string llvm_unroll_metadata{"llvm.loop.unroll."};
+  switch (op->metadata.unroll_mode) {
+    case ir::LLVMForLoopMeta::FullyUnroll:
+      llvm_unroll_metadata += "full";
+      break;
+    case ir::LLVMForLoopMeta::NoUnroll:
+      llvm_unroll_metadata += "disable";
+      break;
+    default:
+      llvm_unroll_metadata += "enable";
+  }
+
+  /*
+  loop_metadata.push_back(llvm::MDNode::get(ctx, {llvm::MDString::get(ctx, llvm_unroll_metadata)}));
+  auto loop_id = llvm::MDNode::get(ctx, loop_metadata);
+  loop_id->replaceOperandWith(0, loop_id);
+  back_branch->setMetadata(llvm::LLVMContext::MD_loop, loop_id);
+  */
+
+  if (old_var) {
+    SetVar(op->loop_var->name, old_var);
+  } else {
+    symbol_table_->Erase(op->loop_var->name);
+  }
+
+  b_->SetInsertPoint(exit_bb);
+  return nullptr;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::For *op) { return CreateSerialFor(op); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::PolyFor *op) {
+  CINN_NOT_IMPLEMENTED
+  return nullptr;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Select *op) {
+  return Select(Visit(&op->condition), Visit(&op->true_value), Visit(&op->false_value));
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::IfThenElse *op) {
+  SymbolTableGuard symbol_table_guard(*symbol_table_);
+
+  bool emit_else = op->false_case.defined();
+
+  auto &ll_ctx      = b_->getContext();
+  auto *ll_function = b_->GetInsertBlock()->getParent();
+
+  llvm::Value *cond            = Visit(&op->condition);
+  llvm::BasicBlock *then_block = llvm::BasicBlock::Create(ll_ctx, "if-then", ll_function);
+  llvm::BasicBlock *end_block  = llvm::BasicBlock::Create(ll_ctx, "if-end", ll_function);
+
+  if (op->false_case.defined()) {
+    llvm::BasicBlock *else_block = llvm::BasicBlock::Create(ll_ctx, "if-else", ll_function);
+    CondBr(cond, then_block, else_block);
+
+    // true case
+    b_->SetInsertPoint(then_block);
+    Visit(&op->true_case);
+    Br(end_block);
+
+    // false case
+    b_->SetInsertPoint(else_block);
+    Visit(&op->false_case);
+    Br(end_block);
+  } else {
+    CondBr(cond, then_block, end_block);
+    b_->SetInsertPoint(then_block);
+    Visit(&op->true_case);
+    Br(end_block);
+  }
+  b_->SetInsertPoint(end_block);
+
+  return nullptr;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Block *op) {
+  // Create a new scope holding the temporary variables.
+  SymbolTableGuard symbol_table_guard(*symbol_table_);
+
+  llvm::Value *ret = nullptr;
+
+  llvm::BasicBlock *block =
+      llvm::BasicBlock::Create(b_->getContext(), "block", b_->GetInsertBlock()->getParent(), nullptr);
+
+  Br(block);
+  b_->SetInsertPoint(block);
+
+  for (const auto &expr : op->stmts) {
+    ret = Visit(&expr);
+  }
+
+  return ret;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::PrimitiveNode *) { CINN_NOT_IMPLEMENTED return nullptr; }
+llvm::Value *CodeGenLLVM::Visit(const ir::_BufferRange_ *) { CINN_NOT_IMPLEMENTED return nullptr; }
+llvm::Value *CodeGenLLVM::Visit(const ir::ScheduleBlock *) { CINN_NOT_IMPLEMENTED return nullptr; }
+llvm::Value *CodeGenLLVM::Visit(const ir::ScheduleBlockRealize *) { CINN_NOT_IMPLEMENTED return nullptr; }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Call *op) {
+  if (op->name == runtime::intrinsic::debug_log_repr) {
+    return EmitCall_debug_info(op);
+  } else if (op->is_extern_call()) {
+    auto emitter_id     = ExternFuncID{backend_llvm_host, op->name.c_str()};
+    const auto &fn_name = ExternFunctionEmitterRegistry::Global().Lookup(emitter_id);
+    if (!fn_name.empty()) {
+      ExternFunctionLLVMEmitter emitter(fn_name);
+      emitter.BindCodeGen(this);
+      emitter.Emit(op);
+      return extern_func_emit_res_;
+    }
+  }
+
+  llvm::Function *callee = m_->getFunction(op->name);
+  CHECK(callee) << "Unknown function referenced. [" << op->name << "]";
+
+  std::vector<llvm::Value *> args;
+  for (const auto &e : op->read_args) {
+    auto *arg = Visit(&e);
+    CHECK(arg) << "argument " << e << " is null";
+    args.push_back(arg);
+  }
+  for (const auto &e : op->write_args) {
+    auto *arg = Visit(&e);
+    CHECK(arg) << "argument " << e << " is null";
+    args.push_back(arg);
+  }
+
+  if (op->is_cinn_call()) {
+    auto arg = ir::intrinsics::GetAddr::Make(op->read_args[0]);
+    args[0]  = Visit(&arg);
+    args[0]  = BitCast(args[0], ll_void_p_ty(), "cast_to_void_p");
+  }
+
+  return Call(callee, std::move(args));
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::_Module_ *op) {
+  {
+    Expr body_to_verify(&Reference(op));
+    ir::IrVerify(body_to_verify);
+  }
+
+  for (auto &fn : op->functions) {
+    VLOG(1) << "JIT Linking function [" << fn.As<ir::_LoweredFunc_>()->name << "]";
+    ir::Expr fn_expr(fn);
+
+    auto fnll = Visit(&fn_expr);
+
+    VLOG(5) << "fn llvm:\n" << DumpToString(*fnll);
+  }
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::_Var_ *op) {
+  llvm::Value *value = GetVar(op->name, false);
+  llvm::Value *result{};
+  CHECK(value) << "ir::_Var_[" << op->name << "]: value is null";
+  // TODO(fc500110) hard coding
+  if (LLVM_WillVarLowerAsPointer(op->name)) {
+    result = value;
+  } else if (value->getType()->isPointerTy()) {
+    result = Load(value, op->name + "_load");
+  } else {
+    result = value;
+  }
+
+  return result;
+}
+
+void CodeGenLLVM::Scalarize(const Expr &e, std::function<void(int i, llvm::Value *v)> flambda) {
+  if (const ir::Ramp *ramp = e.As<ir::Ramp>()) {
+    for (int i = 0; i < ramp->type().lanes(); ++i) {
+      Expr offset = ramp->base + (ramp->stride * i);
+      VLOG(3) << "offset: " << offset;
+      flambda(i, Visit(&offset));
+    }
+  } else {
+    llvm::Value *value = Visit(&e);
+    for (int i = 0; i < e->type().lanes(); ++i) {
+      flambda(i, b_->CreateExtractElement(value, i));
+    }
+  }
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Load *op) {
+  llvm::Value *array{nullptr};
+  bool is_alias{false};
+  if (auto *tensor_op = op->tensor.As<ir::_Tensor_>()) {
+    array = GetVar(tensor_op->name);
+  } else if (auto *var_op = op->tensor.As<ir::_Var_>()) {
+    array    = GetVar(var_op->name);
+    is_alias = alias_vars_.count(const_cast<ir::_Var_ *>(var_op));
+  } else {
+    array = Visit(&op->tensor);
+  }
+  CHECK(array) << "fail to Visit Load node: " << Expr(const_cast<ir::Load *>(op));
+
+  ir::Expr index = op->index();
+  if (index.type().lanes() <= 1) {
+    std::vector<llvm::Value *> indices;
+    indices.push_back(Visit(&index));
+
+    // auto load_inst = Load(InBoundsGEP(array, std::move(indices)));
+    auto *load_inst = AlignedLoad(InBoundsGEP(array, std::move(indices)), llvm::MaybeAlign());
+    /*
+    if (is_alias) {
+      llvm::MDNode *meta = md_builder_->createTBAANode("cinn-alias", md_tbaa_root_);
+      load_inst->setMetadata("tbaa", md_builder_->createTBAAStructTagNode(meta, meta, 0));
+    }
+     */
+    if (auto *load_tensor = op->tensor.as_tensor()) {
+      AddTbaaMetadata(load_inst, load_tensor->name, op->index());
+    }
+
+    {
+      int alignment = op->type().bits();
+      alignment     = 8;
+      CHECK_GT(alignment, 0);
+      load_inst->setAlignment(llvm::Align(std::min(alignment, 8)));
+    }
+
+    // TODO(fc500110): tbaa AliasAnalysis
+    // auto md_tbaa_root      = md_builder_->createTBAARoot("cinn-tbaa");
+    // auto md_tbaa_alias_set = md_builder_->createTBAANode("cinn-alias", md_tbaa_root);
+    // llvm::MDNode *meta     = md_tbaa_alias_set;
+    // load_inst->setMetadata("tbaa", md_builder_->createTBAAStructTagNode(meta, meta, 0));
+    return load_inst;
+  } else {  // vector load
+    Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1);
+    llvm::Value *buffer     = Visit(&op->tensor);
+    if (dense_strided_ramp.defined()) {
+      CHECK(op->type().is_vector());
+      return DenseVectorLoad(op);
+    }
+    // scalarize load
+    Type type        = op->type();
+    int alignment    = type.bits() / 8;
+    llvm::Value *ret = llvm::UndefValue::get(CinnTypeToLLVMType(type, m_, true));
+    auto flambda     = [&](int i, llvm::Value *index) {
+      auto *ptr                 = CreateBufferPtr(type.ElementOf(), buffer, index);
+      llvm::LoadInst *load_inst = b_->CreateAlignedLoad(ptr, llvm::Align(alignment), "load_vec");
+      ret                       = b_->CreateInsertElement(ret, load_inst, ll_const_int32(i));
+      if (auto *load_tensor = op->tensor.as_tensor()) {
+        AddTbaaMetadata(load_inst, load_tensor->name, op->index());
+      }
+    };
+    Scalarize(op->index(), flambda);
+    return ret;
+  }
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Store *op) {
+  llvm::Value *array{nullptr};
+  bool is_alias{false};
+  if (auto *tensor_op = op->tensor.As<ir::_Tensor_>()) {
+    array = GetVar(tensor_op->name);
+  } else if (auto *var_op = op->tensor.As<ir::_Var_>()) {
+    array    = GetVar(var_op->name);
+    is_alias = alias_vars_.count(const_cast<ir::_Var_ *>(var_op));
+  }
+  CHECK(array) << "array is null";
+
+  ir::Expr index = op->index();
+
+  if (op->type().is_scalar()) {
+    std::vector<llvm::Value *> indices;
+    indices.push_back(Visit(&index));
+
+    // auto *store_inst = Store(Visit(&op->value), InBoundsGEP(array, std::move(indices)));
+    auto *store_inst = AlignedStore(Visit(&op->value), InBoundsGEP(array, std::move(indices)), llvm::MaybeAlign());
+    /*
+    if (is_alias) {
+      llvm::MDNode *meta = md_builder_->createTBAANode("cinn-alias", md_tbaa_root_);
+      store_inst->setMetadata("tbaa", md_builder_->createTBAAStructTagNode(meta, meta, 0));
+    }
+     */
+    {
+      int alignment = op->type().bits();
+      alignment     = 8;
+      CHECK_GT(alignment, 0);
+      store_inst->setAlignment(llvm::Align(std::min(alignment, 8)));
+    }
+    // TODO(fc500110): tbaa AliasAnalysis
+    // auto md_tbaa_root      = md_builder_->createTBAARoot("cinn-tbaa");
+    // auto md_tbaa_alias_set = md_builder_->createTBAANode("cinn-alias", md_tbaa_root);
+    // llvm::MDNode *meta     = md_tbaa_alias_set;
+    // store_inst->setMetadata("tbaa", md_builder_->createTBAAStructTagNode(meta, meta, 0));
+    AddTbaaMetadata(store_inst, op->tensor.as_tensor()->name, op->index());
+    return store_inst;
+  } else {  // vector store
+    Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1);
+    auto ramp_expr          = op->index();
+    auto *ramp              = index.As<ir::Ramp>();
+    auto *buffer            = Visit(&op->tensor);
+    auto *value             = Visit(&op->value);
+
+    if (dense_strided_ramp.defined()) {  // stride 1
+      int total_lanes = op->type().lanes();
+      int step        = naive_vec_alignment_ / op->type().ElementOf().bits();
+
+      // fit the total_lanes in native_lanes(split into multiple native steps)
+      for (int offset = 0; offset < total_lanes; offset += total_lanes) {
+        int lanes = total_lanes;
+        Expr base = common::AutoSimplify(ramp->base + offset);
+        optim::VarModSimplify(&base);
+        auto *ptr   = CreateBufferPtr(op->type().ElementOf(), buffer, Visit(&base));
+        auto *vtype = llvm::VectorType::get(CinnTypeToLLVMType(op->type().ElementOf(), m_, true),
+                                            llvm::ElementCount(lanes, false /*Scalable*/))
+                          ->getPointerTo();
+        int alignment = std::max(op->type().ElementOf().bits() / 8, 1);
+        llvm::StoreInst *inst =
+            b_->CreateAlignedStore(CreateVecSlice(value, offset, lanes), b_->CreatePointerCast(ptr, vtype), alignment);
+        AddTbaaMetadata(inst, op->tensor.as_tensor()->name, base);
+        return inst;
+      }
+    }
+    // scalarize store
+    Type type        = op->type();
+    int alignment    = type.bits() / 8;
+    llvm::Value *ret = llvm::UndefValue::get(CinnTypeToLLVMType(type, m_, true));
+    auto flambda     = [&](int i, llvm::Value *index) {
+      auto *ptr = CreateBufferPtr(type.ElementOf(), buffer, index);
+      llvm::StoreInst *store_inst =
+          b_->CreateAlignedStore(b_->CreateExtractElement(value, i), ptr, llvm::Align(alignment), "store_vec");
+      ret = b_->CreateInsertElement(ret, store_inst, ll_const_int32(i));
+      if (auto *store_tensor = op->tensor.as_tensor()) {
+        AddTbaaMetadata(store_inst, store_tensor->name, op->index());
+      }
+    };
+    Scalarize(op->index(), flambda);
+    return ret;
+  }
+  return nullptr;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Alloc *op) {
+  auto *buffer_op = op->destination.As<ir::_Buffer_>();
+  auto *buffer    = GetVar(buffer_op->name);
+  CHECK(buffer);
+
+  return buffer;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Free *op) {
+  auto *buffer_op = op->destination.As<ir::_Buffer_>();
+  CHECK(symbol_table_->Lookup(buffer_op->name));
+  symbol_table_->Erase(buffer_op->name);
+  return nullptr;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::_Buffer_ *op) { return GetVar(op->name); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::_Tensor_ *op) {
+  return GetVar(op->name);
+  auto *buffer_op = op->buffer.As<ir::_Buffer_>();
+  if (symbol_table_->Lookup(buffer_op->name)) {
+    return Visit(buffer_op);
+  }
+
+  return SetVar(buffer_op->name, Visit(buffer_op));
+}
+
+template <typename T, std::enable_if_t<std::is_same<const ir::Expr &, T>::value, int> = 0>
+void appendBody(std::vector<Expr> &new_body, T &&v) {
+  new_body.push_back(v);
+}
+
+template <typename T, std::enable_if_t<!std::is_same<const ir::Expr &, T>::value, int> = 1>
+void appendBody(std::vector<Expr> &new_body, T &&v) {
+  new_body.insert(new_body.end(), v.begin(), v.end());
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::_LoweredFunc_ *op) {
+  auto init_function_state = [this]() { alias_vars_.clear(); };
+  init_function_state();
+
+  CHECK_EQ(op->alloc_output_buffer_exprs.size(), op->dealloc_output_buffer_exprs.size())
+      << "the count of allocation and deallocation expressions is not match";
+
+  std::vector<Expr> new_body;
+  auto create_temp_buffers   = op->PrepareCreateTempBufferExprs();
+  auto alloca_temp_buffers   = op->PrepareAllocTempBufferExprs();
+  auto dealloca_temp_buffers = op->PrepareDeallocTempBufferExprs();
+
+  appendBody(new_body, op->argument_prepare_exprs);
+  appendBody(new_body, create_temp_buffers);
+  appendBody(new_body, alloca_temp_buffers);
+  appendBody(new_body, op->alloc_output_buffer_exprs);
+  appendBody(new_body, op->buffer_data_cast_exprs);
+  appendBody(new_body, op->body);
+  appendBody(new_body, dealloca_temp_buffers);
+  appendBody(new_body, op->dealloc_output_buffer_exprs);
+
+  ir::Expr function_body = ir::Block::Make(new_body);
+
+  // Emit Function
+  std::vector<llvm::Type *> arg_types = {b_->getInt8PtrTy(), b_->getInt32Ty()};
+
+  llvm::FunctionType *function_type = llvm::FunctionType::get(
+      /*Result=*/b_->getVoidTy(),
+      /*Params=*/std::move(arg_types),
+      /*isVarArg=*/false);
+  CHECK(m_->getFunction(op->name) == nullptr) << "function[" << op->name << "] exists";
+
+  f_ = llvm::Function::Create(
+      /*FunctionType=*/function_type,
+      /*LinkageTypes=*/llvm::Function::ExternalLinkage,
+      /*Name=*/op->name,
+      /*Module=*/m_);
+  f_->setCallingConv(llvm::CallingConv::C);
+  f_->setHasUWTable();  // GDB
+
+  std::vector<llvm::Value *> args;
+  args.reserve(f_->arg_size());
+  std::transform(
+      f_->arg_begin(), f_->arg_end(), std::back_inserter(args), [](auto &arg) { return std::addressof(arg); });
+
+  llvm::BasicBlock *entry = llvm::BasicBlock::Create(
+      /*Context=*/b_->getContext(),
+      /*Name=*/"entry",
+      /*Parent=*/f_,
+      /*InsertBefore=*/nullptr);
+
+  SetVar("_args", args[0]);
+  b_->SetInsertPoint(entry);
+  Visit(&function_body);
+  symbol_table_->Erase("_args");
+  RetVoid();
+  return f_;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Let *op) {
+  CHECK(op->type().valid());
+  auto name = op->symbol.As<ir::_Var_>()->name;
+  if (op->symbol.As<ir::_Var_>()->type().is_cpp_handle()) {
+    alias_vars_.insert(const_cast<ir::_Var_ *>(op->symbol.As<ir::_Var_>()));
+  }
+  if (op->body.defined()) {
+    SetVar(name, Visit(&op->body));
+  } else {
+    llvm::AllocaInst *inst = Alloca(CinnTypeToLLVMType(op->type(), m_), nullptr, name);
+    auto get_align         = [](int n) {
+      int i{0}, r{1};
+      while (n > r) {
+        r *= 2;
+        ++i;
+      }
+      return r / 8;
+    };
+    int align_bits = std::max<int>(op->type().bits(), 8);
+    int align      = get_align(align_bits);
+    inst->setAlignment(llvm::Align(align));
+    SetVar(name, inst);
+  }
+
+  return GetVar(name);
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Reduce *op) { __IR_EMITTER_NOT_IMPLEMENTED(op); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Ramp *op) { __IR_EMITTER_NOT_IMPLEMENTED(op); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Broadcast *op) {
+#if LLVM_VERSION_MAJOR >= 11
+  const llvm::ElementCount elem_count(op->lanes, /*scalable*/ false);
+#else
+  const int elem_count = op->lanes;
+#endif
+  llvm::Value *value    = Visit(&op->value);
+  llvm::Constant *undef = llvm::UndefValue::get(llvm::VectorType::get(value->getType(), elem_count));
+  llvm::Constant *zero  = llvm::ConstantInt::get(ll_int32_ty(), 0);
+  value                 = b_->CreateInsertElement(undef, value, zero, "broadcast");
+  llvm::Constant *zeros = llvm::ConstantVector::getSplat(elem_count, zero);
+  return b_->CreateShuffleVector(value, undef, zeros, "broadcast_shuffle");
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::FracOp *op) { __IR_EMITTER_NOT_IMPLEMENTED(op); }
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Product *op) {
+  auto size = op->operands().size();
+  if (size == 0) return nullptr;
+
+  llvm::Value *ret = Visit(&op->operand(0));
+  for (int i = 1; i < size; i++) {
+    llvm::Value *v = Visit(&op->operand(i));
+    if (is_integral_type(op->type())) {
+      ret = Mul(ret, v);
+    } else {
+      ret = FMul(ret, v);
+    }
+  }
+
+  return ret;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::Sum *op) {
+  auto size = op->operands().size();
+  if (size == 0) return nullptr;
+
+  llvm::Value *ret = Visit(&op->operand(0));
+  for (int i = 1; i < size; i++) {
+    llvm::Value *v = Visit(&op->operand(i));
+    if (is_integral_type(op->type())) {
+      ret = Add(ret, v);
+    } else {  // float
+      ret = FAdd(ret, v);
+    }
+  }
+
+  return ret;
+}
+
+#undef __IR_EMITTER_CINN_NOT_IMPLEMENTED
+
+void CodeGenLLVM::Compile(const ir::Module &module) { Visit(module.self()); }
+
+llvm::Value *CodeGenLLVM::EmitCall_buffer_malloc(const ir::Call *op) { return nullptr; }
+
+llvm::Value *CodeGenLLVM::EmitCall_get_address(const ir::Call *op) {
+  if (auto *read_var = op->read_args.front().as_var()) {
+    return GetVar(read_var->name);
+  }
+
+  if (auto *read_buf = op->read_args.front().as_buffer()) {
+    return GetVar(read_buf->name);
+  }
+  return nullptr;
+}
+
+llvm::Value *CodeGenLLVM::EmitCall_debug_info(const ir::Call *op) {
+  auto callee = m_->getFunction(runtime::intrinsic::debug_log_repr);
+  CHECK_GE(op->read_args.size(), 1UL);
+  std::vector<llvm::Value *> args;
+  for (auto &arg : op->read_args) {
+    args.push_back(Visit(&arg));
+  }
+  return Call(callee, args, "call debug_info");
+}
+
+llvm::Value *CodeGenLLVM::GetVar(const std::string &name, bool lazy) {
+  auto symbol = symbol_table_->Lookup(name);
+  if (!lazy) {
+    CHECK(symbol) << "No var [" << name << "] found";
+  }
+  return symbol;
+}
+
+llvm::Value *CodeGenLLVM::SetVar(const std::string &name, llvm::Value *val) {
+  symbol_table_->Insert(name, val);
+  CHECK(GetVar(name));
+  return val;
+}
+
+llvm::FunctionType *CodeGenLLVM::GenFunctionTypeFromCinnFunction(const ir::_LoweredFunc_ *func, bool with_buffer_type) {
+  auto func_ret_type = CinnTypeToLLVMType(Void(), m_);
+  std::vector<llvm::Type *> arg_types;
+  for (auto &arg : func->args) {
+    if (arg.is_buffer() && arg.is_var()) {
+      alias_vars_.insert(arg.var_arg().get());
+    }
+    if (arg.is_var()) {
+      arg_types.push_back(CinnTypeToLLVMType(arg.var_arg()->type(), m_));
+    } else if (arg.is_buffer()) {
+      if (with_buffer_type) {
+        arg_types.push_back(ll_cinn_buffer_p_ty());
+      } else {
+        arg_types.push_back(CinnTypeToLLVMType(arg.buffer_arg()->type(), m_));
+      }
+    }
+  }
+
+  return llvm::FunctionType::get(func_ret_type, arg_types, false);
+}
+
+llvm::Value *CodeGenLLVM::DenseVectorLoad(const ir::Load *op) {
+  auto index = op->index();
+  auto *ramp = index.As<ir::Ramp>();
+  CHECK(ramp);
+
+  int load_lanes   = op->type().lanes();
+  int native_lanes = naive_vec_alignment_ / op->type().bits();
+
+  std::vector<llvm::Value *> slices;
+
+  llvm::Value *buffer = Visit(&op->tensor);
+  buffer->setName("buffer");
+
+  for (int i = 0; i < load_lanes; i += load_lanes) {
+    int slice_lanes = load_lanes;
+    auto slice_base = common::AutoSimplify(ramp->base + i);
+    optim::VarModSimplify(&slice_base);
+    auto slide_stride = Expr(1);
+    auto slide_index  = slice_base;
+
+#if LLVM_VERSION_MAJOR >= 11
+    const llvm::ElementCount elem_count(slice_lanes, /*scalable*/ false);
+#else
+    const int elem_count = slice_lanes;
+#endif
+
+    llvm::Type *slice_type = llvm::VectorType::get(CinnTypeToLLVMType(op->type().ElementOf(), m_, true), elem_count);
+
+    llvm::Value *elt_ptr = CreateBufferPtr(op->type().ElementOf(), buffer, Visit(&slice_base));
+    llvm::Value *vec_ptr = b_->CreatePointerCast(elt_ptr, slice_type->getPointerTo(), "get_vec_ptr");
+
+    int alignment = std::max(op->type().ElementOf().bits() / 8, 1);
+
+    llvm::Instruction *load_inst = b_->CreateAlignedLoad(vec_ptr, llvm::Align(alignment), "load_vec");
+    AddTbaaMetadata(load_inst, op->tensor.as_tensor()->name, op->index());
+
+    slices.push_back(load_inst);
+  }
+
+  CHECK_EQ(slices.size(), 1UL);
+
+  return slices[0];
+}
+
+llvm::Value *CodeGenLLVM::CreateBufferVecPtr(Type t, llvm::Value *buffer, llvm::Value *index) {
+  CHECK_GT(t.lanes(), 1) << "type is not a vector type: " << t;
+  llvm::PointerType *btype = llvm::dyn_cast<llvm::PointerType>(buffer->getType());
+  CHECK(btype);
+  llvm::PointerType *ptype = CinnTypeToLLVMType(t, m_)->getPointerTo(btype->getAddressSpace());
+  if (btype != ptype) {
+    buffer = b_->CreatePointerCast(buffer, ptype);
+  }
+  return b_->CreateInBoundsGEP(buffer, index);
+}
+
+llvm::Value *CodeGenLLVM::CreateBufferPtr(Type t, llvm::Value *buffer, llvm::Value *index) {
+  CHECK_EQ(t.lanes(), 1);
+  auto *btype = llvm::dyn_cast<llvm::PointerType>(buffer->getType());
+  CHECK(btype);
+  auto *ptype = CinnTypeToLLVMType(t, m_)->getPointerTo(btype->getAddressSpace());
+  CHECK(ptype);
+  if (btype != ptype) {
+    buffer = b_->CreatePointerCast(buffer, ptype, "pointer_cast");
+  }
+  return b_->CreateInBoundsGEP(buffer, index, "buffer_ptr");
+}
+
+llvm::Value *CodeGenLLVM::CreateVecSlice(llvm::Value *vec, int begin, int lanes) {
+  int total_lanes = llvm::dyn_cast<llvm::VectorType>(vec->getType())->getNumElements();
+  CHECK_LE(begin + lanes, total_lanes);
+  if (lanes == total_lanes && begin == 0) return vec;  // full slice
+  std::vector<llvm::Constant *> indices;
+  for (int i = 0; i < lanes; ++i) {
+    indices.push_back(ll_const_int32(begin + i));
+  }
+  llvm::Constant *undef = llvm::UndefValue::get(vec->getType());
+  return b_->CreateShuffleVector(vec, undef, llvm::ConstantVector::get(indices));
+}
+
+void CodeGenLLVM::InitTarget(const Target &target) {
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+  llvm::InitializeAllAsmPrinters();
+  switch (target.arch) {
+    case Target::Arch::X86:
+      if (target.bits == Target::Bit::k32) {
+        naive_vec_alignment_ = 256;
+      } else if (target.bits == Target::Bit::k64) {
+        naive_vec_alignment_ = 512;
+      } else {
+        LOG(FATAL) << "get unknown bits";
+      }
+      break;
+    case Target::Arch::ARM:
+      naive_vec_alignment_ = 128;
+      break;
+    case Target::Arch::NVGPU:
+      naive_vec_alignment_ = 128;
+      break;
+    case Target::Arch::Unk:
+      LOG(FATAL) << "unknown Arch found";
+      break;
+  }
+}
+
+bool LLVM_WillVarLowerAsPointer(const std::string &var_name) {
+  return var_name == "_args" || utils::Endswith(var_name, "__ptr");
+}
+
+void CodeGenLLVM::AddTbaaMetadata(llvm::Instruction *inst, absl::string_view buffer, Expr index) {
+  // If the index is constant, generate some TBAA info that helps LLVM understand our loads/stores aren't aliased.
+  bool constant_index = false;
+  int base            = 0;
+  int width           = 1;
+
+  if (index.defined()) {
+    if (const ir::Ramp *ramp = index.As<ir::Ramp>()) {
+      auto *pstride_int = ramp->stride.As<ir::IntImm>();
+      auto *pbase_int   = ramp->base.As<ir::IntImm>();
+      if (pstride_int && pbase_int) {
+        int stride = pstride_int->value;
+        base       = pbase_int->value;
+        CHECK_GE(base, 0);
+        width = NextPowerOfTwo(ramp->lanes * stride);
+
+        while (base % width) {
+          base -= base % width;
+          width *= 2;
+        }
+        constant_index = true;
+      }
+    } else {
+      auto *pbase_int = index.As<ir::IntImm>();
+      if (pbase_int) {
+        int pbase      = pbase_int->value;
+        base           = pbase;
+        constant_index = true;
+      }
+    }
+  }
+
+  llvm::MDBuilder builder(b_->getContext());
+
+  // Add type-based-alias-analysis metadata to the pointer, so that loads and stores to different buffers can get
+  // reordered.
+  llvm::MDNode *tbaa = builder.createTBAARoot("cinn buffer");
+  tbaa               = builder.createTBAAScalarTypeNode(std::string(buffer), tbaa);
+
+  // Add metadata for constant indices to allow loads and stores to the same buffer to get reordered.
+  if (constant_index) {
+    for (int w = 1024; w >= width; w /= 2) {
+      int b = (base / w) * w;
+      tbaa  = builder.createTBAAScalarTypeNode(utils::StringFormat("%s.width%d.base%d", buffer.data(), w, b), tbaa);
+    }
+  }
+
+  tbaa = builder.createTBAAStructTagNode(tbaa, tbaa, 0);
+  inst->setMetadata("tbaa", tbaa);
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::IntrinsicOp *op) {
+  switch (op->getKind()) {
+#define __(op__)                   \
+  case ir::IntrinsicKind::k##op__: \
+    return Visit(llvm::dyn_cast<ir::intrinsics::op__>(op));
+    INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+  }
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::BufferGetDataHandle *op) {
+  std::vector<llvm::Value *> args({Visit(&op->buffer)});
+  auto *callee = m_->getFunction("cinn_buffer_get_data_handle");
+  return Call(callee, std::move(args));
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::BufferGetDataConstHandle *op) {
+  std::vector<llvm::Value *> args({Visit(&op->buffer)});
+  auto *callee = m_->getFunction("cinn_buffer_get_data_const_handle");
+  return Call(callee, std::move(args));
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::BufferCreate *op) {
+  auto *callee     = m_->getFunction(runtime::intrinsic::buffer_create_default);
+  auto buffer_node = op->buffer.as_buffer();
+  CHECK(buffer_node);
+  std::vector<llvm::Value *> args({ll_const_int32(buffer_node->target.runtime_arch())});
+  uint64_t memory_size = (buffer_node->dtype.ElementOf().bits() + 7) / 8;
+  for (auto shape : buffer_node->shape) {
+    int shape_int = shape.as_int32();
+    memory_size *= shape_int;
+  }
+  args.push_back(ll_const_int64(memory_size));
+  args.push_back(ll_const_int32(32));
+
+  return Call(callee, args);
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::GetAddr *op) {
+  if (auto *n = op->data.as_var()) {
+    return GetVar(n->name);
+  } else if (auto *n = op->data.as_buffer()) {
+    return GetVar(n->name);
+  }
+  if (auto *n = op->data.As<ir::Load>()) {  // get the address to an element in a buffer
+    auto *e = Visit(&op->data);
+    if (auto *e_load = llvm::dyn_cast<llvm::LoadInst>(e)) {
+      return e_load->getPointerOperand();
+    }
+    return e;
+  }
+  return nullptr;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::ArgsConstruct *op) {
+  llvm::SmallVector<llvm::Value *, 7> args;
+  Expr var(op->var);
+  var->set_type(type_of<cinn_pod_value_t>());
+  var = ir::intrinsics::GetAddr::Make(var);
+
+  llvm::Value *ll_var = Visit(&var);
+  var                 = ir::Cast::Make(type_of<cinn_pod_value_t *>(), var);
+
+  Expr num_args(static_cast<int>(op->args.size()));
+  args.push_back(BitCast(ll_var, ll_cinn_pod_p_ty(), "cast_to_pod_value_t_ptr"));
+  args.push_back(Visit(&num_args));
+  for (auto &arg : op->args) {
+    args.push_back(Visit(&arg));
+  }
+
+  auto *callee = m_->getFunction(runtime::intrinsic::args_construct_repr);
+  return Call(callee, std::move(args));
+}
+
+llvm::Function *CodeGenLLVM::GetIntrinsicDecl(llvm::Intrinsic::ID id,
+                                              llvm::Type *ret_type,
+                                              llvm::ArrayRef<llvm::Type *> arg_types) {
+  llvm::Module *module = m_;
+
+  if (!llvm::Intrinsic::isOverloaded(id)) {
+    return llvm::Intrinsic::getDeclaration(module, id, {});
+  }
+
+  llvm::SmallVector<llvm::Intrinsic::IITDescriptor, 4> infos;
+  llvm::Intrinsic::getIntrinsicInfoTableEntries(id, infos);
+  llvm::SmallVector<llvm::Type *, 4> overload_types;
+
+  auto try_match = [&](llvm::FunctionType *f_ty, bool var_arg) {
+    overload_types.clear();
+    llvm::ArrayRef<llvm::Intrinsic::IITDescriptor> ref(infos);
+    auto match = llvm::Intrinsic::matchIntrinsicSignature(f_ty, ref, overload_types);
+    if (match == llvm::Intrinsic::MatchIntrinsicTypes_Match) {
+      if (llvm::Intrinsic::matchIntrinsicVarArg(var_arg, ref)) {
+        return llvm::Intrinsic::MatchIntrinsicTypes_NoMatchArg;
+      }
+    }
+    return match;
+  };
+
+  auto *fn_ty = llvm::FunctionType::get(ret_type, arg_types, false);
+  switch (try_match(fn_ty, false)) {
+    case llvm::Intrinsic::MatchIntrinsicTypes_Match:
+      return llvm::Intrinsic::getDeclaration(module, id, overload_types);
+    case llvm::Intrinsic::MatchIntrinsicTypes_NoMatchRet:
+      return nullptr;
+    case llvm::Intrinsic::MatchIntrinsicTypes_NoMatchArg:
+      break;
+  }
+
+  // try matching the var arg signature.
+  llvm::SmallVector<llvm::Type *, 4> var_types;
+  for (int i = 0; i <= arg_types.size(); ++i) {
+    if (i > 0) {
+      var_types.push_back(arg_types[i - 1]);
+    }
+    auto *ft = llvm::FunctionType::get(ret_type, var_types, true);
+    if (try_match(ft, true) == llvm::Intrinsic::MatchIntrinsicTypes_Match) {
+      return llvm::Intrinsic::getDeclaration(module, id, overload_types);
+    }
+  }
+  return nullptr;
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::BuiltinIntrin *op) {
+  std::string func_name = op->name;
+  if (op->id == -1) {
+    if (func_name == "bitwise_and") {
+      CHECK_GE(op->args.size(), 2U);
+      return b_->CreateAnd(Visit(&op->args[0]), Visit(&op->args[1]));
+    } else if (func_name == "bitwise_or") {
+      CHECK_GE(op->args.size(), 2U);
+      return b_->CreateOr(Visit(&op->args[0]), Visit(&op->args[1]));
+    } else if (func_name == "bitwise_xor") {
+      CHECK_GE(op->args.size(), 2U);
+      return b_->CreateXor(Visit(&op->args[0]), Visit(&op->args[1]));
+    } else if (func_name == "bitwise_not") {
+      CHECK_GE(op->args.size(), 1U);
+      return b_->CreateNot(Visit(&op->args[0]));
+    } else if (func_name == "left_shift") {
+      CHECK_GE(op->args.size(), 2U);
+      return b_->CreateShl(Visit(&op->args[0]), Visit(&op->args[1]));
+    } else if (func_name == "right_shift") {
+      CHECK_GE(op->args.size(), 2U);
+      if (op->args[0]->type().is_int()) {
+        return b_->CreateAShr(Visit(&op->args[0]), Visit(&op->args[1]));
+      } else {
+        return b_->CreateLShr(Visit(&op->args[0]), Visit(&op->args[1]));
+      }
+    } else if (func_name == "isnan") {
+      CHECK_GE(op->args.size(), 1U);
+      llvm::Value *v = Visit(&op->args[0]);
+      return b_->CreateFCmpUNO(v, v);
+    }
+  }
+
+  llvm::Intrinsic::ID id = op->id;
+  int64_t num_signature  = op->arg_nums;
+  std::vector<llvm::Value *> arg_value;
+  std::vector<llvm::Type *> arg_type;
+  for (size_t i = 0; i < op->args.size(); ++i) {
+    arg_value.push_back(Visit(&op->args[i]));
+    if (i < static_cast<size_t>(num_signature)) {
+      arg_type.push_back(arg_value.back()->getType());
+    }
+  }
+  CHECK(!op->args.empty());
+  llvm::Type *return_type = CinnTypeToLLVMType(op->type(), m_, true);
+  llvm::Function *fn      = GetIntrinsicDecl(id, return_type, arg_type);
+  CHECK(fn) << "Cannot find intrinsic declaration, possible type mismatch: " << llvm::Intrinsic::getName(id, {});
+  return b_->CreateCall(fn, arg_value);
+}
+
+llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::PodValueToX *op) {
+  auto to_type = op->GetOutputType(0);
+  llvm::Function *callee{};
+
+  if (to_type == type_of<float>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_float);
+  } else if (to_type == type_of<double>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_double);
+  } else if (to_type == type_of<bfloat16>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_bfloat16);
+  } else if (to_type == type_of<float16>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_float16);
+  } else if (to_type == type_of<bool>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_bool);
+  } else if (to_type == type_of<int8_t>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_int8);
+  } else if (to_type == type_of<int16_t>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_int16);
+  } else if (to_type == type_of<int32_t>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_int32);
+  } else if (to_type == type_of<int64_t>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_int64);
+  } else if (to_type == type_of<uint8_t>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_uint8);
+  } else if (to_type == type_of<uint16_t>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_uint16);
+  } else if (to_type == type_of<uint32_t>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_uint32);
+  } else if (to_type == type_of<uint64_t>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_uint64);
+  } else if (to_type == type_of<void *>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_void_p);
+  } else if (to_type == type_of<cinn_buffer_t *>()) {
+    callee = m_->getFunction(runtime::intrinsic::pod_value_to_buffer_p);
+  } else {
+    LOG(FATAL) << "Not supported type: " << to_type;
+  }
+
+  CHECK(callee);
+  auto *value = Visit(&op->pod_value_ptr);
+  CHECK(value);
+  return Call(callee, std::vector<llvm::Value *>({value}), "pod_value_cast");
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.h b/paddle/cinn/backends/llvm/codegen_llvm.h
new file mode 100644
index 0000000000000..f472e2239e15d
--- /dev/null
+++ b/paddle/cinn/backends/llvm/codegen_llvm.h
@@ -0,0 +1,248 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/strings/string_view.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/Module.h>
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cinn/backends/llvm/ir_builder_mixin.h"
+#include "cinn/backends/llvm/llvm_util.h"
+#include "cinn/ir/intrinsic_ops.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/ir/module.h"
+
+namespace cinn {
+namespace backends {
+
+class LLVMIRVisitor : public ir::IRVisitorBase<llvm::Value *> {
+ public:
+  LLVMIRVisitor() = default;
+
+  using ir::IRVisitorBase<llvm::Value *>::Visit;
+#define __m(t__) virtual llvm::Value *Visit(const ir::t__ *x) = 0;
+  NODETY_FORALL(__m)
+#undef __m
+};
+
+/**
+ * Tell whether a variable called \p \var_name will lowered to a pointer type in LLVM.
+ * @param var_name name of the variable.
+ * @return a boolean.
+ */
+bool LLVM_WillVarLowerAsPointer(const std::string &var_name);
+
+class SymbolTable {
+ public:
+  SymbolTable() = default;
+
+  void PushScope() { scopes_.emplace_back(); }
+
+  llvm::Value *Lookup(const std::string &id) {
+    for (auto it = scopes_.rbegin(); it != scopes_.rend(); it++) {
+      auto vt = (*it).find(id);
+      if (vt != (*it).end()) return vt->second;
+    }
+    return nullptr;
+  }
+
+  void Insert(const std::string &id, llvm::Value *value) {
+    CHECK(!scopes_.empty());
+    scopes_.back().emplace(id, value);
+  }
+
+  void Erase(const std::string &id) {
+    CHECK(!scopes_.empty());
+    scopes_.back().erase(id);
+  }
+
+  void PopScope() {
+    CHECK(!scopes_.empty());
+    scopes_.pop_back();
+  }
+
+  //! Get the number of the variables contained in the current scope.
+  size_t size() const { return scopes_.empty() ? 0 : scopes_.back().size(); }
+
+  size_t num_scopes() const { return scopes_.size(); }
+
+ private:
+  std::vector<absl::flat_hash_map<std::string, llvm::Value *>> scopes_;
+
+  SymbolTable(const SymbolTable &) = delete;
+};
+
+struct SymbolTableGuard {
+  explicit SymbolTableGuard(SymbolTable &symbol_table) : symbol_table_(symbol_table) { symbol_table.PushScope(); }
+
+  ~SymbolTableGuard() { symbol_table_.PopScope(); }
+
+ private:
+  SymbolTable &symbol_table_;
+};
+
+/**
+ * Base class of all the LLVM-based codegen.
+ */
+class CodeGenLLVM : public LLVMIRVisitor, public IrBuilderMixin<CodeGenLLVM> {
+ public:
+  explicit CodeGenLLVM(llvm::Module *m,
+                       llvm::IRBuilder<> *b,
+                       const std::shared_ptr<SymbolTable> &symbol_table = nullptr,
+                       const Target &target                             = common::DefaultHostTarget());
+
+  // Common llvm types
+  // @{
+  inline llvm::Type *ll_void_p_ty() const { return llvm_type_of<void *>(m_); }
+  inline llvm::Type *ll_void_pp_ty() const { return llvm_type_of<void **>(m_); }
+
+  inline llvm::Type *ll_int8_ty() const { return llvm_type_of<int8_t>(m_); }
+  inline llvm::Type *ll_int16_ty() const { return llvm_type_of<int16_t>(m_); }
+  inline llvm::Type *ll_int32_ty() const { return llvm_type_of<int32_t>(m_); }
+  inline llvm::Type *ll_int64_ty() const { return llvm_type_of<int64_t>(m_); }
+
+  inline llvm::Type *ll_uint8_ty() const { return llvm_type_of<uint8_t>(m_); }
+  inline llvm::Type *ll_uint16_ty() const { return llvm_type_of<uint16_t>(m_); }
+  inline llvm::Type *ll_uint32_ty() const { return llvm_type_of<uint32_t>(m_); }
+  inline llvm::Type *ll_uint64_ty() const { return llvm_type_of<uint64_t>(m_); }
+
+  inline llvm::Type *ll_bf16_ty() const { return llvm_type_of<cinn::common::bfloat16>(m_); }
+  inline llvm::Type *ll_fp16_ty() const { return llvm_type_of<cinn::common::float16>(m_); }
+  inline llvm::Type *ll_fp32_ty() const { return llvm_type_of<float>(m_); }
+  inline llvm::Type *ll_fp64_ty() const { return llvm_type_of<double>(m_); }
+
+  inline llvm::Type *ll_cinn_buffer_p_ty() const { return llvm_type_of<cinn_buffer_t *>(m_); }
+  inline llvm::Type *ll_cinn_pod_ty() const { return llvm_type_of<cinn_pod_value_t>(m_); }
+  inline llvm::Type *ll_cinn_pod_p_ty() const { return llvm_type_of<cinn_pod_value_t *>(m_); }
+  // @}
+
+  //! get a llvm type equivalent to a CINN type.
+  inline llvm::Type *ll_type_of(Type type) { return CinnTypeToLLVMType(type, m_); }
+
+  // Common methods to get a constant
+  // @{
+  inline llvm::Constant *ll_const_int32(int v) const { return llvm::ConstantInt::get(b_->getInt32Ty(), v); }
+  inline llvm::Constant *ll_const_int64(int v) const { return llvm::ConstantInt::get(b_->getInt64Ty(), v); }
+  // @}
+
+  //! Get the bound LLVM module.
+  llvm::Module *m() { return m_; }
+  //! Get the bound LLVM ir builder.
+  llvm::IRBuilder<> *b() { return b_; }
+
+  void Compile(const ir::Module &module);
+
+  using LLVMIRVisitor::Visit;
+
+#define __(op__) llvm::Value *Visit(const ir::op__ *) override;
+  NODETY_FORALL(__)
+#undef __
+
+#define __(op__) llvm::Value *Visit(const ir::intrinsics::op__ *);
+  INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+
+  //! Used for the ExternFuncEmitter to store temporary result.
+  mutable llvm::Value *extern_func_emit_res_{};
+
+  std::shared_ptr<SymbolTable> named_vars() { return symbol_table_; }
+
+  llvm::FunctionType *GenFunctionTypeFromCinnFunction(const ir::_LoweredFunc_ *func, bool with_buffer_type);
+
+  virtual llvm::Value *GetVar(const std::string &name, bool lazy = true);
+
+  llvm::Function *GetIntrinsicDecl(llvm::Intrinsic::ID id,
+                                   llvm::Type *ret_type,
+                                   llvm::ArrayRef<llvm::Type *> arg_types);
+
+  // Constants
+  // @{
+  inline llvm::Value *llvm_int32_constant(int v) { return llvm::ConstantInt::get(ll_int32_ty(), v); }
+  // @}
+
+  virtual ~CodeGenLLVM();
+
+ protected:
+  // TODO(Superjomn) When to clear the existing local variables when switch to another function?
+  llvm::Value *SetVar(const std::string &name, llvm::Value *val);
+  llvm::Value *EmitVectorSlice(llvm::Value *vec, int begin, int extent);
+  llvm::Value *EmitVectorPad(llvm::Value *vec, int lanes);
+  llvm::Value *EmitVectorConcat(std::vector<llvm::Value *> vecs);
+
+  //! Visit different kinds of Calls, the following methods are analogous to
+  //! those in CodeGenC.
+  // @{
+  llvm::Value *EmitCall_buffer_create(const ir::Call *op);
+  llvm::Value *EmitCall_buffer_malloc(const ir::Call *op);
+  llvm::Value *EmitCall_get_address(const ir::Call *op);
+  llvm::Value *EmitCall_debug_info(const ir::Call *op);
+  // @}
+
+  llvm::Value *EmitBinaryOp(llvm::Value *lhs, llvm::Value *rhs, char opcode, bool is_integral, bool is_signed = true);
+
+  llvm::Value *LLVMGenGlobalStringVar(const std::string &data);
+
+  llvm::Value *CreateBufferPtr(Type t, llvm::Value *buffer, llvm::Value *index);
+  llvm::Value *CreateBufferVecPtr(Type t, llvm::Value *buffer, llvm::Value *index);
+  llvm::Value *CreateVecSlice(llvm::Value *vec, int begin, int lanes);
+
+  llvm::Value *DenseVectorLoad(const ir::Load *load);
+  llvm::Value *CreateSerialFor(const ir::For *op, int stride = 1);
+
+  /**
+   * Mark a load or store with type-based-alias-analysis metadata so that LLVM can optimize by reordering loads and
+   * stores across different buffers.
+   */
+  void AddTbaaMetadata(llvm::Instruction *inst, absl::string_view buffer, Expr index);
+
+  void InitTarget(const Target &target);
+
+  void Scalarize(const Expr &e, std::function<void(int i, llvm::Value *v)> flambda);
+
+  llvm::Module *m_;
+  llvm::IRBuilder<> *b_;
+  // Current function
+  llvm::Function *f_;
+
+  std::unique_ptr<llvm::MDBuilder> md_builder_;
+
+  // std::shared_ptr<absl::flat_hash_map<std::string, llvm::Value *>> named_vars_;
+  std::shared_ptr<SymbolTable> symbol_table_;
+  std::unordered_set<ir::_Var_ *> alias_vars_;
+
+  llvm::MDNode *md_tbaa_root_{nullptr};
+  llvm::MDNode *md_tbaa_alias_set_{nullptr};
+
+  int naive_vec_alignment_{0};
+  Target target_;
+};
+namespace detail {
+Expr StridedRampBase(Expr e, int stride);
+}  // namespace detail
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/codegen_llvm_test.cc b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
new file mode 100644
index 0000000000000..ebeaf20f01577
--- /dev/null
+++ b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
@@ -0,0 +1,623 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <llvm/AsmParser/Parser.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "cinn/backends/llvm/cinn_runtime_llvm_ir.h"
+#include "cinn/cinn.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/module.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+
+namespace cinn {
+namespace backends {
+
+namespace {
+
+auto CreateCodeGenLLVMTestLLVM() {
+  auto context = std::make_unique<llvm::LLVMContext>();
+  auto b       = std::make_unique<llvm::IRBuilder<>>(*context);
+  auto m       = std::make_unique<llvm::Module>("test_codegen_llvm", *context);
+  auto emitter = std::make_unique<CodeGenLLVM>(m.get(), b.get());
+
+  return std::make_tuple(std::move(m), std::move(b), std::move(context), std::move(emitter));
+}
+
+auto CreateTensor() {
+  ir::Expr M(3);
+  ir::Expr N(2);
+  lang::Placeholder<float> a("a", {M, N});
+  lang::Placeholder<float> b("b", {M, N});
+  auto c = lang::Compute(
+      {M, N}, [&](auto i, auto j) { return a(i, j) + b(i, j); }, "c");
+
+  lang::Buffer c_buf(common::Float(32));
+
+  return std::make_tuple(std::move(a), std::move(b), std::move(c), std::move(c_buf));
+}
+
+auto CreateLLVMType(llvm::LLVMContext *context) {
+  llvm::Type *i8   = llvm::Type::getInt8Ty(*context);
+  llvm::Type *i32  = llvm::Type::getInt32Ty(*context);
+  llvm::Type *i64  = llvm::Type::getInt64Ty(*context);
+  llvm::Type *u32  = llvm::Type::getInt32Ty(*context);
+  llvm::Type *f32  = llvm::Type::getFloatTy(*context);
+  llvm::Type *f16  = llvm::Type::getHalfTy(*context);
+  llvm::Type *bf16 = llvm::Type::getBFloatTy(*context);
+
+  return std::make_tuple(i8, i32, i64, u32, f32, f16, bf16);
+}
+
+template <typename OT, typename NT1, typename T1, typename NT2 = NT1, typename T2 = T1>
+auto CreateBinaryOp(common::Type t, T1 x, T2 y) {
+  auto px = std::make_unique<NT1>(t, x);
+  auto py = std::make_unique<NT2>(t, y);
+
+  auto ex = ir::Expr(px.release());
+  auto ey = ir::Expr(py.release());
+
+  return std::make_unique<OT>(std::move(ex), std::move(ey));
+}
+
+auto CreateIrBuffer(common::Type t, std::string name, std::vector<int> shape, int data_alignment = 0) {
+  CHECK_GE(data_alignment, 0);
+  auto buffer = ir::_Buffer_::Make(std::move(name), std::move(t));
+
+  if (data_alignment) {
+    buffer->data_alignment = data_alignment;
+  }
+
+  for (auto i : shape) {
+    auto pi = std::make_unique<ir::IntImm>(common::Int(32), i);
+    buffer->shape.emplace_back(pi.release());
+  }
+
+  return buffer;
+}
+
+auto CreateIrTensor(std::string name, std::vector<int> shape) {
+  std::vector<ir::Expr> shape_expr;
+  for (auto i : shape) {
+    auto pi = std::make_unique<ir::IntImm>(common::Int(32), i);
+    shape_expr.emplace_back(pi.release());
+  }
+
+  ir::Tensor tensor(std::move(name), Float(32), shape_expr, shape_expr, {}, {});
+  tensor->domain = tensor->shape;
+  return tensor;
+}
+
+auto CreateLoweredFunc() {
+  //
+}
+
+}  // namespace
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+TEST(CodeGenLLVM, Imm) {
+  auto context = std::make_unique<llvm::LLVMContext>();
+  auto b       = std::make_unique<llvm::IRBuilder<>>(*context);
+  auto m       = std::make_unique<llvm::Module>("test_codegen_llvm", *context);
+  auto emitter = std::make_unique<CodeGenLLVM>(m.get(), b.get());
+
+  llvm::Type *i32  = llvm::Type::getInt32Ty(*context);
+  llvm::Type *u32  = llvm::Type::getInt32Ty(*context);
+  llvm::Type *f32  = llvm::Type::getFloatTy(*context);
+  llvm::Type *f16  = llvm::Type::getHalfTy(*context);
+  llvm::Type *bf16 = llvm::Type::getBFloatTy(*context);
+
+  llvm::Value *value = nullptr;
+
+  ir::IntImm i32_imm(common::Int(32), 10);
+  value = emitter->Visit(&i32_imm);
+  ASSERT_EQ(value->getType(), i32);
+  ASSERT_EQ(value, llvm::ConstantInt::get(i32, i32_imm.value, true));
+  // value->print(llvm::outs(), false);
+
+  ir::UIntImm u32_imm(common::UInt(32), 5);
+  value = emitter->Visit(&u32_imm);
+  ASSERT_EQ(value->getType(), u32);
+  ASSERT_EQ(value, llvm::ConstantInt::get(u32, u32_imm.value, false));
+
+  ir::FloatImm float32_imm(common::Float(32), 2.5);
+  value = emitter->Visit(&float32_imm);
+  ASSERT_EQ(value->getType(), f32);
+  ASSERT_EQ(value, llvm::ConstantFP::get(f32, float32_imm.value));
+
+  ir::FloatImm float16_imm(common::Float16(), 2.5);
+  value = emitter->Visit(&float16_imm);
+  ASSERT_EQ(value->getType(), f16);
+  ASSERT_EQ(value, llvm::ConstantFP::get(f16, float16_imm.value));
+
+  ir::FloatImm bfloat16_imm(common::BFloat16(), 2.5);
+  value = emitter->Visit(&bfloat16_imm);
+  ASSERT_EQ(value->getType(), bf16);
+  ASSERT_EQ(value, llvm::ConstantFP::get(bf16, bfloat16_imm.value));
+}
+
+TEST(CodeGenLLVM, Expr) {
+  auto context = std::make_unique<llvm::LLVMContext>();
+  auto b       = std::make_unique<llvm::IRBuilder<>>(*context);
+  auto m       = std::make_unique<llvm::Module>("test_binary_op", *context);
+  auto emitter = std::make_unique<CodeGenLLVM>(m.get(), b.get());
+
+  llvm::Type *i1   = llvm::Type::getInt1Ty(*context);
+  llvm::Type *i8   = llvm::Type::getInt8Ty(*context);
+  llvm::Type *i32  = llvm::Type::getInt32Ty(*context);
+  llvm::Type *i64  = llvm::Type::getInt64Ty(*context);
+  llvm::Type *u32  = llvm::Type::getInt32Ty(*context);
+  llvm::Type *f32  = llvm::Type::getFloatTy(*context);
+  llvm::Type *f16  = llvm::Type::getHalfTy(*context);
+  llvm::Type *bf16 = llvm::Type::getBFloatTy(*context);
+
+  llvm::Value *value        = nullptr;
+  llvm::Value *expect_value = nullptr;
+
+  std::string outs;
+  llvm::raw_string_ostream ss(outs);
+
+  // +
+  do {
+    int x   = 2;
+    int y   = 3;
+    auto op = CreateBinaryOp<ir::Add, ir::IntImm, int>(common::Int(32), x, y);
+
+    expect_value = llvm::ConstantInt::get(i32, x + y);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), i32);
+    ASSERT_EQ(value, expect_value);
+    // value->print(llvm::outs(), false);
+    // value->print(ss, false);
+    // LOG(INFO) << "xxx: " << ss.str();
+  } while (false);
+
+  // -
+  do {
+    float x = 2.5;
+    float y = 3.5;
+    auto op = CreateBinaryOp<ir::Sub, ir::FloatImm, float>(common::Float(32), x, y);
+
+    expect_value = llvm::ConstantFP::get(f32, x - y);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), f32);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // -
+  do {
+    float16 x{2.5};
+    float16 y{3.5};
+    auto op = CreateBinaryOp<ir::Sub, ir::FloatImm, float16>(common::Float16(), x, y);
+
+    expect_value = llvm::ConstantFP::get(f16, x - y);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), f16);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // -
+  do {
+    bfloat16 x{2.5};
+    bfloat16 y{3.5};
+    auto op = CreateBinaryOp<ir::Sub, ir::FloatImm, bfloat16>(common::BFloat16(), x, y);
+
+    expect_value = llvm::ConstantFP::get(bf16, x - y);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), bf16);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // *
+  do {
+    int x        = 5;
+    int y        = 3;
+    auto op      = CreateBinaryOp<ir::Mul, ir::IntImm, float>(common::Int(64), x, y);
+    expect_value = llvm::ConstantInt::get(i64, x * y);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), i64);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // /
+  do {
+    float x      = 6;
+    float y      = 4;
+    auto op      = CreateBinaryOp<ir::Div, ir::FloatImm, float>(common::Float(32), x, y);
+    expect_value = llvm::ConstantFP::get(f32, x / y);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), f32);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // /
+  do {
+    float16 x{6};
+    float16 y{4};
+    auto op      = CreateBinaryOp<ir::Div, ir::FloatImm, float16>(common::Float16(), x, y);
+    expect_value = llvm::ConstantFP::get(f16, x / y);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), f16);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // /
+  do {
+    bfloat16 x{6};
+    bfloat16 y{4};
+    auto op      = CreateBinaryOp<ir::Div, ir::FloatImm, bfloat16>(common::BFloat16(), x, y);
+    expect_value = llvm::ConstantFP::get(bf16, x / y);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), bf16);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // %
+  do {
+    int x        = 25;
+    int y        = 7;
+    auto op      = CreateBinaryOp<ir::Mod, ir::IntImm, int>(common::Int(32), x, y);
+    expect_value = llvm::ConstantInt::get(i32, x % y);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), i32);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // ==
+  do {
+    int x        = 3;
+    int y        = 3;
+    auto op      = CreateBinaryOp<ir::EQ, ir::IntImm, int>(common::Int(32), x, y);
+    expect_value = llvm::ConstantInt::get(i1, 1);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), i1);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // !=
+  do {
+    float x = 3;
+    float y = 3;
+
+    auto op      = CreateBinaryOp<ir::NE, ir::FloatImm, float>(common::Float(32), x, y);
+    expect_value = llvm::ConstantInt::get(i1, 0);
+    value        = emitter->Visit(op.get());
+    ASSERT_EQ(value->getType(), i1);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // <
+  do {
+    int x        = 6;
+    int y        = 6;
+    auto op      = CreateBinaryOp<ir::LT, ir::IntImm, int>(common::Int(32), x, y);
+    value        = emitter->Visit(op.get());
+    expect_value = llvm::ConstantInt::get(i1, 0);
+    ASSERT_EQ(value->getType(), i1);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // <=
+  do {
+    int x        = 6;
+    int y        = 6;
+    auto op      = CreateBinaryOp<ir::LE, ir::IntImm, int>(common::Int(32), x, y);
+    value        = emitter->Visit(op.get());
+    expect_value = llvm::ConstantInt::get(i1, 1);
+    ASSERT_EQ(value->getType(), i1);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // >
+  do {
+    int x        = 6;
+    int y        = 6;
+    auto op      = CreateBinaryOp<ir::GT, ir::IntImm, int>(common::Int(32), x, y);
+    value        = emitter->Visit(op.get());
+    expect_value = llvm::ConstantInt::get(i1, 0);
+    ASSERT_EQ(value->getType(), i1);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // >=
+  do {
+    int x        = 6;
+    int y        = 6;
+    auto op      = CreateBinaryOp<ir::GE, ir::IntImm, int>(common::Int(32), x, y);
+    value        = emitter->Visit(op.get());
+    expect_value = llvm::ConstantInt::get(i1, 1);
+    ASSERT_EQ(value->getType(), i1);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // and, or
+  do {
+  } while (false);
+
+  // min
+  do {
+    int x        = 2;
+    int y        = 3;
+    auto op      = CreateBinaryOp<ir::Min, ir::IntImm, int>(common::Int(32), x, y);
+    value        = emitter->Visit(op.get());
+    expect_value = llvm::ConstantInt::get(i32, std::min(x, y));
+    ASSERT_EQ(value->getType(), i32);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // max
+  do {
+    float x      = 2;
+    float y      = 3;
+    auto op      = CreateBinaryOp<ir::Max, ir::FloatImm, float>(common::Float(32), x, y);
+    value        = emitter->Visit(op.get());
+    expect_value = llvm::ConstantFP::get(f32, std::max(x, y));
+    ASSERT_EQ(value->getType(), f32);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+
+  // minus
+  // not
+
+  // cast
+  do {
+    // i32 -> u32
+    // skip
+
+    // i32 -> f32
+    LOG(INFO) << "test i32 -> f32";
+    int v2       = 2;
+    auto x2      = std::make_unique<ir::IntImm>(common::Int(32), v2);
+    auto ex2     = ir::Expr(x2.release());
+    auto op2     = ir::Cast::Make(common::Float(32), std::move(ex2));
+    value        = emitter->Visit(&op2);
+    expect_value = llvm::ConstantFP::get(f32, v2);
+    ASSERT_EQ(value->getType(), f32);
+    ASSERT_EQ(value, expect_value);
+
+    // f32 -> i32
+    LOG(INFO) << "test f32 -> i32";
+    float v3     = 3;
+    auto x3      = std::make_unique<ir::FloatImm>(common::Float(32), v3);
+    auto ex3     = ir::Expr(x3.release());
+    auto op3     = ir::Cast::Make(common::Int(32), std::move(ex3));
+    value        = emitter->Visit(&op3);
+    expect_value = llvm::ConstantInt::get(i32, v3);
+    ASSERT_EQ(value->getType(), i32);
+    ASSERT_EQ(value, expect_value);
+
+    // i32 -> f16
+    LOG(INFO) << "test i32 -> f16";
+    int v4       = 4;
+    auto x4      = std::make_unique<ir::IntImm>(common::Int(32), v4);
+    auto ex4     = ir::Expr(x4.release());
+    auto op4     = ir::Cast::Make(common::Float16(), std::move(ex4));
+    value        = emitter->Visit(&op4);
+    expect_value = llvm::ConstantFP::get(f16, v4);
+    ASSERT_EQ(value->getType(), f16);
+    ASSERT_EQ(value, expect_value);
+
+    // f16 -> f32
+    LOG(INFO) << "test f16 -> f32";
+    float16 v5{5};
+    auto x5      = std::make_unique<ir::FloatImm>(common::Float16(), v5);
+    auto ex5     = ir::Expr(x5.release());
+    auto op5     = ir::Cast::Make(common::Float(32), std::move(ex5));
+    value        = emitter->Visit(&op5);
+    expect_value = llvm::ConstantFP::get(f32, v5);
+    ASSERT_EQ(value->getType(), f32);
+    ASSERT_EQ(value, expect_value);
+
+    // i32 -> bf16
+    LOG(INFO) << "test i32 -> bf16";
+    int v6       = 4;
+    auto x6      = std::make_unique<ir::IntImm>(common::Int(32), v6);
+    auto ex6     = ir::Expr(x6.release());
+    auto op6     = ir::Cast::Make(common::BFloat16(), std::move(ex6));
+    value        = emitter->Visit(&op6);
+    expect_value = llvm::ConstantFP::get(bf16, v6);
+    ASSERT_EQ(value->getType(), bf16);
+    ASSERT_EQ(value, expect_value);
+
+    // bf16 -> f32
+    LOG(INFO) << "test bf16 -> f32";
+    bfloat16 v7{5};
+    auto x7      = std::make_unique<ir::FloatImm>(common::BFloat16(), v7);
+    auto ex7     = ir::Expr(x7.release());
+    auto op7     = ir::Cast::Make(common::Float(32), std::move(ex7));
+    value        = emitter->Visit(&op7);
+    expect_value = llvm::ConstantFP::get(f32, v7);
+    ASSERT_EQ(value->getType(), f32);
+    ASSERT_EQ(value, expect_value);
+  } while (false);
+}
+
+TEST(CodeGenLLVM, Statement) {
+  return;
+  std::string outs;
+  llvm::raw_string_ostream ss(outs);
+
+  do {
+    auto _m_b_context_emitter_        = CreateCodeGenLLVMTestLLVM();  // NOLINT
+    auto &m                           = std::get<0>(_m_b_context_emitter_);
+    auto &b                           = std::get<1>(_m_b_context_emitter_);
+    auto &context                     = std::get<2>(_m_b_context_emitter_);
+    auto &emitter                     = std::get<3>(_m_b_context_emitter_);
+    auto _i8_i32_i64_u32_f32_f16_     = CreateLLVMType(context.get());  // NOLINT
+    auto &i8                          = std::get<0>(_i8_i32_i64_u32_f32_f16_);
+    auto &i32                         = std::get<1>(_i8_i32_i64_u32_f32_f16_);
+    auto &i64                         = std::get<2>(_i8_i32_i64_u32_f32_f16_);
+    auto &u32                         = std::get<3>(_i8_i32_i64_u32_f32_f16_);
+    auto &f32                         = std::get<4>(_i8_i32_i64_u32_f32_f16_);
+    auto &f16                         = std::get<4>(_i8_i32_i64_u32_f32_f16_);
+    llvm::FunctionType *function_type = llvm::FunctionType::get(i32, {}, false);
+    llvm::Function *function          = llvm::Function::Create(
+        function_type, llvm::Function::ExternalLinkage, "codegen_llvm_test.Alloc_Store_Load_Free", m.get());
+
+    std::string module_str;
+    module_str += "; ModuleID = 'test_codegen_llvm'";
+    module_str += "\nsource_filename = \"test_codegen_llvm\"\n";
+    module_str += "\ndefine i32 @codegen_llvm_test.Alloc_Store_Load_Free()";
+
+    llvm::BasicBlock *entry = llvm::BasicBlock::Create(*context, "entry", function);
+    b->SetInsertPoint(entry);
+
+    module_str += " {\nentry:";
+
+    // ir::Tensor
+    auto tensor_op    = CreateIrTensor("x", {2, 3});
+    tensor_op->buffer = CreateIrBuffer(common::Int(32), "", {2, 3});
+
+    // ir::Alloc
+    auto alloc_op         = std::make_unique<ir::Alloc>();
+    alloc_op->destination = ir::Expr(tensor_op->buffer);
+
+    // ir::Store
+    auto store_op    = std::make_unique<ir::Store>();
+    store_op->tensor = ir::Expr(tensor_op);
+    for (int i : {1, 1}) {
+      auto pi = std::make_unique<ir::IntImm>(common::Int(32), std::move(i));
+      store_op->indices.emplace_back(pi.release());
+    }
+    auto store_value = std::make_unique<ir::IntImm>(common::Int(32), 5);
+    store_op->value  = ir::Expr(store_value.release());
+
+    // ir::Load
+    auto load_op    = std::make_unique<ir::Load>();
+    load_op->tensor = ir::Expr(tensor_op);
+    for (int i : {1, 1}) {
+      auto pi = std::make_unique<ir::IntImm>(common::Int(32), std::move(i));
+      load_op->indices.emplace_back(pi.release());
+    }
+
+    // ir::Free
+    auto free_op         = std::make_unique<ir::Free>();
+    free_op->destination = ir::Expr(tensor_op->buffer);
+
+    // ir::Call
+    auto call_op  = std::make_unique<ir::Call>(common::Int(32));
+    call_op->name = "codegen_llvm_test.Alloc_Store_Load_Free";
+
+    // Emit llvm ir
+    auto *alloc_inst = llvm::dyn_cast<llvm::AllocaInst>(emitter->Visit(alloc_op.get()));
+    module_str += "\n  %0 = alloca [6 x i32]";
+    auto *store_inst = llvm::dyn_cast<llvm::StoreInst>(emitter->Visit(store_op.get()));
+    module_str += "\n  %1 = getelementptr [6 x i32], [6 x i32]* %0, i32 1";
+    module_str += "\n  store i32 5, [6 x i32]* %1";
+    auto *load_inst = llvm::dyn_cast<llvm::LoadInst>(emitter->Visit(load_op.get()));
+    module_str += "\n  %2 = getelementptr [6 x i32], [6 x i32]* %0, i32 1";
+    module_str += "\n  %3 = load [6 x i32], [6 x i32]* %2";
+
+    b->CreateRet(llvm::ConstantInt::get(i32, 1));
+
+    module_str += "\n  ret i32 1";
+    module_str += "\n}\n";
+
+    auto log_inst = [&ss, &outs](auto *inst) {
+      inst->print(ss, false);
+      LOG(INFO) << inst->getOpcodeName() << " instruction:" << ss.str();
+      outs.clear();
+    };
+
+    log_inst(alloc_inst);
+    log_inst(store_inst);
+    log_inst(load_inst);
+
+    ASSERT_EQ(module_str, ss.str());
+  } while (false);
+}
+
+TEST(CodeGenLLVM, LowerFunc) {
+  std::string outs;
+  llvm::raw_string_ostream ss(outs);
+
+  do {
+    auto context = std::make_unique<llvm::LLVMContext>();
+    // auto src_name = m->getSourceFileName();
+    llvm::SMDiagnostic error;
+    std::string runtime_ir(backends::kRuntimeLlvmIr);
+    // NOTE: read ir string before IRBuilder create
+    auto m = llvm::parseAssemblyString(runtime_ir, error, *context);
+    error.print("error:", ss, false);
+    CHECK(m) << ss.str();
+    auto b = std::make_unique<llvm::IRBuilder<>>(*context);
+
+    auto emitter = std::make_unique<CodeGenLLVM>(m.get(), b.get());
+
+    auto _i8_i32_i64_u32_f32_f16_ = CreateLLVMType(context.get());  // NOLINT
+    auto &i8                      = std::get<0>(_i8_i32_i64_u32_f32_f16_);
+    auto &i32                     = std::get<1>(_i8_i32_i64_u32_f32_f16_);
+    auto &i64                     = std::get<2>(_i8_i32_i64_u32_f32_f16_);
+    auto &u32                     = std::get<3>(_i8_i32_i64_u32_f32_f16_);
+    auto &f32                     = std::get<4>(_i8_i32_i64_u32_f32_f16_);
+    auto &f16                     = std::get<5>(_i8_i32_i64_u32_f32_f16_);
+    auto _x_y_z_z_buf_            = CreateTensor();  // NOLINT
+    auto &x                       = std::get<0>(_x_y_z_z_buf_);
+    auto &y                       = std::get<1>(_x_y_z_z_buf_);
+    auto &z                       = std::get<2>(_x_y_z_z_buf_);
+    auto &z_buf                   = std::get<3>(_x_y_z_z_buf_);
+
+    z->Bind(z_buf);
+
+    auto stages   = CreateStages({x, y, z});
+    auto function = lang::Lower("add1", stages, {x, y, z});
+    ir::Expr func_expr(function);
+
+    auto ir_function = emitter->Visit(&func_expr);
+    LOG(INFO) << "ir function: " << func_expr;
+
+    auto func = m->getFunction("add1");
+  } while (false);
+}
+
+TEST(SymbolTable, test) {
+  SymbolTable table;
+  ASSERT_EQ(table.num_scopes(), 0UL);
+
+  table.PushScope();
+
+  auto *fake_addr = reinterpret_cast<llvm::Value *>(1);
+  table.Insert("a", fake_addr);
+  ASSERT_EQ(table.size(), 1UL);
+
+  table.PushScope();
+  table.Insert("b", fake_addr);
+  ASSERT_EQ(table.size(), 1UL);
+
+  auto *a = table.Lookup("a");
+  ASSERT_EQ(a, fake_addr);
+
+  auto *b = table.Lookup("b");
+  ASSERT_EQ(b, fake_addr);
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/codegen_x86.cc b/paddle/cinn/backends/llvm/codegen_x86.cc
new file mode 100644
index 0000000000000..c76b04b16c372
--- /dev/null
+++ b/paddle/cinn/backends/llvm/codegen_x86.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/llvm/codegen_x86.h"
+
+#include <absl/container/flat_hash_map.h>
+#include <llvm/IR/LLVMContext.h>
+
+#include <algorithm>
+#include <utility>
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/optim/collect_undefined_vars.h"
+#include "cinn/runtime/intrinsic.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Casting.h"
+
+namespace cinn::backends {
+
+CodeGenX86::CodeGenX86(llvm::Module* m, llvm::IRBuilder<>* b, const std::shared_ptr<SymbolTable>& vars)
+    : CodeGenLLVM(m, b, vars) {}
+
+CodeGenX86::~CodeGenX86() {}
+
+llvm::Value* CodeGenX86::PackVars(const std::vector<std::string>& vars, uint64_t* num_bytes) {
+  if (vars.empty()) {
+    *num_bytes = 0U;
+    return llvm::Constant::getNullValue(ll_void_p_ty());
+  }
+  std::vector<llvm::Type*> types;
+  for (auto& v : vars) {
+    types.push_back(GetVar(v, false)->getType());
+  }
+  llvm::StructType* t_data = llvm::StructType::create(types);
+  llvm::Value* data        = b_->CreateAlloca(t_data, llvm_int32_constant(1));
+  for (size_t i = 0; i < vars.size(); ++i) {
+    b_->CreateStore(GetVar(vars[i]), b_->CreateInBoundsGEP(data, {llvm_int32_constant(0), llvm_int32_constant(i)}));
+  }
+  *num_bytes = m_->getDataLayout().getTypeAllocSize(llvm::cast<llvm::PointerType>(data->getType())->getElementType());
+  return data;
+}
+
+void CodeGenX86::UnpackVars(const std::vector<std::string>& vars, llvm::Value* data) {
+  for (size_t i = 0; i < vars.size(); ++i) {
+    SetVar(vars[i], b_->CreateLoad(b_->CreateInBoundsGEP(data, {llvm_int32_constant(0), llvm_int32_constant(i)})));
+  }
+}
+
+llvm::BasicBlock* CodeGenX86::CheckCallSuccess(llvm::Value* retcode) {
+  llvm::BasicBlock* fail_block =
+      llvm::BasicBlock::Create(b_->getContext(), "call_fail", b_->GetInsertBlock()->getParent(), nullptr);
+  llvm::BasicBlock* end_block =
+      llvm::BasicBlock::Create(b_->getContext(), "call_end", b_->GetInsertBlock()->getParent(), nullptr);
+  llvm::Value* succ = b_->CreateICmpEQ(retcode, llvm::ConstantInt::get(ll_int32_ty(), 0));
+  b_->CreateCondBr(succ, end_block, fail_block);
+  b_->SetInsertPoint(fail_block);
+  RetVoid();
+  b_->SetInsertPoint(end_block);
+  return end_block;
+}
+
+void CodeGenX86::CreateParallelLaunch(Expr body, int num_task) {
+  auto ftype_parallel_lambda =
+      llvm::FunctionType::get(ll_int32_ty(), {ll_int32_ty(), ll_int32_ty(), ll_type_of(Float(32).PointerOf())}, false);
+  llvm::Function* f =
+      llvm::Function::Create(ftype_parallel_lambda, llvm::Function::PrivateLinkage, "__parallel_lambda", m_);
+  std::vector<std::string> vars = optim::CollectUndefinedVars(&body);
+  uint64_t nbytes;
+  auto* data = PackVars(vars, &nbytes);
+
+  auto ftype_parallel_launch = llvm::FunctionType::get(
+      ll_int32_ty(), {ftype_parallel_lambda->getPointerTo(), ll_type_of(Float(32).PointerOf()), ll_int32_ty()}, false);
+  auto* launch_callee = llvm::dyn_cast<llvm::Function>(
+      m_->getOrInsertFunction(runtime::intrinsic::parallel_launch, ftype_parallel_launch).getCallee());
+  launch_callee->setCallingConv(llvm::CallingConv::C);
+  auto* launch_end = CheckCallSuccess(b_->CreateCall(
+      launch_callee,
+      {f, b_->CreatePointerCast(data, ll_type_of(Float(32).PointerOf())), llvm_int32_constant(num_task)}));
+
+  auto* flambda = llvm::BasicBlock::Create(b_->getContext(), "flambda", f);
+  b_->SetInsertPoint(flambda);
+  auto it       = f->arg_begin();
+  auto* task_id = &(*it++);
+  auto* penv    = &(*it++);
+  data          = b_->CreatePointerCast(&(*it++), data->getType());
+  symbol_table_->PushScope();
+  UnpackVars(vars, data);
+  ParallelEnv par_env;
+  auto task_id_name  = common::UniqName("task_id");
+  auto num_task_name = common::UniqName("num_task");
+  par_env.task_id    = ir::Var(task_id_name, Int(32));
+  par_env.num_task   = ir::Var(num_task_name, Int(32));
+  SetVar(task_id_name, task_id);
+  SetVar(num_task_name, penv);
+  par_env.penv = penv;
+  std::swap(f_, f);
+  std::swap(parallel_env_, par_env);
+  this->Visit(&body);
+  b_->CreateRet(ll_const_int32(0));
+  symbol_table_->Erase(task_id_name);
+  symbol_table_->Erase(num_task_name);
+  symbol_table_->PopScope();
+  std::swap(parallel_env_, par_env);
+  std::swap(f_, f);
+  CHECK_NE(par_env.parallel_loop_count, 0) << "find no parallel loop within parallel launch";
+  b_->SetInsertPoint(launch_end);
+}
+
+llvm::Value* CodeGenX86::Visit(const ir::For* op) {
+  if (op->is_parallel()) {
+    VLOG(3) << "parallel forloop";
+    if (parallel_env_.penv == nullptr) {
+      CreateParallelLaunch(
+          ir::For::Make(
+              op->loop_var, op->min, op->extent, op->for_type(), op->device_api, op->body, op->vectorize_info()),
+          0);
+    } else {
+      Expr num_task = parallel_env_.num_task;
+      Expr task_id  = parallel_env_.task_id;
+      CHECK(!parallel_env_.in_parallel_loop) << "Nested parallel loop is not supported, try to fuse them instead";
+      parallel_env_.in_parallel_loop = true;
+      if (parallel_env_.stride_pattern) {
+        auto new_for = ir::For::Make(
+            op->loop_var, task_id, op->extent, op->for_type(), op->device_api, op->body, op->vectorize_info());
+        auto for_node = new_for.As<ir::For>();
+        CHECK(for_node);
+        CreateSerialFor(for_node, num_task.as_int32());
+      } else {
+        Expr extent = op->extent;
+        Expr step   = (extent + num_task - Expr(1)) / num_task;
+        Expr begin  = min(task_id * step, op->extent);
+        Expr end    = min((task_id + Expr(1)) * step, op->extent);
+        auto new_for =
+            ir::For::Make(op->loop_var, begin, end, op->for_type(), op->device_api, op->body, op->vectorize_info());
+        auto for_node = new_for.As<ir::For>();
+        CHECK(for_node);
+        CreateSerialFor(for_node);
+      }
+      parallel_env_.in_parallel_loop = false;
+      ++parallel_env_.parallel_loop_count;
+    }
+  } else {
+    return CodeGenLLVM::Visit(op);
+  }
+  return nullptr;
+}
+}  // namespace cinn::backends
diff --git a/paddle/cinn/backends/llvm/codegen_x86.h b/paddle/cinn/backends/llvm/codegen_x86.h
new file mode 100644
index 0000000000000..baf480f51a3d5
--- /dev/null
+++ b/paddle/cinn/backends/llvm/codegen_x86.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <llvm/IR/IRBuilder.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+
+namespace cinn::backends {
+
+class CodeGenX86 : public CodeGenLLVM {
+ public:
+  explicit CodeGenX86(llvm::Module* m, llvm::IRBuilder<>* b, const std::shared_ptr<SymbolTable>& vars = nullptr);
+  virtual ~CodeGenX86();
+
+  using LLVMIRVisitor::Visit;
+
+  llvm::Value* Visit(const ir::For* op);
+
+ private:
+  // parallel information
+  struct ParallelEnv {
+    Expr task_id;
+    Expr num_task;
+    bool stride_pattern{false};
+    bool in_parallel_loop{false};
+    int parallel_loop_count{0};
+    llvm::Value* penv{nullptr};
+  };
+
+  llvm::Value* ParallelLaunch();
+  // Create parallel launch
+  void CreateParallelLaunch(Expr body, int num_task);
+
+  llvm::Value* PackVars(const std::vector<std::string>& vars, uint64_t* num_bytes);
+  void UnpackVars(const std::vector<std::string>& vars, llvm::Value* data);
+  llvm::BasicBlock* CheckCallSuccess(llvm::Value* retcode);
+  // Current parallel environment scope.
+  ParallelEnv parallel_env_;
+};
+
+}  // namespace cinn::backends
diff --git a/paddle/cinn/backends/llvm/codegen_x86_test.cc b/paddle/cinn/backends/llvm/codegen_x86_test.cc
new file mode 100644
index 0000000000000..95ded4776ce56
--- /dev/null
+++ b/paddle/cinn/backends/llvm/codegen_x86_test.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/llvm/codegen_x86.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/cinn.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace cinn {
+namespace backends {
+
+TEST(Vectorize, basic) {
+  Expr M(1024);
+  Placeholder<float> A("A", {M});
+  Placeholder<float> B("B", {M});
+
+  auto C = Compute(
+      {M}, [&](Expr i) { return A(i) + B(i); }, "C");
+  auto stages = CreateStages({C});
+
+  stages[C]->Vectorize(0, 8);
+
+  auto fn = Lower("fn", stages, {A, B, C});
+
+  LOG(INFO) << "fn: " << fn;
+
+  Module::Builder builder("module", common::DefaultHostTarget());
+  builder.AddFunction(fn);
+
+  auto module = builder.Build();
+
+  LOG(INFO) << "\n" << module->functions[0];
+
+  auto jit = SimpleJIT::Create();
+  jit->Link(builder.Build());
+
+  auto fn_ = jit->Lookup("fn");
+
+  auto* fn_ptr = reinterpret_cast<lower_func_ptr_t>(fn_);
+
+  auto* A_buf = common::BufferBuilder(Float(32), {1024}).set_random().set_align(64).Build();
+  auto* B_buf = common::BufferBuilder(Float(32), {1024}).set_random().set_align(64).Build();
+  auto* C_buf = common::BufferBuilder(Float(32), {1024}).set_zero().set_align(64).Build();
+
+  auto args = common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
+
+  fn_ptr(reinterpret_cast<void**>(args.data()), args.size());
+
+  auto* A_data = reinterpret_cast<float*>(A_buf->memory);
+  auto* B_data = reinterpret_cast<float*>(B_buf->memory);
+  auto* C_data = reinterpret_cast<float*>(C_buf->memory);
+  for (int i = 0; i < C_buf->num_elements(); i++) {
+    ASSERT_NEAR(A_data[i] + B_data[i], C_data[i], 1e-5);
+  }
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/execution_engine.cc b/paddle/cinn/backends/llvm/execution_engine.cc
new file mode 100644
index 0000000000000..175e58dbdd59b
--- /dev/null
+++ b/paddle/cinn/backends/llvm/execution_engine.cc
@@ -0,0 +1,250 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/llvm/execution_engine.h"
+
+#include <absl/strings/string_view.h>
+#include <llvm/ADT/Triple.h>
+#include <llvm/AsmParser/Parser.h>
+#include <llvm/Config/llvm-config.h>
+#include <llvm/ExecutionEngine/JITSymbol.h>
+#include <llvm/ExecutionEngine/Orc/Core.h>
+#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
+#include <llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h>
+#include <llvm/ExecutionEngine/Orc/LLJIT.h>
+#include <llvm/ExecutionEngine/Orc/ThreadSafeModule.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/InitializePasses.h>
+#include <llvm/PassRegistry.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Support/Error.h>
+#include <llvm/Support/Host.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/InstCombine/InstCombine.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/Scalar/GVN.h>
+#include <llvm/Transforms/Scalar/NewGVN.h>
+#include <llvm/Transforms/Scalar/Reassociate.h>
+#include <llvm/Transforms/Scalar/SimplifyCFG.h>
+
+#include <cmath>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <utility>
+
+#include "cinn/backends/codegen_cuda_host.h"
+#include "cinn/backends/llvm/cinn_runtime_llvm_ir.h"
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/codegen_x86.h"
+#include "cinn/backends/llvm/llvm_optimizer.h"
+#include "cinn/backends/llvm/llvm_util.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/runtime/intrinsic.h"
+#include "cinn/utils/profiler.h"
+
+namespace cinn::backends {
+namespace {
+void InitializeLLVMPasses() {
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+
+  auto &registry = *llvm::PassRegistry::getPassRegistry();
+  llvm::initializeCore(registry);
+  llvm::initializeTransformUtils(registry);
+  llvm::initializeScalarOpts(registry);
+  llvm::initializeIPO(registry);
+  llvm::initializeInstCombine(registry);
+  llvm::initializeAggressiveInstCombine(registry);
+  llvm::initializeAnalysis(registry);
+  llvm::initializeVectorization(registry);
+  llvm::initializeSROALegacyPassPass(registry);
+
+  // llvm::initializeCodeGen(registry);
+  // llvm::initializeTarget(registry);
+  // llvm::initializeCodeGenPreparePass(registry);
+}
+}  // namespace
+void NaiveObjectCache::notifyObjectCompiled(const llvm::Module *m, llvm::MemoryBufferRef obj_buffer) {
+  cached_objects_[m->getModuleIdentifier()] =
+      llvm::MemoryBuffer::getMemBufferCopy(obj_buffer.getBuffer(), obj_buffer.getBufferIdentifier());
+}
+
+std::unique_ptr<llvm::MemoryBuffer> NaiveObjectCache::getObject(const llvm::Module *m) {
+  auto it = cached_objects_.find(m->getModuleIdentifier());
+  if (it == cached_objects_.end()) {
+    VLOG(1) << "No object for " << m->getModuleIdentifier() << " in cache. Compiling.";
+    return nullptr;
+  }
+
+  VLOG(3) << "Object for " << m->getModuleIdentifier() << " loaded from cache.";
+  return llvm::MemoryBuffer::getMemBuffer(it->second->getMemBufferRef());
+}
+
+/*static*/ std::unique_ptr<ExecutionEngine> ExecutionEngine::Create(const ExecutionOptions &config) {
+  return Create(config, {});
+}
+
+/*static*/ std::unique_ptr<ExecutionEngine> ExecutionEngine::Create(const ExecutionOptions &config,
+                                                                    RuntimeSymbols &&module_symbols) {
+  VLOG(1) << "===================== Create CINN ExecutionEngine begin ====================";
+  VLOG(1) << "initialize llvm config";
+  VLOG(1) << "llvm version: " << LLVM_VERSION_STRING;
+  VLOG(1) << "llvm default target triple: " << LLVM_DEFAULT_TARGET_TRIPLE;
+
+  static std::once_flag flag;
+  std::call_once(flag, InitializeLLVMPasses);
+
+  auto engine = std::make_unique<ExecutionEngine>(/*enable_object_cache=*/true, std::move(module_symbols));
+
+  auto compile_layer_creator = [&engine](llvm::orc::JITTargetMachineBuilder jtmb)
+      -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
+    auto machine = llvm::cantFail(jtmb.createTargetMachine());
+    VLOG(1) << "create llvm compile layer";
+    VLOG(1) << "Target Name: " << machine->getTarget().getName();
+    VLOG(1) << "Target CPU: " << machine->getTargetCPU().str() << std::endl;
+    return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(machine), engine->cache_.get());
+  };
+
+  auto object_layer_creator = [&](llvm::orc::ExecutionSession &session, const llvm::Triple &triple) {
+    auto object_layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(
+        session, []() { return std::make_unique<llvm::SectionMemoryManager>(); });
+    llvm::orc::JITDylib *main_jd = session.getJITDylibByName("<main>");
+    if (!main_jd) {
+      main_jd = &llvm::cantFail(session.createJITDylib("<main>"));
+    }
+    return object_layer;
+  };
+
+  VLOG(2) << "create jit execution engine";
+  engine->jit_ = llvm::cantFail(llvm::orc::LLJITBuilder()
+                                    .setCompileFunctionCreator(compile_layer_creator)
+                                    .setObjectLinkingLayerCreator(object_layer_creator)
+                                    .create());
+  engine->jit_->getMainJITDylib().addGenerator(llvm::cantFail(
+      llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(engine->jit_->getDataLayout().getGlobalPrefix())));
+
+  VLOG(2) << "register runtime call symbols";
+
+  engine->RegisterRuntimeSymbols();
+
+  VLOG(2) << "===================== Create CINN ExecutionEngine end ====================";
+  return engine;
+}
+
+template <typename CodeGenT>
+void ExecutionEngine::Link(const ir::Module &module) {
+  utils::RecordEvent("ExecutionEngine Link", utils::EventType::kOrdinary);
+  llvm::SMDiagnostic error;
+  auto ctx        = std::make_unique<llvm::LLVMContext>();
+  auto m          = llvm::parseAssemblyString(AsStringRef(backends::kRuntimeLlvmIr), error, *ctx);
+  auto b          = std::make_unique<llvm::IRBuilder<>>(*ctx);
+  auto ir_emitter = std::make_unique<CodeGenT>(m.get(), b.get());
+  VLOG(3) << "ir_emitter->Compile(module) Begin";
+  ir_emitter->Compile(module);
+  VLOG(3) << "ir_emitter->Compile(module) Succeed!";
+  CHECK(!llvm::verifyModule(*m, &llvm::errs())) << "Invalid module found";
+
+  auto machine =
+      std::move(llvm::cantFail(llvm::cantFail(llvm::orc::JITTargetMachineBuilder::detectHost()).createTargetMachine()));
+  LLVMModuleOptimizer optimize(machine.get(), 3, {}, true);
+  optimize(m.get());
+  CHECK(!llvm::verifyModule(*m, &llvm::errs())) << "Invalid optimized module detected";
+  for (auto &f : *m) {
+    VLOG(5) << "function: " << DumpToString(f);
+  }
+
+  llvm::raw_svector_ostream rawstream(buffer_);
+  llvm::legacy::PassManager pass_manager;
+  machine->addPassesToEmitFile(pass_manager, rawstream, nullptr, llvm::CGFT_ObjectFile);
+  pass_manager.run(*m);
+
+  CHECK(AddModule(std::move(m), std::move(ctx)));
+
+  if (VLOG_IS_ON(5)) {
+    VLOG(5) << "======= dump jit execution session ======";
+    std::string buffer;
+    llvm::raw_string_ostream os(buffer);
+    decltype(auto) es = jit_->getExecutionSession();
+    es.dump(os);
+    os.flush();
+    VLOG(5) << buffer;
+  }
+}
+
+bool ExecutionEngine::AddModule(std::unique_ptr<llvm::Module> module, std::unique_ptr<llvm::LLVMContext> context) {
+  utils::RecordEvent("ExecutionEngine AddModule", utils::EventType::kOrdinary);
+  module->setDataLayout(jit_->getDataLayout());
+  if (VLOG_IS_ON(5)) {
+    VLOG(5) << "======= dump jit lib ==========";
+    std::string buffer;
+    llvm::raw_string_ostream os(buffer);
+    module->print(os, {});
+    // main_jd_->dump(os);
+    os.flush();
+    VLOG(5) << buffer;
+  }
+  llvm::orc::ThreadSafeContext tsc(std::move(context));
+  llvm::orc::ThreadSafeModule tsm(std::move(module), std::move(tsc));
+  llvm::cantFail(jit_->addIRModule(std::move(tsm)));
+  return true;
+}
+
+void ExecutionEngine::ExportObject(const std::string &path) {
+  FILE *of = fopen(path.c_str(), "w");
+  fwrite(buffer_.data(), 1, buffer_.size(), of);
+  fclose(of);
+}
+
+void *ExecutionEngine::Lookup(absl::string_view name) {
+  utils::RecordEvent("ExecutionEngine Lookup", utils::EventType::kOrdinary);
+  std::lock_guard<std::mutex> lock(mu_);
+  if (auto symbol = jit_->lookup(AsStringRef(name))) {
+    return reinterpret_cast<void *>(symbol->getAddress());
+  }
+
+  LOG(ERROR) << "Unknown symbol name[" << name << "]";
+  return nullptr;
+}
+
+void ExecutionEngine::RegisterRuntimeSymbols() {
+  utils::RecordEvent("ExecutionEngine RegisterRuntimeSymbols", utils::EventType::kOrdinary);
+  const auto &registry = GlobalSymbolRegistry::Global();
+  auto *session        = &jit_->getExecutionSession();
+  for (const auto &sym : registry.All()) {
+    llvm::cantFail(jit_->define(llvm::orc::absoluteSymbols(
+        {{session->intern(sym.first), {llvm::pointerToJITTargetAddress(sym.second), llvm::JITSymbolFlags::None}}})));
+  }
+  for (const auto &sym : module_symbols_.All()) {
+    llvm::cantFail(jit_->define(llvm::orc::absoluteSymbols(
+        {{session->intern(sym.first), {llvm::pointerToJITTargetAddress(sym.second), llvm::JITSymbolFlags::None}}})));
+  }
+}
+
+template void ExecutionEngine::Link<CodeGenLLVM>(const ir::Module &module);
+template void ExecutionEngine::Link<CodeGenX86>(const ir::Module &module);
+template void ExecutionEngine::Link<CodeGenCUDA_Host>(const ir::Module &module);
+
+}  // namespace cinn::backends
diff --git a/paddle/cinn/backends/llvm/execution_engine.h b/paddle/cinn/backends/llvm/execution_engine.h
new file mode 100644
index 0000000000000..15a7e8793a139
--- /dev/null
+++ b/paddle/cinn/backends/llvm/execution_engine.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <llvm/ADT/StringMap.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/JITSymbol.h>
+#include <llvm/ExecutionEngine/ObjectCache.h>
+#include <llvm/ExecutionEngine/Orc/CompileUtils.h>
+#include <llvm/ExecutionEngine/Orc/Core.h>
+#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
+#include <llvm/ExecutionEngine/Orc/IRCompileLayer.h>
+#include <llvm/ExecutionEngine/Orc/LLJIT.h>
+#include <llvm/ExecutionEngine/Orc/LambdaResolver.h>
+#include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
+#include <llvm/ExecutionEngine/Orc/ThreadSafeModule.h>
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Error.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/SmallVectorMemoryBuffer.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <functional>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "cinn/backends/llvm/codegen_x86.h"
+#include "cinn/backends/llvm/llvm_util.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/ir/module.h"
+
+namespace cinn::backends {
+
+class NaiveObjectCache : public llvm::ObjectCache {
+ public:
+  void notifyObjectCompiled(const llvm::Module *, llvm::MemoryBufferRef) override;
+  std::unique_ptr<llvm::MemoryBuffer> getObject(const llvm::Module *) override;
+
+ private:
+  llvm::StringMap<std::unique_ptr<llvm::MemoryBuffer>> cached_objects_;
+};
+
+struct ExecutionOptions {
+  int opt_level{3};
+  bool enable_debug_info{false};
+  // TODO(fc500110)
+  // int num_compile_threads{1};
+  // bool enable_fast_math;
+};
+
+class ExecutionEngine {
+ public:
+  static std::unique_ptr<ExecutionEngine> Create(const ExecutionOptions &config);
+
+  static std::unique_ptr<ExecutionEngine> Create(const ExecutionOptions &config, RuntimeSymbols &&module_symbols);
+
+  void *Lookup(absl::string_view name);
+
+  template <typename CodeGenT = CodeGenLLVM>
+  void Link(const ir::Module &module);
+
+  void ExportObject(const std::string &path);
+
+  bool AddModule(std::unique_ptr<llvm::Module> module, std::unique_ptr<llvm::LLVMContext> context);
+
+ protected:
+  explicit ExecutionEngine(bool enable_object_cache, RuntimeSymbols &&module_symbols)
+      : cache_(std::make_unique<NaiveObjectCache>()), module_symbols_(std::move(module_symbols)) {}
+
+  void RegisterRuntimeSymbols();
+
+  bool SetupTargetTriple(llvm::Module *module);
+
+  // This may not be a compatible implementation.
+  friend std::unique_ptr<ExecutionEngine> std::make_unique<ExecutionEngine>(bool &&, cinn::backends::RuntimeSymbols &&);
+
+ private:
+  mutable std::mutex mu_;
+  llvm::SmallString<0> buffer_;
+  std::unique_ptr<llvm::orc::LLJIT> jit_;
+  std::unique_ptr<NaiveObjectCache> cache_;
+  RuntimeSymbols module_symbols_;
+};
+
+}  // namespace cinn::backends
diff --git a/paddle/cinn/backends/llvm/execution_engine_test.cc b/paddle/cinn/backends/llvm/execution_engine_test.cc
new file mode 100644
index 0000000000000..5818f33a645a8
--- /dev/null
+++ b/paddle/cinn/backends/llvm/execution_engine_test.cc
@@ -0,0 +1,329 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/llvm/execution_engine.h"
+
+#include <glog/logging.h>
+#include <glog/raw_logging.h>
+#include <gtest/gtest.h>
+#include <llvm/AsmParser/Parser.h>
+#include <llvm/IR/Argument.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/Support/FileSystem.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <algorithm>
+#include <cmath>
+#include <iomanip>
+#include <memory>
+#include <random>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "cinn/backends/llvm/cinn_runtime_llvm_ir.h"
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/cinn.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/module.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/optim/optimize.h"
+#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "cinn/runtime/cpu/use_extern_funcs.h"
+
+namespace cinn {
+namespace backends {
+
+namespace {
+bool RegisterKnownSymbols() {
+  decltype(auto) registry = GlobalSymbolRegistry::Global();
+
+  registry.RegisterFn("sinf", reinterpret_cast<void *>(&sinf));
+  registry.RegisterFn("sin", reinterpret_cast<void *>(static_cast<double (*)(double)>(&sin)));
+
+  registry.RegisterFn("cosf", reinterpret_cast<void *>(&cosf));
+  registry.RegisterFn("cos", reinterpret_cast<void *>(static_cast<double (*)(double)>(&cos)));
+  return true;
+}
+
+[[maybe_unused]] bool unused = RegisterKnownSymbols();
+
+constexpr int kM = 100;
+constexpr int kN = 32;
+
+auto CreateTestBuffer() {
+  auto *A = cinn_buffer_t::new_(cinn_device_kind_t::cinn_x86_device, cinn_float32_t(), {kM, kN}, 32);
+  auto *B = cinn_buffer_t::new_(cinn_device_kind_t::cinn_x86_device, cinn_float32_t(), {kM, kN}, 32);
+  auto *C = cinn_buffer_t::new_(cinn_device_kind_t::cinn_x86_device, cinn_float32_t(), {kM, kN}, 32);
+  cinn_buffer_malloc(nullptr, A);
+  cinn_buffer_malloc(nullptr, B);
+  cinn_buffer_malloc(nullptr, C);
+  float *Ad = reinterpret_cast<float *>(A->memory);
+  float *Bd = reinterpret_cast<float *>(B->memory);
+
+  for (int i = 0; i < A->num_elements(); i++) {
+    Ad[i] = static_cast<float>(rand()) / RAND_MAX;  // NOLINT
+    Bd[i] = static_cast<float>(rand()) / RAND_MAX;  // NOLINT
+  }
+
+  float *Cd = reinterpret_cast<float *>(C->memory);
+  CHECK_EQ(C->num_elements(), A->num_elements());
+
+  return std::make_tuple(A, B, C);
+}
+
+auto CreateTestCinnModule() {
+  ir::Expr M(kM);
+  ir::Expr N(kN);
+  lang::Placeholder<float> A("A", {M, N});
+  lang::Placeholder<float> B("B", {M, N});
+
+  lang::Buffer C_buf(Float(32));
+  auto C = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) + B(i, j); }, "C");
+  C->Bind(C_buf);
+
+  common::Target target;
+  target.arch = common::Target::Arch::X86;
+  target.bits = common::Target::Bit::k32;
+  target.os   = common::Target::OS::Linux;
+  ir::Module::Builder builder("module1", target);
+
+  auto stages = CreateStages({C});
+  auto funcs  = lang::Lower("elementwise_add", stages, {A, B, C});
+
+  // auto func = optim::Optimize(funcs);
+
+  builder.AddFunction(ir::LoweredFunc(funcs.As<ir::_LoweredFunc_>()));
+  return builder.Build();
+}
+}  // namespace
+
+TEST(llvm_test01, elementwise_add) {
+  return;
+  auto engine = backends::ExecutionEngine::Create({1});
+
+  auto _a_b_c_ = CreateTestBuffer();  // NOLINT
+  auto &a      = std::get<0>(_a_b_c_);
+  auto &b      = std::get<1>(_a_b_c_);
+  auto &c      = std::get<2>(_a_b_c_);
+
+  auto module = CreateTestCinnModule();
+
+  engine->Link(module);
+
+  auto elementwise_add_addr = engine->Lookup("elementwise_add");
+  return;
+  auto elementwise_add = reinterpret_cast<void (*)(void *, int32_t)>(elementwise_add_addr);
+  cinn_pod_value_t a_arg(a), b_arg(b), c_arg(c);
+  cinn_pod_value_t args[3] = {a_arg, b_arg, c_arg};
+  elementwise_add(args, 3);
+
+  float *ad = reinterpret_cast<float *>(a->memory);
+  float *bd = reinterpret_cast<float *>(b->memory);
+  float *cd = reinterpret_cast<float *>(c->memory);
+
+  for (int i = 0; i < c->num_elements(); i++) {
+    EXPECT_EQ(ad[i] + bd[i], cd[i]);
+  }
+}
+
+TEST(llvm, module_call_lowered_func) {
+  ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+  ir::Expr M(kM);
+  ir::Expr N(kN);
+  {  // define fn
+    lang::Placeholder<float> a("A", {M, N});
+    lang::Placeholder<float> b("B", {M, N});
+    auto c = lang::Compute(
+        {M, N}, [&](auto i, auto j) { return a(i, j) + b(i, j); }, "C");
+
+    auto stages = CreateStages({c});
+    auto fn     = lang::Lower("elementwise_add", stages, {a, b, c}, {});
+    builder.AddFunction(fn);
+  }
+
+  {  // call fn
+    lang::Placeholder<float> a("A", {M, N});
+    lang::Placeholder<float> b("B", {M, N});
+
+    std::vector<lang::ReturnType> ret_types({lang::ReturnType{Float(32), {M, N}, "c_out"}});
+
+    auto call_outs = lang::CallLowered("elementwise_add", {a, b}, ret_types);
+    auto c         = call_outs[0];
+
+    // here we must call the output, so that it cal output something.
+
+    auto stages  = CreateStages({c});
+    auto main_fn = lang::Lower("main", stages, {a, b, c}, {});
+    builder.AddFunction(main_fn);
+
+    CodeGenC codegen(common::DefaultHostTarget());
+    codegen.SetInlineBuiltinCodes(false);
+    LOG(INFO) << "module:\n" << codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  }
+
+  auto _ab_bb_cb_ = CreateTestBuffer();  // NOLINT
+  auto &ab        = std::get<0>(_ab_bb_cb_);
+  auto &bb        = std::get<1>(_ab_bb_cb_);
+  auto &cb        = std::get<2>(_ab_bb_cb_);
+  do {  // call the function
+    auto engine = backends::ExecutionEngine::Create({1});
+
+    LOG(INFO) << "JIT Link the module";
+    engine->Link(builder.Build());
+    auto cos_fn = (double (*)(double))engine->Lookup("cos");
+    LOG(INFO) << "=> LLVM JIT cos(0) = " << cos_fn(0);
+    auto elementwise_add_addr = engine->Lookup("elementwise_add");
+    auto elementwise_add      = reinterpret_cast<void (*)(void *, int32_t)>(elementwise_add_addr);
+    LOG(INFO) << "JIT get elementwise_add_addr";
+    break;
+
+    cinn_pod_value_t a_arg(ab), b_arg(bb), c_arg(cb);
+    cinn_pod_value_t args[3] = {a_arg, b_arg, c_arg};
+
+    elementwise_add(args, 3);
+
+    auto *ad = reinterpret_cast<float *>(ab->memory);
+    auto *bd = reinterpret_cast<float *>(bb->memory);
+    for (int i = 0; i < kM; i++) {
+      for (int j = 0; j < kN; j++) {
+        auto *data = reinterpret_cast<float *>(cb->memory);
+        ASSERT_NEAR(data[i * kN + j], ad[i * kN + j] + bd[i * kN + j], 1e-5);
+      }
+    }
+  } while (false);
+}
+
+TEST(ExecutionEngine, custom_runtime_symbols) {
+  auto context = std::make_unique<llvm::LLVMContext>();
+  auto module  = std::make_unique<llvm::Module>("test_llvm_cpu_runtime", *context);
+  auto builder = std::make_unique<llvm::IRBuilder<>>(*context);
+
+  auto call_custom_target = [&](std::string name, llvm::Type *ty) {
+    llvm::FunctionType *fn_type = llvm::FunctionType::get(ty, {ty}, false);
+    llvm::Function *function =
+        llvm::Function::Create(fn_type, llvm::Function::ExternalLinkage, "_call_custom_" + name, module.get());
+    function->setCallingConv(llvm::CallingConv::C);
+    llvm::BasicBlock *entry = llvm::BasicBlock::Create(module->getContext(), "entry", function);
+    builder->SetInsertPoint(entry);
+    llvm::Argument *arg = &*function->args().begin();
+    llvm::Function *custom_function =
+        llvm::dyn_cast<llvm::Function>(module->getOrInsertFunction(name, fn_type).getCallee());
+    custom_function->setCallingConv(llvm::CallingConv::C);
+    llvm::Value *ret = builder->CreateCall(custom_function, {arg});
+    builder->CreateRet(ret);
+  };
+
+  llvm::Type *f32 = builder->getFloatTy();
+  llvm::Type *f64 = builder->getDoubleTy();
+  call_custom_target("cosf", f32);
+  call_custom_target("cos", f64);
+  call_custom_target("sinf", f32);
+  call_custom_target("sin", f64);
+
+  double pi = std::acos(-1);
+
+  std::vector<double> angle = {0., pi / 6., pi / 4., pi / 3., pi / 2., pi};
+
+  std::random_device rd;
+  std::mt19937 mt(rd());
+  std::uniform_int_distribution<int> dis(-100, 100);
+  int random_x = dis(mt);
+  int random_y = dis(mt);
+
+  decltype(auto) registry = GlobalSymbolRegistry::Global();
+  // registry.Register("dereference_f64_ptr", (void *)+[](double *x) { return *x; });
+
+  for (size_t i = 0; i < angle.size(); i++) {
+    registry.RegisterVar("theta_" + std::to_string(i), angle[i]);
+  }
+
+  auto engine = cinn::backends::ExecutionEngine::Create({1});
+  engine->AddModule(std::move(module), std::move(context));
+
+  auto *call_cosf = reinterpret_cast<float (*)(float)>(engine->Lookup("_call_custom_cosf"));
+  auto *call_cos  = reinterpret_cast<double (*)(double)>(engine->Lookup("_call_custom_cos"));
+  auto *call_sinf = reinterpret_cast<float (*)(float)>(engine->Lookup("_call_custom_sinf"));
+  auto *call_sin  = reinterpret_cast<double (*)(double)>(engine->Lookup("_call_custom_sin"));
+
+  ASSERT_TRUE(call_cosf && call_cos && call_sinf && call_sin);
+
+  for (auto theta : angle) {
+    float theta_f = static_cast<float>(theta);
+    ASSERT_NEAR(call_cosf(theta_f), cosf(theta_f), 1e-6);
+    ASSERT_NEAR(call_cos(theta), cos(theta), 1e-6);
+    ASSERT_NEAR(call_sinf(theta_f), sinf(theta_f), 1e-6);
+    ASSERT_NEAR(call_sin(theta), sin(theta), 1e-6);
+  }
+}
+
+TEST(ExecutionEngine, call_extern) {
+  ir::Expr M(kM);
+  ir::Expr N(kN);
+
+  Placeholder<float> x("x", {M, N});
+  Placeholder<float> y("y", {M, N});
+
+  auto add_out = Compute(
+      {M, N}, [=](Var i, Var j) { return x(i, j) + y(i, j); }, "add_out");
+
+  ir::Tensor res = Compute(
+      {M, N}, [&](Var i, Var j) -> Expr { return lang::CallExtern("tanh", {add_out(i, j)}); }, "res");
+
+  auto stages = CreateStages({add_out, res});
+
+  stages[add_out]->ComputeInline();
+  auto func = Lower("comp", stages, {x, y, res});
+
+  Module::Builder builder("module0", common::DefaultHostTarget());
+  builder.AddFunction(func);
+
+  auto engine = backends::ExecutionEngine::Create({1});
+
+  engine->Link(builder.Build());
+
+  auto _ab_bb_cb_ = CreateTestBuffer();  // NOLINT
+  auto &ab        = std::get<0>(_ab_bb_cb_);
+  auto &bb        = std::get<1>(_ab_bb_cb_);
+  auto &cb        = std::get<2>(_ab_bb_cb_);
+
+  auto comp_addr = engine->Lookup("comp");
+  auto comp      = reinterpret_cast<void (*)(void *, int32_t)>(comp_addr);
+
+  cinn_pod_value_t a_arg(ab), b_arg(bb), c_arg(cb);
+  cinn_pod_value_t args[3] = {a_arg, b_arg, c_arg};
+
+  comp(args, 3);
+
+  auto *ad = reinterpret_cast<float *>(ab->memory);
+  auto *bd = reinterpret_cast<float *>(bb->memory);
+  auto *cd = reinterpret_cast<float *>(cb->memory);
+  for (int m = 0; m < kM; m++) {
+    for (int n = 0; n < kN; n++) {
+      ASSERT_NEAR(cd[m * kN + n], tanh(ad[m * kN + n] + bd[m * kN + n]), 1e-5);
+    }
+  }
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/generate_runtime_llvm_ir.py b/paddle/cinn/backends/llvm/generate_runtime_llvm_ir.py
new file mode 100644
index 0000000000000..2d8d93aa5d334
--- /dev/null
+++ b/paddle/cinn/backends/llvm/generate_runtime_llvm_ir.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import subprocess
+
+
+def main():
+    path = sys.argv[1]
+    out_path = sys.argv[2]
+    llvm_config = sys.argv[3]
+
+    srcs = []
+    srcs.append('#include <absl/strings/string_view.h>')
+    #srcs.append('#include "cinn/backends/llvm/cinn_runtime_llvm_ir.h"\n')
+    srcs.append('namespace cinn::backends {')
+    srcs.append("static const absl::string_view kRuntimeLlvmIr(")
+    srcs.append('R"ROC(')
+    with open(path, 'r') as fr:
+        srcs.append(fr.read())
+
+    srcs.append(')ROC"')
+    srcs.append(');\n')
+
+    cmd = "{} --version".format(llvm_config)
+    version = subprocess.check_output(
+        cmd, shell=True).decode('utf-8').strip().split('.')
+    srcs.append("struct llvm_version {")
+    for v, n in zip(["major", "minor", "micro"], version):
+        srcs.append("  static constexpr int k{} = {};".format(
+            v.title(), ''.join(filter(str.isdigit, n))))
+    srcs.append("};")
+
+    srcs.append('}  // namespace cinn::backends')
+    with open(out_path, 'w') as fw:
+        fw.write("\n".join(srcs))
+
+
+def get_clang_version():
+    pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddle/cinn/backends/llvm/ir_builder_mixin.h b/paddle/cinn/backends/llvm/ir_builder_mixin.h
new file mode 100644
index 0000000000000..42b1e9663afbb
--- /dev/null
+++ b/paddle/cinn/backends/llvm/ir_builder_mixin.h
@@ -0,0 +1,306 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Value.h>
+
+#include <utility>
+
+namespace cinn {
+namespace backends {
+template <typename Derived>
+class IrBuilderMixin {
+ protected:
+  template <typename... Args>
+  decltype(auto) BinOp(Args &&...args) {
+    return mixin_builder()->CreateBinOp(std::forward<Args>(args)...);
+  }
+
+  /// \brief +
+  template <typename... Args>
+  decltype(auto) Add(Args &&...args) {
+    return mixin_builder()->CreateAdd(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FAdd(Args &&...args) {
+    return mixin_builder()->CreateFAdd(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) NSWAdd(Args &&...args) {
+    return mixin_builder()->CreateNSWAdd(std::forward<Args>(args)...);
+  }
+
+  /// \brief -
+  template <typename... Args>
+  decltype(auto) Sub(Args &&...args) {
+    return mixin_builder()->CreateSub(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FSub(Args &&...args) {
+    return mixin_builder()->CreateFSub(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) NSWSub(Args &&...args) {
+    return mixin_builder()->CreateNSWSub(std::forward<Args>(args)...);
+  }
+
+  /// \brief *
+  template <typename... Args>
+  decltype(auto) Mul(Args &&...args) {
+    return mixin_builder()->CreateMul(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FMul(Args &&...args) {
+    return mixin_builder()->CreateFMul(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) NSWMul(Args &&...args) {
+    return mixin_builder()->CreateNSWMul(std::forward<Args>(args)...);
+  }
+
+  /// \brief /
+  template <typename... Args>
+  decltype(auto) SDiv(Args &&...args) {
+    return mixin_builder()->CreateSDiv(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) UDiv(Args &&...args) {
+    return mixin_builder()->CreateUDiv(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FDiv(Args &&...args) {
+    return mixin_builder()->CreateFDiv(std::forward<Args>(args)...);
+  }
+
+  /// \brief %
+  template <typename... Args>
+  decltype(auto) SRem(Args &&...args) {
+    return mixin_builder()->CreateSRem(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) URem(Args &&...args) {
+    return mixin_builder()->CreateURem(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FRem(Args &&...args) {
+    return mixin_builder()->CreateFRem(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) And(Args &&...args) {
+    return mixin_builder()->CreateAnd(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) Or(Args &&...args) {
+    return mixin_builder()->CreateOr(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) Not(Args &&...args) {
+    return mixin_builder()->CreateNot(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) Neg(Args &&...args) {
+    return mixin_builder()->CreateNeg(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FNeg(Args &&...args) {
+    return mixin_builder()->CreateFNeg(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) ICmpEQ(Args &&...args) {
+    return mixin_builder()->CreateICmpEQ(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FCmpOEQ(Args &&...args) {
+    return mixin_builder()->CreateFCmpOEQ(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FCmpUEQ(Args &&...args) {
+    return mixin_builder()->CreateFCmpUEQ(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) ICmpNE(Args &&...args) {
+    return mixin_builder()->CreateICmpNE(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FCmpONE(Args &&...args) {
+    return mixin_builder()->CreateFCmpONE(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FCmpUNE(Args &&...args) {
+    return mixin_builder()->CreateFCmpUNE(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) ICmpULE(Args &&...args) {
+    return mixin_builder()->CreateICmpULE(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FCmpOLE(Args &&...args) {
+    return mixin_builder()->CreateFCmpOLE(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) ICmpULT(Args &&...args) {
+    return mixin_builder()->CreateICmpULT(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) ICmpSLT(Args &&...args) {
+    return mixin_builder()->CreateICmpSLT(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FCmpOLT(Args &&...args) {
+    return mixin_builder()->CreateFCmpOLT(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) ICmpUGE(Args &&...args) {
+    return mixin_builder()->CreateICmpUGE(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) ICmpSGE(Args &&...args) {
+    return mixin_builder()->CreateICmpSGE(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FCmpOGE(Args &&...args) {
+    return mixin_builder()->CreateFCmpOGE(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) ICmpUGT(Args &&...args) {
+    return mixin_builder()->CreateICmpUGT(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) ICmpSGT(Args &&...args) {
+    return mixin_builder()->CreateICmpSGT(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FCmpOGT(Args &&...args) {
+    return mixin_builder()->CreateFCmpOGT(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) BitCast(Args &&...args) {
+    return mixin_builder()->CreateBitCast(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) IntCast(Args &&...args) {
+    return mixin_builder()->CreateIntCast(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FPCast(Args &&...args) {
+    return mixin_builder()->CreateFPCast(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) PointerCast(Args &&...args) {
+    return mixin_builder()->CreatePointerCast(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) FPToSI(Args &&...args) {
+    return mixin_builder()->CreateFPToSI(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) FPToUI(Args &&...args) {
+    return mixin_builder()->CreateFPToUI(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) SIToFP(Args &&...args) {
+    return mixin_builder()->CreateSIToFP(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) UIToFP(Args &&...args) {
+    return mixin_builder()->CreateUIToFP(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) Select(Args &&...args) {
+    return mixin_builder()->CreateSelect(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) Br(Args &&...args) {
+    return mixin_builder()->CreateBr(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) CondBr(Args &&...args) {
+    return mixin_builder()->CreateCondBr(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) Alloca(Args &&...args) {
+    return mixin_builder()->CreateAlloca(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) Load(Args &&...args) {
+    return mixin_builder()->CreateLoad(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) AlignedLoad(Args &&...args) {
+    return mixin_builder()->CreateAlignedLoad(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) Store(Args &&...args) {
+    return mixin_builder()->CreateStore(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) AlignedStore(Args &&...args) {
+    return mixin_builder()->CreateAlignedStore(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) Call(Args &&...args) {
+    return mixin_builder()->CreateCall(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) RetVoid(Args &&...args) {
+    return mixin_builder()->CreateRetVoid(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) GEP(Args &&...args) {
+    return mixin_builder()->CreateGEP(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) InBoundsGEP(Args &&...args) {
+    return mixin_builder()->CreateInBoundsGEP(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  decltype(auto) PHI(Args &&...args) {
+    return mixin_builder()->CreatePHI(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) InsertValue(Args &&...args) {
+    return mixin_builder()->CreateInsertValue(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) ExtractValue(Args &&...args) {
+    return mixin_builder()->CreateExtractValue(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) InsertElement(Args &&...args) {
+    return mixin_builder()->CreateInsertElement(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  decltype(auto) ShuffleVector(Args &&...args) {
+    return mixin_builder()->CreateShuffleVector(std::forward<Args>(args)...);
+  }
+
+ private:
+  llvm::IRBuilder<> *mixin_builder() { return static_cast<Derived *>(this)->b(); }
+};
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/llvm_intrin_rule.h b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
new file mode 100644
index 0000000000000..822349f8a8ae9
--- /dev/null
+++ b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <glog/logging.h>
+#include <llvm/IR/Intrinsics.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/intrinsic_ops.h"
+#include "cinn/ir/registry.h"
+#include "cinn/lang/packed_func.h"
+
+namespace cinn {
+namespace codegen {
+
+template <int id, int arg_nums, bool add_float_suffix = true>
+inline void MakeFloatIntrinOp(lang::Args args, lang::RetValue *rv) {
+  CHECK_GE(args.size(), 1U);
+  Expr arg       = args[0];
+  ir::Call *node = arg->as<ir::Call>();
+  CHECK(node);
+  CHECK_GE(node->read_args.size(), arg_nums);
+  if (add_float_suffix) {
+    CHECK(node->type().is_float());
+    *rv = ir::intrinsics::BuiltinIntrin::Make(node->name + "f", node->read_args, id, arg_nums, node->type());
+  } else {
+    *rv = ir::intrinsics::BuiltinIntrin::Make(node->name, node->read_args, id, arg_nums, node->type());
+  }
+}
+
+void RegisterCpuIntrinRule() {
+#define __(intrin_name__, id) \
+  ir::Registry::Register("lower_cpu_intrinsic_" #intrin_name__, true).SetBody(MakeFloatIntrinOp<id, 1>);
+  __(exp, ::llvm::Intrinsic::exp)
+  __(exp2, ::llvm::Intrinsic::exp2)
+  __(sqrt, ::llvm::Intrinsic::sqrt)
+  __(log, ::llvm::Intrinsic::log)
+  __(log2, ::llvm::Intrinsic::log2)
+  __(log10, ::llvm::Intrinsic::log10)
+  __(floor, ::llvm::Intrinsic::floor)
+  __(ceil, ::llvm::Intrinsic::ceil)
+  __(round, ::llvm::Intrinsic::round)
+  __(trunc, ::llvm::Intrinsic::trunc)
+  __(cos, ::llvm::Intrinsic::cos)
+  __(sin, ::llvm::Intrinsic::sin)
+  __(fabs, ::llvm::Intrinsic::fabs)
+#undef __
+
+// set id -1 if not llvm intrinsics
+#define RegisterBitwise(intrin_name__) \
+  ir::Registry::Register("lower_cpu_intrinsic_" #intrin_name__, true).SetBody(MakeFloatIntrinOp<-1, 2, false>);
+  RegisterBitwise(bitwise_or) RegisterBitwise(bitwise_xor) RegisterBitwise(bitwise_and) RegisterBitwise(left_shift)
+      RegisterBitwise(right_shift)
+#undef RegisterBitwise
+
+          ir::Registry::Register("lower_cpu_intrinsic_fma", true)
+              .SetBody(MakeFloatIntrinOp<::llvm::Intrinsic::fmuladd, 3, false>);
+
+  ir::Registry::Register("lower_cpu_intrinsic_bitwise_not", true).SetBody(MakeFloatIntrinOp<-1, 1, false>);
+
+  ir::Registry::Register("lower_cpu_intrinsic_isnan", true).SetBody(MakeFloatIntrinOp<-1, 1, false>);
+
+  ir::Registry::Register("lower_cpu_intrinsic_isfinite", true).SetBody([](lang::Args args, lang::RetValue *rv) {
+    CHECK_GE(args.size(), 1U);
+    Expr arg0      = args[0];
+    ir::Call *node = arg0->as<ir::Call>();
+    CHECK(node);
+    CHECK(!node->read_args.empty());
+    Expr arg = node->read_args[0];
+    *rv      = !(lang::IsInf(arg)) && !(lang::IsNan(arg));
+  });
+
+  ir::Registry::Register("lower_cpu_intrinsic_isinf", true).SetBody([](lang::Args args, lang::RetValue *rv) {
+    CHECK_GE(args.size(), 1U);
+    Expr arg0      = args[0];
+    ir::Call *node = arg0->as<ir::Call>();
+    CHECK(node);
+    CHECK(!node->read_args.empty());
+    Expr arg  = node->read_args[0];
+    Type type = arg->type();
+    if (type.is_int() || type.is_uint()) {
+      *rv = common::make_bool(false, type.lanes());
+    } else if (type.is_float()) {
+      *rv = ir::EQ::Make(lang::Abs(arg), lang::Infinity(type)) && !(lang::IsNan(arg));
+    }
+  });
+
+  ir::Registry::Register("lower_cpu_intrinsic_rsqrt", true).SetBody([](lang::Args args, lang::RetValue *rv) {
+    CHECK_GE(args.size(), 1U);
+    Expr arg0      = args[0];
+    ir::Call *node = arg0->as<ir::Call>();
+    CHECK(node);
+    CHECK(!node->read_args.empty());
+    Expr arg = node->read_args[0];
+    *rv      = make_const(arg->type(), 1) / lang::Sqrt(arg);
+  });
+
+  ir::Registry::Register("lower_cpu_intrinsic_exp10", true).SetBody([](lang::Args args, lang::RetValue *rv) {
+    CHECK_GE(args.size(), 1U);
+    Expr arg0      = args[0];
+    ir::Call *node = arg0->as<ir::Call>();
+    CHECK(node);
+    CHECK(!node->read_args.empty());
+    Expr arg  = node->read_args[0];
+    Expr ln10 = make_const(arg->type(), 2.302585093);
+    *rv       = lang::Exp(arg * ln10);
+  });
+
+  ir::Registry::Register("lower_cpu_intrinsic_tan", true).SetBody([](lang::Args args, lang::RetValue *rv) {
+    CHECK_GE(args.size(), 1U);
+    Expr arg0      = args[0];
+    ir::Call *node = arg0->as<ir::Call>();
+    CHECK(node);
+    CHECK(!node->read_args.empty());
+    Expr arg = node->read_args[0];
+    *rv      = lang::Sin(arg) / lang::Cos(arg);
+  });
+
+  ir::Registry::Register("lower_cpu_intrinsic_tanh", true).SetBody([](lang::Args args, lang::RetValue *rv) {
+    CHECK_GE(args.size(), 1U);
+    Expr arg0      = args[0];
+    ir::Call *node = arg0->as<ir::Call>();
+    CHECK(node);
+    CHECK(!node->read_args.empty());
+    Expr arg     = node->read_args[0];
+    Expr zero    = make_const(arg->type(), 0);
+    Expr one     = make_const(arg->type(), 1);
+    Expr two     = make_const(arg->type(), 2);
+    Expr neg_two = make_const(arg->type(), -2);
+
+    Expr exp_neg2x = lang::Exp(neg_two * arg);
+    Expr exp_pos2x = lang::Exp(two * arg);
+
+    Expr tanh_pos = (one - exp_neg2x) / (one + exp_neg2x);
+    Expr tanh_neg = (exp_pos2x - one) / (exp_pos2x + one);
+    *rv           = ir::Select::Make(arg >= zero, tanh_pos, tanh_neg);
+  });
+
+  ir::Registry::Register("lower_cpu_intrinsic_cosh", true).SetBody([](lang::Args args, lang::RetValue *rv) {
+    CHECK_GE(args.size(), 1U);
+    Expr arg0      = args[0];
+    ir::Call *node = arg0->as<ir::Call>();
+    CHECK(node);
+    CHECK(!node->read_args.empty());
+    Expr arg = node->read_args[0];
+    *rv      = (lang::Exp(arg) + lang::Exp(arg * make_const(arg->type(), -1))) / make_const(arg->type(), 2);
+  });
+
+  ir::Registry::Register("lower_cpu_intrinsic_sinh", true).SetBody([](lang::Args args, lang::RetValue *rv) {
+    CHECK_GE(args.size(), 1U);
+    Expr arg0      = args[0];
+    ir::Call *node = arg0->as<ir::Call>();
+    CHECK(node);
+    CHECK(!node->read_args.empty());
+    Expr arg = node->read_args[0];
+    *rv      = (lang::Exp(arg) - lang::Exp(arg * make_const(arg->type(), -1))) / make_const(arg->type(), 2);
+  });
+}
+}  // namespace codegen
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/llvm_optimizer.cc b/paddle/cinn/backends/llvm/llvm_optimizer.cc
new file mode 100644
index 0000000000000..ff5c60d74fd7a
--- /dev/null
+++ b/paddle/cinn/backends/llvm/llvm_optimizer.cc
@@ -0,0 +1,166 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/llvm/llvm_optimizer.h"
+
+#include <glog/logging.h>
+#include <llvm/ADT/Triple.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/AsmParser/Parser.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/JITSymbol.h>
+#include <llvm/ExecutionEngine/Orc/CompileUtils.h>
+#include <llvm/ExecutionEngine/Orc/Core.h>
+#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
+#include <llvm/ExecutionEngine/Orc/IRCompileLayer.h>
+#include <llvm/ExecutionEngine/Orc/LambdaResolver.h>
+#include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
+#include <llvm/ExecutionEngine/Orc/ThreadSafeModule.h>
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Support/Error.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/SmallVectorMemoryBuffer.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/IPO/PassManagerBuilder.h>
+#include <llvm/Transforms/InstCombine/InstCombine.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/Scalar/GVN.h>
+#include <llvm/Transforms/Scalar/NewGVN.h>
+#include <llvm/Transforms/Scalar/Reassociate.h>
+#include <llvm/Transforms/Scalar/SimplifyCFG.h>
+#include <llvm/Transforms/Vectorize.h>
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/Support/CodeGen.h"
+
+namespace cinn::backends {
+
+namespace {
+template <typename PassManagerT>
+class CustomPassManager : public PassManagerT {
+ public:
+  template <typename... Ts>
+  explicit CustomPassManager(bool print_passes, Ts &&...ts)
+      : PassManagerT(std::forward<Ts>(ts)...), print_passes_(print_passes) {}
+
+  void add(llvm::Pass *pass) override {
+    if (print_passes_) {
+      if (is_function_pass_manager_) {
+        VLOG(1) << "llvm run function pass[" << std::string(pass->getPassName()) << "]";
+      }
+
+      if (is_module_pass_manager_) {
+        VLOG(1) << "llvm run module pass[" << std::string(pass->getPassName()) << "]";
+      }
+    }
+    // static bool add_pass = true;
+    // if (add_pass) {
+    //  PassManagerT::add(pass);
+    //}
+
+    // if (std::string(pass->getPassName()) == "Loop Vectorization") {
+    //  return;
+    //}
+    PassManagerT::add(pass);
+  }
+
+  void run(llvm::Function &f) {  // NOLINT
+    if (is_function_pass_manager_) {
+      PassManagerT::run(f);
+    }
+  }
+
+  void run(llvm::Module &m) {  // NOLINT
+    if (is_module_pass_manager_) {
+      PassManagerT::run(m);
+    }
+  }
+
+ private:
+  static constexpr bool is_function_pass_manager_ =
+      std::is_same<llvm::legacy::FunctionPassManager, PassManagerT>::value;
+  static constexpr bool is_module_pass_manager_ = std::is_same<llvm::legacy::PassManager, PassManagerT>::value;
+  bool print_passes_;
+};
+
+using CustomFunctionPassManager = CustomPassManager<llvm::legacy::FunctionPassManager>;
+using CustomModulePassManager   = CustomPassManager<llvm::legacy::PassManager>;
+}  // namespace
+
+LLVMModuleOptimizer::LLVMModuleOptimizer(llvm::TargetMachine *machine,
+                                         int opt_level,
+                                         llvm::FastMathFlags fast_math_flags,
+                                         bool print_passes)
+    : opt_level_(opt_level), print_passes_(print_passes), machine_(machine) {}
+
+void LLVMModuleOptimizer::operator()(llvm::Module *m) {
+  auto machine =
+      std::move(llvm::cantFail(llvm::cantFail(llvm::orc::JITTargetMachineBuilder::detectHost()).createTargetMachine()));
+  auto fpm = std::make_unique<CustomFunctionPassManager>(print_passes_, m);
+  // fpm->add(llvm::createTargetTransformInfoWrapperPass(llvm::TargetIRAnalysis()));
+  // fpm->add(llvm::createInstructionCombiningPass());
+  // fpm->add(llvm::createReassociatePass());
+  // fpm->add(llvm::createGVNPass());
+  // fpm->add(llvm::createCFGSimplificationPass());
+  // fpm->add(llvm::createSROAPass());
+  // fpm->add(llvm::createEarlyCSEPass());
+  // fpm->add(llvm::createLowerExpectIntrinsicPass());
+  // fpm->add(llvm::createCallSiteSplittingPass());
+  // fpm->add(llvm::createLoopVectorizePass());
+  // fpm->add(llvm::createSLPVectorizerPass());
+  // fpm->add(llvm::createLoadStoreVectorizerPass());
+  // fpm->add(llvm::createLoopUnrollPass());
+
+  auto mpm = std::make_unique<CustomModulePassManager>(print_passes_);
+  // mpm->add(llvm::createTargetTransformInfoWrapperPass(llvm::TargetIRAnalysis()));
+  // LOG(INFO) << "llvm run pass: target machine: name[" << machine_->getTarget().getName() << "]";
+  // LOG(INFO) << "llvm run pass: target machine: cpu[" << machine_->getTargetCPU().str() << "]";
+  fpm->add(llvm::createTargetTransformInfoWrapperPass(machine->getTargetIRAnalysis()));
+  mpm->add(llvm::createTargetTransformInfoWrapperPass(machine->getTargetIRAnalysis()));
+  auto builder           = std::make_unique<llvm::PassManagerBuilder>();
+  builder->OptLevel      = opt_level_;
+  builder->Inliner       = llvm::createFunctionInliningPass();
+  builder->LoopVectorize = true;
+  builder->SLPVectorize  = true;
+#if LLVM_VERSION_MAJOR >= 11
+  machine->adjustPassManager(*builder);
+#endif
+  builder->populateFunctionPassManager(*fpm);
+  builder->populateModulePassManager(*mpm);
+
+  fpm->doInitialization();
+  std::for_each(m->begin(), m->end(), [&fpm](auto &fn) { fpm->run(fn); });
+  fpm->doFinalization();
+
+  mpm->run(*m);
+}
+
+}  // namespace cinn::backends
diff --git a/paddle/cinn/backends/llvm/llvm_optimizer.h b/paddle/cinn/backends/llvm/llvm_optimizer.h
new file mode 100644
index 0000000000000..ea613c1da0b2b
--- /dev/null
+++ b/paddle/cinn/backends/llvm/llvm_optimizer.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <llvm/IR/Instruction.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Pass.h>
+#include <llvm/Target/TargetMachine.h>
+
+#include <functional>
+
+namespace cinn::backends {
+
+// TODO(fc500110): define class OptimizeOptions
+
+// llvm module optimizer
+class LLVMModuleOptimizer final {
+ public:
+  explicit LLVMModuleOptimizer(llvm::TargetMachine *machine,
+                               int opt_level,
+                               llvm::FastMathFlags fast_math_flags,
+                               bool print_passes = false);
+  void operator()(llvm::Module *m);
+
+ private:
+  llvm::TargetMachine *machine_;
+  int opt_level_{};
+  bool print_passes_{};
+};
+}  // namespace cinn::backends
diff --git a/paddle/cinn/backends/llvm/llvm_util.cc b/paddle/cinn/backends/llvm/llvm_util.cc
new file mode 100644
index 0000000000000..e03325faf4d21
--- /dev/null
+++ b/paddle/cinn/backends/llvm/llvm_util.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/llvm/llvm_util.h"
+
+#include <glog/logging.h>
+#include <llvm/Support/Alignment.h>
+
+#include <atomic>
+#include <mutex>  //NOLINT
+
+namespace cinn {
+namespace backends {
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+llvm::Type *CinnTypeToLLVMType(common::Type type, llvm::Module *m, bool is_vec) {
+  llvm::Type *ir_type = nullptr;
+  if (type.is_cpp_const()) {
+    // TODO(fc500110) support it latter.
+  }
+
+  llvm::Type *v = llvm::Type::getVoidTy(m->getContext());
+
+  llvm::Type *i1 = llvm::Type::getInt1Ty(m->getContext());
+
+  llvm::Type *i8  = llvm::Type::getInt8Ty(m->getContext());
+  llvm::Type *i16 = llvm::Type::getInt16Ty(m->getContext());
+  llvm::Type *i32 = llvm::Type::getInt32Ty(m->getContext());
+  llvm::Type *i64 = llvm::Type::getInt64Ty(m->getContext());
+
+  llvm::Type *u8  = llvm::Type::getInt8Ty(m->getContext());
+  llvm::Type *u16 = llvm::Type::getInt16Ty(m->getContext());
+  llvm::Type *u32 = llvm::Type::getInt32Ty(m->getContext());
+  llvm::Type *u64 = llvm::Type::getInt64Ty(m->getContext());
+
+  llvm::Type *bf16 = llvm::Type::getBFloatTy(m->getContext());
+  llvm::Type *f16  = llvm::Type::getHalfTy(m->getContext());
+  llvm::Type *f32  = llvm::Type::getFloatTy(m->getContext());
+  llvm::Type *f64  = llvm::Type::getDoubleTy(m->getContext());
+  llvm::Type *arr  = llvm::Type::getPrimitiveType(m->getContext(), llvm::Type::ArrayTyID);
+  if (type.is_void() && type.is_cpp_handle()) {
+    return llvm::PointerType::getUnqual(i8);
+  }
+  if (type.is_void() && type.is_cpp_handle2()) {
+    return llvm::PointerType::getUnqual(llvm::PointerType::getUnqual(i8));
+  }
+
+  if (type.is_bool()) {
+    ir_type = i1;
+  } else if (type.is_int(8)) {
+    ir_type = i8;
+  } else if (type.is_int(16)) {
+    ir_type = i16;
+  } else if (type.is_int(32)) {
+    ir_type = i32;
+  } else if (type.is_int(64)) {
+    ir_type = i64;
+  } else if (type.is_uint(8)) {
+    ir_type = u8;
+  } else if (type.is_uint(16)) {
+    ir_type = u16;
+  } else if (type.is_uint(32)) {
+    ir_type = u32;
+  } else if (type.is_uint(64)) {
+    ir_type = u64;
+  } else if (type.is_float(32)) {
+    ir_type = f32;
+  } else if (type.is_float(64)) {
+    ir_type = f64;
+  } else if (type.is_bfloat16()) {
+    ir_type = bf16;
+  } else if (type.is_float16()) {
+    ir_type = f16;
+  } else if (type.is_void()) {
+    ir_type = v;
+  } else if (type.is_string()) {
+    ir_type = arr;
+  } else if (type.is_customized_type()) {
+    CHECK(!type.customized_type().empty());
+    ir_type = m->getTypeByName("struct." + type.customized_type());
+  }
+  CHECK(ir_type) << "LLVM can't convert type: " << type;
+
+  // C array / vector.
+  if (type.lanes() > 1) {
+    if (is_vec) {
+      ir_type = llvm::FixedVectorType::get(ir_type, type.lanes());
+    } else {
+      ir_type = llvm::ArrayType::get(ir_type, type.lanes());
+    }
+  }
+
+  if (type.is_cpp_handle()) {
+    ir_type = llvm::PointerType::getUnqual(ir_type);
+  }
+
+  if (type.is_cpp_handle2()) {
+    ir_type = llvm::PointerType::getUnqual(ir_type);
+    ir_type = llvm::PointerType::getUnqual(ir_type);
+  }
+
+  return ir_type;
+}
+
+#define __(ty__)                                           \
+  template <>                                              \
+  llvm::Type *llvm_type_of<ty__>(llvm::Module * m) {       \
+    return CinnTypeToLLVMType(common::type_of<ty__>(), m); \
+  }
+
+__(int8_t)
+__(int16_t)
+__(int32_t)
+__(int64_t)
+__(uint8_t)
+__(uint16_t)
+__(uint32_t)
+__(uint64_t)
+__(bfloat16)
+__(float16)
+__(float)
+__(double)
+__(cinn_buffer_t)
+__(cinn_buffer_t *)
+__(cinn_pod_value_t *)
+__(cinn_pod_value_t)
+__(void *)
+__(void **)
+
+#undef __
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/llvm_util.h b/paddle/cinn/backends/llvm/llvm_util.h
new file mode 100644
index 0000000000000..b53b46af245d8
--- /dev/null
+++ b/paddle/cinn/backends/llvm/llvm_util.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/strings/string_view.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/ExecutionEngine/MCJIT.h>
+#include <llvm/IR/Argument.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instruction.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Value.h>
+
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "cinn/common/type.h"
+
+namespace cinn {
+namespace backends {
+
+template <typename T>
+std::string DumpToString(const T &entity) {
+  std::string buffer;
+  llvm::raw_string_ostream os(buffer);
+  entity.print(os);
+  os.flush();
+  return buffer;
+  // return "\033[33m" + buffer + "\033[0m"; // Green
+}
+
+inline llvm::StringRef AsStringRef(absl::string_view str) { return llvm::StringRef(str.data(), str.size()); }
+
+llvm::Type *CinnTypeToLLVMType(common::Type t, llvm::Module *m, bool is_vec = false);
+
+template <typename T>
+llvm::Type *llvm_type_of(llvm::Module *m);
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/runtime_symbol_registry.cc b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
new file mode 100644
index 0000000000000..796a7f9b69216
--- /dev/null
+++ b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+
+#include <absl/strings/string_view.h>
+#include <glog/raw_logging.h>
+
+#include <iostream>
+
+#include "cinn/runtime/flags.h"
+#include "gflags/gflags_declare.h"
+
+DECLARE_bool(verbose_function_register);
+
+namespace cinn {
+namespace backends {
+
+RuntimeSymbols &GlobalSymbolRegistry::Global() {
+  static RuntimeSymbols symbols;
+  return symbols;
+}
+
+void *RuntimeSymbols::Lookup(absl::string_view name) const {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto it = symbols_.find(std::string(name));
+  if (it != symbols_.end()) {
+    return it->second;
+  }
+
+  return nullptr;
+}
+
+void RuntimeSymbols::Register(const std::string &name, void *address) {
+#ifdef CINN_WITH_DEBUG
+  if (FLAGS_verbose_function_register) {
+    RAW_LOG_INFO("JIT Register function [%s]: %p", name.c_str(), address);
+  }
+#endif  // CINN_WITH_DEBUG
+  std::lock_guard<std::mutex> lock(mu_);
+  auto it = symbols_.find(name);
+  if (it != symbols_.end()) {
+    CHECK_EQ(it->second, address) << "Duplicate register symbol [" << name << "]";
+    return;
+  }
+
+  symbols_.insert({name, reinterpret_cast<void *>(address)});
+}
+
+void RuntimeSymbols::Clear() {
+  std::lock_guard<std::mutex> lock(mu_);
+  symbols_.clear();
+  scalar_holder_.clear();
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/runtime_symbol_registry.h b/paddle/cinn/backends/llvm/runtime_symbol_registry.h
new file mode 100644
index 0000000000000..91e82cb1ffad9
--- /dev/null
+++ b/paddle/cinn/backends/llvm/runtime_symbol_registry.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/strings/string_view.h>
+#include <absl/types/any.h>
+#include <absl/types/variant.h>
+#include <glog/logging.h>
+
+#include <map>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+
+#include "cinn/common/macros.h"
+
+namespace cinn {
+namespace backends {
+
+class RuntimeSymbols {
+ public:
+  RuntimeSymbols() = default;
+
+  RuntimeSymbols(const RuntimeSymbols &) = delete;
+
+  RuntimeSymbols(RuntimeSymbols &&rhs) {
+    symbols_       = std::move(rhs.symbols_);
+    scalar_holder_ = std::move(rhs.scalar_holder_);
+  }
+
+  /**
+   * Register function address.
+   * @param name Name of the symbol.
+   * @param address Address of the function.
+   */
+  void RegisterFn(const std::string &name, void *address) { Register(name, address); }
+
+  /**
+   * Register scalar.
+   * @tparam T Type of the scalar.
+   * @param name Name of the symbol.
+   * @param val Scalar value.
+   */
+  template <typename T, typename = std::enable_if<std::is_pod<T>::value>>
+  void RegisterVar(const std::string &name, T val) {
+    void *data_ptr = nullptr;
+    {
+      std::lock_guard<std::mutex> lock(mu_);
+      auto &data = scalar_holder_[name];
+      data.resize(sizeof(T));
+      memcpy(data.data(), &val, sizeof(T));
+      data_ptr = reinterpret_cast<void *>(data.data());
+    }
+    Register(name, data_ptr);
+  }
+
+  /**
+   * Lookup a symbol from the registry.
+   * @param name Name of the symbol.
+   * @return The address if existes, or nullptr will return.
+   */
+  void *Lookup(absl::string_view name) const;
+
+  /**
+   * Get all the symbols.
+   */
+  const std::map<std::string, void *> &All() const { return symbols_; }
+
+  /**
+   * Clear all the symbols.
+   */
+  void Clear();
+
+ private:
+  /**
+   * Register external symbol to the registry, the symbols in the registry will finally registered to JIT .
+   * @param name Name of the symbol in the JIT.
+   * @param address The address of the variable in external space.
+   */
+  void Register(const std::string &name, void *address);
+
+  mutable std::mutex mu_;
+  std::map<std::string, void *> symbols_;
+  std::map<std::string, std::vector<int8_t>> scalar_holder_;
+};
+
+/**
+ * Registry for runtime symbols, these symbols will be inserted into JIT.
+ */
+
+class GlobalSymbolRegistry {
+ public:
+  static RuntimeSymbols &Global();
+
+ private:
+  GlobalSymbolRegistry() = default;
+  CINN_DISALLOW_COPY_AND_ASSIGN(GlobalSymbolRegistry);
+};
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/simple_jit.cc b/paddle/cinn/backends/llvm/simple_jit.cc
new file mode 100755
index 0000000000000..77f55e18644cd
--- /dev/null
+++ b/paddle/cinn/backends/llvm/simple_jit.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/llvm/simple_jit.h"
+
+#include <llvm/AsmParser/Parser.h>
+#include <llvm/ExecutionEngine/JITSymbol.h>
+#include <llvm/ExecutionEngine/Orc/Core.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/Scalar/GVN.h>
+#include <llvm/Transforms/Scalar/Reassociate.h>
+#include <llvm/Transforms/Scalar/SimplifyCFG.h>
+
+#include <string>
+#include <utility>
+
+#include "cinn/backends/codegen_cuda_host.h"
+#include "cinn/backends/llvm/cinn_runtime_llvm_ir.h"
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/llvm_util.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/runtime/intrinsic.h"
+
+namespace cinn {
+namespace backends {
+
+void SimpleJIT::AddModule(std::unique_ptr<llvm::Module> module, bool optimize) {
+  /*
+  for (auto &fn : module->functions()) {
+    LOG(INFO) << "fn:\n" << DumpToString(fn);
+  }
+   */
+  CHECK(!llvm::verifyModule(*module, &llvm::errs())) << "Transformation resulted in an invalid module\n\nmodule:\n";
+
+  bool debug = false;
+  if (optimize) {
+    llvm::PassBuilder pass_builder;
+    llvm::LoopAnalysisManager loop_analysis_manager(debug);
+    llvm::FunctionAnalysisManager function_analysis_manager(debug);
+    llvm::CGSCCAnalysisManager cgscc_analysis_manager(debug);
+    llvm::ModuleAnalysisManager module_analysis_manager(debug);
+
+    pass_builder.registerModuleAnalyses(module_analysis_manager);
+    pass_builder.registerCGSCCAnalyses(cgscc_analysis_manager);
+    pass_builder.registerFunctionAnalyses(function_analysis_manager);
+    pass_builder.registerLoopAnalyses(loop_analysis_manager);
+    pass_builder.crossRegisterProxies(
+        loop_analysis_manager, function_analysis_manager, cgscc_analysis_manager, module_analysis_manager);
+
+    llvm::ModulePassManager module_pass_manager =
+        pass_builder.buildPerModuleDefaultPipeline(llvm::PassBuilder::OptimizationLevel::O3);
+    module_pass_manager.run(*module, module_analysis_manager);
+  }
+
+  VLOG(3) << "jit target: " << jit_->getDataLayout().getStringRepresentation();
+  VLOG(3) << "module target: " << module->getDataLayout().getStringRepresentation();
+
+  llvm::orc::ThreadSafeModule tsm(std::move(module), context_);
+  llvm::cantFail(jit_->addIRModule(std::move(tsm)));
+
+  if (debug) {
+    std::string buffer;
+    llvm::raw_string_ostream os(buffer);
+    jit_->getExecutionSession().dump(os);
+    os.flush();
+    VLOG(3) << "compiled jit:\n" << buffer;
+  }
+}
+
+SimpleJIT::SimpleJIT() : context_(std::make_unique<llvm::LLVMContext>()) {
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+  llvm::InitializeAllAsmPrinters();
+
+  jit_ = llvm::cantFail(llvm::orc::LLJITBuilder().create());
+  CHECK(jit_) << "JIT create failed";
+
+  auto proc_symbols_generator = llvm::cantFail(
+      llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(jit_->getDataLayout().getGlobalPrefix()));
+  jit_->getMainJITDylib().addGenerator(std::move(proc_symbols_generator));
+
+  llvm::orc::MangleAndInterner mangle(jit_->getExecutionSession(), jit_->getDataLayout());
+
+  for (auto &item : GlobalSymbolRegistry::Global().All()) {
+    VLOG(2) << "Insert [" << item.first << "] to SimpleJIT";
+    llvm::cantFail(jit_->define(llvm::orc::absoluteSymbols(
+        {{mangle(item.first), {llvm::pointerToJITTargetAddress(item.second), llvm::JITSymbolFlags::None}}})));
+  }
+}
+
+template <typename CodeGenT>
+void SimpleJIT::Link(ir::Module module, bool optimize) {
+  std::string runtime_ir(backends::kRuntimeLlvmIr);
+  llvm::SMDiagnostic error;
+  auto m = llvm::parseAssemblyString(runtime_ir, error, context());
+  m->setDataLayout(jit_->getDataLayout());
+  auto b = std::make_unique<llvm::IRBuilder<>>(context());
+
+  auto ir_emitter = std::make_unique<CodeGenT>(m.get(), b.get());
+  ir_emitter->Compile(module);
+
+  CHECK(!llvm::verifyModule(*m, &llvm::errs())) << "Invalid module found";
+
+  AddModule(std::move(m), optimize);
+}
+
+template void SimpleJIT::Link<CodeGenLLVM>(ir::Module module, bool optimize);
+template void SimpleJIT::Link<CodeGenCUDA_Host>(ir::Module module, bool optimize);
+
+}  // namespace backends
+
+}  // namespace cinn
diff --git a/paddle/cinn/backends/llvm/simple_jit.h b/paddle/cinn/backends/llvm/simple_jit.h
new file mode 100755
index 0000000000000..ebbae127c3d8e
--- /dev/null
+++ b/paddle/cinn/backends/llvm/simple_jit.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/strings/string_view.h>
+#include <llvm/AsmParser/Parser.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/JITSymbol.h>
+#include <llvm/ExecutionEngine/Orc/CompileUtils.h>
+#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
+#include <llvm/ExecutionEngine/Orc/IRCompileLayer.h>
+#include <llvm/ExecutionEngine/Orc/LLJIT.h>
+#include <llvm/ExecutionEngine/Orc/LambdaResolver.h>
+#include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
+#include <llvm/ExecutionEngine/Orc/ThreadSafeModule.h>
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Error.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/SmallVectorMemoryBuffer.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/llvm_util.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/ir/module.h"
+#include "cinn/runtime/intrinsic.h"
+
+namespace cinn {
+namespace backends {
+
+class SimpleJIT {
+ public:
+  static std::unique_ptr<SimpleJIT> Create() { return std::unique_ptr<SimpleJIT>(new SimpleJIT); }
+
+  /**
+   * Runtime link to a module.
+   * @tparam CodeGenT a CodeGenLLVM implementation.
+   * @param module a CINN module.
+   * @param optimize whether to optimize.
+   */
+  template <typename CodeGenT = CodeGenLLVM>
+  void Link(ir::Module module, bool optimize = true);
+
+  void Link(llvm::orc::ThreadSafeModule m, bool optimize = true) { llvm::cantFail(jit_->addIRModule(std::move(m))); }
+
+  llvm::JITTargetAddress Lookup(absl::string_view name) {
+    return llvm::cantFail(jit_->lookup(AsStringRef(name))).getAddress();
+  }
+
+ private:
+  void AddModule(std::unique_ptr<llvm::Module> module, bool optimize);
+
+  llvm::LLVMContext &context() { return *context_.getContext(); }
+
+  SimpleJIT();
+
+  std::unique_ptr<llvm::orc::LLJIT> jit_;
+  llvm::orc::ThreadSafeContext context_;
+};
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/modular.cc b/paddle/cinn/backends/modular.cc
new file mode 100644
index 0000000000000..e09c06b0d43ef
--- /dev/null
+++ b/paddle/cinn/backends/modular.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/modular.h"
+
+#include "cinn/ir/ir_visitor.h"
+
+namespace cinn {
+namespace backends {
+
+class ModularEvaluator : public ir::IRVisitorBase<ModularEntry> {
+ public:
+  explicit ModularEvaluator(const std::map<Var, ModularEntry>& mod_map) : mod_map_(mod_map) {}
+
+  ModularEntry Eval(const Expr& e) { return ir::IRVisitorBase<ModularEntry>::Visit(&e); }
+
+  ModularEntry Visit(const ir::IntImm* op) {
+    if (op->value < std::numeric_limits<int>::max()) {
+      return ModularEntry{static_cast<int>(op->value), 0};
+    }
+    return ModularEntry::everything();
+  }
+
+  ModularEntry Visit(const ir::UIntImm* op) {
+    if (op->value < std::numeric_limits<uint64_t>::max()) {
+      return ModularEntry{static_cast<int>(op->value), 0};
+    }
+    return ModularEntry::everything();
+  }
+
+  ModularEntry Visit(const ir::_Var_* op) {
+    Var var(&Reference(op));
+    auto it = mod_map_.find(var);
+    if (it != mod_map_.end()) return it->second;
+    return ModularEntry::everything();
+  }
+
+  ModularEntry Visit(const ir::Add* op) {
+    auto a = Eval(op->a());
+    auto b = Eval(op->b());
+    ModularEntry ret;
+    ret.coeff = gcd(a.coeff, b.coeff);
+    ret.base  = BaseSimplify(a.base + b.base, ret.coeff);
+    return ret;
+  }
+
+  ModularEntry Visit(const ir::Sub* op) {
+    auto a = Eval(op->a());
+    auto b = Eval(op->b());
+
+    ModularEntry ret;
+    ret.coeff = gcd(a.coeff, b.coeff);
+    ret.base  = BaseSimplify(a.base - b.base, ret.coeff);
+    return ret;
+  }
+
+  ModularEntry Visit(const ir::Mul* op) {
+    auto a = Eval(op->a());
+    auto b = Eval(op->b());
+
+    int pq = a.coeff * b.coeff;
+    int pm = a.coeff * b.base;
+    int qn = a.base * b.coeff;
+
+    ModularEntry ret;
+    ret.coeff = gcd(pq, gcd(pm, qn));
+    ret.base  = BaseSimplify(a.base * b.base, ret.coeff);
+    return ret;
+  }
+
+  ModularEntry Visit(const ir::Div* op) {
+    auto a = Eval(op->a());
+    auto b = Eval(op->b());
+
+    if (b.coeff % b.base == 0) {
+      ModularEntry ret;
+      ret.coeff = a.coeff / b.base;
+      ret.base  = 0;
+      return ret;
+    }
+
+    return ModularEntry::everything();
+  }
+
+  static int BaseSimplify(int base, int coeff) {
+    if (coeff == 0) return base;
+    base = base % coeff;
+    if (base < 0) base += coeff;
+    return base;
+  }
+
+  static int gcd(int a, int b) {
+    CHECK_GE(a, 0);
+    CHECK_GE(b, 0);
+    if (a < b) std::swap(a, b);
+    if (b == 0) return a;
+
+    while (a % b != 0) {
+      a = a % b;
+      std::swap(a, b);
+    }
+    return b;
+  }
+
+ private:
+  const std::map<Var, ModularEntry>& mod_map_;
+};
+
+ModularEntry ModularEntry::Add(const ModularEntry& a, const ModularEntry& b) {
+  ModularEntry ret;
+  ret.coeff = ModularEvaluator::gcd(a.coeff, b.coeff);
+  ret.base  = ModularEvaluator::BaseSimplify(a.base + b.base, ret.coeff);
+  return ret;
+}
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/modular.h b/paddle/cinn/backends/modular.h
new file mode 100644
index 0000000000000..a72bc9f922b18
--- /dev/null
+++ b/paddle/cinn/backends/modular.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace backends {
+
+// borrowed from Halide and TVM.
+struct ModularEntry {
+  int base;
+  int coeff;
+
+  ModularEntry() = default;
+  ModularEntry(int base, int coeff) : base(base), coeff(coeff) {}
+
+  static ModularEntry everything() { return ModularEntry{0, 1}; }
+
+  static ModularEntry Add(const ModularEntry& a, const ModularEntry& b);
+};
+
+ModularEntry EvalModular(const Expr& e, const std::map<Var, ModularEntry>& mod_map);
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/nvrtc/CMakeLists.txt b/paddle/cinn/backends/nvrtc/CMakeLists.txt
new file mode 100644
index 0000000000000..a344b65ca93e4
--- /dev/null
+++ b/paddle/cinn/backends/nvrtc/CMakeLists.txt
@@ -0,0 +1,8 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+  header_generator.cc
+  nvrtc_util.cc
+)
+
+nv_test(test_nvrtc_util SRCS nvrtc_util_test.cc DEPS cinncore)
diff --git a/paddle/cinn/backends/nvrtc/header_generator.cc b/paddle/cinn/backends/nvrtc/header_generator.cc
new file mode 100644
index 0000000000000..85972814bcbc0
--- /dev/null
+++ b/paddle/cinn/backends/nvrtc/header_generator.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/nvrtc/header_generator.h"
+
+#include "glog/logging.h"
+#include "jitify.hpp"
+
+namespace cinn {
+namespace backends {
+namespace nvrtc {
+
+HeaderGeneratorBase& JitSafeHeaderGenerator::GetInstance() {
+  static JitSafeHeaderGenerator instance;
+  return instance;
+}
+
+const size_t JitSafeHeaderGenerator::size() const {
+  CHECK_EQ(include_names_.size(), headers_.size()) << "Internal error in size of header files.";
+  return include_names_.size();
+}
+
+JitSafeHeaderGenerator::JitSafeHeaderGenerator() {
+  const auto& headers_map = ::jitify::detail::get_jitsafe_headers_map();
+  for (auto& pair : headers_map) {
+    include_names_.emplace_back(pair.first.data());
+    headers_.emplace_back(pair.second.data());
+  }
+}
+
+}  // namespace nvrtc
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/nvrtc/header_generator.h b/paddle/cinn/backends/nvrtc/header_generator.h
new file mode 100644
index 0000000000000..1e6e57665857e
--- /dev/null
+++ b/paddle/cinn/backends/nvrtc/header_generator.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace cinn {
+namespace backends {
+class HeaderGeneratorBase {
+ public:
+  virtual const size_t size() const                             = 0;
+  virtual const std::vector<const char*>& headers() const       = 0;
+  virtual const std::vector<const char*>& include_names() const = 0;
+};
+
+namespace nvrtc {
+
+class JitSafeHeaderGenerator : public HeaderGeneratorBase {
+ public:
+  static HeaderGeneratorBase& GetInstance();
+  const size_t size() const;
+  const std::vector<const char*>& headers() const override { return headers_; }
+  const std::vector<const char*>& include_names() const override { return include_names_; }
+
+ private:
+  JitSafeHeaderGenerator();
+  std::vector<const char*> headers_;
+  std::vector<const char*> include_names_;
+};
+
+}  // namespace nvrtc
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/nvrtc/nvrtc_util.cc b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
new file mode 100644
index 0000000000000..4598054701129
--- /dev/null
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/backends/nvrtc/header_generator.h"
+#include "cinn/common/common.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/string.h"
+
+DECLARE_string(cinn_nvcc_cmd_path);
+DECLARE_bool(nvrtc_compile_to_cubin);
+
+namespace cinn {
+namespace backends {
+namespace nvrtc {
+
+std::string Compiler::operator()(const std::string& code, bool include_headers) {
+  if (runtime::CanUseNvccCompiler()) {
+    return CompileWithNvcc(code);
+  }
+  return CompileCudaSource(code, include_headers);
+}
+
+Compiler::Compiler() {
+  if (FLAGS_nvrtc_compile_to_cubin) {
+#if CUDA_VERSION >= 11010
+    compile_to_cubin_ = true;
+#endif
+  }
+  VLOG(4) << "FLAGS_nvrtc_compile_to_cubin: " << FLAGS_nvrtc_compile_to_cubin
+          << ", compile_to_cubin_: " << compile_to_cubin_;
+}
+
+bool Compiler::compile_to_cubin() { return compile_to_cubin_; }
+
+std::vector<std::string> Compiler::FindCUDAIncludePaths() {
+  const std::string delimiter = "/";
+  std::string cuda_include_path;
+  const char* cuda_path_env = std::getenv("CUDA_PATH");
+  if (cuda_path_env != nullptr) {
+    cuda_include_path += cuda_path_env;
+    cuda_include_path += delimiter + "include";
+    return {cuda_include_path};
+  }
+
+#if defined(__linux__)
+  struct stat st;
+  cuda_include_path = "/usr/local/cuda/include";
+  if (stat(cuda_include_path.c_str(), &st) == 0) {
+    return {cuda_include_path};
+  }
+#endif
+  LOG(FATAL) << "Cannot find cuda include path."
+             << "CUDA_PATH is not set or CUDA is not installed in the default installation path."
+             << "In other than linux, it is necessary to set CUDA_PATH.";
+  return {cuda_include_path};
+}
+
+std::vector<std::string> Compiler::FindCINNRuntimeIncludePaths() { return {Context::Global().runtime_include_dir()}; }
+
+std::string Compiler::CompileCudaSource(const std::string& code, bool include_headers) {
+  const auto& header_gen = JitSafeHeaderGenerator::GetInstance();
+  std::vector<std::string> compile_options;
+  std::vector<const char*> param_cstrings{};
+  nvrtcProgram prog;
+  std::string cc = "30";
+  int major, minor;
+  cudaError_t e1 = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0);
+  cudaError_t e2 = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0);
+
+  if (e1 == cudaSuccess && e2 == cudaSuccess) {
+    cc = std::to_string(major) + std::to_string(minor);
+  } else {
+    LOG(WARNING) << "cannot detect compute capability from your device, "
+                 << "fall back to compute_30.";
+  }
+  if (compile_to_cubin_) {
+    compile_options.push_back("-arch=sm_" + cc);
+  } else {
+    compile_options.push_back("-arch=compute_" + cc);
+  }
+  compile_options.push_back("-std=c++14");
+  compile_options.push_back("-default-device");
+
+  if (include_headers) {  // prepare include headers
+    auto cuda_headers = FindCUDAIncludePaths();
+    auto cinn_headers = FindCINNRuntimeIncludePaths();
+    std::vector<std::string> include_paths;
+    for (auto& header : cuda_headers) {
+      include_paths.push_back("--include-path=" + header);
+    }
+    for (auto& header : cinn_headers) {
+      include_paths.push_back("--include-path=" + header);
+    }
+    compile_options.insert(std::end(compile_options), include_paths.begin(), include_paths.end());
+  }
+
+  for (const auto& option : compile_options) {
+    param_cstrings.push_back(option.c_str());
+  }
+  VLOG(3) << "compile options: " << utils::Join(compile_options, " ");
+  NVRTC_CALL(nvrtcCreateProgram(
+      &prog, code.c_str(), nullptr, header_gen.size(), header_gen.headers().data(), header_gen.include_names().data()));
+  nvrtcResult compile_res = nvrtcCompileProgram(prog, param_cstrings.size(), param_cstrings.data());
+
+  {  // get log
+    size_t log_size;
+    NVRTC_CALL(nvrtcGetProgramLogSize(prog, &log_size));
+    std::string log;
+    log.resize(log_size);
+    NVRTC_CALL(nvrtcGetProgramLog(prog, &log[0]));
+    CHECK_EQ(compile_res, NVRTC_SUCCESS) << log;
+  }
+
+  size_t size;
+  std::string data;
+  if (compile_to_cubin_) {
+    NVRTC_CALL(nvrtcGetCUBINSize(prog, &size));
+    data.resize(size);
+    NVRTC_CALL(nvrtcGetCUBIN(prog, &data[0]));
+  } else {
+    NVRTC_CALL(nvrtcGetPTXSize(prog, &size));
+    data.resize(size);
+    NVRTC_CALL(nvrtcGetPTX(prog, &data[0]));
+  }
+
+  NVRTC_CALL(nvrtcDestroyProgram(&prog));
+  return data;
+}
+
+std::string Compiler::CompileWithNvcc(const std::string& cuda_c) {
+  // read dir source
+  std::string dir = "./source";
+  if (access(dir.c_str(), 0) == -1) {
+    CHECK(mkdir(dir.c_str(), 7) != -1) << "Fail to mkdir " << dir;
+  }
+
+  // get unqiue prefix name
+  prefix_name_ = dir + "/" + common::UniqName("rtc_tmp");
+
+  auto cuda_c_file = prefix_name_ + ".cu";
+  std::ofstream ofs(cuda_c_file, std::ios::out);
+  CHECK(ofs.is_open()) << "Fail to open file " << cuda_c_file;
+  ofs << cuda_c;
+  ofs.close();
+
+  CompileToPtx();
+  CompileToCubin();
+
+  return prefix_name_ + ".cubin";
+}
+
+// std::string Compiler::GetPtx() { return ReadFile(prefix_name_ + ".ptx", std::ios::in); }
+
+void Compiler::CompileToPtx() {
+  auto include_dir            = common::Context::Global().runtime_include_dir();
+  std::string include_dir_str = "";
+  for (auto dir : include_dir) {
+    if (include_dir_str.empty()) {
+      include_dir_str = dir;
+    } else {
+      include_dir_str += ":" + dir;
+    }
+  }
+
+  std::string options = std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path +
+                        std::string(":$PATH && nvcc -std=c++14 --ptx -O3 -I ") + include_dir_str;
+  options += " -arch=" + GetDeviceArch();
+  options += " -o " + prefix_name_ + ".ptx";
+  options += " " + prefix_name_ + ".cu";
+
+  VLOG(2) << "Nvcc Compile Options : " << options;
+  CHECK(system(options.c_str()) == 0) << options;
+}
+
+void Compiler::CompileToCubin() {
+  std::string options =
+      std::string("export PATH=") + FLAGS_cinn_nvcc_cmd_path + std::string(":$PATH && nvcc --cubin -O3");
+  options += " -arch=" + GetDeviceArch();
+  options += " -o " + prefix_name_ + ".cubin";
+  options += " " + prefix_name_ + ".ptx";
+
+  VLOG(2) << "Nvcc Compile Options : " << options;
+  CHECK(system(options.c_str()) == 0) << options;
+}
+
+std::string Compiler::GetDeviceArch() {
+  int major = 0, minor = 0;
+  if (cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, 0) == cudaSuccess &&
+      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, 0) == cudaSuccess) {
+    return "sm_" + std::to_string(major) + std::to_string(minor);
+  } else {
+    LOG(WARNING) << "cannot detect compute capability from your device, "
+                 << "fall back to compute_30.";
+    return "sm_30";
+  }
+}
+
+std::string Compiler::ReadFile(const std::string& file_name, std::ios_base::openmode mode) {
+  // open cubin file
+  std::ifstream ifs(file_name, mode);
+  CHECK(ifs.is_open()) << "Fail to open file " << file_name;
+  ifs.seekg(std::ios::end);
+  auto len = ifs.tellg();
+  ifs.seekg(0);
+
+  // read cubin file
+  std::string file_data(len, ' ');
+  ifs.read(&file_data[0], len);
+  ifs.close();
+  return std::move(file_data);
+}
+
+}  // namespace nvrtc
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/nvrtc/nvrtc_util.h b/paddle/cinn/backends/nvrtc/nvrtc_util.h
new file mode 100644
index 0000000000000..b13c24c550a63
--- /dev/null
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef CINN_WITH_CUDA
+#if defined(__linux__)
+#include <sys/stat.h>
+#endif
+#include <glog/logging.h>
+
+#include <string>
+#include <vector>
+
+namespace cinn {
+namespace backends {
+namespace nvrtc {
+
+/**
+ * An helper class to call NVRTC. Input CUDA device source code, get PTX string.
+ */
+class Compiler {
+ public:
+  Compiler();
+
+  /**
+   * Compile the \p code and get PTX string.
+   * @param code The CUDA source code.
+   * @param include_headers Whether to include the headers of CUDA and CINN runtime modules.
+   * @return Compiled PTX code string.
+   */
+  std::string operator()(const std::string& code, bool include_headers = true);
+
+  /** Compile into cubin or not
+   * @return Compile into cubin or not.
+   */
+  bool compile_to_cubin();
+
+ private:
+  /**
+   * Get the directories of CUDA's header files.
+   * @return list of header file directories.
+   */
+  std::vector<std::string> FindCUDAIncludePaths();
+
+  /**
+   * Get the directories of CINN runtime's header files.
+   * @return list of header file directories.
+   */
+  std::vector<std::string> FindCINNRuntimeIncludePaths();
+
+  /**
+   * Compile CUDA source code and get PTX or CUBIN.
+   * @param code source code string.
+   * @return PTX or CUBIN string.
+   */
+  std::string CompileCudaSource(const std::string& code, bool include_headers);
+
+  /**
+   * whether to compile the source code into cubin, only works with cuda version > 11.1
+   */
+  bool compile_to_cubin_{false};
+
+  // compile with nvcc
+  std::string CompileWithNvcc(const std::string&);
+
+  // compile to ptx
+  void CompileToPtx();
+  // compile to cubin
+  void CompileToCubin();
+  std::string GetDeviceArch();
+
+  std::string ReadFile(const std::string&, std::ios_base::openmode);
+
+  std::string prefix_name_{""};
+};
+
+}  // namespace nvrtc
+}  // namespace backends
+}  // namespace cinn
+
+#endif  // CINN_WITH_CUDA
diff --git a/paddle/cinn/backends/nvrtc/nvrtc_util_test.cc b/paddle/cinn/backends/nvrtc/nvrtc_util_test.cc
new file mode 100644
index 0000000000000..9a21934130086
--- /dev/null
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util_test.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn {
+namespace backends {
+namespace nvrtc {
+
+TEST(Compiler, basic) {
+  Compiler compiler;
+
+  std::string source_code = R"ROC(
+extern "C" __global__
+void saxpy(float a, float *x, float *y, float *out, size_t n)
+{
+  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n) {
+    out[tid] = a * x[tid] + y[tid];
+  }
+}
+)ROC";
+
+  auto ptx = compiler(source_code);
+
+  LOG(INFO) << "ptx:\n" << ptx;
+}
+
+TEST(Compiler, float16) {
+  Compiler compiler;
+
+  std::string source_code = R"(
+#include <cstdint>
+#define CINN_WITH_CUDA
+#include "float16.h"
+using cinn::common::float16;
+
+extern "C" __global__
+void cast_fp32_to_fp16_cuda_kernel(const float* input, const int num, float16* out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num) {
+    out[idx] = float16(input[idx]);
+  }
+}
+)";
+
+  auto ptx = compiler(source_code);
+
+  LOG(INFO) << "ptx:\n" << ptx;
+}
+
+TEST(Compiler, bfloat16) {
+  Compiler compiler;
+
+  std::string source_code = R"(
+#include <cstdint>
+#define CINN_WITH_CUDA
+#include "bfloat16.h"
+using cinn::common::bfloat16;
+
+extern "C" __global__
+void cast_fp32_to_bf16_cuda_kernel(const float* input, const int num, bfloat16* out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num) {
+    out[idx] = bfloat16(input[idx]);
+  }
+}
+)";
+
+  auto ptx = compiler(source_code);
+
+  LOG(INFO) << "ptx:\n" << ptx;
+}
+
+}  // namespace nvrtc
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/outputs.cc b/paddle/cinn/backends/outputs.cc
new file mode 100644
index 0000000000000..65d4cc76899fe
--- /dev/null
+++ b/paddle/cinn/backends/outputs.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/outputs.h"
+
+namespace cinn {
+namespace lang {}  // namespace lang
+
+backends::Outputs backends::Outputs::object(const std::string &name) const {
+  Outputs updated     = *this;
+  updated.object_name = name;
+  return updated;
+}
+
+backends::Outputs backends::Outputs::bitcode(const std::string &name) const {
+  Outputs updated      = *this;
+  updated.bitcode_name = name;
+  return updated;
+}
+
+backends::Outputs backends::Outputs::c_header(const std::string &name) const {
+  Outputs updated       = *this;
+  updated.c_header_name = name;
+  return updated;
+}
+
+backends::Outputs backends::Outputs::c_source(const std::string &name) const {
+  Outputs updated       = *this;
+  updated.c_source_name = name;
+  return updated;
+}
+
+backends::Outputs backends::Outputs::cuda_source(const std::string &name) const {
+  Outputs updated          = *this;
+  updated.cuda_source_name = name;
+  return updated;
+}
+
+}  // namespace cinn
diff --git a/paddle/cinn/backends/outputs.h b/paddle/cinn/backends/outputs.h
new file mode 100644
index 0000000000000..45c4c9e1418e7
--- /dev/null
+++ b/paddle/cinn/backends/outputs.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace cinn {
+namespace backends {
+
+/**
+ * A struct specifying a collection of outputs.
+ */
+struct Outputs {
+  //! The name of the emitted object file. Empty if no object file is desired.
+  std::string object_name;
+
+  //! The name of the emitted llvm bitcode. Empty if no bitcode file is desired.
+  std::string bitcode_name;
+
+  //! The name of the emitted C header file.
+  std::string c_header_name;
+
+  //! The name of the emitted C source file.
+  std::string c_source_name;
+
+  //! The name of the emitted CUDA source file.
+  std::string cuda_source_name;
+
+  Outputs object(const std::string& name) const;
+
+  Outputs bitcode(const std::string& name) const;
+
+  Outputs c_header(const std::string& name) const;
+
+  Outputs c_source(const std::string& name) const;
+
+  Outputs cuda_source(const std::string& name) const;
+};
+
+}  // namespace backends
+}  // namespace cinn
diff --git a/paddle/cinn/backends/raw_cuda_code_test.cu b/paddle/cinn/backends/raw_cuda_code_test.cu
new file mode 100644
index 0000000000000..765ef5bd986bb
--- /dev/null
+++ b/paddle/cinn/backends/raw_cuda_code_test.cu
@@ -0,0 +1,54 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/utils/timer.h"
+
+__global__ void elementwise_add_kernel(const float* __restrict__ A,
+                                       const float* __restrict__ B,
+                                       float* __restrict__ C) {
+  if ((blockIdx.x < 1024)) {
+    {
+      if ((threadIdx.x < 1024)) {
+        {
+          C[((1024 * blockIdx.x) + threadIdx.x)] =
+              (A[((1024 * blockIdx.x) + threadIdx.x)] + B[((1024 * blockIdx.x) + threadIdx.x)]);
+        }
+      }
+    }
+  }
+}
+
+TEST(raw_cuda, basic) {
+  const int M = 1024;
+  const int N = 1024;
+  // allocate CUDA buffer
+  float *Ag, *Bg, *Cg;
+  const int num_bytes = M * N * sizeof(float);
+  cudaMalloc(&Ag, num_bytes);
+  cudaMalloc(&Bg, num_bytes);
+  cudaMalloc(&Cg, num_bytes);
+
+  cinn::utils::Timer timer;
+  timer.Start();
+  for (int i = 0; i < 1000; i++) {
+    elementwise_add_kernel<<<1024, 1024>>>(Ag, Bg, Cg);
+  }
+  CUDA_CALL(cudaDeviceSynchronize());
+  float latency = timer.Stop();
+  LOG(INFO) << "latency: " << latency / 1000;
+}
diff --git a/paddle/cinn/cinn.h b/paddle/cinn/cinn.h
new file mode 100644
index 0000000000000..41ce22a7b54ba
--- /dev/null
+++ b/paddle/cinn/cinn.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This file exposes some internal APIs to global cinn namespace to make usage more friendly.
+ */
+#pragma once
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/common/common.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/optim/optimize.h"
+
+namespace cinn {
+
+using backends::CodeGenC;
+using backends::CodeGenCX86;
+using backends::Outputs;
+using ir::Module;
+using ir::Var;
+using lang::Buffer;
+using lang::CallExtern;
+using lang::CallLowered;
+using lang::Compute;
+using lang::Lower;
+using lang::Placeholder;
+using lang::ReduceAll;
+using lang::ReduceAny;
+using lang::ReduceMax;
+using lang::ReduceMin;
+using lang::ReduceMul;
+using lang::ReduceSum;
+using optim::Optimize;
+using poly::CreateStages;
+
+using lang::logic_and;
+using lang::logic_or;
+
+using common::Target;
+
+}  // namespace cinn
diff --git a/paddle/cinn/common/CMakeLists.txt b/paddle/cinn/common/CMakeLists.txt
new file mode 100644
index 0000000000000..f45e2812960a0
--- /dev/null
+++ b/paddle/cinn/common/CMakeLists.txt
@@ -0,0 +1,36 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    shared.cc
+    cinn_value.cc
+    type.cc
+    target.cc
+    object.cc
+    debug_manager.cc
+    info_registry.cc
+    graph_utils.cc
+    context.cc
+    axis.cc
+    ir_util.cc
+    test_helper.cc
+    # cuda_test_helper.cc
+    arithmatic.cc
+    cas.cc
+    union_find.cc
+    python_interpreter_guard.cc
+    )
+
+ message(STATUS "srcs: ${cinnapi_src}")
+
+cc_test(test_cinn_value SRCS cinn_value_test.cc DEPS cinncore)
+cc_test(test_shared SRCS shared_test.cc DEPS cinncore)
+cc_test(test_graph_utils SRCS graph_utils_test.cc DEPS cinncore)
+cc_test(test_arithmatic SRCS arithmatic_test.cc DEPS cinncore)
+cc_test(test_cas SRCS cas_test.cc DEPS cinncore)
+cc_test(test_type SRCS type_test.cc DEPS cinncore)
+cc_test(test_axis SRCS axis_test.cc DEPS cinncore)
+
+cc_test(test_fp16_bf16_host SRCS float16_bfloat16_host_test.cc DEPS gtest glog)
+if (WITH_CUDA)
+nv_test(test_fp16_bf16_cuda SRCS float16_bfloat16_cuda_test.cu DEPS gtest glog)
+endif()
diff --git a/paddle/cinn/common/arithmatic.cc b/paddle/cinn/common/arithmatic.cc
new file mode 100644
index 0000000000000..8fd8bb6f6ec50
--- /dev/null
+++ b/paddle/cinn/common/arithmatic.cc
@@ -0,0 +1,310 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/arithmatic.h"
+
+#include <map>
+#include <mutex>
+#include <numeric>
+#include <set>
+#include <string>
+
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace common {
+
+using utils::GetStreamCnt;
+using utils::Join;
+using utils::Replace;
+using utils::Split;
+using namespace ir;  // NOLINT
+
+#ifdef As
+#undef As
+#endif
+
+std::string ExprToGinacConverter::Repr(const ir::Expr& expr) {
+  auto* load_n      = expr.As<Load>();
+  auto* var_n       = expr.As<_Var_>();
+  auto* broadcast_n = expr.As<Broadcast>();
+  auto* mod_n       = expr.As<Mod>();
+  auto* min_n       = expr.As<Min>();
+  auto* max_n       = expr.As<Max>();
+  auto* div_n       = expr.As<Div>();
+  auto* frac_n      = expr.As<FracOp>();
+  if (load_n || broadcast_n || mod_n || min_n || max_n || div_n || frac_n) {
+    std::string repr = GetStreamCnt(expr);
+    Replace(&repr, "[", "lsq_");
+    Replace(&repr, "]", "_rsq");
+    Replace(&repr, "(", "lb_");
+    Replace(&repr, ")", "_rb");
+    Replace(&repr, "+", "_add_");
+    Replace(&repr, "-", "_sub_");
+    Replace(&repr, ":", "_ref_");
+    Replace(&repr, "*", "_mul_");
+    Replace(&repr, "/", "_div_");
+    // remove the spaces
+    auto fields = utils::Split(repr, " ");
+    repr        = utils::Join(fields, "_");
+    return repr;
+  } else if (var_n) {
+    return utils::GetStreamCnt(expr);
+  }
+  return "";
+}
+
+void ExprToGinacConverter::RecordExpr(const ir::Expr& expr) { repr_to_expr_[Repr(expr)] = expr; }
+
+GiNaC::ex ExprToGinacConverter::BuildHelper(ir::Expr expr) {
+  auto* load_n      = expr.As<Load>();
+  auto* var_n       = expr.As<_Var_>();
+  auto* int_n       = expr.As<IntImm>();
+  auto* float_n     = expr.As<FloatImm>();
+  auto* add_n       = expr.As<Add>();
+  auto* sub_n       = expr.As<Sub>();
+  auto* mul_n       = expr.As<Mul>();
+  auto* div_n       = expr.As<Div>();
+  auto* minus_n     = expr.As<Minus>();
+  auto* broadcast_n = expr.As<Broadcast>();
+  auto* mod_n       = expr.As<Mod>();
+  auto* frac_n      = expr.As<FracOp>();
+  auto* min_n       = expr.As<Min>();
+  auto* max_n       = expr.As<Max>();
+
+  bool is_integer_math = expr.type().is_int();
+
+  bool is_invalid_arith = load_n || var_n || broadcast_n || mod_n || min_n || max_n;
+  if (is_integer_math)
+    is_invalid_arith = is_invalid_arith || div_n || frac_n;  // GiNac can't deal with integer division.
+
+  if (is_invalid_arith) {
+    RecordExpr(expr);
+    std::string repr = Repr(expr);
+    return CreateGinacSymbol(repr);
+  } else if (int_n) {
+    return int_n->value;
+  } else if (float_n) {
+    return float_n->value;
+  } else if (add_n) {
+    auto a = BuildHelper(add_n->a());
+    auto b = BuildHelper(add_n->b());
+    return (a + b) * 1;
+  } else if (sub_n) {
+    return (BuildHelper(sub_n->a()) - BuildHelper(sub_n->b()));
+  } else if (mul_n) {
+    return (BuildHelper(mul_n->a()) * BuildHelper(mul_n->b()));
+  } else if (div_n) {
+    return (BuildHelper(div_n->a()) / BuildHelper(div_n->b()));
+  } else if (frac_n) {
+    return (BuildHelper(frac_n->a()) / BuildHelper(frac_n->b()));
+  } else if (minus_n) {
+    return -BuildHelper(minus_n->v());
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+GiNaC::ex ExprToGinacConverter::operator()(Expr expr) {
+  // TODO(Superjomn) Replace this with common::IsPureMath(
+  auto complex_nodes = CollectIRNodes(expr, [](const Expr* n) {
+    return n->As<Block>() ||    //
+           n->As<PolyFor>() ||  //
+           n->As<EQ>() ||       //
+           n->As<NE>() ||       //
+           n->As<LT>() ||       //
+           n->As<LE>() ||       //
+           n->As<GT>() ||       //
+           n->As<GE>() ||       //
+           n->As<And>() ||      //
+           n->As<Or>() ||       //
+           n->As<Not>() ||      //
+           n->As<Let>() ||      //
+           n->As<Call>() ||     //
+           n->As<Select>() ||   //
+           n->As<Store>() ||    //
+           n->As<Alloc>() ||    //
+           n->As<Free>() ||     //
+           n->As<IfThenElse>();
+  });
+
+  CHECK(complex_nodes.empty())
+      << "Ginac converter can only deal with simple math expression, but get some complex nodes" << expr;
+
+  return BuildHelper(expr);
+}
+
+GiNaC::symbol ExprToGinacConverter::CreateGinacSymbol(const std::string& repr) {
+  CHECK(!repr.empty());
+  auto it = repr_to_ginac_.find(repr);
+  if (it != repr_to_ginac_.end()) return it->second;
+
+  GiNaC::symbol x(repr);
+  repr_to_ginac_[repr] = x;
+  return x;
+}
+
+GiNaC::symbol ExprToGinacConverter::CreateGinacSymbol(const ir::Expr& var) {
+  CHECK(var.As<_Var_>());
+  return CreateGinacSymbol(Repr(var));
+}
+
+class GiNaCToExprVisitor : public GiNaC::symbol::visitor,
+                           public GiNaC::numeric::visitor,
+                           public GiNaC::add::visitor,
+                           public GiNaC::mul::visitor,
+                           public GiNaC::power::visitor,
+                           public GiNaC::basic::visitor,
+                           public GiNaC::visitor {
+  std::map<std::string, ir::Expr>& repr_to_expr;
+  ir::Expr cur;
+
+ public:
+  explicit GiNaCToExprVisitor(std::map<std::string, ir::Expr>& repr_to_expr) : repr_to_expr(repr_to_expr) {}
+
+  Expr operator()(GiNaC::ex ex) {
+    ex.accept(*this);
+    return cur;
+  }
+
+  void visit(const GiNaC::symbol& node) override {
+    auto it = repr_to_expr.find(node.get_name());
+    CHECK(it != repr_to_expr.end()) << "node [" << node.get_name() << "] not found";
+    cur = it->second;
+  }
+
+  void visit(const GiNaC::numeric& node) override {
+    if (node.is_integer()) {
+      cur = Expr(static_cast<int>(node.to_int()));
+    } else {
+      cur = Expr(static_cast<float>(node.to_double()));
+    }
+  }
+  void visit(const GiNaC::add& node) override {
+    node.op(0).accept(*this);
+    Expr res = cur;
+
+    for (int i = 1; i < node.nops(); i++) {
+      node.op(i).accept(*this);
+      res = res + cur;
+    }
+
+    cur = res;
+  }
+
+  void visit(const GiNaC::power& node) override {
+    node.op(0).accept(*this);
+    Expr a = cur;
+    node.op(1).accept(*this);
+
+    auto* intv = cur.As<IntImm>();
+    CHECK(intv);
+    CHECK_EQ(intv->value, -1);
+
+    cur = Div::Make(Expr(1), a);
+  }
+
+  void visit(const GiNaC::mul& node) override {
+    node.op(0).accept(*this);
+    Expr res = cur;
+
+    for (int i = 1; i < node.nops(); i++) {
+      node.op(i).accept(*this);
+      res = res * cur;
+    }
+
+    cur = res;
+  }
+  void visit(const GiNaC::basic& basic) override { CINN_NOT_IMPLEMENTED }
+};
+
+Expr ExprToGinacConverter::GinacToExpr(const GiNaC::ex& ex) {
+  GiNaCToExprVisitor visitor(repr_to_expr_);
+  return visitor(ex);
+}
+
+bool IsPureMath(Expr expr) {
+  std::set<IrNodeTy> valid_node_tys({
+      IrNodeTy ::_Var_,
+      IrNodeTy ::IntImm,
+      IrNodeTy ::Sum,
+      IrNodeTy ::Product,
+      IrNodeTy ::FracOp,
+      IrNodeTy ::FloatImm,
+      IrNodeTy ::Add,
+      IrNodeTy ::Sub,
+      IrNodeTy ::Div,
+      IrNodeTy ::Mul,
+      IrNodeTy::Mod,
+      IrNodeTy ::Minus,
+  });
+
+  auto complex_nodes = ir::CollectIRNodes(expr, [&](const Expr* n) { return !valid_node_tys.count(n->node_type()); });
+#ifdef CINN_DEBUG
+  for (auto& node : complex_nodes) {
+    VLOG(3) << "Found " << node->node_type() << " " << Expr(node);
+  }
+#endif
+  return complex_nodes.empty();
+}
+
+bool MathContainsSymbol(Expr expr, Var symbol) {
+  // Use diff(expr, x) and check the result is not zero.
+  ExprToGinacConverter expr_converter;
+  auto expr_ex = expr_converter(expr);
+  if (!expr_converter.HasSymbol(symbol->name)) return false;
+  return !ginac::diff(expr_ex, expr_converter.GetSymbol(symbol->name)).is_zero();
+}
+
+// lhs >= rhs.
+std::tuple<Expr, bool /*positive*/> Solve(Expr lhs, Expr rhs, Var var) {
+  static std::mutex ginac_mutex;
+  std::lock_guard<std::mutex> guard(ginac_mutex);
+  VLOG(4) << "Solve: " << lhs << "=" << rhs << " in " << var;
+  ExprToGinacConverter converter;
+  auto lhs_ex = converter(lhs);
+  auto rhs_ex = converter(rhs);
+  ginac::lst eqs{lhs_ex == rhs_ex};
+  VLOG(4) << "eqs: " << eqs;
+  const auto& symbol = converter.GetSymbol(var->name);
+  ginac::lst vars{symbol};
+  ginac::ex res = ginac::lsolve(eqs, vars);
+
+  CHECK_EQ(res.nops(), 1);
+  auto item = res.op(0);
+  CHECK_EQ(item.nops(), 2);
+  Expr value = converter.GinacToExpr(item.op(1));
+
+  // tell the symbol
+  auto diff     = lhs_ex - rhs_ex;
+  auto diff_res = ginac::diff(diff, symbol);
+  CHECK(!diff_res.is_zero());
+
+  return std::make_tuple(value, diff_res > 0);
+}
+
+bool MathIsZero(Expr expr) {
+  if (!IsPureMath(expr)) return false;
+  ExprToGinacConverter converter;
+
+  auto ex = converter(expr);
+  return ex.is_zero();
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/arithmatic.h b/paddle/cinn/common/arithmatic.h
new file mode 100644
index 0000000000000..4c443dd0adda6
--- /dev/null
+++ b/paddle/cinn/common/arithmatic.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This file includes some arithmatic utilities, such as simplifying/solving a math equation/CINN expression.
+ */
+#pragma once
+
+// clang-format off
+#include "cinn/ir/ir.h"
+
+#include <ginac/ginac.h>
+// clang-format on
+
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <tuple>
+
+#ifdef As
+#undef As
+#endif
+
+namespace cinn {
+namespace common {
+
+namespace ginac = GiNaC;
+
+//! Tell whether the expression \p expr contains only simple math calculations, like i*32+j is true, while Load(buf,
+//! i)+1 is not due to the Load Node is not math related.
+bool IsPureMath(Expr expr);
+
+//! Tell whether the expression \p expr contains the expression \symbol, e.g. i*32+32 contains `i`, it also contains
+//! `i+1`.
+bool MathContainsSymbol(Expr expr, Var symbol);
+
+//! Solve the equation \p lhs == \p rhs on symbol \p symbol.
+std::tuple<Expr, bool /*positive*/> Solve(Expr lhs, Expr rhs, Var symbol);
+
+//! Determine whether this expression \p expr calculates to be a zero.
+bool MathIsZero(Expr expr);
+
+int gcd(int a, int b);
+
+/**
+ * Helper to convert cinn::Expr to GiNaC::expr for some symbolic math analysis.
+ */
+struct ExprToGinacConverter {
+  //! Convert CINN expression \p expr to GiNaC ex.
+  ginac::ex operator()(Expr expr);
+
+  //! Convert GiNaC ex back to CINN expression, should call operator() first.
+  Expr GinacToExpr(const GiNaC::ex& ex);
+
+  bool HasSymbol(const std::string& name) const { return repr_to_ginac_.count(name); }
+  const ginac::symbol& GetSymbol(const std::string& name) const { return repr_to_ginac_.at(name); }
+
+ private:
+  std::string Repr(const Expr& expr);
+  ginac::symbol CreateGinacSymbol(const std::string& repr);
+  ginac::symbol CreateGinacSymbol(const ir::Expr& var);
+
+  ginac::ex BuildHelper(ir::Expr expr);
+
+  void RecordExpr(const ir::Expr& expr);
+
+ private:
+  std::map<std::string, ir::Expr> repr_to_expr_;
+  std::map<std::string, ginac::symbol> repr_to_ginac_;
+};
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/arithmatic_test.cc b/paddle/cinn/common/arithmatic_test.cc
new file mode 100644
index 0000000000000..707eda8620aa8
--- /dev/null
+++ b/paddle/cinn/common/arithmatic_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/arithmatic.h"
+
+#include <ginac/ginac.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace common {
+using utils::GetStreamCnt;
+using utils::Join;
+using utils::Trim;
+using namespace ir;  // NOLINT
+
+TEST(GiNaC, simplify) {
+  using namespace GiNaC;  // NOLINT
+  symbol x("x");
+  symbol y("y");
+
+  ex e = x * 0 + 1 + 2 + 3 - 100 + 30 * y - y * 21 + 0 * x;
+  LOG(INFO) << "e: " << e;
+}
+
+TEST(GiNaC, diff) {
+  using namespace GiNaC;  // NOLINT
+  symbol x("x"), y("y");
+  ex e  = (x + 1);
+  ex e1 = (y + 1);
+
+  e  = diff(e, x);
+  e1 = diff(e1, x);
+  LOG(INFO) << "e: " << eval(e);
+  LOG(INFO) << "e1: " << eval(e1);
+}
+
+TEST(GiNaC, solve) {
+  using namespace GiNaC;  // NOLINT
+  symbol x("x"), y("y");
+
+  lst eqns{2 * x + 3 == 19};
+  lst vars{x};
+
+  LOG(INFO) << "solve: " << lsolve(eqns, vars);
+  LOG(INFO) << diff(2 * x + 3, x);
+}
+
+TEST(Solve, basic) {
+  Var i("i", Int(32));
+  Expr lhs = Expr(i) * 2;
+  Expr rhs = Expr(2) * Expr(200);
+  Expr res;
+  bool is_positive;
+  std::tie(res, is_positive) = Solve(lhs, rhs, i);
+  LOG(INFO) << "res: " << res;
+  EXPECT_TRUE(is_positive);
+  EXPECT_TRUE(res == Expr(200));
+}
+
+TEST(Solve, basic1) {
+  Var i("i", Int(32));
+  Expr lhs = Expr(i) * 2;
+  Expr rhs = Expr(2) * Expr(200) + 3 * Expr(i);
+
+  Expr res;
+  bool is_positive;
+  std::tie(res, is_positive) = Solve(lhs, rhs, i);
+  LOG(INFO) << "res " << res;
+  EXPECT_TRUE(res == Expr(-400));
+  EXPECT_FALSE(is_positive);
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/axis.cc b/paddle/cinn/common/axis.cc
new file mode 100644
index 0000000000000..f74de36aa3063
--- /dev/null
+++ b/paddle/cinn/common/axis.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/axis.h"
+
+#include "cinn/common/common.h"
+#include "cinn/lang/compute.h"
+#include "cinn/poly/dim.h"
+#include "cinn/poly/domain.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace common {
+
+const static std::vector<std::string> kAxises({
+    "i",  // level 0
+    "j",  // level 1
+    "k",  // level 2
+    "a",  // level 3
+    "b",  // level 4
+    "c",  // level 5
+    "d",  // level 6
+    "e",  // level 7
+    "f",  // level 8
+    "g",  // level 9
+    "h",  // level 10
+    "l",  // level 11
+    "m",  // level 12
+    "n",  // level 13
+    "o",  // level 14
+    "p",  // level 15
+    "q",  // level 16
+    "r",  // level 17
+    "s",  // level 18
+    "t",  // level 19
+    "u",  // level 20
+    "v"   // level 21
+});
+
+std::string axis_name(int level) {
+  if (level < kAxises.size()) {
+    return kAxises[level];
+  }
+  // upper level
+  int repeat_num        = 1 + (level / kAxises.size());
+  const auto& base_axis = kAxises[level % kAxises.size()];
+
+  // if the level greater than kAxis, repeat the axis, like:
+  // level == 22 ==> axis = "ii"
+  std::string axis;
+  for (int i = 0; i < repeat_num; ++i) {
+    axis.append(base_axis);
+  }
+  return axis;
+}
+
+std::vector<ir::Var> GenDefaultAxis(int naxis) {
+  std::vector<ir::Var> axis;
+  for (int i = 0; i < naxis; i++) {
+    axis.emplace_back(common::axis_name(i));
+    CHECK(axis.back()->type().valid());
+  }
+  return axis;
+}
+
+std::vector<ir::Expr> GenDefaultAxisAsExpr(int naxis) {
+  auto vars = GenDefaultAxis(naxis);
+  std::vector<Expr> res;
+  for (auto& v : vars) {
+    res.push_back(Expr(v));
+  }
+  return res;
+}
+
+static const std::set<std::string>& axis_set() {
+  static std::set<std::string> x(kAxises.begin(), kAxises.end());
+  return x;
+}
+
+bool IsAxisNameReserved(const std::string& x) {
+  if (x.empty()) {
+    // axis should not be empty
+    return false;
+  }
+  if (axis_set().count(x)) {
+    return true;
+  }
+  if (!axis_set().count(std::string(1, x[0]))) {
+    // all char in axis should in kAxises
+    return false;
+  }
+  bool is_repeat_axis = true;
+  for (int i = 1; i < x.size(); ++i) {
+    if (x[i] != x[0]) {
+      // the axis are repeat with the char in kAxises
+      is_repeat_axis = false;
+      break;
+    }
+  }
+  return is_repeat_axis;
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/axis.h b/paddle/cinn/common/axis.h
new file mode 100644
index 0000000000000..bad06569c5442
--- /dev/null
+++ b/paddle/cinn/common/axis.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace cinn {
+namespace ir {
+
+struct Var;
+struct Expr;
+
+}  // namespace ir
+}  // namespace cinn
+
+namespace cinn {
+namespace common {
+
+//! Get the predifined axis name.
+std::string axis_name(int level);
+
+//! Generate `naxis` axis using the global names (i,j,k...).
+std::vector<ir::Var> GenDefaultAxis(int naxis);
+std::vector<ir::Expr> GenDefaultAxisAsExpr(int naxis);
+
+bool IsAxisNameReserved(const std::string& x);
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/axis_test.cc b/paddle/cinn/common/axis_test.cc
new file mode 100644
index 0000000000000..1d2a07e87c6d3
--- /dev/null
+++ b/paddle/cinn/common/axis_test.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/axis.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace common {
+
+TEST(AXISNAME, BASE) {
+  ASSERT_EQ(axis_name(0), std::string("i"));
+  ASSERT_EQ(axis_name(1), std::string("j"));
+  ASSERT_EQ(axis_name(22), std::string("ii"));
+  ASSERT_EQ(axis_name(44), std::string("iii"));
+}
+
+TEST(AXISNAME, CHECK_RESERVED) {
+  ASSERT_TRUE(IsAxisNameReserved("i"));
+  ASSERT_TRUE(IsAxisNameReserved("j"));
+  ASSERT_TRUE(IsAxisNameReserved("ii"));
+  ASSERT_TRUE(IsAxisNameReserved("iiiiiiiiii"));
+  ASSERT_FALSE(IsAxisNameReserved("ijk"));
+  ASSERT_FALSE(IsAxisNameReserved("iiiiiiiiiij"));
+  ASSERT_FALSE(IsAxisNameReserved("x"));
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/bfloat16.h b/paddle/cinn/common/bfloat16.h
new file mode 100644
index 0000000000000..27501008bf5bf
--- /dev/null
+++ b/paddle/cinn/common/bfloat16.h
@@ -0,0 +1,402 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CINN_COMMON_BFLOAT16_H
+#define CINN_COMMON_BFLOAT16_H
+
+#ifdef __cplusplus
+#pragma once
+#endif  // __cplusplus
+
+#include <stdint.h>
+
+#include <cmath>
+#include <cstring>
+
+#ifdef CINN_WITH_CUDA
+#include <cuda.h>
+
+#if (defined(__CUDACC__) || defined(__CUDACC_RTC__)) && CUDA_VERSION >= 11000
+#define CINN_CUDA_BF16
+#include <cuda_bf16.h>
+
+#endif  // __CUDACC__
+#endif  // CINN_WITH_CUDA
+
+#ifdef __cplusplus
+
+#ifndef _WIN32
+#define CINN_ALIGN(x) __attribute__((aligned(x)))
+#else  // _WIN32
+#define CINN_ALIGN(x) __declspec(align(x))
+#endif  // _WIN32
+
+#else  // __cplusplus
+#define CINN_ALIGN(x)
+#endif  // __cplusplus
+
+// The `HOST` macro definition is not used here, it has a potential
+// conflict with the enumeration `kHOST` representing the backend.
+#ifndef __host__
+#define __host__
+#endif
+#ifndef __device__
+#define __device__
+#endif
+
+#ifdef __cplusplus
+namespace cinn {
+namespace common {
+#endif  // __cplusplus
+
+// Use CINN_ALIGNED(2) to ensure that each bfloat16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes bfloat16 compatible
+// with CUDA half
+struct CINN_ALIGN(2) bfloat16 {
+  uint16_t x;
+
+#ifdef __cplusplus
+  // Constructors
+  bfloat16()                  = default;
+  bfloat16(const bfloat16& o) = default;
+  bfloat16& operator=(const bfloat16& o) = default;
+  bfloat16(bfloat16&& o)                 = default;
+  bfloat16& operator=(bfloat16&& o) = default;
+  ~bfloat16()                       = default;
+
+  __host__ __device__ inline explicit bfloat16(float val) {
+#if defined(CINN_CUDA_BF16)
+    __nv_bfloat16 tmp = __float2bfloat16(val);
+    x                 = *reinterpret_cast<uint16_t*>(&tmp);
+#else
+    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+#endif
+  }
+
+#if defined(CINN_CUDA_BF16)
+  __host__ __device__ inline explicit bfloat16(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);  // NOLINT
+  }
+#endif
+
+  template <class T>
+  __host__ __device__ inline explicit bfloat16(const T& val) : x(bfloat16(static_cast<float>(val)).x) {}
+
+// Assignment operators
+#if defined(CINN_CUDA_BF16)
+  __host__ __device__ inline bfloat16& operator=(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);  // NOLINT
+    return *this;
+  }
+#endif
+
+  __host__ __device__ inline bfloat16& operator=(bool b) {
+    x = b ? 0x3f80 : 0;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(int8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(uint8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(int16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(uint16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(int32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(uint32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(int64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(uint64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(float val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(double val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  // Conversion opertors
+  __host__ __device__ inline operator float() const {
+#ifdef CINN_CUDA_BF16
+    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
+    float val     = 0.f;
+    uint16_t temp = x;
+    std::memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp), 2);
+    return val;
+#endif
+  }
+
+#ifdef CINN_CUDA_BF16
+  __host__ __device__ inline __nv_bfloat16 to_nv_bfloat16() const {
+    return *reinterpret_cast<const __nv_bfloat16*>(&x);
+  }
+#endif
+
+  __host__ __device__ inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  __host__ __device__ inline explicit operator int8_t() const { return static_cast<int8_t>(static_cast<float>(*this)); }
+
+  __host__ __device__ inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline operator double() const { return static_cast<double>(static_cast<float>(*this)); }
+#endif  // __cplusplus
+};
+
+__host__ __device__ inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(__hadd(a.to_nv_bfloat16(), b.to_nv_bfloat16()));
+#else
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(__hsub(a.to_nv_bfloat16(), b.to_nv_bfloat16()));
+#else
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(__hmul(a.to_nv_bfloat16(), b.to_nv_bfloat16()));
+#else
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(__hdiv(a.to_nv_bfloat16(), b.to_nv_bfloat16()));
+#else
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline bfloat16 operator-(const bfloat16& a) {
+  bfloat16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+__host__ __device__ inline bfloat16& operator+=(bfloat16& a,  // NOLINT
+                                                const bfloat16& b) {
+  a = a + b;
+  return a;
+}
+
+__host__ __device__ inline bfloat16& operator-=(bfloat16& a,  // NOLINT
+                                                const bfloat16& b) {
+  a = a - b;
+  return a;
+}
+
+__host__ __device__ inline bfloat16& operator*=(bfloat16& a,  // NOLINT
+                                                const bfloat16& b) {
+  a = a * b;
+  return a;
+}
+
+__host__ __device__ inline bfloat16& operator/=(bfloat16& a,  // NOLINT
+                                                const bfloat16& b) {
+  a = a / b;
+  return a;
+}
+
+__host__ __device__ inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
+  bfloat16 res;
+  res.x = a;
+  return res;
+}
+
+// Comparison operators
+__host__ __device__ inline bool operator==(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __heq(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) == static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator!=(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hne(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) != static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator<(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hlt(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) < static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator<=(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hle(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) <= static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator>(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hgt(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) > static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator>=(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hge(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) >= static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool(isnan)(const bfloat16& a) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hisnan(a.to_nv_bfloat16());
+#else
+  return (a.x & 0x7FFF) > 0x7F80;
+#endif
+}
+
+__host__ __device__ inline bool(isinf)(const bfloat16& a) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hisinf(a.to_nv_bfloat16());
+#else
+  return (a.x & 0x7F80) == 0x7F80;
+#endif
+}
+
+__host__ __device__ inline bool(isfinite)(const bfloat16& a) { return !((isnan)(a)) && !((isinf)(a)); }
+
+__host__ __device__ inline bfloat16(abs)(const bfloat16& a) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(__habs(a.to_nv_bfloat16()));
+#else
+  return bfloat16(std::abs(static_cast<float>(a)));
+#endif
+}
+
+#ifdef __cplusplus
+}  // namespace common
+}  // namespace cinn
+#endif  // __cplusplus
+
+// for runtime calls
+#if defined(__cplusplus) && defined(CINN_CUDA_BF16)
+__device__ inline cinn::common::bfloat16 __shfl_sync(unsigned mask,
+                                                     cinn::common::bfloat16 var,
+                                                     int srcLane,
+                                                     int width = warpSize) {
+  return cinn::common::bfloat16(__shfl_sync(mask, var.to_nv_bfloat16(), srcLane, width));
+}
+
+__device__ inline cinn::common::bfloat16 __shfl_up_sync(unsigned mask,
+                                                        cinn::common::bfloat16 var,
+                                                        unsigned int delta,
+                                                        int width = warpSize) {
+  return cinn::common::bfloat16(__shfl_up_sync(mask, var.to_nv_bfloat16(), delta, width));
+}
+
+__device__ inline cinn::common::bfloat16 __shfl_down_sync(unsigned mask,
+                                                          cinn::common::bfloat16 var,
+                                                          unsigned int delta,
+                                                          int width = warpSize) {
+  return cinn::common::bfloat16(__shfl_down_sync(mask, var.to_nv_bfloat16(), delta, width));
+}
+
+__device__ inline cinn::common::bfloat16 __shfl_xor_sync(unsigned mask,
+                                                         cinn::common::bfloat16 var,
+                                                         int laneMask,
+                                                         int width = warpSize) {
+  return cinn::common::bfloat16(__shfl_xor_sync(mask, var.to_nv_bfloat16(), laneMask, width));
+}
+
+__host__ __device__ inline cinn::common::bfloat16 max(const cinn::common::bfloat16& a,
+                                                      const cinn::common::bfloat16& b) {
+  return a > b ? a : b;
+}
+__host__ __device__ inline cinn::common::bfloat16 min(const cinn::common::bfloat16& a,
+                                                      const cinn::common::bfloat16& b) {
+  return a < b ? a : b;
+}
+#endif  // __cplusplus && CINN_CUDA_FP16
+
+#endif  // CINN_COMMON_BFLOAT16_H
diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc
new file mode 100644
index 0000000000000..7049770611717
--- /dev/null
+++ b/paddle/cinn/common/cas.cc
@@ -0,0 +1,2200 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/cas.h"
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <utility>
+
+#include "cinn/common/arithmatic.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/optim/cast_simplify.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace common {
+using namespace ir;  // NOLINT
+
+Expr AutoSimplify(Expr u, const absl::flat_hash_map<std::string, CasInterval>& var_intervals) {
+  VLOG(7) << "Begin AutoSimplify: " << u;
+  u = detail::ConvertCinnToCAS(u);
+  absl::flat_hash_map<std::string, CasInterval> s_var_intervals;
+  for (auto& item : var_intervals) {
+    if (item.second.e_l.defined() && item.second.e_r.defined()) {
+      Expr e_l = detail::ConvertCinnToCAS(item.second.e_l);
+      Expr e_r = detail::ConvertCinnToCAS(item.second.e_r);
+      s_var_intervals.emplace(item.first, CasInterval(e_l, e_r));
+    } else {
+      s_var_intervals.emplace(item.first, CasInterval(item.second.l, item.second.r));
+    }
+  }
+  u = CasSimplify(u, s_var_intervals);
+  u = detail::ConvertCasToCinn(u);
+  VLOG(7) << "End AutoSimplify " << u;
+  return u;
+}
+
+int gcd(int a, int b) {
+  // Everything divides 0
+  if (a == 0) return b;
+  if (b == 0) return a;
+  if (a == 1 || b == 1) return 1;
+  if (a < 0 || b < 0) {
+    return gcd(std::abs(a), std::abs(b));
+  }
+
+  // base case
+  if (a == b) return a;
+
+  // a is greater
+  if (a > b) return gcd(a - b, b);
+  return gcd(a, b - a);
+}
+
+//////// All the following symbolic computation methods are implemented referencing to the book <Computer Algegra and
+/// Symbolic Computation - Joel S. Cohen>
+
+template <typename T>
+std::vector<T> EraseFront(const std::vector<T>& vs) {
+  return std::vector<T>(vs.begin() + 1, vs.end());
+}
+
+template <typename T>
+std::vector<T> Concat(const std::vector<T>& as, const std::vector<T>& bs) {
+  auto res = as;
+  res.insert(std::end(res), bs.begin(), bs.end());
+  return res;
+}
+
+// 3*x*2*y => 3*2
+// x => 1
+Expr ProductGetConstantPart(Expr u) {
+  auto* product = u.As<Product>();
+  if (product) {
+    std::vector<Expr> constant_operands;
+    for (auto& i : product->operands()) {
+      if (i.is_constant()) {
+        constant_operands.push_back(i);
+      }
+    }
+    if (constant_operands.empty())
+      return make_const(u->type(), 1);
+    else if (constant_operands.size() == 1)
+      return constant_operands.front();
+    else
+      return Product::Make(constant_operands);
+  }
+  return make_const(u->type(), 1);
+}
+
+// 3*x*2*y => x*y
+// x => x
+Expr ProductGetNonConstantPart(Expr u) {
+  auto* product = u.As<Product>();
+  if (product) {
+    std::vector<Expr> nonconstant_operands;
+    for (auto& i : product->operands()) {
+      if (!i.is_constant()) {
+        nonconstant_operands.push_back(i);
+      }
+    }
+    if (nonconstant_operands.empty()) {
+      return make_const(u->type(), 1);
+    } else if (nonconstant_operands.size() == 1)
+      return nonconstant_operands.front();
+    else
+      return Product::Make(nonconstant_operands);
+  }
+  return u;
+}
+
+namespace detail {
+
+// Is a Divisible to b.
+// @{
+bool IsDivisible(int64_t a, int64_t b) {
+  CHECK_NE(b, 0);
+  return a % b == 0;
+}
+bool IsDivisible(const Sum* a, int b);
+
+// If int a Divisible to any operands of product b
+bool IsDivisible(int a, const Product* b) {
+  if (a < 0) return false;
+  for (auto& item : b->operands()) {
+    if (item.As<IntImm>() && item.As<IntImm>()->value > 0 && IsDivisible(a, item.As<IntImm>()->value)) return true;
+  }
+  return false;
+}
+bool IsDivisible(const Product* a, int b) {
+  for (auto& item : a->operands()) {
+    if (item.As<IntImm>() && IsDivisible(item.As<IntImm>()->value, b)) {
+      return true;
+    }
+    if (item.As<Sum>() && IsDivisible(item.As<Sum>(), b)) return true;
+  }
+  return false;
+}
+bool IsDivisible(const Sum* a, int b) {
+  for (auto& item : a->operands()) {
+    auto* vi = item.As<IntImm>();
+    auto* vp = item.As<Product>();
+    if (vi && IsDivisible(vi->value, b)) continue;
+    if (vp && IsDivisible(vp, b)) continue;
+    return false;
+  }
+  return true;
+}
+bool IsDivisible(Expr a, int b) {
+  auto* ai = a.As<IntImm>();
+  auto* as = a.As<Sum>();
+  auto* ap = a.As<Product>();
+
+  if (ai) return IsDivisible(ai->value, b);
+  if (as) return IsDivisible(as, b);
+  if (ap) return IsDivisible(ap, b);
+  return false;
+}
+// @}
+
+//! Divide a by b, NOTE that a should be divisible by b.
+// @{
+Expr Divide(const Product* a, int b);
+Expr Divide(const Sum* a, int b) {
+  std::vector<Expr> args;
+  for (auto& item : a->operands()) {
+    if (item.As<IntImm>())
+      args.push_back(make_const(item.type(), item.As<IntImm>()->value / b));
+    else if (item.As<Product>())
+      args.push_back(Divide(item.As<Product>(), b));
+    else
+      CINN_NOT_IMPLEMENTED
+  }
+  return Sum::Make(args);
+}
+Expr Divide(const Product* a, int b) {
+  std::vector<Expr> args;
+  int i             = 0;
+  int times         = -1;
+  bool is_divisible = false;
+  for (i = 0; i < a->operands().size(); i++) {
+    auto* a_i = a->operand(i).As<IntImm>();
+    if (a_i && a_i->value % b == 0) {
+      times        = a_i->value / b;
+      is_divisible = true;
+      break;
+    }
+  }
+  // Case is_divisible : a = 8x and b = 4 and a/b = 2x
+  // Case !is_divisible : a = 2x and b = 8 and a/b = x/4
+  if (is_divisible) {
+    // NOTE that a should be divisible by b.
+    if (times != 1) {
+      args.push_back(make_const(a->type(), times));
+    }
+    for (int j = 0; j < a->operands().size(); j++) {
+      if (j == i) continue;
+      args.push_back(a->operand(j));
+    }
+    return Product::Make(args);
+  } else {
+    for (i = 0; i < a->operands().size(); i++) {
+      auto* a_i = a->operand(i).As<IntImm>();
+      if (a_i && b % a_i->value == 0) {
+        b = b / a_i->value;
+      } else {
+        args.push_back(a->operand(i));
+      }
+    }
+    return FracOp::Make(Product::Make(args), Expr(b));
+  }
+  return Product::Make(args);
+}
+
+// @}
+
+inline int Iquot(int n, int d) { return n / d; }
+
+inline int Irem(int n, int d) {
+  int k = Iquot(n, d);
+  return n - d * k;
+}
+
+Expr CasSimplifyMutator::SimplifyRationalNumber(Expr u) {
+  auto* frac_n = u.As<FracOp>();
+  if (frac_n) {
+    Expr n = frac_n->a();
+    Expr d = frac_n->b();
+
+    auto* ni = n.As<IntImm>();
+    auto* di = d.As<IntImm>();
+
+    CHECK(ni && di);
+    int nv = ni->value;
+    int dv = di->value;
+
+    if (Irem(nv, dv) == 0) {
+      return Expr(make_const(u.type(), Iquot(nv, dv)));
+    } else {
+      int g = gcd(nv, dv);
+      if (dv > 0) {
+        return FracOp::Make(make_const(Iquot(nv, g)), make_const(Iquot(dv, g)));
+      } else {
+        return FracOp::Make(make_const(Iquot(-nv, g)), make_const(Iquot(-dv, g)));
+      }
+    }
+  }
+  return u;
+}
+
+Expr SumOrProductGetSingleElementsRec(Expr u) {
+  auto* product = u.As<Product>();
+  auto* sum     = u.As<Sum>();
+  if (product && product->operands().size() == 1) {
+    return SumOrProductGetSingleElementsRec(u->operands.front());
+  }
+  if (sum && sum->operands().size() == 1) {
+    return SumOrProductGetSingleElementsRec(u->operands.front());
+  }
+  return u;
+}
+
+// Order, reference to Page 85.
+bool ExprPosCmp::operator()(const Expr& a, const Expr& b) {
+  // O-1, 1 <| 2
+  VLOG(7) << "Begin ExprPosCmp, a: " << a << ", b: " << b;
+  if (a.is_constant() && b.is_constant()) {
+    return a.get_constant() < b.get_constant();
+  }
+
+  // O-2, both are symbols, compare by the lexicographical order.
+  if (a.As<_Var_>() && b.As<_Var_>()) {
+    return a.As<_Var_>()->name < b.As<_Var_>()->name;
+  }
+
+  // O-3, if a and b are either both products or both sums, compare by each element similar to lexicographical order.
+  if ((a.As<Product>() && b.As<Product>()) || (a.As<Add>() && b.As<Add>())) {
+    auto& aoprs = a->operands;
+    auto& boprs = b->operands;
+    int m       = std::min(aoprs.size(), boprs.size());
+
+    for (int i = 0; i < m; i++) {
+      // ugly compare representation in string.
+      auto& aopr = aoprs[aoprs.size() - 1 - i];
+      auto& bopr = boprs[boprs.size() - 1 - i];
+      if (aopr != bopr) return operator()(aopr, bopr);
+    }
+
+    return aoprs.size() < boprs.size();
+  }
+
+  // customized case, if both are mod
+  {
+    auto* am = a.As<Mod>();
+    auto* bm = b.As<Mod>();
+    if (am && bm) {
+      if (am->b() != bm->b()) {
+        return operator()(am->b(), bm->b());
+      }
+      return operator()(am->a(), bm->a());
+    }
+  }
+
+  // O-7, if a is an integer or fraction and v is any other type, 1 < x
+  if (a.As<IntImm>() || a.As<FloatImm>() || a.As<FracOp>()) {
+    if (!(b.As<IntImm>() || b.As<FloatImm>() || b.As<FracOp>())) return true;
+  }
+  if (b.As<IntImm>() || b.As<FloatImm>() || b.As<FracOp>()) {
+    if (!(a.As<IntImm>() || a.As<FloatImm>() || a.As<FracOp>())) return false;
+  }
+
+  // O-8, if a is a product, v is a sum, fractional, or symbol
+  {
+    auto* ap = a.As<Product>();
+
+    if (ap && (b.As<Sum>() || b.As<Call>() || b.As<_Var_>() || b.As<Mod>())) {
+      return operator()(a, Product::Make({b}));
+    }
+  }
+
+  {
+    if (a.As<Mod>()) {
+      if (!b.As<Mod>()) {
+        // Todo: may be wrong especially for negative value
+        return operator()(a, Mod::Make(b, Sum::Make({b, Expr(1)})));
+      }
+    }
+  }
+
+  // O-10, if a is a sum, b is a function, or symbol
+  {
+    if (a.As<Sum>()) {
+      if (b.As<_Var_>()) {
+        return operator()(a.As<Sum>()->operand(0), {b});
+      }
+    }
+  }
+
+  return false;
+}
+
+std::vector<Expr> CasSimplifyMutator::MergeProduct(const std::vector<Expr>& p, const std::vector<Expr>& q) {
+  return MergeExprs(
+      p, q, std::bind(&CasSimplifyMutator::SimplifyBinaryProduct, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+std::vector<Expr> CasSimplifyMutator::SimplifyBinaryProduct(Expr left, Expr right) {
+  // SPRDREC-1
+  if (!left.As<Product>() && !right.As<Product>()) {
+    auto a = left;
+    auto b = right;
+
+    auto* ai = a.As<IntImm>();
+    auto* af = a.As<FloatImm>();
+    auto* bi = b.As<IntImm>();
+    auto* bf = b.As<FloatImm>();
+
+    // case 1, both are constants
+    if (a.is_constant() && b.is_constant()) {
+      if (ai) return {make_const(a.type(), ai->value * bi->value)};
+      if (af) return {make_const(a.type(), af->value * bf->value)};
+    }
+
+    if (a.As<Max>() || a.As<Min>() || b.As<Max>() || b.As<Min>()) {
+      // cinn_min/cinn_max(a, b) * 2 = cinn_min/cinn_max(2*a, 2*b)
+      // 2 * cinn_min/cinn_max(a, b) = cinn_min/cinn_max(2*a, 2*b)
+      // cinn_min/cinn_max(a, b) * -2 = cinn_max/cinn_min(-2*b, -2*a)
+      // -2 * cinn_min/cinn_max(a, b) = cinn_max/cinn_min(-2*b, -2*a)
+      Expr const_oper;
+      Expr cmp_oper;
+      int const_value;
+      if (ai) {
+        const_oper  = a;
+        cmp_oper    = b;
+        const_value = ai->value;
+      }
+      if (af) {
+        const_oper  = a;
+        cmp_oper    = b;
+        const_value = af->value;
+      }
+      if (bi) {
+        const_oper  = b;
+        cmp_oper    = a;
+        const_value = bi->value;
+      }
+      if (bf) {
+        const_oper  = b;
+        cmp_oper    = a;
+        const_value = bf->value;
+      }
+      if (const_value == 0) {
+        return {make_const(a->type(), 0)};
+      }
+      if (cmp_oper.defined() && const_oper.defined()) {
+        auto cmp_min = cmp_oper.As<Min>();
+        auto cmp_max = cmp_oper.As<Max>();
+        if (const_value > 0) {
+          if (cmp_min) {
+            return {CasSimplify(Min::Make(CasSimplify(Product::Make({cmp_min->a(), const_oper}), var_intervals),
+                                          CasSimplify(Product::Make({cmp_min->b(), const_oper}), var_intervals)),
+                                var_intervals)};
+          }
+          if (cmp_max) {
+            return {CasSimplify(Max::Make(CasSimplify(Product::Make({cmp_max->a(), const_oper}), var_intervals),
+                                          CasSimplify(Product::Make({cmp_max->b(), const_oper}), var_intervals)),
+                                var_intervals)};
+          }
+        } else {
+          if (cmp_min) {
+            return {CasSimplify(Max::Make(CasSimplify(Product::Make({cmp_min->b(), const_oper}), var_intervals),
+                                          CasSimplify(Product::Make({cmp_min->a(), const_oper}), var_intervals)),
+                                var_intervals)};
+          }
+          if (cmp_max) {
+            return {CasSimplify(Min::Make(CasSimplify(Product::Make({cmp_max->b(), const_oper}), var_intervals),
+                                          CasSimplify(Product::Make({cmp_max->a(), const_oper}), var_intervals)),
+                                var_intervals)};
+          }
+        }
+      }
+    }
+
+    {  // FracOp related constants.
+      // NOTE the integer division is weried in C language, 1/2 = 0, that is huge different from a real CAS.
+      auto* af = a.As<FracOp>();
+      auto* bf = b.As<FracOp>();
+      // 1/2 * 2/3
+      if (af && bf && a->type().is_float()) {
+        return {CasSimplify(FracOp::Make(Product::Make({af->a(), bf->a()}), Product::Make({af->b(), bf->b()})),
+                            var_intervals)};
+      }
+      if (af && !bf && a->type().is_float()) {
+        return {CasSimplify(FracOp::Make(Product::Make({af->a(), b}), af->b()), var_intervals)};
+      }
+      if (!af && bf && a->type().is_float()) {
+        return {CasSimplify(FracOp::Make(Product::Make({bf->a(), a}), bf->b()), var_intervals)};
+      }
+    }
+
+    // case 2
+    // x*1 -> a
+    if (ai && ai->value == 1) return {b};
+    if (af && af->value == 1.f) return {b};
+    // 1*x -> x
+    if (bi && bi->value == 1) return {a};
+    if (bf && bf->value == 1.f) return {a};
+
+    {
+      auto* a_sum = a.As<Sum>();
+      auto* b_sum = b.As<Sum>();
+
+      if (b_sum) {
+        std::vector<Expr> args;
+        for (auto& v : b_sum->operands()) {
+          args.push_back(CasSimplify(Product::Make({a, v}), var_intervals));
+        }
+        return {SimplifySum(Sum::Make(args))};
+      }
+
+      if (a_sum) {
+        std::vector<Expr> args;
+        for (auto& v : a_sum->operands()) {
+          args.push_back(CasSimplify(Product::Make({b, v}), var_intervals));
+        }
+        return {SimplifySum(Sum::Make(args))};
+      }
+    }
+
+    // case 4, b <| a
+    {
+      if (ExprPosCmp()(b, a)) {
+        return {b, a};
+      }
+    }
+
+    return {left, right};
+  }
+
+  // SPRDREC-2, Page 101
+  if (left.As<Product>() || right.As<Product>()) {
+    auto a = left;
+    auto b = right;
+
+    auto* a_product = a.As<Product>();
+    auto* b_product = b.As<Product>();
+    // case 1
+    if (a_product && b_product) {
+      return MergeProduct(a_product->operands(), b_product->operands());
+    }
+
+    // case 2
+    if (a_product) {
+      return MergeProduct(a_product->operands(), {b});
+    }
+
+    // case 3
+    if (b_product) {
+      return MergeProduct({a}, b_product->operands());
+    }
+  }
+
+  return {left, right};
+}
+
+std::vector<Expr> CasSimplifyMutator::SimplifyProductRec(const std::vector<Expr>& operands) {
+  if (operands.size() < 2) return {CasSimplify(operands.front(), var_intervals)};
+  auto mid_it  = operands.begin() + operands.size() / 2;
+  auto&& left  = SimplifyProductRec(std::vector<Expr>(operands.begin(), mid_it));
+  auto&& right = SimplifyProductRec(std::vector<Expr>(mid_it, operands.end()));
+  return MergeProduct(left, right);
+}
+
+Expr CasSimplifyMutator::SimplifyProduct(Expr a) {
+  a = SumOrProductGetSingleElementsRec(a);
+  // We reuse the Mul node for production.
+  auto* prod = a.As<Product>();
+  if (!prod) return a;
+
+  const auto& _operands = prod->operands();
+  std::vector<Expr> operands;
+  for (auto& e : _operands) operands.push_back(CasSimplify(e, var_intervals));
+#ifdef CINN_DEBUG
+  {
+    std::stringstream ss;
+    for (auto& v : operands) {
+      ss << v << " ";
+    }
+    VLOG(7) << "operands: " << ss.str();
+  };
+#endif
+
+  // SPRD-2
+  // 0*x... = 0
+  for (auto& opr : operands) {
+    auto* opri = opr.As<IntImm>();
+    auto* oprf = opr.As<FloatImm>();
+    if (opri && opri->value == 0) return make_const(a.type(), 0);
+    if (oprf && oprf->value == 0) return make_const(a.type(), 0);
+  }
+
+  // SPRD-3
+  // prod(x) = x, single number.
+  if (operands.size() == 1) {
+    auto* first_s = operands.front().As<Sum>();
+    auto* first_p = operands.front().As<Product>();
+    return operands[0];
+  }
+
+  // SPRD-4
+  return Product::Make(SimplifyProductRec(operands));
+}
+
+Expr CasSimplifyMutator::SimplifySum(Expr u) {
+  u = SumOrProductGetSingleElementsRec(u);
+
+  auto* sum = u.As<Sum>();
+  CHECK(sum);
+
+  auto& operands = sum->operands();
+
+  auto temp = SimplifySpecificSum(u);
+  // If temp has been simplified, return it.
+  if (!temp.As<Sum>()) return temp;
+
+  operands = temp.As<Sum>()->operands();
+
+  auto args = SimplifySumRec(operands);
+  if (args.empty()) return make_const(u.type(), 0);
+  if (args.size() == 1) return args[0];
+  return Sum::Make(args);
+}
+
+std::vector<Expr> CasSimplifyMutator::MergeExprs(const std::vector<Expr>& p,
+                                                 const std::vector<Expr>& q,
+                                                 const std::function<std::vector<Expr>(Expr, Expr)>& binary_merge) {
+  std::vector<Expr> res;
+  int li = 0, lj = 0;
+  while (li < p.size() && lj < q.size()) {
+    auto&& p1 = p[li];
+    auto&& q1 = q[lj];
+    auto&& h  = binary_merge(p1, q1);
+    if (h.size() == 2 && h[0] == p1 && h[1] == q1) {
+      ++li;
+      res.emplace_back(std::move(h.front()));
+    } else if (h.size() == 2 && h[0] == q1 && h[1] == p1) {
+      ++lj;
+      res.emplace_back(std::move(h.front()));
+    } else {
+      ++li;
+      ++lj;
+      std::move(h.begin(), h.end(), std::back_inserter(res));
+    }
+  }
+
+  if (li < p.size()) res.insert(res.end(), p.begin() + li, p.end());
+  if (lj < q.size()) res.insert(res.end(), q.begin() + lj, q.end());
+  return std::move(res);
+}
+
+// This implementation is similar to MergeProduct
+std::vector<Expr> CasSimplifyMutator::MergeSum(const std::vector<Expr>& p, const std::vector<Expr>& q) {
+#ifdef CINN_DEBUG
+  {
+    std::stringstream ss;
+    for (auto& x : p) ss << x << " ";
+
+    VLOG(7) << "MergeSum p(" << ss.str() << ")";
+    ss.str("");
+
+    for (auto& x : q) ss << x << " ";
+    VLOG(7) << "MergeSum q(" << ss.str() << ")";
+    ss.str("");
+  }
+#endif
+
+  return MergeExprs(p, q, [this](Expr left, Expr right) -> std::vector<Expr> {
+    auto&& h = SimplifyBinarySum(std::move(left), std::move(right));
+    if (h.size() == 1 && h[0].is_constant() && h[0].get_constant() == 0) {
+      return {};
+    } else {
+      return std::move(h);
+    }
+  });
+}
+
+std::vector<Expr> CasSimplifyMutator::SimplifyBinarySum(Expr left, Expr right) {
+  // SPRDREC-1
+  if (!left.As<Sum>() && !right.As<Sum>()) {
+    auto a = left;
+    auto b = right;
+
+    auto* ai = a.As<IntImm>();
+    auto* af = a.As<FloatImm>();
+    auto* bi = b.As<IntImm>();
+    auto* bf = b.As<FloatImm>();
+
+    // case 1, both are constants
+    if (a.is_constant() && b.is_constant()) {
+      if (ai) return {make_const(a.type(), ai->value + bi->value)};
+      if (af) return {make_const(a.type(), af->value + bf->value)};
+    }
+
+    // cinn_min/cinn_max(a, b)+c = cinn_min/cinn_max(a+c, b+c)
+    // c + cinn_min/cinn_max(a, b) = cinn_min/cinn_max(a+c, b+c)
+    auto* a_min = a.As<Min>();
+    auto* a_max = a.As<Max>();
+    auto* b_min = b.As<Min>();
+    auto* b_max = b.As<Max>();
+    if (a_min) {
+      return {CasSimplify(Min::Make(CasSimplify(Sum::Make({a_min->a(), b}), var_intervals),
+                                    CasSimplify(Sum::Make({a_min->b(), b}), var_intervals)),
+                          var_intervals)};
+    }
+    if (a_max) {
+      return {CasSimplify(Max::Make(CasSimplify(Sum::Make({a_max->a(), b}), var_intervals),
+                                    CasSimplify(Sum::Make({a_max->b(), b}), var_intervals)),
+                          var_intervals)};
+    }
+    if (b_min) {
+      return {CasSimplify(Min::Make(CasSimplify(Sum::Make({b_min->a(), a}), var_intervals),
+                                    CasSimplify(Sum::Make({b_min->b(), a}), var_intervals)),
+                          var_intervals)};
+    }
+    if (b_max) {
+      return {CasSimplify(Max::Make(CasSimplify(Sum::Make({b_max->a(), a}), var_intervals),
+                                    CasSimplify(Sum::Make({b_max->b(), a}), var_intervals)),
+                          var_intervals)};
+    }
+
+    // case 2
+    // x*1 -> a
+    if (ai && ai->value == 0) return {b};
+    if (af && af->value == 0.f) return {b};
+    // 1*x -> x
+    if (bi && bi->value == 0) return {a};
+    if (bf && bf->value == 0.f) return {a};
+
+    // customized case for Mod
+    {
+      auto* am = a.As<Mod>();
+      auto* bm = b.As<Mod>();
+      if (am && bm) {
+        if (am->b() == bm->b() && ProductGetNonConstantPart(am->a()) == ProductGetNonConstantPart(bm->a())) {
+          return {CasSimplify(Mod::Make(Sum::Make({am->a(), bm->a()}), am->b()), var_intervals)};
+        }
+      }
+    }
+
+    // case 3
+    // Here is different from SimplifySumRec, to deal with cases like 3x + (-2x) = 2x
+    auto a_non_constant = ProductGetNonConstantPart(a);
+    auto b_non_constant = ProductGetNonConstantPart(b);
+    if (a_non_constant.defined() && b_non_constant.defined() && a_non_constant == b_non_constant) {
+      VLOG(7) << "a " << a;
+      VLOG(7) << "b " << b;
+      Expr s = SimplifySum(Sum::Make({ProductGetConstantPart(a), ProductGetConstantPart(b)}));
+      Expr p = Product::Make({s, ProductGetNonConstantPart(a)});
+      return {CasSimplify(p, var_intervals)};
+    }
+
+    // case 4, b <| a
+    {
+      if (ExprPosCmp()(b, a)) {
+        return {b, a};
+      }
+    }
+
+    return {left, right};
+  }
+
+  // SPRDREC-2, Page 101
+  if (left.As<Sum>() || right.As<Sum>()) {
+    auto a = left;
+    auto b = right;
+
+    auto* a_sum = a.As<Sum>();
+    auto* b_sum = b.As<Sum>();
+
+    // case 1
+    if (a_sum && b_sum) {
+      return MergeSum(a_sum->operands(), b_sum->operands());
+    }
+
+    // case 2
+    if (a_sum) {
+      return MergeSum(a_sum->operands(), {b});
+    }
+
+    // case 3
+    if (b_sum) {
+      return MergeSum({a}, b_sum->operands());
+    }
+  }
+
+  return {left, right};
+}
+
+// The implementation is similar to SimplifyProductRec
+std::vector<Expr> CasSimplifyMutator::SimplifySumRec(const std::vector<Expr>& operands) {
+#ifdef CINN_DEBUG
+  {
+    std::stringstream ss;
+    for (auto& o : operands) {
+      ss << o.node_type() << " " << o << " ";
+    }
+    VLOG(7) << "SimplifySumRec operands: " << ss.str();
+  }
+#endif
+  CHECK(!operands.empty());
+  if (operands.size() < 2) return {CasSimplify(operands.front(), var_intervals)};
+  auto mid_it  = operands.begin() + operands.size() / 2;
+  auto&& left  = SimplifySumRec(std::vector<Expr>(operands.begin(), mid_it));
+  auto&& right = SimplifySumRec(std::vector<Expr>(mid_it, operands.end()));
+  return MergeSum(left, right);
+}
+
+void CasSimplifyMutator::AddBaseAndSimplify(Expr* base, Expr bound) {
+  if ((*base).defined()) {
+    *base = Sum::Make({*base, bound});
+  } else {
+    *base = bound;
+  }
+  *base = CasSimplify(*base, var_intervals);
+}
+
+void CasSimplifyMutator::UnfoldBound(Expr* lower_bound, Expr* upper_bound, Expr var, bool unfold_const_bound) {
+  CHECK(lower_bound);
+  CHECK(upper_bound);
+  auto v_var = var.As<_Var_>();
+  CHECK(v_var);
+  if (var_intervals.count(v_var->name)) {
+    auto& interval = var_intervals.at(v_var->name);
+    if (interval.e_l.defined() && interval.e_r.defined()) {
+      AddBaseAndSimplify(lower_bound, interval.e_l);
+      AddBaseAndSimplify(upper_bound, interval.e_r);
+    } else if (unfold_const_bound) {
+      // unfold var's const bound
+      AddBaseAndSimplify(lower_bound, Expr(interval.l));
+      AddBaseAndSimplify(upper_bound, Expr(interval.r));
+    } else {
+      // no unfold var's const bound for var simplification
+      AddBaseAndSimplify(lower_bound, var);
+      AddBaseAndSimplify(upper_bound, var);
+    }
+  } else if (!unfold_const_bound) {
+    // not get var's bound for var simplification
+    AddBaseAndSimplify(lower_bound, var);
+    AddBaseAndSimplify(upper_bound, var);
+  } else {
+    LOG(FATAL) << "can't get the bound";
+  }
+}
+
+bool CasSimplifyMutator::GetVarBound(Expr* lower_bound, Expr* upper_bound, Expr var, bool unfold_const_bound) {
+  CHECK(lower_bound);
+  CHECK(upper_bound);
+  auto v_var     = var.As<_Var_>();
+  auto v_product = var.As<Product>();
+  auto v_frac    = var.As<FracOp>();
+  if (v_var && (var_intervals.count(v_var->name) || !unfold_const_bound)) {
+    UnfoldBound(lower_bound, upper_bound, var, unfold_const_bound);
+    return true;
+  } else if (v_product) {
+    // only deal with 2*x
+    Expr p_lower_bound;
+    Expr p_upper_bound;
+    Expr const_oper     = ProductGetConstantPart(var);
+    Expr non_const_oper = ProductGetNonConstantPart(var);
+    auto v_var          = non_const_oper.As<_Var_>();
+    if (v_var && var_intervals.count(v_var->name)) {
+      Expr v_lower, v_upper;
+      UnfoldBound(&v_lower, &v_upper, non_const_oper, unfold_const_bound);
+      auto const_v = const_oper.get_constant();
+      CHECK(v_lower.defined() && v_upper.defined());
+      if (const_v > 0) {
+        p_lower_bound = Product::Make({const_oper, v_lower});
+        p_upper_bound = Product::Make({const_oper, v_upper});
+      } else {
+        p_lower_bound = Product::Make({const_oper, v_upper});
+        p_upper_bound = Product::Make({const_oper, v_lower});
+      }
+      AddBaseAndSimplify(lower_bound, p_lower_bound);
+      AddBaseAndSimplify(upper_bound, p_upper_bound);
+      return true;
+    }
+  } else if (v_frac) {
+    // only deal with x/2
+    Expr p_lower_bound;
+    Expr p_upper_bound;
+    Expr non_const_oper = v_frac->a();
+    Expr const_oper     = v_frac->b();
+    auto v_var          = non_const_oper.As<_Var_>();
+    if (v_var && var_intervals.count(v_var->name)) {
+      Expr v_lower, v_upper;
+      UnfoldBound(&v_lower, &v_upper, non_const_oper, unfold_const_bound);
+      auto const_v = const_oper.get_constant();
+      CHECK(v_lower.defined() && v_upper.defined());
+      if (const_v > 0) {
+        p_lower_bound = FracOp::Make(v_lower, const_oper);
+        p_upper_bound = FracOp::Make(v_upper, const_oper);
+      } else {
+        p_lower_bound = FracOp::Make(v_upper, const_oper);
+        p_upper_bound = FracOp::Make(v_lower, const_oper);
+      }
+      AddBaseAndSimplify(lower_bound, p_lower_bound);
+      AddBaseAndSimplify(upper_bound, p_upper_bound);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CasSimplifyMutator::GetOperandBound(Expr* lower_bound, Expr* upper_bound, Expr v, bool unfold_const_bound) {
+  // only support simple operand of int, var and var's product with int
+  CHECK(lower_bound);
+  CHECK(upper_bound);
+  auto* v_int = v.As<IntImm>();
+  if (v_int) {
+    AddBaseAndSimplify(lower_bound, v);
+    AddBaseAndSimplify(upper_bound, v);
+    return true;
+  } else if (GetVarBound(lower_bound, upper_bound, v, unfold_const_bound)) {
+    return true;
+  }
+  return false;
+}
+
+bool CasSimplifyMutator::GetSumBound(Expr* lower_bound, Expr* upper_bound, Expr sum, bool unfold_const_bound) {
+  // only support sum of int, var and var's product with int
+  CHECK(lower_bound);
+  CHECK(upper_bound);
+  auto bound_sum = sum.As<Sum>();
+  // CHECK(bound_sum);
+  bool get_bound = true;
+  Expr sum_lower_bound, sum_upper_bound;
+  if (bound_sum) {
+    for (Expr& v : bound_sum->operands()) {
+      if (!GetOperandBound(&sum_lower_bound, &sum_upper_bound, v, unfold_const_bound)) {
+        get_bound = false;
+        break;
+      }
+    }
+    if (get_bound) {
+      *lower_bound = sum_lower_bound;
+      *upper_bound = sum_upper_bound;
+    }
+    return get_bound;
+  }
+  return false;
+}
+
+bool CasSimplifyMutator::GetExprBound(Expr* lower_bound, Expr* upper_bound, Expr expr, bool unfold_const_bound) {
+  // only support min's operands as sum, int or var or var's product with int or min/max
+  auto bound_sum = expr.As<Sum>();
+  auto bound_min = expr.As<Min>();
+  auto bound_max = expr.As<Max>();
+  bool get_bound = true;
+  if (bound_sum) {
+    get_bound = GetSumBound(lower_bound, upper_bound, expr, unfold_const_bound);
+  } else if (bound_min) {
+    get_bound = GetMinBound(lower_bound, upper_bound, expr, unfold_const_bound);
+  } else if (bound_max) {
+    get_bound = GetMaxBound(lower_bound, upper_bound, expr, unfold_const_bound);
+  } else if (!GetOperandBound(lower_bound, upper_bound, expr, unfold_const_bound)) {
+    return false;
+  }
+  return get_bound;
+}
+
+bool CasSimplifyMutator::GetMinBound(Expr* lower_bound, Expr* upper_bound, Expr min, bool unfold_const_bound) {
+  // only support min's operands as sum, int or var or var's product with int or min/max
+  auto bound_min = min.As<Min>();
+  CHECK(bound_min);
+  bool get_bound = true;
+  Expr a_lower_bound, a_upper_bound, b_lower_bound, b_upper_bound;
+  get_bound = get_bound && GetExprBound(&a_lower_bound, &a_upper_bound, bound_min->a(), unfold_const_bound) &&
+              GetExprBound(&b_lower_bound, &b_upper_bound, bound_min->b(), unfold_const_bound);
+  if (get_bound) {
+    *lower_bound = CasSimplify(Min::Make(a_lower_bound, b_lower_bound), var_intervals);
+    *upper_bound = CasSimplify(Min::Make(a_upper_bound, b_upper_bound), var_intervals);
+  }
+  return get_bound;
+}
+
+bool CasSimplifyMutator::GetMaxBound(Expr* lower_bound, Expr* upper_bound, Expr max, bool unfold_const_bound) {
+  auto bound_max = max.As<Max>();
+  CHECK(bound_max);
+  bool get_bound = true;
+  Expr a_lower_bound, a_upper_bound, b_lower_bound, b_upper_bound;
+  get_bound = get_bound && GetExprBound(&a_lower_bound, &a_upper_bound, bound_max->a(), unfold_const_bound) &&
+              GetExprBound(&b_lower_bound, &b_upper_bound, bound_max->b(), unfold_const_bound);
+  if (get_bound) {
+    *lower_bound = CasSimplify(Max::Make(a_lower_bound, b_lower_bound), var_intervals);
+    *upper_bound = CasSimplify(Max::Make(a_upper_bound, b_upper_bound), var_intervals);
+  }
+  return get_bound;
+}
+
+bool CasSimplifyMutator::SimplifySpecificSumMod(Expr* result, Expr a, Expr b) {
+  // case1: (32+(-x))%33 = 32-x%33 (0<=x<=32)
+  // case2: (x-32)%33 = x%33 - 32%33 (0<=x<=32)
+  auto a_sum = a.As<Sum>();
+  auto b_i   = b.As<IntImm>();
+  if (!a_sum || !b_i) {
+    return false;
+  }
+  // if 0 < b < 3, (3a+b) % 6 = (3a % 6) + (b % 6)
+  if (a_sum->operands().size() == 2) {
+    a_sum->operands()[0] = CasSimplify(a_sum->operands()[0], var_intervals);
+    auto sum_a_prod      = a_sum->operands()[0].As<Product>();
+    auto sum_b_var       = a_sum->operands()[1].As<_Var_>();
+    if (sum_a_prod && sum_b_var && var_intervals.count(sum_b_var->name)) {
+      auto sum_a_prod_b_int = sum_a_prod->operand(1).As<IntImm>();
+      if (sum_a_prod_b_int) std::swap(sum_a_prod->operand(0), sum_a_prod->operand(1));
+      auto sum_a_prod_a_int = sum_a_prod->operand(0).As<IntImm>();
+      auto& interval        = var_intervals.at(sum_b_var->name);
+      int b_abs             = std::abs(b_i->value);
+      int sum_prod_a_abs    = std::abs(sum_a_prod_a_int->value);
+      if (sum_a_prod_a_int && (b_abs % sum_prod_a_abs == 0)) {
+        if (std::abs(interval.l) < sum_prod_a_abs && std::abs(interval.r) < sum_prod_a_abs) {
+          *result = CasSimplify(Sum::Make({CasSimplify(Mod::Make(a_sum->operands()[0], b), var_intervals),
+                                           CasSimplify(Mod::Make(a_sum->operands()[1], b), var_intervals)}),
+                                var_intervals);
+          return true;
+        }
+      }
+    }
+  }
+#ifdef CINN_WITH_CUDA
+  return false;
+#else
+
+  int const_value = 0;
+  Expr lower_bound;
+  Expr upper_bound;
+  Expr rest_oper;
+  bool can_simplify = true;
+  bool has_int      = false;
+  // fold only the expr bound(may contains the var) and try to simplify the var
+  Expr unfolded_lower_bound, unfolded_upper_bound;
+  for (Expr& v : a_sum->operands()) {
+    auto* v_int = v.As<IntImm>();
+    if (v_int) {
+      const_value += v_int->value;
+      has_int = true;
+    } else if (GetVarBound(&lower_bound, &upper_bound, v, false)) {
+      AddBaseAndSimplify(&rest_oper, v);
+    } else {
+      can_simplify = false;
+      break;
+    }
+  }
+  can_simplify = can_simplify && has_int && std::abs(const_value) % b_i->value == b_i->value - 1 &&
+                 lower_bound.defined() && upper_bound.defined() && rest_oper.defined();
+  // further infer the vars' bound by the intervals infos, try to get the constant
+  if (can_simplify) {
+    std::vector<Expr> bounds = {lower_bound, upper_bound};
+    for (int i = 0; i < bounds.size(); ++i) {
+      Expr bound = bounds[i];
+      Expr bound_l, bound_r;
+      GetExprBound(&bound_l, &bound_r, bound);
+      if (i == 0 && bound_l.defined()) {
+        lower_bound = bound_l;
+      }
+      if (i == 1 && bound_r.defined()) {
+        upper_bound = bound_r;
+      }
+    }
+  } else {
+    return false;
+  }
+  // case1: (32+(-x))%33 = 32-x%33 (0<=x<=32)
+  // case2: (x-32)%33 = x%33 - 32%33 (0<=x<=32)
+  can_simplify = can_simplify && lower_bound.is_constant();
+  bool case1   = can_simplify && const_value >= 0 && lower_bound.get_constant() >= -const_value &&
+               upper_bound.is_constant() && upper_bound.get_constant() <= 0;
+  bool case2 = can_simplify && const_value <= 0 && lower_bound.get_constant() >= 0 && upper_bound.is_constant() &&
+               upper_bound.get_constant() <= -const_value;
+  can_simplify = can_simplify && (case1 || case2);
+  if (can_simplify) {
+    Expr const_expr;
+    if (const_value < 0) {
+      const_expr = make_const(b->type(), const_value % b_i->value);
+    } else {
+      const_expr = make_const(b->type(), const_value % b_i->value);
+    }
+    *result = CasSimplify(Sum::Make({const_expr, CasSimplify(Mod::Make(rest_oper, b), var_intervals)}), var_intervals);
+    return true;
+  }
+  return false;
+#endif
+}
+
+// Return if the var's interval is nonnegative.
+inline bool IsVarNonnegative(const absl::flat_hash_map<std::string, CasInterval>& var_intervals,
+                             const std::string& var_name) {
+  return var_intervals.count(var_name) && var_intervals.at(var_name).l >= 0;
+}
+
+// Return if the var is binded with thread or block in cuda(which implies it is non-negative).
+inline bool IsVarBinded(const std::string& var_name) {
+  return utils::Startswith(var_name, "threadIdx") || utils::Startswith(var_name, "blockIdx");
+}
+
+/**
+ * Return if exprs are still all nonnegative vars.
+ * @param all_nonnegative_var is previous exprs all nonnegative vars.
+ * @param arg_var the pointer of this var.
+ * @param var_intervals intervals of each var.
+ * @return if exprs are still all nonnegative vars.
+ */
+inline bool IsVarAllNonnegative(bool all_nonnegative_var,
+                                _Var_* arg_var,
+                                const absl::flat_hash_map<std::string, CasInterval>& var_intervals) {
+  // All exprs all nonnegative vars if previous exprs are nonnegative vars(all_nonnegative_var == true) and this expr is
+  // a var (arg_var != nullptr) and (this var's interval is nonnegative or this var is binded to thread or block in
+  // cuda).
+  return all_nonnegative_var && arg_var &&
+         (IsVarNonnegative(var_intervals, arg_var->name) || IsVarBinded(arg_var->name));
+}
+
+Expr CasSimplifyMutator::SimplifyMod(Expr u) {
+  VLOG(4) << "SimplifyMod:" << u;
+  auto* node = u.As<Mod>();
+  CHECK(node);
+
+  auto a = CasSimplify(node->a(), var_intervals);
+  auto b = CasSimplify(node->b(), var_intervals);
+
+  auto* a_i       = a.As<IntImm>();
+  auto* a_product = a.As<Product>();
+  auto* a_sum     = a.As<Sum>();
+  auto* a_var     = a.As<_Var_>();
+  auto* a_mod     = a.As<Mod>();
+  auto* a_add     = a.As<Add>();
+
+  auto* b_i = b.As<IntImm>();
+
+  // 7 % 3
+  if (a_i && b_i) {
+    return make_const(a_i->type(), a_i->value % b_i->value);
+  }
+
+  // x % 1 = 0
+  if (b_i && b_i->value == 1) return make_const(b_i->type(), 0);
+
+  // handle cases:
+  // (x * 6) % 2 = 0
+  // (x * 2) % 6 = (x % 3) * 2
+  if (b_i && a_product && b_i->value > 0) {
+    for (int i = 0; i < a_product->operands().size(); i++) {
+      auto a_op_i = a_product->operand(i);
+      if (a_op_i.As<IntImm>() && a_op_i.As<IntImm>()->value > 0) {
+        int a_op_int = a_op_i.As<IntImm>()->value;
+        // case: (x * 6) % 2 = 0
+        if (a_op_int % b_i->value == 0) return make_const(a_product->type(), 0);
+        // case: (x * y * 2) % 6 = ((x * y) % 3) * 2
+        if (b_i->value % a_op_int == 0) {
+          int new_b                    = b_i->value / a_op_int;
+          std::vector<Expr> a_operands = a_product->operands();
+          a_operands.erase(a_operands.begin() + i);
+          return Product::Make({SimplifyMod(Mod::Make(Product::Make(a_operands), Expr(new_b))), Expr(a_op_int)});
+        }
+      }
+    }
+  }
+
+  // (x % 16) % 4 = x % 4
+  if (a_mod && b_i) {
+    VLOG(4) << "Simplify sequential mod";
+    auto* a_b_i = a_mod->b().As<IntImm>();
+    if (a_b_i->value != 0 && a_b_i->value % b_i->value == 0) {
+      auto e = SimplifyMod(Mod::Make(a_mod->a(), b_i));
+      VLOG(4) << "Reduce Mod from " << u << " to " << e;
+      return e;
+    }
+  }
+
+  // 0 % x = 0, 1 % x = 1
+  if (a_i && (a_i->value == 0 || a_i->value == 1)) return a;
+
+  if (b_i && a_var && var_intervals.count(a_var->name)) {
+    auto& interval = var_intervals.at(a_var->name);
+    int b_abs      = std::abs(b_i->value);
+    // x\in[1, 3] % 4 = x
+    if (std::abs(interval.l) < b_abs && std::abs(interval.r) < b_abs) return a;
+    // [3,3] % 3 = 0
+    if (interval.l == interval.r && interval.l % b_abs == 0) return make_const(b_i->type(), 0);
+  }
+
+  if (a_product && b_i) {
+    if (IsDivisible(a_product, b_i->value)) {
+      return make_const(Int(32), 0);
+    }
+  }
+
+  // (4*x + k*y)%2 = (k*y) %2
+  // (2x+y+z) % 2 = (y+z) % 2
+  if (a_sum && b_i) {
+    VLOG(4) << "A SUM ";
+    std::vector<Expr> sum_args;
+    for (auto& v : a_sum->operands()) {
+      if (!IsDivisible(v, b_i->value)) {
+        VLOG(4) << v;
+        sum_args.push_back(v);
+      }
+    }
+
+    if (sum_args.empty()) return make_const(b_i->type(), 0);
+    // handle the case: (2x+y+z) % 2 = (y+z) % 2 when y>=0 and z>=0
+    if (sum_args.size() == 1) {
+      return SimplifyMod(Mod::Make(sum_args[0], b));
+    } else if (sum_args.size() < a_sum->operands().size()) {
+      bool all_nonnegative_var = true;
+      bool all_nonnegative_int = true;
+      for (int i = 0; i < sum_args.size(); i++) {
+        auto* arg_var       = sum_args[i].As<_Var_>();
+        all_nonnegative_var = IsVarAllNonnegative(all_nonnegative_var, arg_var, var_intervals);
+        auto* arg_int       = sum_args[i].As<IntImm>();
+        all_nonnegative_int = all_nonnegative_int && arg_int && arg_int->value >= 0;
+      }
+      VLOG(4) << all_nonnegative_var << " " << all_nonnegative_int;
+      if (all_nonnegative_var) return SimplifyMod(Mod::Make(Sum::Make(sum_args), b));
+      if (all_nonnegative_int) {
+        int sum_value = 0;
+        for (auto& i : sum_args) sum_value += i.As<IntImm>()->value;
+        return make_const(a_sum->type(), sum_value % b_i->value);
+      }
+      return SimplifyMod(Mod::Make(Sum::Make(sum_args), b));
+    } else if (sum_args.size() == a_sum->operands().size()) {
+      if (b_i->value > 0 && !var_intervals.empty()) {
+        // case1: (32+(-x))%33 = 32-x%33 (0<=x<=32)
+        // case2: (x-32))%33 = x%33 - 32%33 (0<=x<=32)
+        Expr result;
+        if (SimplifySpecificSumMod(&result, a, b)) {
+          return result;
+        }
+      }
+      return Mod::Make(a, b);
+    }
+  }
+
+  return Mod::Make(a, b);
+}
+
+Expr CasSimplifyMutator::SimplifyMinAndMax(Expr u) {
+  // simplify min/max
+  auto* u_max = u.As<Max>();
+  auto* u_min = u.As<Min>();
+  if (u_max) {
+    Expr a          = CasSimplify(u_max->a(), var_intervals);
+    Expr b          = CasSimplify(u_max->b(), var_intervals);
+    bool is_a_const = a.is_constant();
+    bool is_b_const = b.is_constant();
+    if (is_a_const && is_b_const) {
+      return a.get_constant() >= b.get_constant() ? a : b;
+    }
+    Expr lower_bound, upper_bound;
+    Expr const_operand, non_const_operand;
+    if (is_a_const) {
+      const_operand     = a;
+      non_const_operand = b;
+    }
+    if (is_b_const) {
+      const_operand     = b;
+      non_const_operand = a;
+    }
+    if (const_operand.defined() && non_const_operand.defined()) {
+      auto const_size = const_operand.get_constant();
+      // unfold var with bounds
+      if (GetExprBound(&lower_bound, &upper_bound, non_const_operand, true)) {
+        // if non_const_operand's lower_bound is larger than const_operand, then non_const_operand must be larger than
+        // const_operand
+        if (lower_bound.is_constant() && const_size <= lower_bound.get_constant()) {
+          return non_const_operand;
+        }
+        // if non_const_operand's upper_bound is smaller than a, then const_operand must be larger than
+        // non_const_operand
+        if (upper_bound.is_constant() && const_size >= upper_bound.get_constant()) {
+          return const_operand;
+        }
+      }
+      // not unfold var for var may be eliminated in the caculation
+      if (GetExprBound(&lower_bound, &upper_bound, non_const_operand, false)) {
+        // if non_const_operand's lower_bound is larger than const_operand, then non_const_operand must be larger than
+        // const_operand
+        lower_bound = CasSimplify(lower_bound, var_intervals);
+        upper_bound = CasSimplify(upper_bound, var_intervals);
+        if (lower_bound.is_constant() && const_size <= lower_bound.get_constant()) {
+          return non_const_operand;
+        }
+        // if non_const_operand's upper_bound is smaller than a, then const_operand must be larger than
+        // non_const_operand
+        if (upper_bound.is_constant() && const_size >= upper_bound.get_constant()) {
+          return const_operand;
+        }
+      }
+    }
+    return ir::Max::Make(a, b);
+  }
+
+  if (u_min) {
+    Expr a          = CasSimplify(u_min->a(), var_intervals);
+    Expr b          = CasSimplify(u_min->b(), var_intervals);
+    bool is_a_const = a.is_constant();
+    bool is_b_const = b.is_constant();
+    if (is_a_const && is_b_const) {
+      return a.get_constant() <= b.get_constant() ? a : b;
+    }
+    Expr lower_bound, upper_bound;
+    Expr const_operand, non_const_operand;
+    if (is_a_const) {
+      const_operand     = a;
+      non_const_operand = b;
+    }
+    if (is_b_const) {
+      const_operand     = b;
+      non_const_operand = a;
+    }
+    if (const_operand.defined() && non_const_operand.defined()) {
+      auto const_size = const_operand.get_constant();
+      if (GetExprBound(&lower_bound, &upper_bound, non_const_operand, true)) {
+        // if non_const_operand's lower_bound is larger than const_operand, then non_const_operand must be larger than
+        // const_operand
+        if (lower_bound.is_constant() && const_size <= lower_bound.get_constant()) {
+          return const_operand;
+        }
+        // if non_const_operand's upper_bound is smaller than a, then const_operand must be larger than
+        // non_const_operand
+        if (upper_bound.is_constant() && const_size >= upper_bound.get_constant()) {
+          return non_const_operand;
+        }
+      }
+      if (GetExprBound(&lower_bound, &upper_bound, non_const_operand, false)) {
+        // if non_const_operand's lower_bound is larger than const_operand, then non_const_operand must be larger than
+        // const_operand
+        if (lower_bound.is_constant() && const_size <= lower_bound.get_constant()) {
+          return const_operand;
+        }
+        // if non_const_operand's upper_bound is smaller than a, then const_operand must be larger than
+        // non_const_operand
+        if (upper_bound.is_constant() && const_size >= upper_bound.get_constant()) {
+          return non_const_operand;
+        }
+      }
+    }
+    return ir::Min::Make(a, b);
+  }
+  return u;
+}
+
+Expr CasSimplifyMutator::SimplifyCmp(Expr u) {
+  Expr a = operator()(u->operand(0));
+  Expr b = operator()(u->operand(1));
+
+  if (a.is_constant() && b.is_constant()) {
+    switch (u->node_type()) {
+      case ir::IrNodeTy::LT:
+        return Expr(a.get_constant() < b.get_constant());
+      case ir::IrNodeTy::LE:
+        return Expr(a.get_constant() <= b.get_constant());
+      case ir::IrNodeTy::GT:
+        return Expr(a.get_constant() > b.get_constant());
+      case ir::IrNodeTy::GE:
+        return Expr(a.get_constant() >= b.get_constant());
+      case ir::IrNodeTy::EQ:
+        return Expr(a.get_constant() == b.get_constant());
+      case ir::IrNodeTy::NE:
+        return Expr(a.get_constant() != b.get_constant());
+    }
+  }
+
+  return u;
+}
+
+/**
+ * deal with index's div-mod add simplification, tempory solution, not cover all situations.
+ * case 1: (m / n) * n + m % n = m (m, n's type is int)
+ * case 2: (m / n1) * n3 + (n2 * m) % n3 = n2 * m if n3 = n1 * n2 (m, n1, n2, n3's type is int)
+ */
+Expr CasSimplifyMutator::SimplifySpecificSum(Expr tmp) {
+  auto sum = tmp.As<Sum>();
+  if (!sum) {
+    return tmp;
+  }
+  if (sum->operands().size() == 1U) return sum->operand(0);
+  Expr left      = sum->operand(0);
+  Expr right     = sum->operand(1);
+  auto left_mod  = left.As<Mod>();
+  auto right_mod = right.As<Mod>();
+  auto left_mul  = left.As<Product>();
+  auto right_mul = right.As<Product>();
+  auto left_div  = left.As<FracOp>();
+  auto right_div = right.As<FracOp>();
+  // normalize to left mul and right mod
+  if (right_mul && left_mod) {
+    left_mul  = right_mul;
+    right_mod = left_mod;
+  }
+  // normalize to left div and right mod
+  if (right_div && left_mod) {
+    left_div  = right_div;
+    right_mod = left_mod;
+  }
+  if (!right_mod || (!left_mul && !left_div)) {
+    return tmp;
+  }
+  CHECK_GE(right_mod->operands().size(), 2U);
+  Expr mod_left  = right_mod->operand(0);
+  Expr mod_right = right_mod->operand(1);
+  if (!mod_left->type().is_integer() || !mod_right->type().is_integer()) {
+    return tmp;
+  }
+  if (left_mul) {
+    // case 1: (m / n) * n + m % n = m (m, n's type is int)
+    // case 2: (m / n1) * n3 + (n2 * m) % n3 = n2 * m if n3 = n1 * n2 (m, n1, n2, n3's type is int)
+    CHECK_GE(left_mul->operands().size(), 2U);
+    Expr mul_left  = left_mul->operand(0);
+    Expr mul_right = left_mul->operand(1);
+
+    // handle the case1 : n * (m / n)  + m % n = (m / n) * n + m % n = m
+    // handle the case2 : n3 * (m / n1) + (n2 * m) % n3 = (m / n1) * n3 + (n2 * m) % n3 = n2 * m if n3 = n1 * n2
+    if (MathEqual(mod_right, mul_left)) {
+      mul_left  = left_mul->operand(1);
+      mul_right = left_mul->operand(0);
+    } else if (!MathEqual(mod_right, mul_right)) {
+      return tmp;
+    }
+    auto div = mul_left.As<FracOp>();
+    if (!div) {
+      return tmp;
+    }
+    CHECK_GE(div->operands().size(), 2U);
+    Expr div_left  = div->operand(0);
+    Expr div_right = div->operand(1);
+    if (!div_left->type().is_integer() || !div_right->type().is_integer()) {
+      return tmp;
+    }
+    if (MathEqual(div_left * mod_right, mod_left * div_right)) {
+      tmp = mod_left;
+      for (int i = 2; i < sum->operands().size(); i++) {
+        tmp = tmp + sum->operand(i);
+      }
+      return tmp;
+    }
+  }
+  return tmp;
+}
+
+Expr CasSimplifyMutator::operator()(Expr u) {
+  if (u.As<Min>() || u.As<Max>()) {
+    return SimplifyMinAndMax(u);
+  }
+
+  u = detail::SumOrProductGetSingleElementsRec(u);
+
+  if (u.is_constant() || u.As<_Var_>()) return u;
+
+  if (u.As<FracOp>()) {
+    u        = SimplifyFracOp(u);
+    auto tmp = FurtherSimplifyFracWithInterval(u, var_intervals);
+    if (!tmp.same_as(u)) return operator()(tmp);
+    return u;
+  }
+
+  if (u.As<Product>()) {
+    return detail::SumOrProductGetSingleElementsRec(SimplifyProduct(u));
+  }
+
+  if (u.As<Sum>()) {
+    auto tmp = detail::SumOrProductGetSingleElementsRec(SimplifySum(u));
+    // deal with index's div-mod add simplification, tempory solution, not cover all situations.
+    // case 1: (m / n) * n + m % n = m (m, n's type is int)
+    // case 2: (m / n1) * n3 + (n2 * m) % n3 = n2 * m if n3 = n1 * n2 (m, n1, n2, n3's type is int)
+    // case 3: m / n2 + (n1 * m) % n3 = n1 * m if n3 = n1 * n2 (m, n1, n2, n3's type is int)
+    return SimplifySpecificSum(tmp);
+  }
+
+  if (u.As<Mod>()) {
+    return detail::SumOrProductGetSingleElementsRec(SimplifyMod(u));
+  }
+
+  if (u.is_cmp()) {
+    return SimplifyCmp(u);
+  }
+
+  switch (u.node_type()) {
+    case ir::IrNodeTy::And:
+    case ir::IrNodeTy::Or:
+    case ir::IrNodeTy::Not:
+      return SimplifyCond(u);
+    default:
+      break;
+  }
+
+  return u;
+}
+
+bool CASasSymbol(Expr expr) {
+  auto* load_n      = expr.As<Load>();
+  auto* var_n       = expr.As<_Var_>();
+  auto* broadcast_n = expr.As<Broadcast>();
+
+  return load_n || var_n || broadcast_n;
+}
+
+Expr ConvertCinnToCAS(Expr expr) {
+  VLOG(7) << "Begin ConvertCinnToCAS " << expr;
+  Expr copied = optim::IRCopy(expr);
+  struct Mutator : public ir::IRMutator<ir::Expr*> {
+    void operator()(Expr* expr) { Visit(expr); }
+    void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+   private:
+    void Visit(const Add* op, Expr* expr) override {
+      auto a = op->a();
+      auto b = op->b();
+
+      Visit(&a);
+      Visit(&b);
+
+      bool is_zero_a = a.is_constant() && a.get_constant() == 0;
+      bool is_zero_b = b.is_constant() && b.get_constant() == 0;
+      if (is_zero_a) {
+        *expr = b;
+        return;
+      } else if (is_zero_b) {
+        *expr = a;
+        return;
+      }
+      *expr = Sum::Make({a, b});
+    }
+    void Visit(const Mul* op, Expr* expr) override {
+      auto a = op->a();
+      auto b = op->b();
+
+      Visit(&a);
+      Visit(&b);
+
+      if (a.is_constant() && a.get_constant() == 0) {
+        *expr = make_const(a->type(), 0);
+        return;
+      }
+
+      if (a.is_constant() && a.get_constant() == 1) {
+        *expr = b;
+        return;
+      }
+
+      if (b.is_constant() && b.get_constant() == 0) {
+        *expr = make_const(b->type(), 0);
+        return;
+      }
+
+      if (b.is_constant() && b.get_constant() == 1) {
+        *expr = a;
+        return;
+      }
+
+      *expr = Product::Make({a, b});
+    }
+
+    void Visit(const Sub* op, Expr* expr) override {
+      auto a = op->a();
+      auto b = op->b();
+
+      Visit(&a);
+      Visit(&b);
+
+      bool is_zero_a = a.is_constant() && a.get_constant() == 0;
+      bool is_zero_b = b.is_constant() && b.get_constant() == 0;
+      if (is_zero_a) {
+        *expr = Product::Make({make_const(b->type(), -1), b});
+        return;
+      } else if (is_zero_b) {
+        *expr = a;
+        return;
+      }
+
+      b     = Product::Make({make_const(b->type(), -1), b});
+      *expr = Sum::Make({a, b});
+    }
+
+    void Visit(const Div* op, Expr* expr) override {
+      auto a = op->a();
+      auto b = op->b();
+
+      Visit(&a);
+      Visit(&b);
+
+      CHECK(!is_zero(b)) << "Dividend should not be zero";
+
+      if (a.is_constant() && a.get_constant() == 0) {
+        *expr = make_const(a->type(), 0);
+        return;
+      }
+
+      if (b.is_constant() && b.get_constant() == 1) {
+        *expr = a;
+        return;
+      }
+
+      // int division, NOTE that 3/2 = 1, 3./2 = 1.5
+      *expr = FracOp::Make(a, b);
+    }
+
+    void Visit(const Minus* op, Expr* expr) override {
+      auto a = op->v();
+
+      Visit(&a);
+
+      if (a.is_constant()) {
+        auto value = a.get_constant();
+        if (value == 0) {
+          *expr = make_const(a->type(), 0);
+          return;
+        }
+      }
+
+      *expr = Product::Make({make_const(a->type(), -1), a});
+    }
+  };
+
+  Mutator()(&copied);
+  return copied;
+}
+
+/**
+ * @brief Given an expr, visit it. If there is an ir::Min and its operands are 1 constant value and 1 inconstant value,
+ * return the constant min value.
+ * For example, if a < min(5, b), then we get a < 5 and a < b. Using a < 5 to simplify the condition ensures
+ * correctness, though not sufficient.
+ */
+Expr ReplaceMinToConstant(Expr expr) {
+  Expr copied = optim::IRCopy(expr);
+  struct Mutator : public ir::IRMutator<ir::Expr*> {
+    void operator()(Expr* expr) { Visit(expr); }
+    void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+   private:
+    void Visit(const Min* op, Expr* expr) override {
+      auto a = op->a();
+      auto b = op->b();
+
+      Visit(&a);
+      Visit(&b);
+
+      auto min_a = op->a();
+      auto min_b = op->b();
+      if (min_a.is_constant() && !min_b.is_constant()) {
+        CHECK(min_a->type().is_integer());
+        *expr = optim::IRCopy(min_a);
+      } else if (min_b.is_constant() && !min_a.is_constant()) {
+        CHECK(min_b->type().is_integer());
+        *expr = optim::IRCopy(min_b);
+      }
+    }
+  };
+  Mutator()(&copied);
+  return copied;
+}
+
+/**
+ * @brief Given an expr, visit it. If there is an ir::Max and its operands are 1 constant value and 1 inconstant value,
+ * return the constant max value.
+ */
+Expr ReplaceMaxToConstant(Expr expr) {
+  Expr copied = optim::IRCopy(expr);
+  struct Mutator : public ir::IRMutator<ir::Expr*> {
+    void operator()(Expr* expr) { Visit(expr); }
+    void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+   private:
+    void Visit(const Max* op, Expr* expr) override {
+      auto a = op->a();
+      auto b = op->b();
+
+      Visit(&a);
+      Visit(&b);
+
+      auto max_a = op->a();
+      auto max_b = op->b();
+      if (max_a.is_constant() && !max_b.is_constant()) {
+        CHECK(max_a->type().is_integer());
+        *expr = optim::IRCopy(max_a);
+      } else if (max_b.is_constant() && !max_a.is_constant()) {
+        CHECK(max_b->type().is_integer());
+        *expr = optim::IRCopy(max_b);
+      }
+    }
+  };
+  Mutator()(&copied);
+  return copied;
+}
+
+Expr ConvertCasToCinn(Expr expr) {
+  VLOG(7) << "Begin ConvertCasToCinn : " << expr;
+  Expr copied = optim::IRCopy(expr);
+
+  struct Mutator : ir::IRMutator<Expr*> {
+    void operator()(Expr* expr) { Visit(expr); }
+    void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+   private:
+    void Visit(const Product* op, Expr* expr) override {
+      std::vector<Expr> operands;
+      auto* node = expr->As<Product>();
+      for (auto& v : node->operands()) {
+        auto c = v;
+        Mutator()(&c);
+        operands.push_back(c);
+      }
+
+      CHECK(!operands.empty());
+      if (operands.size() == 1) {
+        *expr = operands[0];
+      } else if (operands.size() == 2) {
+        *expr = Mul::Make(operands[0], operands[1]);
+      } else {
+        auto a = operands[0];
+        auto b = Product::Make(EraseFront(operands));
+        Mutator()(&b);
+        *expr = Mul::Make(a, b);
+      }
+
+      // process the Mul
+      Visit(expr);
+    }
+
+    void Visit(const Sum* op, Expr* expr) override {
+      std::vector<Expr> operands;
+      auto* node = expr->As<Sum>();
+      for (auto& v : node->operands()) {
+        auto c = v;
+        Mutator()(&c);
+        operands.push_back(c);
+      }
+
+      CHECK(!operands.empty());
+      if (operands.size() == 1) {
+        *expr = operands[0];
+      } else if (operands.size() == 2) {
+        *expr = Add::Make(operands[0], operands[1]);
+      } else {
+        auto a = operands[0];
+        auto b = Sum::Make(EraseFront(operands));
+        Mutator()(&b);
+        *expr = Add::Make(a, b);
+      }
+
+      // process the sum
+      Visit(expr);
+    }
+
+    void Visit(const FracOp* op, Expr* expr) override {
+      auto a = op->a();
+      auto b = op->b();
+
+      Visit(&a);
+      Visit(&b);
+
+      CHECK(!is_zero(b)) << "Dividend should not be zero";
+      *expr = Div::Make(a, b);
+      Visit(expr);
+    }
+
+    // a + -1*b -> a-b
+    void Visit(const Add* op, Expr* expr) override {
+      auto a = op->a();
+      auto b = op->b();
+
+      Visit(&a);
+      Visit(&b);
+
+      auto* bp = b.As<ir::Mul>();
+      if (bp && bp->a().is_constant() && bp->a().get_constant() == -1.f) {
+        *expr = Sub::Make(a, bp->b());
+      } else {
+        *expr = Add::Make(a, b);
+      }
+    }
+  };
+
+  Mutator()(&copied);
+  return copied;
+}
+
+bool IsExprCasCompatible(Expr expr) {
+  auto teller = [](const Expr* expr) {
+    return expr->As<Add>() || expr->As<Sub>() || expr->As<Mul>() || expr->As<Div>();
+  };
+  return ir::CollectIRNodes(expr, teller).empty();
+}
+
+// Partially divide a by b. e.g. (2x+y)/2 => x + y/2
+Expr DividePartially(Sum* a, int b) {
+  std::vector<Expr> external_sum_args, sum_args;
+
+  for (auto& item : a->operands()) {
+    if (item.As<Product>() && (IsDivisible(item.As<Product>(), b) || IsDivisible(b, item.As<Product>()))) {
+      external_sum_args.push_back(Divide(item.As<Product>(), b));
+    } else if (item.As<IntImm>() && IsDivisible(item.As<IntImm>()->value, b)) {
+      external_sum_args.push_back(make_const(item.type(), item.As<IntImm>()->value / b));
+    } else {
+      sum_args.push_back(item);
+    }
+  }
+
+  if (!external_sum_args.empty()) {
+    if (sum_args.empty()) return Sum::Make(external_sum_args);
+    Expr internal_sum = sum_args.size() == 1 ? sum_args[0] : Sum::Make(sum_args);
+    Expr new_frac     = FracOp::Make(internal_sum, make_const(a->type(), b));
+    return Sum::Make(Concat(external_sum_args, {new_frac}));
+  }
+  return Expr(a);
+}
+
+bool IsMonotonical(Expr u, Var v) {
+  auto* up = u.As<Product>();
+  auto* uv = u.As<_Var_>();
+
+  if (uv && uv->name == v->name) return true;
+  if (up) {
+    for (auto& item : up->operands()) {
+      if (IsMonotonical(item, v)) return true;
+    }
+  }
+  return false;
+}
+
+// Should be called after SimplifyFracOp. If y is integer and $y\in \[0, 3\]$, then y/4=0
+Expr CasSimplifyMutator::FurtherSimplifyFracWithInterval(
+    Expr expr, const absl::flat_hash_map<std::string, CasInterval>& var_intervals) {
+  auto* node = expr.As<FracOp>();
+  if (!node) return expr;
+  auto a = CasSimplify(node->a(), var_intervals);
+  auto b = CasSimplify(node->b(), var_intervals);
+
+  auto* ai = a.As<IntImm>();
+  auto* bi = b.As<IntImm>();
+  auto* av = a.As<_Var_>();
+  auto* bv = b.As<_Var_>();
+  auto* ap = a.As<Product>();
+  // case: y / 4, y\in[0,3]
+  if (bi) {
+    if (av) {
+      auto it = var_intervals.find(av->name);
+      if (it != var_intervals.end() && std::abs(it->second.r) < std::abs(bi->value) &&
+          std::abs(it->second.l) < std::abs(bi->value))
+        return make_const(a.type(), 0);
+    }
+  }
+  // case: 1/y, y\in(2, 100)
+  if (ai) {
+    if (bv) {
+      auto it     = var_intervals.find(bv->name);
+      auto ai_abs = std::abs(ai->value);
+      if (it != var_intervals.end()) {
+        VLOG(7) << "found " << bv->name << " " << it->second << " "
+                << " ai " << ai_abs;
+      }
+      if (it != var_intervals.end() && std::abs(it->second.r) > ai_abs && std::abs(it->second.l) > ai_abs) {
+        return make_const(a.type(), 0);
+      }
+    }
+  }
+  return expr;
+}
+
+Expr SimplifyConstantFrac(FracOp* node) {
+  auto* ai = node->a().As<ir::IntImm>();
+  auto* au = node->a().As<ir::UIntImm>();
+  auto* af = node->a().As<ir::FloatImm>();
+
+  if (ai) {
+    auto* bi = node->b().As<ir::IntImm>();
+    CHECK(bi);
+    return make_const(ai->type(), ai->value / bi->value);
+  }
+
+  if (au) {
+    auto* bu = node->b().As<ir::UIntImm>();
+    CHECK(bu);
+    return make_const(au->type(), au->value / bu->value);
+  }
+
+  if (af) {
+    auto* bf = node->b().As<ir::FloatImm>();
+    CHECK(af);
+    return make_const(af->type(), af->value / bf->value);
+  }
+  CINN_NOT_IMPLEMENTED
+  return Expr();
+}
+
+Expr CasSimplifyMutator::SimplifyFracOp(Expr expr) {
+  VLOG(7) << "CAS simplify Frac " << expr;
+  auto* node = expr.As<FracOp>();
+  auto a     = CasSimplify(node->a(), var_intervals);
+  auto b     = CasSimplify(node->b(), var_intervals);
+
+  // update frac op node
+  expr = ir::FracOp::Make(a, b);
+  node = expr.As<FracOp>();
+
+  auto* ap = a.As<Product>();
+  auto* bp = b.As<Product>();
+  auto* as = a.As<Sum>();
+  auto* bi = b.As<IntImm>();
+  auto* ai = a.As<IntImm>();
+  auto* af = a.As<FloatImm>();
+  auto* bf = b.As<FloatImm>();
+  auto* av = a.As<_Var_>();
+  auto* bv = b.As<_Var_>();
+
+  // case 1
+  // integer constant division: 64/3
+  if (node->is_constant()) {
+    if (int_compute_) {
+      return SimplifyConstantFrac(node);
+    } else {
+      return SimplifyRationalNumber(expr);
+    }
+  }
+
+  // case 2
+  // sum/x or product/x is divisible
+  if (bi) {
+    auto* a_sum     = a.As<Sum>();
+    auto* a_product = a.As<Product>();
+    // divisible
+    if (a_sum && IsDivisible(a_sum, bi->value)) return Divide(a_sum, bi->value);
+    if (a_product) {
+      if (IsDivisible(a_product, bi->value) || IsDivisible(bi->value, a_product)) {
+        return Divide(a_product, bi->value);
+      } else {
+        return FracOp::Make(a, b);
+      }
+    }
+
+    // if 0 < b < 3, (3a+b) / 6 = (3a / 6) + (b / 6)
+    if (a_sum && a_sum->operands().size() == 2) {
+      a_sum->operands()[0] = CasSimplify(a_sum->operands()[0], var_intervals);
+      auto sum_a_prod      = a_sum->operands()[0].As<Product>();
+      auto sum_b_var       = a_sum->operands()[1].As<_Var_>();
+      if (sum_a_prod && sum_b_var && var_intervals.count(sum_b_var->name)) {
+        auto sum_a_prod_b_int = sum_a_prod->operand(1).As<IntImm>();
+        if (sum_a_prod_b_int) std::swap(sum_a_prod->operand(0), sum_a_prod->operand(1));
+        auto sum_a_prod_a_int = sum_a_prod->operand(0).As<IntImm>();
+        auto& interval        = var_intervals.at(sum_b_var->name);
+        int b_abs             = std::abs(bi->value);
+        int sum_prod_a_abs    = std::abs(sum_a_prod_a_int->value);
+        if (sum_a_prod_a_int && (b_abs % sum_prod_a_abs == 0)) {
+          if (std::abs(interval.l) < sum_prod_a_abs && std::abs(interval.r) < sum_prod_a_abs) {
+            return CasSimplify(Sum::Make({CasSimplify(FracOp::Make(a_sum->operands()[0], b), var_intervals),
+                                          CasSimplify(FracOp::Make(a_sum->operands()[1], b), var_intervals)}),
+                               var_intervals);
+          }
+        }
+      }
+    }
+
+    // not divisible
+    /*
+    if (a_sum) {
+      auto expr = DividePartially(a_sum, bi->value);
+      return expr;
+    }
+     */
+  }
+
+  // cinn_min/cinn_max(a, b)/2 = cinn_min/cinn_max(a/2, b/2)
+  if ((bi && bi->value > 0) || (bf && bf->value > 0)) {
+    auto cmp_min = a.As<Min>();
+    auto cmp_max = a.As<Max>();
+    if (cmp_min) {
+      return {CasSimplify(Min::Make(CasSimplify(FracOp::Make(cmp_min->a(), b), var_intervals),
+                                    CasSimplify(FracOp::Make(cmp_min->b(), b), var_intervals)),
+                          var_intervals)};
+    }
+    if (cmp_max) {
+      return {CasSimplify(Max::Make(CasSimplify(FracOp::Make(cmp_max->a(), b), var_intervals),
+                                    CasSimplify(FracOp::Make(cmp_max->b(), b), var_intervals)),
+                          var_intervals)};
+    }
+  }
+
+  if (av && bi) {
+    if (var_intervals.count(av->name)) {
+      auto& interval = var_intervals.at(av->name);
+      int b_abs      = std::abs(bi->value);
+      if (std::abs(interval.l) < b_abs && std::abs(interval.r) < b_abs) return make_const(bi->type(), 0);
+      return FracOp::Make(a, b);
+    }
+  }
+
+  // (32x+y)/32 = x + y/32
+  if (as && bi) {
+    std::vector<Expr> external_sum_args;
+    std::vector<Expr> internal_sum_args;
+    for (auto& e : as->operands()) {
+      if (IsDivisible(e, bi->value)) {
+        if (e.As<Sum>()) external_sum_args.push_back(Divide(e.As<Sum>(), bi->value));
+        if (e.As<IntImm>()) external_sum_args.push_back(make_const(bi->type(), e.As<IntImm>()->value / bi->value));
+        if (e.As<Product>()) external_sum_args.push_back(Divide(e.As<Product>(), bi->value));
+      } else {
+        internal_sum_args.push_back(e);
+      }
+    }
+
+    Expr external_sum, internal_sum;
+    if (!external_sum_args.empty()) {
+      if (external_sum_args.size() == 1)
+        external_sum = external_sum_args.front();
+      else
+        external_sum = Sum::Make(external_sum_args);
+    }
+
+    if (!internal_sum_args.empty()) {
+      internal_sum = FracOp::Make(Sum::Make(internal_sum_args), b);
+    }
+
+    if (external_sum.defined() && internal_sum.defined()) {
+      return CasSimplify(Sum::Make({external_sum, internal_sum}), var_intervals);
+    }
+    if (external_sum.defined()) return CasSimplify(external_sum, var_intervals);
+    return internal_sum;
+  }
+
+  // solve the case: 2abc / b
+  // Both avs and bvs should be sorted first.
+  auto reduce_product_div_product = [](const std::vector<Expr>& avs, const std::vector<Expr>& bvs) {
+    std::vector<Expr> avs1, bvs1;
+    int i = 0;
+    int j = 0;
+
+    ExprPosCmp cmp;
+
+    while (i < avs.size() && j < bvs.size()) {
+      auto& a = avs[i];
+      auto& b = bvs[j];
+      if (a.is_constant() && b.is_constant()) {
+        auto* ai = a.As<IntImm>();
+        auto* bi = b.As<IntImm>();
+        auto* af = a.As<FloatImm>();
+        auto* bf = b.As<FloatImm>();
+        if (ai) {
+          CHECK(bi);
+          int g   = gcd(ai->value, bi->value);
+          int a_d = ai->value / g;
+          int b_d = bi->value / g;
+
+          avs1.push_back(make_const(a.type(), a_d));
+          if (b_d != 1) bvs1.push_back(make_const(b.type(), b_d));
+        } else if (af || bf) {
+          double value      = af->value / bf->value;
+          const auto& ftype = af ? af->type() : bf->type();
+          avs1.push_back(make_const(ftype, value));
+        } else {
+          avs1.push_back(a);
+          bvs1.push_back(b);
+        }
+
+        // CHECK(!af) << a << " " << b;
+        i++;
+        j++;
+      } else if (avs[i] == bvs[j]) {
+        i++;
+        j++;
+      } else {
+        // <
+        if (cmp(avs[i], bvs[j])) {
+          avs1.push_back(avs[i++]);
+        } else {
+          bvs1.push_back(bvs[j++]);
+        }
+      }
+    }
+    while (i < avs.size()) {
+      avs1.push_back(avs[i++]);
+    }
+    while (j < bvs.size()) {
+      bvs1.push_back(bvs[j++]);
+    }
+    if (avs1.empty()) return make_const(avs[0].type(), 1);
+    if (bvs1.empty()) return Product::Make(avs1);
+
+    return FracOp::Make(Product::Make(avs1), Product::Make(bvs1));
+  };
+
+  {
+    // TODO : fix in future.
+    // std::vector<Expr> a_args, b_args;
+    // if (ap)
+    //   a_args = ap->operands();
+    // else
+    //   a_args.push_back(a);
+    // if (bp)
+    //   b_args = bp->operands();
+    // else
+    //   b_args.push_back(b);
+    // return reduce_product_div_product(a_args, b_args);
+  }
+
+  // x / x
+  if (a.type().is_int() && b.type().is_int() && av && bv) {
+    if (a == b) return make_const(a.type(), 1);
+  }
+
+  if (node->a().same_as(a) && node->b().same_as(b)) return expr;
+  return FracOp::Make(a, b);
+}
+
+Expr CasSimplifyMutator::SimplifyCond(Expr u) {
+  switch (u->node_type()) {
+      // -------------------------- NOT -----------------------------
+    case ir::IrNodeTy::Not: {
+      auto* node = u.As<ir::Not>();
+      Expr v     = operator()(node->v());
+      switch (v.node_type()) {
+          // Not 1 = (1 == 0)
+        case ir::IrNodeTy::IntImm:
+          return Expr(v.As<IntImm>()->value == 0);
+          // Not Not v = v
+        case ir::IrNodeTy::Not:
+          return v;
+          // Not <= is >
+        case ir::IrNodeTy::LE:
+          return ir::GT::Make(v->operand(0), v->operand(1));
+          // Not < is >=
+        case ir::IrNodeTy::LT:
+          return ir::GE::Make(v->operand(0), v->operand(1));
+          // Not >= is <
+        case ir::IrNodeTy::GE:
+          return ir::LT::Make(v->operand(0), v->operand(1));
+          // Not > is <=
+        case ir::IrNodeTy::GT:
+          return ir::LE::Make(v->operand(0), v->operand(1));
+        default:
+          return ir::Not::Make(v);
+      }
+    } break;
+      // -------------------------- AND OR -----------------------------
+    case ir::IrNodeTy::And:
+    case ir::IrNodeTy::Or: {
+      Expr a = operator()(u->operand(0));
+      Expr b = operator()(u->operand(1));
+      if (a.is_constant() || b.is_constant()) {
+        if (u.As<ir::And>()) {
+          // 1 && b is b
+          if (a.As<ir::UIntImm>()) {
+            return a.As<ir::UIntImm>()->value ? b : Expr(false);
+          }
+          // a && 1 is a
+          if (b.As<ir::UIntImm>()) {
+            return b.As<ir::UIntImm>()->value ? a : Expr(false);
+          }
+          return ir::And::Make(a, b);
+        }
+        if (u.As<ir::Or>()) {
+          // 1 || b is 1
+          if (a.As<ir::UIntImm>()) {
+            return a.As<ir::UIntImm>()->value ? a : b;
+          }
+          // a || 1 is 1
+          if (b.As<ir::UIntImm>()) {
+            return b.As<ir::UIntImm>()->value ? b : a;
+          }
+        }
+        return ir::Or::Make(a, b);
+      }
+
+      return u;
+    }
+
+    default:
+      return u;
+  }
+}
+
+}  // namespace detail
+
+Expr CasSimplify(Expr u, const absl::flat_hash_map<std::string, CasInterval>& var_intervals) {
+  return detail::CasSimplifyMutator(var_intervals)(u);
+}
+
+Expr SolveInequality(Expr inequality, Var val) {
+  auto copied = AutoSimplify(inequality);
+
+  auto* le_n = copied.As<ir::LE>();
+  auto* lt_n = copied.As<ir::LT>();
+  auto* gt_n = copied.As<ir::GT>();
+  auto* ge_n = copied.As<ir::GE>();
+
+  Expr a, b;
+
+#define __(x__)   \
+  if (x__) {      \
+    a = x__->a(); \
+    b = x__->b(); \
+  }
+  __(le_n)
+  __(lt_n)
+  __(gt_n)
+  __(ge_n)
+#undef __
+  Expr all = AutoSimplify(a - b);
+
+  // if (common::IsPureMath(a) && common::IsPureMath(b)) {
+  if (true) {
+    auto _res_positive_ = common::Solve(a, b, val);  // NOLINT
+    auto& res           = std::get<0>(_res_positive_);
+    auto& positive      = std::get<1>(_res_positive_);
+    // Simplify it with CAS to avoid random result from GiNac.
+    res = AutoSimplify(res);
+    res = common::cast(res, val->type());
+
+    if (le_n) {
+      if (positive) return ir::LE::Make(val, res);
+      return ir::GE::Make(val, res);
+    }
+    if (lt_n) {
+      if (positive) return ir::LT::Make(val, res);
+      return ir::GT::Make(val, res);
+    }
+    if (ge_n) {
+      if (positive) return ir::GE::Make(val, res);
+      return ir::LE::Make(val, res);
+    }
+    if (gt_n) {
+      if (positive) return ir::GT::Make(val, res);
+      return ir::LT::Make(val, res);
+    }
+  } else {
+    return AutoSimplify(inequality);
+  }
+  return Expr();
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/cas.h b/paddle/cinn/common/cas.h
new file mode 100755
index 0000000000000..03fa5181f52d9
--- /dev/null
+++ b/paddle/cinn/common/cas.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/optim/ir_simplify.h"
+
+namespace cinn {
+namespace common {
+
+namespace detail {
+Expr ReplaceMinToConstant(Expr expr);
+Expr ReplaceMaxToConstant(Expr expr);
+}  // namespace detail
+
+/**
+ * Interval of a _Var_.
+ */
+struct CasInterval {
+  template <typename T>
+  CasInterval(T l, T r) : l(l), r(r) {
+    CHECK_LE(l, r) << "left should not be larger than right";
+  }
+
+  /**
+   * @brief When iterator's upper_bound is an ir::Min of a constant value and a inconstant value, choose the constant
+   * value. When iterator's lower_bound is an ir::Max of a constant value and a inconstant value, choose the constant
+   * value. E.g: expr_l = max(x, 1) and expr_r = min(y,5): max(x, 1) <= iterator_i <= min(y,5)
+   *
+   * the bounds will be simplified to e_l = 1 and e_r = 5:
+   * 1 <= iterator_i <= 5
+   */
+  CasInterval(Expr expr_l, Expr expr_r) {
+    VLOG(2) << "CasInterval is : [" << expr_l << ", " << expr_r << "].";
+    expr_r = detail::ReplaceMinToConstant(expr_r);
+    expr_l = detail::ReplaceMaxToConstant(expr_l);
+    optim::Simplify(&expr_l);
+    optim::Simplify(&expr_r);
+    VLOG(2) << "After simplify, CasInterval is : [" << expr_l << ", " << expr_r << "].";
+
+    if (expr_l.is_constant() && expr_r.is_constant()) {
+      CHECK(expr_l->type().is_integer());
+      CHECK(expr_r->type().is_integer());
+      l = expr_l.as_int32();
+      r = expr_r.as_int32();
+      return;
+    }
+    e_l = expr_l;
+    e_r = expr_r;
+  }
+  int l, r;
+  // Note: not verify l <= r and (e_l, e_r) has higher priority than (l, r)
+  Expr e_l, e_r;
+
+  friend std::ostream& operator<<(std::ostream& os, const CasInterval& i) {
+    if (i.e_l.defined() && i.e_r.defined()) {
+      os << "Expr e_l Interval[" << i.e_l << ", " << i.e_r << "]";
+    } else {
+      os << "Int l Interval[" << i.l << ", " << i.r << "]";
+    }
+    return os;
+  }
+};
+
+using cas_intervals_t = absl::flat_hash_map<std::string, CasInterval>;
+
+Expr AutoSimplify(Expr u, const absl::flat_hash_map<std::string, CasInterval>& var_intervals = {});
+
+//! Simplify a CAS expression.
+Expr CasSimplify(Expr u, const absl::flat_hash_map<std::string, CasInterval>& var_intervals = {});
+
+/**
+ * \brief Solve an equality.
+ * Currently this is an naive implementation using the GiNaC.
+ *
+ * @param inequality The inequality expression containing an LE or LT or GT or GE, such as 2x-1<3
+ * @param val The target variable.
+ * @return an copied expression looks like x < 100.
+ */
+Expr SolveInequality(Expr inequality, Var val);
+Expr SolveInequalityInt(Expr inequality, Var val);
+
+namespace detail {
+
+//! Whether to treat this expression as a symbol. e.g. Load, Min, Max are treated as symbol to avoid confusing the CAS.
+bool CASasSymbol(Expr expr);
+//! Convert some nodes to CAS representation, e.g. convert Mul, Add to Product and Sum.
+Expr ConvertCinnToCAS(Expr expr);
+//! Convert the CAS representation to CINN expression, e.g. convert Product and Sum to Mul and Add.
+Expr ConvertCasToCinn(Expr expr);
+//! Tell whether this expression is acceptable by CAS.
+bool IsExprCasCompatible(Expr expr);
+
+struct ExprPosCmp {
+  bool operator()(const Expr& a, const Expr& b);
+};
+
+struct CasSimplifyMutator {
+  explicit CasSimplifyMutator(const absl::flat_hash_map<std::string, CasInterval> var_intervals)
+      : var_intervals(var_intervals) {}
+
+  Expr operator()(Expr u);
+
+  Expr SimplifyRationalNumber(Expr u);
+  Expr SimplifyPower(Expr u);
+  Expr SimplifySum(Expr u);
+  Expr SimplifyProduct(Expr a);
+  Expr SimplifyMinAndMax(Expr a);
+  Expr SimplifyCmp(Expr a);
+  std::vector<Expr> SimplifyProductRec(const std::vector<Expr>& operands);
+  std::vector<Expr> SimplifySumRec(const std::vector<Expr>& operands);
+  Expr SimplifyMod(Expr u);
+  Expr SimplifyFracOp(Expr expr);
+  Expr SimplifyCond(Expr u);
+  Expr FurtherSimplifyFracWithInterval(Expr expr, const absl::flat_hash_map<std::string, CasInterval>& var_intervals);
+  Expr SimplifyIntegerPower(Expr u);
+  void AddBaseAndSimplify(Expr* base, Expr bound);
+  void UnfoldBound(Expr* lower_bound, Expr* upper_bound, Expr var, bool unfold_const_bound = true);
+  bool GetVarBound(Expr* lower_bound, Expr* upper_bound, Expr var, bool unfold_const_bound = true);
+  bool GetOperandBound(Expr* lower_bound, Expr* upper_bound, Expr var, bool unfold_const_bound = true);
+  bool GetSumBound(Expr* lower_bound, Expr* upper_bound, Expr sum, bool unfold_const_bound = true);
+  bool GetMinBound(Expr* lower_bound, Expr* upper_bound, Expr min, bool unfold_const_bound = true);
+  bool GetMaxBound(Expr* lower_bound, Expr* upper_bound, Expr max, bool unfold_const_bound = true);
+  bool GetExprBound(Expr* lower_bound, Expr* upper_bound, Expr min, bool unfold_const_bound = true);
+  bool SimplifySpecificSumMod(Expr* u, Expr a, Expr b);
+  Expr SimplifySpecificSum(Expr u);
+
+ private:
+  std::vector<Expr> SimplifyBinaryProduct(Expr left, Expr right);
+  std::vector<Expr> MergeProduct(const std::vector<Expr>& p, const std::vector<Expr>& q);
+
+  std::vector<Expr> SimplifyBinarySum(Expr left, Expr right);
+  std::vector<Expr> MergeSum(const std::vector<Expr>& p, const std::vector<Expr>& q);
+  std::vector<Expr> MergeExprs(const std::vector<Expr>& p,
+                               const std::vector<Expr>& q,
+                               const std::function<std::vector<Expr>(Expr, Expr)>& binary_merge);
+
+  const absl::flat_hash_map<std::string, CasInterval> var_intervals;
+
+  // Computation based on integer if set true(1/2 get 0), false if treat as rational number in mathematics(1/2 is still
+  // 1/2), currently it only works with true.
+  bool int_compute_{true};
+};
+
+}  // namespace detail
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/cas_test.cc b/paddle/cinn/common/cas_test.cc
new file mode 100644
index 0000000000000..bf260e4e3a0dd
--- /dev/null
+++ b/paddle/cinn/common/cas_test.cc
@@ -0,0 +1,432 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/cas.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/common/common.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace common {
+
+using common::make_const;
+using utils::GetStreamCnt;
+using utils::Join;
+using utils::Trim;
+using namespace ir;  // NOLINT
+
+TEST(CAS, number_cal) {
+  // 1 * 100 * -1 + 0 + 1001
+  auto u1 = Sum::Make({Product::Make({Expr(1), Expr(100), Expr(-1)}), Expr(0), Expr(1001)});
+  LOG(INFO) << u1;
+}
+
+TEST(CAS, cmp) {
+  detail::ExprPosCmp cmp;
+
+  Var x = ir::_Var_::Make("x", Int(32));
+  Var y = ir::_Var_::Make("y", Int(32));
+  Var z = ir::_Var_::Make("z", Int(32));
+
+  EXPECT_EQ(cmp(x, Expr(1)), false);
+  EXPECT_EQ(cmp(Expr(1), x), true);
+
+  // x * y * z > x * y
+  EXPECT_EQ(cmp(ir::Product::Make({x, y, z}), ir::Product::Make({x, y})), false);
+  // x * y * z > 10 * y * z
+  EXPECT_EQ(cmp(ir::Product::Make({x, y, z}), ir::Product::Make({Expr(10), y, z})), false);
+  // 1 * y * z < 10 * y * z
+  EXPECT_EQ(cmp(ir::Product::Make({Expr(1), y, z}), ir::Product::Make({Expr(10), y, z})), true);
+}
+
+TEST(CAS, SimplifySum) {
+  Var x = ir::_Var_::Make("x", Int(32));
+  Var y = ir::_Var_::Make("y", Int(32));
+  Var z = ir::_Var_::Make("z", Int(32));
+  // x + y + z + 0
+  auto u1 = Sum::Make({x, y, z, make_const(0)});
+  // x*1 + y + z + 0
+  auto u2 = Sum::Make({Product::Make({x, Expr(1)}), y, z, make_const(0)});
+  // z + 1 + y + x + zx
+  auto u3 = CasSimplify(Sum::Make({z, Expr(1), y, x, Product::Make({z, x})}));
+  // z + 1 + y + 3 + x + 0 + zx
+  auto u4 = CasSimplify(Sum::Make({z, Expr(1), y, Expr(3), x, Expr(0), Product::Make({z, x})}));
+  // x2 + 3zy + -3*yz + -2x + 1
+  auto u5 = CasSimplify(Sum::Make({Product::Make({x, Expr(2)}),
+                                   Product::Make({z, y, Expr(3)}),
+                                   Product::Make({Expr(-3), y, z}),
+                                   Product::Make({Expr(-2), x}),
+                                   Expr(1)}));
+
+  EXPECT_EQ(GetStreamCnt(CasSimplify(u1)), "(x + y + z)");
+  EXPECT_EQ(GetStreamCnt(CasSimplify(u2)), "(x + y + z)");
+  EXPECT_EQ(GetStreamCnt(u3), "(1 + x + y + z + (x * z))");
+  EXPECT_EQ(GetStreamCnt(u4), "(4 + x + y + z + (x * z))");
+  EXPECT_EQ(GetStreamCnt(u5), "1");
+}
+
+TEST(CAS, SimplifyProduct) {
+  Var x = ir::_Var_::Make("x", Int(32));
+  Var y = ir::_Var_::Make("y", Int(32));
+  Var z = ir::_Var_::Make("z", Int(32));
+
+  // zyx*(-1)
+  auto u2 = CasSimplify(Product::Make({z, y, x, Expr(-1)}));
+
+  EXPECT_EQ(GetStreamCnt(u2), "(-1 * x * y * z)");
+}
+
+TEST(CAS, SimplifyMod) {
+  Var x = ir::_Var_::Make("x", Int(32));
+  Var y = ir::_Var_::Make("y", Int(32));
+  Var z = ir::_Var_::Make("z", Int(32));
+
+  // 2*x % 2 = 0
+  auto u1 = CasSimplify(Mod::Make(Product::Make({x, Expr(2)}), Expr(2)));
+  // (x+y+z) % 2 = x%2 + y%2 + z%2
+  auto u2 = CasSimplify(Mod::Make(Sum::Make({x, y, z}), Expr(2)));
+  // x%2 + 1%2 + x%2
+  auto u3 = CasSimplify(Sum::Make({Mod::Make(x, Expr(2)), Mod::Make(Expr(1), Expr(2)), Mod::Make(x, Expr(2))}));
+
+  EXPECT_EQ(GetStreamCnt(u1), "0");
+  EXPECT_EQ(GetStreamCnt(u2), "((x + y + z) % 2)");
+  EXPECT_EQ(GetStreamCnt(u3), "1");
+}
+
+TEST(CAS, SimplifyModForVectorize) {
+  Var x = ir::_Var_::Make("x", Int(32));
+  Var y = ir::_Var_::Make("y", Int(32));
+
+  // (((8*x + 1024*y) % 802816) % 7168) %64
+  // = (8*x + 1024*y) %64           // since 7168 and 802816 is k*64
+  // = (8*x) % 64                   // since 1024 is k*64
+  // = (8*x - ((8*x) // 64) * 64    // since mod definition a%b = a - (a//b)*b
+  // = (8*x) - (x//8)*64
+  // = (8*x) - (x//8)*(8*8)
+  // = 8*(x-(x//8)*8)               // since mod definition
+  // = 8*(x%8)
+  auto u1 = CasSimplify(Mod::Make(
+      Mod::Make(Mod::Make(Sum::Make({Product::Make({x, Expr(8)}), Product::Make({y, Expr(1024)})}), Expr(802816)),
+                Expr(7168)),
+      Expr(64)));
+  std::cout << GetStreamCnt(u1);
+  EXPECT_EQ(GetStreamCnt(u1), "((x % 8) * 8)");
+}
+
+TEST(CAS, ConvertCinnToCAS) {
+  Placeholder<float> A("A", {10, 10});
+  Placeholder<float> B("B", {10, 10});
+
+  auto C = Compute(
+      {Expr(10), Expr(10)},
+      [&](Expr i, Expr j) { return A(i, j) + 0.f + 1.f + 2.f * B(i, j) + 0.f * B(i, j) * A(i, j); },
+      "C");
+
+  Expr body = C->body();
+  LOG(INFO) << "body " << body;
+
+  body = detail::ConvertCinnToCAS(body);
+  body = CasSimplify(body);
+  EXPECT_EQ(GetStreamCnt(body), "(1.00000000f + A[i, j] + (2.00000000f * B[i, j]))");
+  body = detail::ConvertCasToCinn(body);
+  EXPECT_EQ(GetStreamCnt(body), "(1.00000000f + (A[i, j] + (2.00000000f * B[i, j])))");
+}
+
+TEST(CAS, FracOp) {
+  Var x = ir::_Var_::Make("x", Int(32));
+  Var y = ir::_Var_::Make("y", Int(32));
+  Var z = ir::_Var_::Make("z", Int(32));
+
+  auto u1 = AutoSimplify(Div::Make(Expr(1), x) * x);
+  EXPECT_EQ(GetStreamCnt(u1), "((1 / x) * x)");
+  // 64x/32 + y + 64/32
+  auto u2 = AutoSimplify(Expr(64) * x / Expr(32) + y + Expr(64) / Expr(32));
+  ASSERT_EQ(GetStreamCnt(u2), "(2 + ((2 * x) + y))");
+  // 1/32 * y * z * 32768 * 2
+  auto u3 = AutoSimplify(Expr(1) / Expr(32) * y * z * 32768 * 2);
+  EXPECT_EQ(GetStreamCnt(u3), "0");
+  // 32768 * (32x + y) + y
+  auto u4 = AutoSimplify(Expr(32768) * (((Expr(32) * x) + y) / 32));
+  EXPECT_EQ(GetStreamCnt(u4), "((32768 * (y / 32)) + (32768 * x))");
+
+  common::cas_intervals_t var_intervals;
+  var_intervals.emplace("y", common::CasInterval(0, 31));
+  auto u = AutoSimplify((Expr(x) * 32 + y) / 32, var_intervals);
+  EXPECT_EQ(GetStreamCnt(u), "x");
+
+  u = AutoSimplify((Expr(x) * 33 + y) / 32, var_intervals);
+  EXPECT_EQ(GetStreamCnt(u), "(((33 * x) + y) / 32)");
+
+  u = AutoSimplify(Expr(125) / 8 - 1);
+  EXPECT_EQ(GetStreamCnt(u), "14");
+}
+
+#define OUTPUT_EQUAL(s__) EXPECT_EQ(GetStreamCnt(u), s__);
+
+TEST(CAS, Mod) {
+  Var x = ir::_Var_::Make("x", Int(32));
+  Var y = ir::_Var_::Make("y", Int(32));
+  Var z = ir::_Var_::Make("z", Int(32));
+  Var k = ir::_Var_::Make("k", Int(32));
+
+  absl::flat_hash_map<std::string, CasInterval> var_intervals0, var_intervals1;
+  var_intervals0.emplace("x", CasInterval{0, 3});
+  var_intervals0.emplace("y", CasInterval{0, 3});
+  var_intervals0.emplace("z", CasInterval{0, 3});
+  var_intervals0.emplace("k", CasInterval{0, 3});
+
+  Expr u;
+  u = AutoSimplify(x % 5);
+  EXPECT_EQ(GetStreamCnt(u), "(x % 5)");
+  OUTPUT_EQUAL("(x % 5)")
+
+  u = AutoSimplify((5 + x) % 5);
+  OUTPUT_EQUAL("(x % 5)")
+
+  u = AutoSimplify((x + 5 * y + 1 + 1 + 3 - z * 3) % 5);
+  OUTPUT_EQUAL("((x + (-3 * z)) % 5)")
+
+  // u = AutoSimplify((x + 5) % 5, var_intervals0);
+  // OUTPUT_EQUAL("x")
+
+  // u = AutoSimplify((x + y + 5) % 5, var_intervals0);
+  // OUTPUT_EQUAL("((x + y) % 5)")
+
+  // u = AutoSimplify((x + 20 * y + 5) % 5, var_intervals0);
+  // OUTPUT_EQUAL("x")
+
+  u = AutoSimplify((x % 32) + ((32768 * (x / 32)) + ((32768 * y) + ((32 * z) + (128 * k)))));
+  OUTPUT_EQUAL("((32768 * (x / 32)) + ((x % 32) + ((128 * k) + ((32768 * y) + (32 * z)))))");
+
+  u = AutoSimplify((x % 32) + ((32768 * (x / 32)) + ((32768 * y) + ((32 * z) + (128 * k)))), var_intervals0);
+  OUTPUT_EQUAL("((128 * k) + (x + ((32768 * y) + (32 * z))))")
+
+  // (2x+y+z) % 2 = (y+z) % 2
+  u = AutoSimplify((2 * x + y + z) % 2, var_intervals0);
+  OUTPUT_EQUAL("((y + z) % 2)")
+
+  // 0 % x = 0
+  u = AutoSimplify(0 % x);
+  OUTPUT_EQUAL("0")
+
+  // 1 % x = 1
+  u = AutoSimplify(1 % x);
+  OUTPUT_EQUAL("1")
+
+  // (x * 6) % 2 = 0
+  u = AutoSimplify((x * 6) % 2);
+  OUTPUT_EQUAL("0")
+
+  // (x * 2) % 6 = (x % 3) * 2
+  u = AutoSimplify((x * 2) % 6);
+  OUTPUT_EQUAL("((x % 3) * 2)")
+
+  // 7 % 3 = 1
+  u = AutoSimplify(Expr(7) % Expr(3));
+  OUTPUT_EQUAL("1")
+
+  // x % 1 = 0
+  u = AutoSimplify(x % 1);
+  OUTPUT_EQUAL("0")
+
+  // (m / n) * n + m % n = m (m, n's type is int)
+  u = AutoSimplify((x / 10) * 10 + x % 10);
+  OUTPUT_EQUAL("x")
+
+  u = AutoSimplify(((x + y * 2) / 10) * 10 + (x + y * 2) % 10 + 3 * z);
+  OUTPUT_EQUAL("(x + ((2 * y) + (3 * z)))")
+}
+
+TEST(CAS, IntConnerCase) {
+  Var x = ir::_Var_::Make("x", Int(32));
+  Var y = ir::_Var_::Make("y", Int(32));
+  Var z = ir::_Var_::Make("z", Int(32));
+
+  auto u1 = AutoSimplify(Expr(1) / 32);
+  EXPECT_EQ(GetStreamCnt(u1), "0");
+  auto u2 = AutoSimplify(x / 32 + (x * 32 + 64) / 32);
+  EXPECT_EQ(GetStreamCnt(u2), "((x / 32) + (2 + x))");
+  // (32x+y)/32 * 1024 * 32
+  auto u3 = AutoSimplify((((((32 * x) + y) / 32) * 1024) * 32));
+  EXPECT_EQ(GetStreamCnt(u3), "((32768 * (y / 32)) + (32768 * x))");
+
+  auto u4 = AutoSimplify(Expr(1) / 3);
+  EXPECT_EQ(GetStreamCnt(u4), "0");
+
+  absl::flat_hash_map<std::string, CasInterval> var_intervals0, var_intervals1;
+  var_intervals0.emplace("y", CasInterval{2, 3});
+  var_intervals1.emplace("y", CasInterval{0, 3});
+
+  auto u5 = AutoSimplify(Expr(1) / y, var_intervals0);
+  EXPECT_EQ(GetStreamCnt(u5), "0");
+  auto u6 = AutoSimplify(y / 4, var_intervals0);
+  EXPECT_EQ(GetStreamCnt(u6), "0");
+
+  auto u7 = AutoSimplify(1 / y, var_intervals1);
+  EXPECT_EQ(GetStreamCnt(u7), "(1 / y)");
+  auto u8 = AutoSimplify(-1 / y, var_intervals1);
+  EXPECT_EQ(GetStreamCnt(u8), "(-1 / y)");
+}
+
+TEST(SolveInequality, basic) {
+  Var x("x", Int(32));
+  Var y("y", Int(32));
+
+#define TEST_SOLVE(expr__, str__) EXPECT_EQ(GetStreamCnt(SolveInequality(expr__, x)), str__);
+  TEST_SOLVE(x * -1 + 20 < 0, "(x > 20)");
+  TEST_SOLVE(x * 2 + 3 < x * 10 - 20, "(x > 2)");
+  TEST_SOLVE(x * -1 < -1, "(x > 1)");
+  TEST_SOLVE(Expr(2) * x * -1 - x < x + 200, "(x > -50)");
+  TEST_SOLVE(Expr(2) * x + 30 - x * 3 + y * 23 < 2, "(x > int32((28 + (23 * y))))");
+  TEST_SOLVE(x + ir::Min::Make(Expr(2), Expr(3) * y) < 100, "(x < int32(cinn_max((100 + (-3 * y)), 98)))");
+}
+
+TEST(CAS, SimplifyCompoundMod) {
+  {  // (-a % 4) * (-1)
+    Var x   = ir::_Var_::Make("x", Int(32));
+    auto p0 = ir::Product::Make({ir::Mod::Make(-x, Expr(4)), Expr(-1)});
+    LOG(INFO) << "p0 " << p0;
+    auto p2 = AutoSimplify(p0);
+    LOG(INFO) << "simplified " << p2;
+    EXPECT_EQ(GetStreamCnt(p2), "(-1 * ((-1 * x) % 4))");
+  }
+  {  // (33 + x % 34) + -33
+    Var x   = ir::_Var_::Make("x", Int(32));
+    auto p0 = ir::Sum::Make({Expr(33), ir::Sum::Make({ir::Mod::Make(x, Expr(4)), Expr(-33)})});
+    LOG(INFO) << "p0 " << p0;
+    auto p2 = AutoSimplify(p0);
+    LOG(INFO) << "simplified " << p2;
+    EXPECT_EQ(GetStreamCnt(p2), "(x % 4)");
+  }
+  {  // 33 + (x % 2 + (-16))
+    Var x = ir::_Var_::Make("x", Int(32));
+    auto p0 =
+        ir::Sum::Make({Expr(33), ir::Sum::Make({ir::Mod::Make(x, Expr(2)), ir::Product::Make({Expr(-1), Expr(16)})})});
+    LOG(INFO) << "p0 " << p0;
+    auto p2 = AutoSimplify(p0);
+    LOG(INFO) << "simplified " << p2;
+    EXPECT_EQ(GetStreamCnt(p2), "(17 + (x % 2))");
+  }
+  {  // (32- x1 - 16 * x2) % 33
+    Var x1  = ir::_Var_::Make("x1", Int(32));
+    Var x2  = ir::_Var_::Make("x2", Int(32));
+    auto p0 = ir::Mod::Make(ir::Sum::Make({Expr(32), -x1, Expr(16) * -x2}), Expr(33));
+    LOG(INFO) << "p0 " << p0;
+    absl::flat_hash_map<std::string, CasInterval> var_intervals;
+    var_intervals.emplace("x1", CasInterval{0, 15});
+    var_intervals.emplace("x2", CasInterval{0, 1});
+    auto p2 = AutoSimplify(p0, var_intervals);
+    LOG(INFO) << "simplified " << p2;
+#ifdef CINN_WITH_CUDA
+    EXPECT_EQ(GetStreamCnt(p2), "((32 + ((-1 * x1) + (-16 * x2))) % 33)");
+#else
+    EXPECT_EQ(GetStreamCnt(p2), "(32 + (((-1 * x1) + (-16 * x2)) % 33))");
+#endif
+  }
+}
+TEST(CAS, SimplifyNegtive) {
+  {  // (-1*x) /2
+    Var x   = ir::_Var_::Make("x", Int(32));
+    auto p0 = ir::FracOp::Make(-x, Expr(2));
+    LOG(INFO) << "p0 " << p0;
+    auto p2 = AutoSimplify(p0);
+    LOG(INFO) << "simplified " << p2;
+    EXPECT_EQ(GetStreamCnt(p2), "((-1 * x) / 2)");
+  }
+  {  // minus(1)
+    auto p0 = ir::Minus::Make(Expr(1));
+    LOG(INFO) << "p0 " << p0;
+    auto p2 = AutoSimplify(p0);
+    LOG(INFO) << "simplified " << p2;
+    EXPECT_EQ(GetStreamCnt(p2), "-1");
+  }
+}
+
+TEST(CAS, SimplifyMinMax) {
+  {  // 1+cinn_min(15, x)
+    Var x   = ir::_Var_::Make("x", Int(32));
+    auto p0 = ir::Sum::Make({Expr(1), ir::Min::Make(Expr(15), x)});
+    LOG(INFO) << "p0 " << p0;
+    auto p2 = CasSimplify(p0);
+    LOG(INFO) << "simplified " << p2;
+    EXPECT_EQ(GetStreamCnt(p2), "cinn_min(16, (1 + x))");
+  }
+  {  // 2*cinn_min(15, x)
+    Var x   = ir::_Var_::Make("x", Int(32));
+    auto p0 = ir::Product::Make({Expr(2), ir::Min::Make(Expr(15), x)});
+    LOG(INFO) << "p0 " << p0;
+    auto p2 = CasSimplify(p0);
+    LOG(INFO) << "simplified " << p2;
+    EXPECT_EQ(GetStreamCnt(p2), "cinn_min(30, (2 * x))");
+  }
+  {  // cinn_min(15, x)/2
+    Var x   = ir::_Var_::Make("x", Int(32));
+    auto p0 = ir::FracOp::Make(ir::Min::Make(Expr(15), x), Expr(2));
+    LOG(INFO) << "p0 " << p0;
+    auto p2 = CasSimplify(p0);
+    LOG(INFO) << "simplified " << p2;
+    EXPECT_EQ(GetStreamCnt(p2), "cinn_min(7, (x / 2))");
+  }
+  {  // -(cinn_min(16, 3400-x-1)-1)/2 + x
+    Var x   = ir::_Var_::Make("x", Int(32));
+    auto p0 = ir::FracOp::Make(ir::Min::Make(Expr(16), 3400 - x - 1) - 1, Expr(2));
+    p0      = -p0 + x;
+    LOG(INFO) << "p0 " << p0;
+    auto p2 = AutoSimplify(p0);
+    LOG(INFO) << "simplified " << p2;
+    EXPECT_EQ(GetStreamCnt(p2), "cinn_max((-1699 + ((-1 * ((-1 * x) / 2)) + x)), (-7 + x))");
+  }
+  {  // cinn_max((-1 * (3399 + (-16 * i_j_fused_outer))), -15)
+    Var x   = ir::_Var_::Make("x", Int(32));
+    auto p0 = ir::Max::Make(ir::Product::Make({Expr(-1), ir::Sum::Make({Expr(3399), Expr(-16) * x})}), Expr(-15));
+    LOG(INFO) << "p0 " << p0;
+    auto p2 = AutoSimplify(p0);
+    LOG(INFO) << "simplified " << p2;
+    EXPECT_EQ(GetStreamCnt(p2), "cinn_max((-3399 + (16 * x)), -15)");
+  }
+}
+
+TEST(CAS, cond) {
+  {
+    Expr cond = Expr(2) > Expr(1);
+    EXPECT_EQ(GetStreamCnt(CasSimplify(cond)), "true");
+  }
+  {
+    Var a("a");
+    Expr cond = (Expr(2) > Expr(1)) && (a < 20);
+    EXPECT_EQ(GetStreamCnt(CasSimplify(cond)), "(a < 20)");
+  }
+  {
+    Var a("a");
+    Expr cond = (Expr(2) < Expr(1)) && (a < 20);
+    EXPECT_EQ(GetStreamCnt(CasSimplify(cond)), "false");
+  }
+}
+
+TEST(CAS, SimplifyFracOp) {
+  Expr frac = Expr(1) / Expr(7) / Expr(6) / Expr(5) / Expr(4);
+  EXPECT_EQ(GetStreamCnt(AutoSimplify(frac)), "0");
+
+  Expr frac_f = Expr(20.0f) / Expr(2.0f) / Expr(1.0f) / Expr(5.0f);
+  EXPECT_EQ(GetStreamCnt(AutoSimplify(frac_f)), "2.00000000f");
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/cinn_value.cc b/paddle/cinn/common/cinn_value.cc
new file mode 100644
index 0000000000000..06834725b082c
--- /dev/null
+++ b/paddle/cinn/common/cinn_value.cc
@@ -0,0 +1,251 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/cinn_value.h"
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/poly/stage.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace cinn {
+
+namespace ir {
+
+class Expr;
+class Var;
+
+}  // namespace ir
+
+namespace common {
+
+//! Implement the type_code for all the supported types.
+// @{
+#define __m(T, code__)           \
+  template <>                    \
+  int CINNValue::TypeCode<T>() { \
+    return code__;               \
+  }
+__m(std::nullptr_t, -1);
+__m(char *, 20);  // start from a larger number to avoid duplicate id with cinn_pod_value_t
+__m(char const *, 21);
+__m(ir::Expr, 22);
+__m(ir::Var, 23);
+__m(CINNValuePack, 24);
+__m(poly::StageMap, 25);
+__m(std::string, 26);
+#undef __m
+//@}
+
+//! Implement ToValue.
+// @{
+template <>
+cinn_value_t ToValue<bool>(bool v) {
+  cinn_value_t val;
+  val.v_int64 = v;
+  return val;
+}
+template <>
+cinn_value_t ToValue<int>(int v) {
+  cinn_value_t val;
+  val.v_int64 = v;
+  return val;
+}
+template <>
+cinn_value_t ToValue<int64_t>(int64_t v) {
+  cinn_value_t val;
+  val.v_int64 = v;
+  return val;
+}
+template <>
+cinn_value_t ToValue<float>(float v) {
+  cinn_value_t val;
+  val.v_float64 = v;
+  return val;
+}
+template <>
+cinn_value_t ToValue<double>(double v) {
+  cinn_value_t val;
+  val.v_float64 = v;
+  return val;
+}
+template <>
+cinn_value_t ToValue<bfloat16>(bfloat16 v) {
+  cinn_value_t val;
+  val.v_float64 = static_cast<double>(v);
+  return val;
+}
+template <>
+cinn_value_t ToValue<float16>(float16 v) {
+  cinn_value_t val;
+  val.v_float64 = static_cast<double>(v);
+  return val;
+}
+template <>
+cinn_value_t ToValue<char *>(char *v) {
+  cinn_value_t val;
+  val.v_str = v;
+  return val;
+}
+template <>
+cinn_value_t ToValue<char const *>(char const *v) {
+  cinn_value_t val;
+  val.v_str = const_cast<char *>(v);
+  return val;
+}
+// @}
+
+bool CINNValue::is_string() const { return type_code_ == TypeCode<std::string>(); }
+
+bool CINNValue::is_var() const { return type_code_ == TypeCode<ir::Var>(); }
+
+bool CINNValue::is_expr() const {
+  return type_code_ == TypeCode<ir::Expr>() && !absl::any_cast<Expr>(shared_).as_tensor();
+}
+
+bool CINNValue::is_stagemap() const { return type_code_ == TypeCode<poly::StageMap>(); }
+
+bool CINNValue::is_tensor() const {
+  return type_code_ == TypeCode<ir::Expr>() && absl::any_cast<Expr>(shared_).as_tensor();
+}
+
+CINNValue::operator std::string() const {
+  CHECK_EQ(type_code_, TypeCode<std::string>());
+  return absl::any_cast<std::string>(shared_);
+}
+CINNValue::operator ir::Var() const {
+  CHECK_EQ(type_code_, TypeCode<ir::Var>());
+  return absl::any_cast<ir::Var>(shared_);
+}
+CINNValue::operator ir::Expr() const {
+  CHECK_EQ(type_code_, TypeCode<ir::Expr>());
+  return absl::any_cast<Expr>(shared_);
+}
+CINNValue::operator CINNValuePack() const {
+  CHECK_EQ(type_code_, TypeCode<CINNValuePack>());
+  return absl::any_cast<CINNValuePack>(shared_);
+}
+CINNValue::operator poly::StageMap() const {
+  CHECK_EQ(type_code(), TypeCode<poly::StageMap>());
+  return absl::any_cast<poly::StageMap>(shared_);
+}
+CINNValue::CINNValue(char *value) : cinn_pod_value_t(ToValue(value), TypeCode<char *>()) {}
+
+CINNValue::CINNValue(const std::string &value) : cinn_pod_value_t(cinn_value_t(), TypeCode<std::string>()) {
+  shared_ = value;
+}
+CINNValue::CINNValue(const Var &value) : cinn_pod_value_t(cinn_value_t(), TypeCode<Var>()) {
+  CHECK(value.defined());
+  shared_ = value;
+}
+CINNValue::CINNValue(const Expr &value) : cinn_pod_value_t(cinn_value_t(), TypeCode<Expr>()) {
+  CHECK(value.defined());
+  shared_ = value;
+}
+CINNValue::CINNValue(const CINNValuePack &value) : cinn_pod_value_t(cinn_value_t(), TypeCode<CINNValuePack>()) {
+  CHECK(value.defined());
+  shared_ = value;
+}
+CINNValue::CINNValue(const poly::StageMap &value) : cinn_pod_value_t(cinn_value_t(), TypeCode<poly::StageMap>()) {
+  CHECK(value.defined());
+  shared_ = value;
+}
+
+CINNValuePack _CINNValuePack_::Make(const std::vector<CINNValue> &array) {
+  auto *node = new _CINNValuePack_;
+  for (auto &item : array) node->AddValue(item);
+  return CINNValuePack(node);
+}
+CINNValue &_CINNValuePack_::operator[](int offset) {
+  CHECK_LT(offset, size());
+  return values_[offset];
+}
+const CINNValue &_CINNValuePack_::operator[](int offset) const {
+  CHECK_LT(offset, size());
+  return values_[offset];
+}
+void _CINNValuePack_::AddValue(const CINNValue &value) {
+  CHECK(value.defined());
+  values_.push_back(value);
+}
+void _CINNValuePack_::Clear() { values_.clear(); }
+const char *_CINNValuePack_::type_info() const { return __type_info__; }
+
+CINNValue &CINNValue::operator=(bool value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(int32_t value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(int64_t value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(float value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(double value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(bfloat16 value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(float16 value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(char *value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(cinn_buffer_t *value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(void *value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(const char *value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(const CINNValuePack &value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(const std::string &value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(const ir::Var &value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(const ir::Expr &value) {
+  *this = CINNValue(value);
+  return *this;
+}
+CINNValue &CINNValue::operator=(const poly::StageMap &value) {
+  *this = CINNValue(value);
+  return *this;
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/cinn_value.h b/paddle/cinn/common/cinn_value.h
new file mode 100755
index 0000000000000..59598cf955123
--- /dev/null
+++ b/paddle/cinn/common/cinn_value.h
@@ -0,0 +1,222 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/types/any.h>
+#include <glog/logging.h>
+
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/object.h"
+#include "cinn/common/type.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+struct cinn_buffer_t;
+
+namespace cinn {
+
+namespace poly {
+struct StageMap;
+}  // namespace poly
+
+namespace ir {
+
+class Expr;
+class Var;
+
+}  // namespace ir
+
+namespace common {
+
+template <typename T>
+cinn_value_t ToValue(T v);
+
+class CINNValue;
+class CINNValuePack;
+
+/**
+ * A _CINNValuePack_ is a shared Array of multiple CINNValue.
+ */
+struct _CINNValuePack_ : public common::Object {
+  /**
+   * Create a new CINNValuePack instance.
+   * @param array The list of CINNValues.
+   * @return a CINNValuePack.
+   */
+  static CINNValuePack Make(const std::vector<CINNValue>& array);
+
+  //! Get i-th element in mutable mode.
+  CINNValue& operator[](int offset);
+  //! Get i-th element in readonly mode.
+  const CINNValue& operator[](int offset) const;
+
+  //! Add one \p value to the tail.
+  void AddValue(const CINNValue& value);
+
+  //! Remove all the values.
+  void Clear();
+
+  size_t size() const { return values_.size(); }
+
+  bool empty() const { return values_.empty(); }
+
+  CINN_DISALLOW_COPY_AND_ASSIGN(_CINNValuePack_);
+
+  const char* type_info() const override;
+
+ private:
+  _CINNValuePack_() = default;
+  std::vector<CINNValue> values_;
+  static constexpr char* __type_info__ = "CINNValuePack";
+};
+
+struct CINNValuePack : public Shared<_CINNValuePack_> {
+  explicit CINNValuePack(_CINNValuePack_* ptr) : Shared<_CINNValuePack_>(ptr) {}
+  explicit CINNValuePack(const std::vector<CINNValue>& array) : Shared<_CINNValuePack_>(_CINNValuePack_::Make(array)) {}
+  CINNValue& operator[](int offset) { return (*operator->())[offset]; }
+  const CINNValue& operator[](int offset) const { return (*operator->())[offset]; }
+
+  size_t size() const { return (*operator->()).size(); }
+
+  bool empty() const { return (*operator->()).empty(); }
+
+  CINNValue& back() {
+    CHECK_GT((*operator->()).size(), 0);
+    return (*operator->())[size() - 1];
+  }
+
+  const CINNValue& back() const {
+    CHECK_GT((*operator->()).size(), 0);
+    return (*operator->())[size() - 1];
+  }
+
+  _CINNValuePack_* operator->() { return get(); }
+  const _CINNValuePack_* operator->() const { return get(); }
+};
+
+/**
+ * Handler for value types in CINN system. It supports two kinds of values: the POD and Shared.
+ */
+class CINNValue : public cinn_pod_value_t {
+ public:
+  static constexpr int kNull = -1;
+
+  CINNValue() : cinn_pod_value_t(cinn_value_t(), kNull) {}
+  CINNValue(cinn_value_t value, int type_code) : cinn_pod_value_t(value, type_code) {}
+
+  explicit CINNValue(bool value) : cinn_pod_value_t(value) { type_code_ = ::cinn_type_code<bool>(); }
+  explicit CINNValue(int32_t value) : cinn_pod_value_t(value) { type_code_ = ::cinn_type_code<int32_t>(); }
+  explicit CINNValue(int64_t value) : cinn_pod_value_t(value) { type_code_ = ::cinn_type_code<int64_t>(); }
+  explicit CINNValue(float value) : cinn_pod_value_t(value) { type_code_ = ::cinn_type_code<float>(); }
+  explicit CINNValue(bfloat16 value) : cinn_pod_value_t(value) { type_code_ = ::cinn_type_code<bfloat16>(); }
+  explicit CINNValue(float16 value) : cinn_pod_value_t(value) { type_code_ = ::cinn_type_code<float16>(); }
+  explicit CINNValue(double value) : cinn_pod_value_t(value) { type_code_ = ::cinn_type_code<double>(); }
+  explicit CINNValue(char* value);
+  explicit CINNValue(cinn_buffer_t* value) : cinn_pod_value_t(value) {}
+  explicit CINNValue(void* value) : cinn_pod_value_t(value) {}
+  explicit CINNValue(const char* value) : cinn_pod_value_t(value) {}
+  explicit CINNValue(const std::string&);
+  explicit CINNValue(const ir::Var& value);
+  explicit CINNValue(const ir::Expr& value);
+  explicit CINNValue(const CINNValuePack& value);
+  explicit CINNValue(const poly::StageMap& value);
+
+  bool defined() const { return type_code_ != kNull; }
+
+  //! The value getters for the supported types.
+  // @{
+  using cinn_pod_value_t::operator double;
+  using cinn_pod_value_t::operator float;
+  using cinn_pod_value_t::operator cinn::common::bfloat16;
+  using cinn_pod_value_t::operator cinn::common::float16;
+  using cinn_pod_value_t::operator bool;
+  using cinn_pod_value_t::operator int32_t;
+  using cinn_pod_value_t::operator int64_t;
+  using cinn_pod_value_t::operator void*;
+  using cinn_pod_value_t::operator cinn_buffer_t*;
+  using cinn_pod_value_t::operator char*;
+  operator std::string() const;
+  operator ir::Var() const;
+  operator ir::Expr() const;
+  operator CINNValuePack() const;
+  operator poly::StageMap() const;
+  // @}
+
+  bool is_string() const;
+  bool is_var() const;
+  bool is_expr() const;
+  bool is_stagemap() const;
+  bool is_tensor() const;
+
+  //! Assign operators
+  // @{
+  CINNValue& operator=(bool value);
+  CINNValue& operator=(int32_t value);
+  CINNValue& operator=(int64_t value);
+  CINNValue& operator=(float value);
+  CINNValue& operator=(double value);
+  CINNValue& operator=(bfloat16 value);
+  CINNValue& operator=(float16 value);
+  CINNValue& operator=(char* value);
+  CINNValue& operator=(const std::string& value);
+  CINNValue& operator=(const ir::Var& value);
+  CINNValue& operator=(const ir::Expr& value);
+  CINNValue& operator=(cinn_buffer_t* value);
+  CINNValue& operator=(void* value);
+  CINNValue& operator=(const CINNValuePack& value);
+  CINNValue& operator=(const char* value);
+  CINNValue& operator=(const poly::StageMap& value);
+  // @}
+
+  //  //! Set the value.
+  //  template <typename T>
+  //  void Set(T v) {
+  //    if constexpr (std::is_same_v<std::decay_t<T>, CINNValue>) {
+  //      *this = v;
+  //    } else {
+  //      *this = CINNValue(v);
+  //    }
+  //  }
+
+  template <typename T>
+  inline void _Set(T v, std::true_type) {
+    *this = v;
+  }
+
+  template <typename T>
+  inline void _Set(T v, std::false_type) {
+    *this = CINNValue(v);
+  }
+  // using tag-dispatch instead of constexpr if
+  template <typename T>
+  void Set(T v) {
+    _Set(v, std::is_same<std::decay_t<T>, CINNValue>{});
+  }
+
+  /**
+   * Get the type code for a specific POD type.
+   * @param T some data type.
+   * @return an integer representing the type code.
+   */
+  template <typename T>
+  static int TypeCode();
+
+ protected:
+  absl::any shared_;
+};
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/cinn_value_test.cc b/paddle/cinn/common/cinn_value_test.cc
new file mode 100644
index 0000000000000..9d27d2cd5f68a
--- /dev/null
+++ b/paddle/cinn/common/cinn_value_test.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/cinn_value.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/common/common.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn {
+namespace common {
+
+TEST(CINNValue, test) {
+  {
+    CINNValue value(32);
+    ASSERT_EQ(int(value), 32);  // NOLINT
+  }
+  {
+    CINNValue value(32.f);
+    ASSERT_NEAR(float(value), 32.f, 1e-6);  // NOLINT
+  }
+}
+
+TEST(CINNValue, buffer) {
+  cinn_buffer_t* v = nullptr;
+  CINNValue value(v);
+  ASSERT_EQ((cinn_buffer_t*)value, nullptr);
+}
+
+TEST(CINNValue, Expr) {
+  Expr a(1);
+
+  {
+    CINNValue value(a);
+    ASSERT_TRUE(a == value);
+  }
+
+  {
+    CINNValue copied = CINNValue(a);
+    ASSERT_TRUE(copied == common::make_const(1));
+  }
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/common.h b/paddle/cinn/common/common.h
new file mode 100644
index 0000000000000..5e42dc43dce09
--- /dev/null
+++ b/paddle/cinn/common/common.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/strings/string_view.h>
+
+#include "cinn/common/axis.h"
+#include "cinn/common/cinn_value.h"
+#include "cinn/common/context.h"
+#include "cinn/common/graph_utils.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/shared.h"
+#include "cinn/common/target.h"
+#include "cinn/common/type.h"
+
+namespace cinn {
+
+// export some general concepts.
+using common::Context;
+using common::make_shared;
+using common::Object;
+using common::ref_count;
+using common::Shared;
+using common::UniqName;
+
+// Type related.
+using common::Bool;
+using common::Float;
+using common::Int;
+using common::UInt;
+using common::Void;
+
+using common::type_of;
+
+using common::Target;
+using common::Type;
+using common::UnkTarget;
+
+template <typename T>
+T& Reference(const T* x) {
+  return *const_cast<T*>(x);
+}
+
+static void CheckVarNameValid(const absl::string_view name) {
+  CHECK(!name.empty());
+  CHECK(name.find(' ') == std::string::npos &&   //
+        name.find('.') == std::string::npos &&   //
+        name.find('@') == std::string::npos &&   //
+        name.find('/') == std::string::npos &&   //
+        name.find('\t') == std::string::npos &&  //
+        name.find('\n') == std::string::npos &&  //
+        name.find('\r') == std::string::npos)
+      << "Some invalid character found";
+  CHECK(!common::IsAxisNameReserved(std::string(name))) << "The name [" << name << "] is reserved for internal axis";
+}
+
+}  // namespace cinn
diff --git a/paddle/cinn/common/context.cc b/paddle/cinn/common/context.cc
new file mode 100644
index 0000000000000..6f9dfebfc971d
--- /dev/null
+++ b/paddle/cinn/common/context.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/context.h"
+
+#include <glog/logging.h>
+#include <isl/cpp.h>
+
+#include <mutex>
+
+#include "cinn/ir/ir.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace common {
+namespace {
+#ifdef RUNTIME_INCLUDE_DIR
+static constexpr char* defined_runtime_include_dir = RUNTIME_INCLUDE_DIR;
+#else
+static constexpr char* defined_runtime_include_dir = nullptr;
+#endif
+}  // namespace
+
+thread_local isl::ctx Context::ctx_ = isl_ctx_alloc();
+thread_local InfoRegistry Context::info_rgt_;
+thread_local DebugManager Context::debug_mgr_;
+
+Context& Context::Global() {
+  static Context x;
+  isl_options_set_on_error(ctx_.get(), ISL_ON_ERROR_ABORT);
+  return x;
+}
+
+const std::vector<std::string>& Context::runtime_include_dir() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (runtime_include_dir_.empty()) {
+    const char* env = std::getenv(kRuntimeIncludeDirEnvironKey);
+    if (env) {  // use environment variable firstly
+      VLOG(4) << "get runtime_include_dir from env: " << env;
+      runtime_include_dir_ = cinn::utils::Split(env, ":");
+    } else if (defined_runtime_include_dir) {
+      VLOG(4) << "get runtime_include_dir from RUNTIME_INCLUDE_DIR: " << defined_runtime_include_dir;
+      runtime_include_dir_ = cinn::utils::Split(defined_runtime_include_dir, ":");
+    }
+  }
+  return runtime_include_dir_;
+}
+
+void Context::AddRuntimeIncludeDir(std::string dir) {
+  // TODO(Shixiaowei02): path deduplication
+  runtime_include_dir_.emplace_back(std::move(dir));
+}
+
+const char* kRuntimeIncludeDirEnvironKey = "runtime_include_dir";
+
+std::string NameGenerator::New(const std::string& name_hint) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto it = name_hint_idx_.find(name_hint);
+  if (it == name_hint_idx_.end()) {
+    name_hint_idx_.emplace(name_hint, -1);
+    return name_hint;
+  }
+  return name_hint + "_" + std::to_string(++it->second);
+}
+
+}  // namespace common
+
+DEFINE_bool(cinn_runtime_display_debug_info, false, "Whether to display debug information in runtime");
+}  // namespace cinn
diff --git a/paddle/cinn/common/context.h b/paddle/cinn/common/context.h
new file mode 100644
index 0000000000000..6962ce680830d
--- /dev/null
+++ b/paddle/cinn/common/context.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/types/any.h>
+#include <gflags/gflags.h>
+#include <isl/cpp.h>
+
+#include <mutex>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "cinn/common/debug_manager.h"
+#include "cinn/common/info_registry.h"
+#include "cinn/common/target.h"
+
+namespace cinn {
+
+DECLARE_bool(cinn_runtime_display_debug_info);
+
+namespace ir {
+class Expr;
+}  // namespace ir
+
+namespace common {
+
+extern const char* kRuntimeIncludeDirEnvironKey;
+
+struct NameGenerator {
+  std::string New(const std::string& name_hint);
+
+  // Reset id to initial.
+  void ResetID() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    name_hint_idx_.clear();
+  }
+
+ private:
+  absl::flat_hash_map<std::string, uint32_t> name_hint_idx_;
+  mutable std::mutex mutex_;
+};
+
+class Context {
+ public:
+  static Context& Global();
+
+  /**
+   * Generate a new unique name.
+   * @param name_hint The prefix.
+   */
+  std::string NewName(const std::string& name_hint) { return name_generator_.New(name_hint); }
+
+  void ResetNameId() { name_generator_.ResetID(); }
+
+  const std::vector<std::string>& runtime_include_dir();
+
+  void AddRuntimeIncludeDir(std::string dir);
+
+  /**
+   * The global isl ctx.
+   */
+  static isl::ctx& isl_ctx() { return ctx_; }
+
+  static InfoRegistry& info_rgt() { return info_rgt_; }
+
+  static DebugManager& debug_mgr() { return debug_mgr_; }
+
+ private:
+  Context() = default;
+
+  NameGenerator name_generator_;
+  std::vector<std::string> runtime_include_dir_;
+  mutable std::mutex mutex_;
+
+  static thread_local isl::ctx ctx_;
+  static thread_local InfoRegistry info_rgt_;
+  static thread_local DebugManager debug_mgr_;
+};
+
+static std::string UniqName(const std::string& prefix) { return Context::Global().NewName(prefix); }
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/cost_model.h b/paddle/cinn/common/cost_model.h
new file mode 100644
index 0000000000000..6c5f4cc79babc
--- /dev/null
+++ b/paddle/cinn/common/cost_model.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace cinn {
+namespace auto_schedule {
+
+/**
+ * A C++ cost model virtual base class
+ */
+class CostModel {
+ public:
+  virtual void Train(const std::vector<std::vector<float>>& samples, const std::vector<float>& labels) = 0;
+
+  virtual std::vector<float> Predict(const std::vector<std::vector<float>>& samples) const = 0;
+
+  virtual void Update(const std::vector<std::vector<float>>& samples, const std::vector<float>& labels) = 0;
+
+  virtual void Save(const std::string& path) = 0;
+
+  virtual void Load(const std::string& path) = 0;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
diff --git a/paddle/cinn/common/cuda_test_helper.cc b/paddle/cinn/common/cuda_test_helper.cc
new file mode 100644
index 0000000000000..23666e60dabcd
--- /dev/null
+++ b/paddle/cinn/common/cuda_test_helper.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/cuda_test_helper.h"
+
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/codegen_cuda_host.h"
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "cinn/runtime/cuda/cuda_module.h"
+#include "cinn/runtime/cuda/cuda_util.h"
+
+namespace cinn {
+namespace common {
+
+#ifdef CINN_WITH_CUDA
+void CudaModuleTester::Compile(const ir::Module& m, const std::string& rewrite_cuda_code) {
+  auto _host_module_device_module_ = backends::SplitCudaAndHostModule(m);  // NOLINT
+  auto& host_module                = std::get<0>(_host_module_device_module_);
+  auto& device_module              = std::get<1>(_host_module_device_module_);
+  CHECK(!host_module.functions().empty());
+  CHECK(!device_module.functions().empty());
+
+  backends::CodeGenCUDA_Dev codegen(DefaultHostTarget());
+  auto source_code = codegen.Compile(device_module);
+
+  // compile CUDA kernel.
+  backends::nvrtc::Compiler compiler;
+
+  std::string ptx;
+  if (rewrite_cuda_code.empty())
+    ptx = compiler(source_code);
+  else
+    ptx = compiler(rewrite_cuda_code);
+
+  cuda_module_ = new runtime::cuda::CUDAModule(ptx, runtime::cuda::CUDAModule::Kind::PTX);
+
+  for (auto& fn : device_module.functions()) {
+    std::string kernel_fn_name = fn->name;
+    auto fn_kernel = reinterpret_cast<runtime::cuda::CUDAModule*>(cuda_module_)->GetFunction(0, kernel_fn_name);
+    CHECK(fn_kernel);
+    kernel_handles_.push_back(fn_kernel);
+
+    backends::GlobalSymbolRegistry::Global().RegisterFn(kernel_fn_name + "_ptr_",
+                                                        reinterpret_cast<void*>(&kernel_handles_.back()));
+  }
+
+  jit_ = backends::SimpleJIT::Create();
+
+  // compile host module
+  jit_->Link<backends::CodeGenCUDA_Host>(host_module, false);
+}
+
+void* CudaModuleTester::CreateDeviceBuffer(const cinn_buffer_t* host_buffer) {
+  CHECK(host_buffer->memory);
+  int num_bytes = host_buffer->num_elements() * sizeof(float);
+  CUdeviceptr data;
+  cuMemAlloc(&data, num_bytes);
+
+  CUDA_CALL(cudaMemcpy(reinterpret_cast<void*>(data), host_buffer->memory, num_bytes, cudaMemcpyHostToDevice));
+  return reinterpret_cast<void*>(data);
+}
+
+CudaModuleTester::CudaModuleTester() {}
+
+void CudaModuleTester::operator()(const std::string& fn_name, void* args, int arg_num) {
+  auto fn  = jit_->Lookup(fn_name);
+  auto fnp = reinterpret_cast<lower_func_ptr_g>(fn);
+  (*fnp)(args, arg_num, stream_);
+}
+
+void* CudaModuleTester::LookupKernel(const std::string& name) {
+  return reinterpret_cast<runtime::cuda::CUDAModule*>(cuda_module_)->GetFunction(0, name);
+}
+
+CudaModuleTester::~CudaModuleTester() {
+  if (cuda_module_) {
+    delete reinterpret_cast<runtime::cuda::CUDAModule*>(cuda_module_);
+  }
+}
+
+#endif
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/cuda_test_helper.h b/paddle/cinn/common/cuda_test_helper.h
new file mode 100644
index 0000000000000..44bb30a8025a5
--- /dev/null
+++ b/paddle/cinn/common/cuda_test_helper.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/cinn.h"
+
+namespace cinn {
+namespace common {
+
+#ifdef CINN_WITH_CUDA
+class CudaModuleTester {
+ public:
+  CudaModuleTester();
+
+  // Call the host function in JIT.
+  void operator()(const std::string& fn_name, void* args, int arg_num);
+
+  void Compile(const ir::Module& m, const std::string& rewrite_cuda_code = "");
+
+  void* LookupKernel(const std::string& name);
+
+  void* CreateDeviceBuffer(const cinn_buffer_t* host_buffer);
+
+  ~CudaModuleTester();
+
+ private:
+  std::unique_ptr<backends::SimpleJIT> jit_;
+
+  void* stream_{};
+
+  std::vector<void*> kernel_handles_;
+
+  void* cuda_module_{nullptr};
+};
+
+#endif
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/debug_manager.cc b/paddle/cinn/common/debug_manager.cc
new file mode 100644
index 0000000000000..f60cc7dc4c76f
--- /dev/null
+++ b/paddle/cinn/common/debug_manager.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/debug_manager.h"
+
+namespace cinn {
+namespace common {
+
+inline std::vector<std::pair<std::string, absl::any>> &GetVec(absl::any &data) {  // NOLINT
+  return absl::any_cast<std::vector<std::pair<std::string, absl::any>> &>(data);
+}
+
+//! AppendTypeSuffix for multiple types.
+// @{
+template <>
+inline std::string DebugManager::AppendTypeSuffix<int32_t>(const std::string &key) {
+  return key + "_i32";
+}
+template <>
+inline std::string DebugManager::AppendTypeSuffix<int64_t>(const std::string &key) {
+  return key + "_i64";
+}
+template <>
+inline std::string DebugManager::AppendTypeSuffix<float>(const std::string &key) {
+  return key + "_f32";
+}
+template <>
+inline std::string DebugManager::AppendTypeSuffix<double>(const std::string &key) {
+  return key + "_f64";
+}
+template <>
+inline std::string DebugManager::AppendTypeSuffix<bool>(const std::string &key) {
+  return key + "_b";
+}
+template <>
+inline std::string DebugManager::AppendTypeSuffix<std::string>(const std::string &key) {
+  return key + "_s";
+}
+// @}
+
+void DebugManager::Append(const std::string &key, absl::any value) {
+  GetVec(data_).push_back(std::make_pair(key, value));
+}
+void DebugManager::Append(const std::string &key, int32_t value) {
+  GetVec(data_).push_back(std::make_pair(AppendTypeSuffix<int32_t>(key), value));
+}
+void DebugManager::Append(const std::string &key, bool value) {
+  GetVec(data_).push_back(std::make_pair(AppendTypeSuffix<bool>(key), value));
+}
+void DebugManager::Append(const std::string &key, const std::string &value) {
+  GetVec(data_).push_back(std::make_pair(AppendTypeSuffix<std::string>(key), value));
+}
+
+void DebugManager::Clear() { GetVec(data_).clear(); }
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/debug_manager.h b/paddle/cinn/common/debug_manager.h
new file mode 100644
index 0000000000000..934965f13c1ef
--- /dev/null
+++ b/paddle/cinn/common/debug_manager.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/types/any.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace cinn {
+namespace common {
+
+/**
+ * Container for debug info.
+ * DebugManager is integrated into the global Context, and used to log something(but not print to stdout directly).
+ */
+class DebugManager {
+ public:
+  void Append(const std::string& key, int32_t value);
+  void Append(const std::string& key, bool value);
+  void Append(const std::string& key, const std::string& value);
+  void Clear();
+
+ protected:
+  void Append(const std::string& key, absl::any value);
+
+  template <typename T>
+  inline std::string AppendTypeSuffix(const std::string& key) {
+    return key;
+  }
+
+ private:
+  //! hide the type of vector<pair<string, any>>
+  absl::any data_;
+};
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/float16.h b/paddle/cinn/common/float16.h
new file mode 100644
index 0000000000000..4bf8c64614b17
--- /dev/null
+++ b/paddle/cinn/common/float16.h
@@ -0,0 +1,629 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CINN_COMMON_FLOAT16_H
+#define CINN_COMMON_FLOAT16_H
+
+#ifdef __cplusplus
+#pragma once
+#endif  // __cplusplus
+
+#if defined(_M_X64) || defined(__x86_64__) || defined(_M_IX86) || defined(__i386__)
+#define __CINN_x86__
+#include <immintrin.h>
+#endif
+
+#include <stdint.h>
+
+#include <cmath>
+
+#ifdef CINN_WITH_CUDA
+#include <cuda.h>
+
+#if (defined(__CUDACC__) || defined(__CUDACC_RTC__)) && CUDA_VERSION >= 7050
+#define CINN_CUDA_FP16
+#include <cuda_fp16.h>
+
+#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)
+#endif  // __CUDACC__
+#endif  // CINN_WITH_CUDA
+
+#ifdef __cplusplus
+#ifndef _WIN32
+#define CINN_ALIGN(x) __attribute__((aligned(x)))
+#else  // _WIN32
+#define CINN_ALIGN(x) __declspec(align(x))
+#endif  // _WIN32
+
+#else  // __cplusplus
+#define CINN_ALIGN(x)
+#endif  // __cplusplus
+
+// The `HOST` macro definition is not used here, it has a potential
+// conflict with the enumeration `kHOST` representing the backend.
+#ifndef __host__
+#define __host__
+#endif
+#ifndef __device__
+#define __device__
+#endif
+
+#ifdef __cplusplus
+namespace cinn {
+namespace common {
+#endif  // __cplusplus
+
+// Use CINN_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with CUDA half
+struct CINN_ALIGN(2) float16 {
+  uint16_t x;
+
+#ifdef __cplusplus
+  // The following defaulted special class member functions
+  // are added to make float16 pass the std::is_trivial test
+  float16()                 = default;
+  float16(const float16& o) = default;
+  float16& operator=(const float16& o) = default;
+  float16(float16&& o)                 = default;
+  float16& operator=(float16&& o) = default;
+  ~float16()                      = default;
+
+// Constructors
+#ifdef CINN_CUDA_FP16
+  __host__ __device__ inline explicit float16(const half& h) {
+#if (CUDA_VERSION >= 9000)
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
+#else
+    x = h.x;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // CINN_CUDA_FP16
+
+  __host__ __device__ inline explicit float16(float val) {
+#if defined(CINN_CUDA_FP16) && (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)
+    half tmp = __float2half(val);
+    x        = *reinterpret_cast<uint16_t*>(&tmp);
+
+#elif defined(__F16C__) && defined(__CINN_x86__)
+    x = _cvtss_sh(val, 0);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f           = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
+
+#endif
+  }
+
+  __host__ __device__ inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
+
+  template <class T>
+  __host__ __device__ inline explicit float16(const T& val) : x(float16(static_cast<float>(val)).x) {}
+
+// Assignment operators
+#ifdef CINN_CUDA_FP16
+  __host__ __device__ inline float16& operator=(const half& rhs) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
+#else
+    x = rhs.x;
+#endif
+    return *this;
+  }
+#endif
+
+  __host__ __device__ inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(int8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(int16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(int32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(int64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(float val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(double val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+// Conversion opertors
+#ifdef CINN_CUDA_FP16
+  __host__ __device__ inline half to_half() const {
+#if CUDA_VERSION >= 9000
+    __half_raw h;
+    h.x = x;
+    return half(h);
+#else
+    half h;
+    h.x = x;
+    return h;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // CINN_CUDA_FP16
+
+  __host__ __device__ inline operator float() const {
+#if defined(CINN_CUDA_FP16) && (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)
+    half tmp = *reinterpret_cast<const half*>(this);
+    return __half2float(tmp);
+
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui         = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
+  }
+
+  __host__ __device__ inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  __host__ __device__ inline explicit operator int8_t() const { return static_cast<int8_t>(static_cast<float>(*this)); }
+
+  __host__ __device__ inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline operator double() const { return static_cast<double>(static_cast<float>(*this)); }
+
+ private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift     = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1) << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
+#endif  // __cplusplus
+};
+
+struct CINN_ALIGN(32) float8 {
+  float x, y, z, w, v, u, t, s;
+};
+
+struct CINN_ALIGN(16) half8 {
+  float16 x, y, z, w, v, u, t, s;
+};
+
+struct CINN_ALIGN(8) half4 {
+  float16 x, y, z, w;
+};
+
+#ifdef __cplusplus
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+// ROCM has built-in arithmetic operators as not defined
+// __HIP_NO_HALF_OPERATORS__
+#if defined(CINN_CUDA_FP16) && CUDA_VERSION < 9000
+__device__ inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+__device__ inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hsub(a, b);
+#else
+  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+__device__ inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hmul(a, b);
+#else
+  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+__device__ inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  float num   = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#else
+  float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+__device__ inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -static_cast<float>(float16(a));
+  return float16(res).to_half();
+#endif
+}
+
+__device__ inline half& operator+=(half& a, const half& b) {  // NOLINT
+  a = a + b;
+  return a;
+}
+
+__device__ inline half& operator-=(half& a, const half& b) {  // NOLINT
+  a = a - b;
+  return a;
+}
+
+__device__ inline half& operator*=(half& a, const half& b) {  // NOLINT
+  a = a * b;
+  return a;
+}
+
+__device__ inline half& operator/=(half& a, const half& b) {  // NOLINT
+  a = a / b;
+  return a;
+}
+
+__device__ inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a, b);
+#else
+  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
+#endif
+}
+
+__device__ inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a, b);
+#else
+  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
+#endif
+}
+
+__device__ inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a, b);
+#else
+  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
+#endif
+}
+
+__device__ inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a, b);
+#else
+  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
+#endif
+}
+
+__device__ inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a, b);
+#else
+  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
+#endif
+}
+
+__device__ inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a, b);
+#else
+  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
+#endif
+}
+
+#endif  // CINN_CUDA_FP16
+
+// Arithmetic operators for float16 on GPU
+__host__ __device__ inline float16 operator+(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hadd(a.to_half(), b.to_half()));
+#else
+  return float16(static_cast<float>(a) + static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline float16 operator-(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hsub(a.to_half(), b.to_half()));
+#else
+  return float16(static_cast<float>(a) - static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline float16 operator*(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hmul(a.to_half(), b.to_half()));
+#else
+  return float16(static_cast<float>(a) * static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline float16 operator/(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  // TODO(kexinzhao): check which cuda version starts to support __hdiv
+  float num   = __half2float(a.to_half());
+  float denom = __half2float(b.to_half());
+  return float16(num / denom);
+#else
+  return float16(static_cast<float>(a) / static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline float16 operator-(const float16& a) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hneg(a.to_half()));
+#else
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+#endif
+}
+
+__host__ __device__ inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = a + b;
+  return a;
+}
+
+__host__ __device__ inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = a - b;
+  return a;
+}
+
+__host__ __device__ inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = a * b;
+  return a;
+}
+
+__host__ __device__ inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = a / b;
+  return a;
+}
+
+__host__ __device__ inline bool operator==(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) == static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator!=(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) != static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator<(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) < static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator<=(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) <= static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator>(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) > static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator>=(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) >= static_cast<float>(b);
+#endif
+}
+#endif  // __cplusplus
+
+__host__ __device__ inline float16 raw_uint16_to_float16(uint16_t a) {
+  float16 res;
+  res.x = a;
+  return res;
+}
+
+__host__ __device__ inline bool(isnan)(const float16& a) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hisnan(a.to_half());
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+
+__host__ __device__ inline bool(isinf)(const float16& a) { return (a.x & 0x7fff) == 0x7c00; }
+
+__host__ __device__ inline bool(isfinite)(const float16& a) { return !((isnan)(a)) && !((isinf)(a)); }
+
+__host__ __device__ inline float16(abs)(const float16& a) {
+#if defined(CINN_CUDA_FP16) && (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+  return float16(__habs(a.to_half()));
+#else
+  return float16(fabsf(float(a)));
+#endif
+}
+
+__host__ __device__ inline float16(log)(const float16& a) { return float16(std::log(static_cast<float>(a))); }
+
+#ifdef __cplusplus
+}  // namespace common
+}  // namespace cinn
+#endif  // __cplusplus
+
+#if defined(__cplusplus) && defined(CINN_CUDA_FP16)
+__device__ inline cinn::common::float16 __shfl_sync(unsigned mask,
+                                                    cinn::common::float16 var,
+                                                    int srcLane,
+                                                    int width = warpSize) {
+  return cinn::common::float16(__shfl_sync(mask, var.to_half(), srcLane, width));
+}
+
+__device__ inline cinn::common::float16 __shfl_up_sync(unsigned mask,
+                                                       cinn::common::float16 var,
+                                                       unsigned int delta,
+                                                       int width = warpSize) {
+  return cinn::common::float16(__shfl_up_sync(mask, var.to_half(), delta, width));
+}
+
+__device__ inline cinn::common::float16 __shfl_down_sync(unsigned mask,
+                                                         cinn::common::float16 var,
+                                                         unsigned int delta,
+                                                         int width = warpSize) {
+  return cinn::common::float16(__shfl_down_sync(mask, var.to_half(), delta, width));
+}
+
+__device__ inline cinn::common::float16 __shfl_xor_sync(unsigned mask,
+                                                        cinn::common::float16 var,
+                                                        int laneMask,
+                                                        int width = warpSize) {
+  return cinn::common::float16(__shfl_xor_sync(mask, var.to_half(), laneMask, width));
+}
+
+__host__ __device__ inline cinn::common::float16 max(const cinn::common::float16& a, const cinn::common::float16& b) {
+  return a > b ? a : b;
+}
+__host__ __device__ inline cinn::common::float16 min(const cinn::common::float16& a, const cinn::common::float16& b) {
+  return a < b ? a : b;
+}
+#endif  // __cplusplus && CINN_CUDA_FP16
+
+#endif  // CINN_COMMON_FLOAT16_H
diff --git a/paddle/cinn/common/float16_bfloat16_cuda_test.cu b/paddle/cinn/common/float16_bfloat16_cuda_test.cu
new file mode 100644
index 0000000000000..141144563d3ea
--- /dev/null
+++ b/paddle/cinn/common/float16_bfloat16_cuda_test.cu
@@ -0,0 +1,236 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <random>
+#include <vector>
+
+#include "cinn/common/bfloat16.h"
+#include "cinn/common/float16.h"
+
+namespace cinn {
+namespace common {
+
+#define CUDA_CALL(func)                                            \
+  {                                                                \
+    auto status = func;                                            \
+    if (status != cudaSuccess) {                                   \
+      LOG(FATAL) << "CUDA Error : " << cudaGetErrorString(status); \
+    }                                                              \
+  }
+
+class CudaMem {
+ public:
+  CudaMem() = default;
+
+  void* mutable_data(size_t bytes) {
+    CHECK_GT(bytes, 0) << "Cannot allocate empty memory!";
+    if (ptr) {
+      CHECK_EQ(bytes, bytes_) << "Try allocate memory twice!";
+      return ptr;
+    }
+    CUDA_CALL(cudaMalloc(&ptr, bytes));
+    bytes_ = bytes;
+    return ptr;
+  }
+
+  template <typename T>
+  T* mutable_data(size_t num) {
+    return reinterpret_cast<T*>(mutable_data(num * sizeof(T)));
+  }
+
+  void* data() const {
+    CHECK(ptr) << "Try get nullptr!";
+    return ptr;
+  }
+
+  template <typename T>
+  T* data() const {
+    return reinterpret_cast<T*>(data());
+  }
+
+  void MemcpyFromHost(const void* src, size_t bytes, cudaStream_t stream = nullptr) {
+    CHECK_LE(bytes, bytes_) << "Too many data need copy";
+    CUDA_CALL(cudaMemcpyAsync(ptr, src, bytes, cudaMemcpyHostToDevice, stream));
+  }
+
+  void MemcpyToHost(void* dst, size_t bytes, cudaStream_t stream = nullptr) {
+    CHECK_LE(bytes, bytes_) << "Too many data need copy";
+    CUDA_CALL(cudaMemcpyAsync(dst, ptr, bytes, cudaMemcpyDeviceToHost, stream));
+  }
+
+  ~CudaMem() {
+    if (ptr) {
+      cudaFree(ptr);
+    }
+    bytes_ = 0;
+  }
+
+ private:
+  void* ptr{nullptr};
+  size_t bytes_{0};
+};
+
+__global__ void cast_fp32_to_fp16_cuda_kernel(const float* input, const int num, float16* out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num) {
+    out[idx] = float16(input[idx]);
+  }
+}
+
+__global__ void cast_fp16_to_fp32_cuda_kernel(const float16* input, const int num, float* out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num) {
+    out[idx] = float(input[idx]);
+  }
+}
+
+__global__ void test_fp16_cuda_kernel(const float16* x, const float16* y, const int num, float16* out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num) {
+    float16 x_i = x[idx], y_i = y[idx];
+    x_i += float16(1);
+
+    out[idx] = (x_i + y_i) * (x_i - y_i);
+  }
+}
+
+__global__ void cast_fp32_to_bf16_cuda_kernel(const float* input, const int num, bfloat16* out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num) {
+    out[idx] = bfloat16(input[idx]);
+  }
+}
+
+__global__ void cast_bf16_to_fp32_cuda_kernel(const bfloat16* input, const int num, float* out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num) {
+    out[idx] = float(input[idx]);
+  }
+}
+
+__global__ void test_bf16_cuda_kernel(const bfloat16* x, const bfloat16* y, const int num, bfloat16* out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num) {
+    bfloat16 x_i = x[idx], y_i = y[idx];
+    x_i += bfloat16(1);
+
+    out[idx] = (x_i + y_i) * (x_i - y_i);
+  }
+}
+
+__global__ void test_fp32_cuda_kernel(const float* x, const float* y, const int num, float* out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num) {
+    float x_i = x[idx], y_i = y[idx];
+    x_i += 1.0f;
+
+    out[idx] = (x_i + y_i) * (x_i - y_i);
+  }
+}
+
+TEST(FP16_BF16, basic_cuda) {
+#ifdef CUDA_VERSION
+  LOG(INFO) << "CUDA version: " << CUDA_VERSION;
+#endif
+
+  int num = 2048;
+
+  cudaStream_t stream;
+  CUDA_CALL(cudaStreamCreate(&stream));
+
+  dim3 block = 1024;
+  dim3 grid  = (num + block.x - 1) / block.x;
+
+  std::vector<float> x_fp32_host(num), y_fp32_host(num);
+  {  // step1 : generate input data
+    std::random_device r;
+    std::default_random_engine eng(r());
+    std::uniform_real_distribution<float> dis(1e-5f, 1.0f);
+
+    for (int i = 0; i < num; ++i) {
+      x_fp32_host[i] = dis(eng);
+      y_fp32_host[i] = dis(eng);
+    }
+  }
+
+  CudaMem x_fp32_device, y_fp32_device, out_fp32_device;
+  {  // step2 : compute fp32 result
+    auto x_fp32_ptr   = x_fp32_device.mutable_data<float>(num);
+    auto y_fp32_ptr   = y_fp32_device.mutable_data<float>(num);
+    auto out_fp32_ptr = out_fp32_device.mutable_data<float>(num);
+
+    x_fp32_device.MemcpyFromHost(x_fp32_host.data(), num * sizeof(float), stream);
+    y_fp32_device.MemcpyFromHost(y_fp32_host.data(), num * sizeof(float), stream);
+
+    test_fp32_cuda_kernel<<<grid, block, 0, stream>>>(x_fp32_ptr, y_fp32_ptr, num, out_fp32_ptr);
+  }
+
+  CudaMem x_fp16_device, y_fp16_device, out_fp16_device;
+  CudaMem x_bf16_device, y_bf16_device, out_bf16_device;
+  {  // step3 : compute fp16/bf16 result
+    // step3.1 : compute fp16 result
+    auto x_fp16_ptr   = x_fp16_device.mutable_data<float16>(num);
+    auto y_fp16_ptr   = y_fp16_device.mutable_data<float16>(num);
+    auto out_fp16_ptr = out_fp16_device.mutable_data<float16>(num);
+
+    cast_fp32_to_fp16_cuda_kernel<<<grid, block, 0, stream>>>(x_fp32_device.data<float>(), num, x_fp16_ptr);
+    cast_fp32_to_fp16_cuda_kernel<<<grid, block, 0, stream>>>(y_fp32_device.data<float>(), num, y_fp16_ptr);
+
+    test_fp16_cuda_kernel<<<grid, block, 0, stream>>>(x_fp16_ptr, y_fp16_ptr, num, out_fp16_ptr);
+
+    // step3.2 : compute bf16 result
+    auto x_bf16_ptr   = x_bf16_device.mutable_data<bfloat16>(num);
+    auto y_bf16_ptr   = y_bf16_device.mutable_data<bfloat16>(num);
+    auto out_bf16_ptr = out_bf16_device.mutable_data<bfloat16>(num);
+
+    cast_fp32_to_bf16_cuda_kernel<<<grid, block, 0, stream>>>(x_fp32_device.data<float>(), num, x_bf16_ptr);
+    cast_fp32_to_bf16_cuda_kernel<<<grid, block, 0, stream>>>(y_fp32_device.data<float>(), num, y_bf16_ptr);
+
+    test_bf16_cuda_kernel<<<grid, block, 0, stream>>>(x_bf16_ptr, y_bf16_ptr, num, out_bf16_ptr);
+  }
+
+  CudaMem fp32res_fp16_device;
+  CudaMem fp32res_bf16_device;
+  {  // step4 : cast fp16/bf16 result to fp32 result
+    // step4.1 : cast fp16 result to fp32 result
+    auto fp32res_fp16_ptr = fp32res_fp16_device.mutable_data<float>(num);
+    cast_fp16_to_fp32_cuda_kernel<<<grid, block, 0, stream>>>(out_fp16_device.data<float16>(), num, fp32res_fp16_ptr);
+
+    // step4.2 : cast bf16 result to fp32 result
+    auto fp32res_bf16_ptr = fp32res_bf16_device.mutable_data<float>(num);
+    cast_bf16_to_fp32_cuda_kernel<<<grid, block, 0, stream>>>(out_bf16_device.data<bfloat16>(), num, fp32res_bf16_ptr);
+  }
+
+  std::vector<float> out_fp32_host(num), out_fp16_host(num), out_bf16_host(num);
+  {  // step5 : copy result from device to host
+    out_fp32_device.MemcpyToHost(out_fp32_host.data(), num * sizeof(float), stream);
+    fp32res_fp16_device.MemcpyToHost(out_fp16_host.data(), num * sizeof(float), stream);
+    fp32res_bf16_device.MemcpyToHost(out_bf16_host.data(), num * sizeof(float), stream);
+  }
+
+  CUDA_CALL(cudaStreamSynchronize(stream));
+
+  for (int i = 0; i < num; ++i) {
+    ASSERT_NEAR(out_fp32_host[i], out_fp16_host[i], 1e-2f);
+    ASSERT_NEAR(out_fp32_host[i], out_bf16_host[i], 1e-1f);
+  }
+
+  CUDA_CALL(cudaStreamDestroy(stream));
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/float16_bfloat16_host_test.cc b/paddle/cinn/common/float16_bfloat16_host_test.cc
new file mode 100644
index 0000000000000..0846056a34288
--- /dev/null
+++ b/paddle/cinn/common/float16_bfloat16_host_test.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <random>
+#include <vector>
+
+#include "cinn/common/bfloat16.h"
+#include "cinn/common/float16.h"
+
+namespace cinn {
+namespace common {
+
+std::vector<float16> test_fp16_host_kernel(const float16* x, const float16* y, const int num) {
+  std::vector<float16> out(num);
+  for (int idx = 0; idx < num; ++idx) {
+    float16 x_i = x[idx], y_i = y[idx];
+    x_i += float16(1);
+
+    out[idx] = (x_i + y_i) * (x_i - y_i);
+  }
+  return out;
+}
+
+std::vector<bfloat16> test_bf16_host_kernel(const bfloat16* x, const bfloat16* y, const int num) {
+  std::vector<bfloat16> out(num);
+  for (int idx = 0; idx < num; ++idx) {
+    bfloat16 x_i = x[idx], y_i = y[idx];
+    x_i += bfloat16(1);
+
+    out[idx] = (x_i + y_i) * (x_i - y_i);
+  }
+  return out;
+}
+
+std::vector<float> test_fp32_host_kernel(const float* x, const float* y, const int num) {
+  std::vector<float> out(num);
+  for (int idx = 0; idx < num; ++idx) {
+    float x_i = x[idx], y_i = y[idx];
+    x_i += 1.0f;
+
+    out[idx] = (x_i + y_i) * (x_i - y_i);
+  }
+  return out;
+}
+
+TEST(FP16_BF16, basic_host) {
+  int num = 2048;
+  // int num = 2;
+  std::vector<float16> x_fp16(num), y_fp16(num);
+  std::vector<bfloat16> x_bf16(num), y_bf16(num);
+  std::vector<float> x_fp32(num), y_fp32(num);
+
+  std::random_device r;
+  std::default_random_engine eng(r());
+  std::uniform_real_distribution<float> dis(1e-5f, 1.0f);
+
+  for (int i = 0; i < num; ++i) {
+    x_fp16[i] = x_fp32[i] = dis(eng);
+    y_fp16[i] = y_fp32[i] = dis(eng);
+
+    x_fp16[i] = x_fp32[i];
+    y_fp16[i] = y_fp32[i];
+
+    x_bf16[i] = x_fp32[i];
+    y_bf16[i] = y_fp32[i];
+  }
+
+  auto out_fp16 = test_fp16_host_kernel(x_fp16.data(), y_fp16.data(), num);
+  ASSERT_EQ(out_fp16.size(), num);
+
+  auto out_bf16 = test_bf16_host_kernel(x_bf16.data(), y_bf16.data(), num);
+  ASSERT_EQ(out_bf16.size(), num);
+
+  auto out_fp32 = test_fp32_host_kernel(x_fp32.data(), y_fp32.data(), num);
+  ASSERT_EQ(out_fp32.size(), num);
+
+  for (int i = 0; i < num; ++i) {
+    ASSERT_NEAR(static_cast<float>(out_fp16[i]), out_fp32[i], 1e-2f);
+    ASSERT_NEAR(static_cast<float>(out_bf16[i]), out_fp32[i], 1e-1f);
+  }
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/float16_bfloat16_utils.h b/paddle/cinn/common/float16_bfloat16_utils.h
new file mode 100644
index 0000000000000..312c0d99b0912
--- /dev/null
+++ b/paddle/cinn/common/float16_bfloat16_utils.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+
+#include "cinn/common/bfloat16.h"
+#include "cinn/common/float16.h"
+
+namespace std {
+// Override the std::is_pod::value for float16 and bfloat16
+// The reason is that different compilers implemented std::is_pod based on
+// different C++ standards. float16 class is a plain old data in C++11 given
+// that it is both trivial and standard_layout.
+// However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is
+// more restricted in that you cannot provide any customized
+// constructor in float16. Hence, we override is_pod here following C++11
+// so that .cu files can be successfully compiled by nvcc.
+
+// for float16
+template <>
+struct is_pod<cinn::common::float16> {
+  static const bool value =
+      is_trivial<cinn::common::float16>::value && is_standard_layout<cinn::common::float16>::value;
+};
+
+template <>
+struct is_floating_point<cinn::common::float16>
+    : std::integral_constant<
+          bool,
+          std::is_same<cinn::common::float16, typename std::remove_cv<cinn::common::float16>::type>::value> {};
+template <>
+struct is_signed<cinn::common::float16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<cinn::common::float16> {
+  static const bool value = false;
+};
+
+__host__ __device__ inline cinn::common::float16 abs(const cinn::common::float16& a) { return cinn::common::abs(a); }
+
+inline bool isnan(const cinn::common::float16& a) { return cinn::common::isnan(a); }
+
+inline bool isinf(const cinn::common::float16& a) { return cinn::common::isinf(a); }
+
+inline bool isfinite(const cinn::common::float16& a) { return cinn::common::isfinite(a); }
+
+template <>
+struct numeric_limits<cinn::common::float16> {
+  static const bool is_specialized                = true;
+  static const bool is_signed                     = true;
+  static const bool is_integer                    = false;
+  static const bool is_exact                      = false;
+  static const bool has_infinity                  = true;
+  static const bool has_quiet_NaN                 = true;
+  static const bool has_signaling_NaN             = true;
+  static const float_denorm_style has_denorm      = denorm_present;
+  static const bool has_denorm_loss               = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559                     = false;
+  static const bool is_bounded                    = false;
+  static const bool is_modulo                     = false;
+  static const int digits                         = 11;
+  static const int digits10                       = 3;
+  static const int max_digits10                   = 5;
+  static const int radix                          = 2;
+  static const int min_exponent                   = -13;
+  static const int min_exponent10                 = -4;
+  static const int max_exponent                   = 16;
+  static const int max_exponent10                 = 4;
+  static const bool traps                         = true;
+  static const bool tinyness_before               = false;
+
+  __host__ __device__ static cinn::common::float16(min)() { return cinn::common::raw_uint16_to_float16(0x400); }
+  __host__ __device__ static cinn::common::float16 lowest() { return cinn::common::raw_uint16_to_float16(0xfbff); }
+  __host__ __device__ static cinn::common::float16(max)() { return cinn::common::raw_uint16_to_float16(0x7bff); }
+  __host__ __device__ static cinn::common::float16 epsilon() { return cinn::common::raw_uint16_to_float16(0x0800); }
+  __host__ __device__ static cinn::common::float16 round_error() { return cinn::common::float16(0.5); }
+  __host__ __device__ static cinn::common::float16 infinity() { return cinn::common::raw_uint16_to_float16(0x7c00); }
+  __host__ __device__ static cinn::common::float16 quiet_NaN() { return cinn::common::raw_uint16_to_float16(0x7e00); }
+  __host__ __device__ static cinn::common::float16 signaling_NaN() {
+    return cinn::common::raw_uint16_to_float16(0x7e00);
+  }
+  __host__ __device__ static cinn::common::float16 denorm_min() { return cinn::common::raw_uint16_to_float16(0x1); }
+};
+
+// for bfloat16
+template <>
+struct is_pod<cinn::common::bfloat16> {
+  static const bool value =
+      is_trivial<cinn::common::bfloat16>::value && is_standard_layout<cinn::common::bfloat16>::value;
+};
+
+template <>
+struct is_floating_point<cinn::common::bfloat16>
+    : std::integral_constant<
+          bool,
+          std::is_same<cinn::common::bfloat16, typename std::remove_cv<cinn::common::bfloat16>::type>::value> {};
+template <>
+struct is_signed<cinn::common::bfloat16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<cinn::common::bfloat16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const cinn::common::bfloat16& a) { return cinn::common::isnan(a); }
+
+inline bool isinf(const cinn::common::bfloat16& a) { return cinn::common::isinf(a); }
+
+template <>
+struct numeric_limits<cinn::common::bfloat16> {
+  static const bool is_specialized                = true;
+  static const bool is_signed                     = true;
+  static const bool is_integer                    = false;
+  static const bool is_exact                      = false;
+  static const bool has_infinity                  = true;
+  static const bool has_quiet_NaN                 = true;
+  static const bool has_signaling_NaN             = true;
+  static const float_denorm_style has_denorm      = denorm_present;
+  static const bool has_denorm_loss               = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559                     = false;
+  static const bool is_bounded                    = false;
+  static const bool is_modulo                     = false;
+  static const int digits                         = 8;
+  static const int digits10                       = 2;
+  static const int max_digits10                   = 9;
+  static const int radix                          = 2;
+  static const int min_exponent                   = -125;
+  static const int min_exponent10                 = -37;
+  static const int max_exponent                   = 128;
+  static const int max_exponent10                 = 38;
+  static const bool traps                         = true;
+  static const bool tinyness_before               = false;
+
+  __host__ __device__ static cinn::common::bfloat16(min)() { return cinn::common::raw_uint16_to_bfloat16(0x007f); }
+  __host__ __device__ static cinn::common::bfloat16 lowest() { return cinn::common::raw_uint16_to_bfloat16(0xff7f); }
+  __host__ __device__ static cinn::common::bfloat16(max)() { return cinn::common::raw_uint16_to_bfloat16(0x7f7f); }
+  __host__ __device__ static cinn::common::bfloat16 epsilon() { return cinn::common::raw_uint16_to_bfloat16(0x3400); }
+  __host__ __device__ static cinn::common::bfloat16 round_error() { return cinn::common::bfloat16(0.5); }
+  __host__ __device__ static cinn::common::bfloat16 infinity() { return cinn::common::raw_uint16_to_bfloat16(0x7f80); }
+  __host__ __device__ static cinn::common::bfloat16 quiet_NaN() { return cinn::common::raw_uint16_to_bfloat16(0xffc1); }
+  __host__ __device__ static cinn::common::bfloat16 signaling_NaN() {
+    return cinn::common::raw_uint16_to_bfloat16(0xff81);
+  }
+  __host__ __device__ static cinn::common::bfloat16 denorm_min() {
+    return cinn::common::raw_uint16_to_bfloat16(0x0001);
+  }
+};
+
+}  // namespace std
+
+namespace cinn {
+namespace common {
+inline std::ostream& operator<<(std::ostream& os, const float16& a) {
+  os << std::showpoint << static_cast<float>(a);
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
+  os << std::showpoint << static_cast<float>(a);
+  return os;
+}
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/graph_utils.cc b/paddle/cinn/common/graph_utils.cc
new file mode 100755
index 0000000000000..cf7b3446e9b01
--- /dev/null
+++ b/paddle/cinn/common/graph_utils.cc
@@ -0,0 +1,212 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/graph_utils.h"
+
+#include <glog/logging.h>
+
+#include <deque>
+#include <functional>
+#include <set>
+#include <stack>
+
+#include "cinn/common/common.h"
+#include "cinn/utils/dot_lang.h"
+
+namespace cinn {
+namespace common {
+
+namespace {
+
+void DFSSortUtil(const GraphNode *node, std::vector<GraphNode *> *order) {}
+
+std::vector<GraphNode *> DFSSort(const std::vector<GraphNode *> &nodes) {
+  LOG(FATAL) << "not implemented";
+  return {};
+}
+
+}  // namespace
+
+std::set<GraphNode *> Graph::dependencies(const std::vector<GraphNode *> &targets) {
+  // A naive implementation.
+  std::set<GraphNode *> _targets(targets.begin(), targets.end());
+  std::set<GraphNode *> res;
+  int targets_count = 0;
+  while (targets_count != _targets.size()) {
+    targets_count = _targets.size();
+    for (auto *node : nodes()) {
+      if (_targets.count(node)) continue;
+      for (auto &edge : node->outlinks()) {
+        if (_targets.count(edge->sink())) {
+          res.insert(edge->sink());
+          _targets.insert(edge->sink());
+        }
+      }
+    }
+  }
+  return res;
+}
+
+std::vector<const GraphNode *> Graph::nodes() const {
+  std::vector<const GraphNode *> res;
+  for (auto &s : nodes_) res.push_back(s.get());
+  return res;
+}
+std::vector<GraphNode *> Graph::nodes() {
+  std::vector<GraphNode *> res;
+  for (auto &s : nodes_) res.push_back(s.get());
+  return res;
+}
+
+std::tuple<std::vector<GraphNode *>, std::vector<GraphEdge *>> Graph::topological_order() const {
+  std::vector<GraphNode *> node_order;
+  std::vector<GraphEdge *> edge_order;
+  std::deque<GraphNode *> queue;
+
+  // collect indegreee.
+  std::map<std::string, int> indegree;
+  for (auto *n : nodes()) {
+    indegree[n->id()] = n->inlinks().size();
+  }
+
+  // insert start points first.
+  for (auto *n : start_points()) {
+    queue.push_back(&Reference(n));
+  }
+
+  // start to visit
+  int count = 0;
+  while (!queue.empty()) {
+    auto *top_node = queue.front();
+    top_node->set_index(count);
+    node_order.push_back(top_node);
+    count++;
+
+    queue.pop_front();
+
+    for (auto &edge : top_node->outlinks()) {
+      CHECK_EQ(edge->source(), top_node);
+      edge_order.push_back(edge.get());
+      auto *sink = edge->sink();
+      if ((--indegree[sink->id()]) == 0) {
+        queue.push_back(sink);
+      }
+    }
+  }
+
+  CHECK_EQ(node_order.size(), nodes().size()) << "circle detected in the schedule graph:\n\n" << Visualize();
+
+  return std::make_tuple(node_order, edge_order);
+}
+
+std::vector<GraphNode *> Graph::dfs_order() { return std::vector<GraphNode *>(); }
+
+std::vector<const GraphNode *> Graph::start_points() const {
+  std::vector<const GraphNode *> res;
+  for (auto *node : nodes()) {
+    if (node->inlinks().empty()) res.push_back(node);
+  }
+  return res;
+}
+
+std::vector<GraphNode *> Graph::start_points() {
+  std::vector<GraphNode *> res;
+  for (auto *node : nodes()) {
+    if (node->inlinks().empty()) res.push_back(node);
+  }
+  return res;
+}
+
+GraphNode *Graph::RegisterNode(size_t key, GraphNode *node) {
+  registry_.emplace(key, node);
+  nodes_.emplace_back(node);
+  return node;
+}
+
+GraphNode *Graph::RegisterNode(const std::string &key, GraphNode *node) {
+  return RegisterNode(std::hash<std::string>{}(key), node);
+}
+
+GraphNode *Graph::RetrieveNode(size_t key) const {
+  auto it = registry_.find(key);
+  return it == registry_.end() ? nullptr : it->second;
+}
+
+GraphNode *Graph::RetrieveNode(const std::string &key) const { return RetrieveNode(std::hash<std::string>()(key)); }
+
+std::string Graph::Visualize() const {
+  utils::DotLang dot;
+
+  // 1. create nodes
+  for (auto &node : nodes_) {
+    dot.AddNode(node->id(), {}, "", "", true);
+  }
+
+  // 2. link each other
+  for (auto &source : nodes_) {
+    for (auto &sink : source->outlinks()) {
+      dot.AddEdge(source->id(), sink->sink()->id(), {});
+    }
+  }
+
+  return dot();
+}
+
+void Graph::ClearUnlinkedNodes(absl::flat_hash_map<std::string, std::vector<int>> *shape_dict,
+                               absl::flat_hash_map<std::string, Type> *type_dict,
+                               absl::flat_hash_map<std::string, std::string> *layout_dict) {
+  CHECK(shape_dict);
+  CHECK(type_dict);
+  CHECK(layout_dict);
+  for (auto it = nodes_.begin(); it < nodes_.end(); ++it) {
+    auto node = *it;
+    if (node->inlinks().empty() && node->outlinks().empty()) {
+      VLOG(2) << "delete unlinked node: " << node->id();
+      nodes_.erase(it);
+      if (shape_dict->count(node->id())) {
+        shape_dict->erase(node->id());
+      }
+      if (type_dict->count(node->id())) {
+        type_dict->erase(node->id());
+      }
+      if (layout_dict->count(node->id())) {
+        layout_dict->erase(node->id());
+      }
+      --it;
+    }
+  }
+}
+
+const char *GraphNode::__type_info__ = "GraphNode";
+
+bool GraphEdgeCompare::operator()(const Shared<GraphEdge> &a, const Shared<GraphEdge> &b) const {
+  if (a->source()->id() == b->source()->id()) {
+    if (a->sink()->id() == b->sink()->id()) {
+      return a->index() < b->index();
+    }
+    return a->sink()->id() > b->sink()->id();
+  }
+  return a->source()->id() < b->source()->id();
+}
+
+std::set<GraphNode *> Graph::CollectNodes(std::function<bool(const common::GraphNode *)> &&teller) {
+  std::set<GraphNode *> res;
+  for (auto *node : nodes()) {
+    if (teller(node)) res.insert(node);
+  }
+  return res;
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/graph_utils.h b/paddle/cinn/common/graph_utils.h
new file mode 100644
index 0000000000000..2075257ea8b99
--- /dev/null
+++ b/paddle/cinn/common/graph_utils.h
@@ -0,0 +1,289 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+//! \file This file contains the utilities of graph.
+
+#include <absl/container/flat_hash_map.h>
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <functional>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "cinn/common/object.h"
+#include "cinn/common/shared.h"
+#include "cinn/common/type.h"
+
+namespace cinn {
+namespace common {
+
+#ifdef As
+#undef As
+#endif
+
+class GraphNode;
+
+/**
+ * Edge in the graph, which can hold some attributes.
+ */
+class GraphEdge : public Object {
+ public:
+  GraphEdge(GraphNode* source, GraphNode* sink, int index = -1) : source_(source), sink_(sink), index_(index) {}
+
+  GraphNode* source() const { return source_; }
+  GraphNode* sink() const { return sink_; }
+  const char* type_info() const override { return __type_info__; }
+  int index() const { return index_; }
+
+ private:
+  //! the index in sink node's inlinks_ or source node's outlinks_
+  //! this is used to keep the input/output tensor's order of operator node
+  int index_{-1};
+  //! Source of this edge.
+  GraphNode* source_{};
+  //! End of this edge.
+  GraphNode* sink_{};
+  static constexpr char* __type_info__ = "graph_edge";
+};
+
+struct GraphEdgeCompare {
+  bool operator()(const common::Shared<GraphEdge>& a, const common::Shared<GraphEdge>& b) const;
+};
+
+/**
+ * @brief The base class of all node of graph.
+ * This is used to normalize and share the graph operations.
+ */
+class GraphNode : public Object {
+ public:
+  //! The unique identifier of the node.
+  virtual std::string id() const = 0;
+  inline int get_index() { return index; }
+  inline void set_index(int index) { this->index = index; }
+
+  //! Links from this to other.
+  template <typename EdgeT = GraphEdge>
+  std::tuple<EdgeT*, EdgeT*> LinkTo(GraphNode* other) {
+    EdgeT *a, *b;
+    CHECK(other);
+    CHECK_NE(other, this) << "Cannot link to itself";
+    auto outlink_edge = make_shared<GraphEdge>(this, other, index_outlinks);
+    auto inlink_edge  = make_shared<GraphEdge>(this, other, other->index_inlinks);
+    index_outlinks++;
+    other->index_inlinks++;
+    outlinks_.insert(outlink_edge);
+    other->inlinks_.insert(inlink_edge);
+
+    for (auto& item : outlinks_) {
+      if (item->index() == index_outlinks - 1) {
+        a = static_cast<EdgeT*>(item.get());
+        break;
+      }
+    }
+    for (auto& item : other->inlinks_) {
+      if (item->index() == other->index_inlinks - 1) {
+        b = static_cast<EdgeT*>(item.get());
+        break;
+      }
+    }
+    CHECK(a);
+    CHECK(b);
+    return std::make_tuple(a, b);
+  }
+
+  void Controls(GraphNode* other) {
+    bool outlink_linked = false;
+    bool inlink_linked  = false;
+    for (auto& item : outlinks_) {
+      if (item->sink()->id() == other->id()) {
+        outlink_linked = true;
+        break;
+      }
+    }
+    for (auto& item : other->inlinks_) {
+      if (item->source()->id() == this->id()) {
+        inlink_linked = true;
+        break;
+      }
+    }
+    CHECK_EQ(outlink_linked, inlink_linked);
+    if (outlink_linked)
+      return;
+    else
+      this->LinkTo(other);
+  }
+
+  void UnLinkAllTo(GraphNode* other) {
+    if (other == this) return;
+    // remove all this node's outlink
+    {
+      auto it = std::find_if(outlinks_.begin(), outlinks_.end(), [&](const Shared<GraphEdge>& x) {
+        return x->source() == this && x->sink() == other;
+      });
+      while (it != outlinks_.end()) {
+        outlinks_.erase(it);
+        it = std::find_if(outlinks_.begin(), outlinks_.end(), [&](const Shared<GraphEdge>& x) {
+          return x->source() == this && x->sink() == other;
+        });
+      }
+    }
+    // remove all other node's inlink
+    {
+      auto it = std::find_if(other->inlinks_.begin(), other->inlinks_.end(), [&](const Shared<GraphEdge>& x) {
+        return x->source() == this && x->sink() == other;
+      });
+      while (it != other->inlinks_.end()) {
+        other->inlinks_.erase(it);
+        it = std::find_if(other->inlinks_.begin(), other->inlinks_.end(), [&](const Shared<GraphEdge>& x) {
+          return x->source() == this && x->sink() == other;
+        });
+      }
+    }
+  }
+
+  void UnLinkSingleTo(GraphNode* other) {
+    if (other == this) return;
+    // remove single outlink
+    {
+      auto it = std::find_if(outlinks_.begin(), outlinks_.end(), [&](const Shared<GraphEdge>& x) {
+        return x->source() == this && x->sink() == other;
+      });
+      if (it != outlinks_.end()) outlinks_.erase(it);
+    }
+    // remove single inlink
+    {
+      auto it = std::find_if(other->inlinks_.begin(), other->inlinks_.end(), [&](const Shared<GraphEdge>& x) {
+        return x->source() == this && x->sink() == other;
+      });
+      if (it != other->inlinks_.end()) other->inlinks_.erase(it);
+    }
+  }
+
+  bool IsLinkedTo(GraphNode* other) const {
+    for (auto& e : outlinks_) {
+      if (e->sink()->id() == other->id()) return true;
+    }
+    return false;
+  }
+
+  //! Get the input links of the node.
+  virtual const std::set<Shared<GraphEdge>, GraphEdgeCompare>& inlinks() const { return inlinks_; }
+  //! Get the output links of the node.
+  virtual const std::set<Shared<GraphEdge>, GraphEdgeCompare>& outlinks() const { return outlinks_; }
+
+  //! Reset graph traversal meta info.
+  void ResetVisitMeta() { visited_time_ = 0; }
+  void VisitOnce() const { visited_time_++; }
+  bool visited() const { return inlinks_.empty() || visited_time_ == inlinks_.size(); }
+
+  const char* type_info() const override { return __type_info__; }
+
+  GraphNode() = default;
+
+  static const char* __type_info__;
+
+ protected:
+  //! The input links of the node.
+  //! \note We record the raw pointer rather than the shared pointer to avoid cycle reference.
+  std::set<common::Shared<GraphEdge>, GraphEdgeCompare> inlinks_;
+  //! The output links of the node.
+  //! \note We record the raw pointer rather than the shared pointer to avoid cycle reference.
+  std::set<common::Shared<GraphEdge>, GraphEdgeCompare> outlinks_;
+
+  mutable int visited_time_{};
+  //! used to mark the index of node's input/output tensors
+  int index_inlinks{0};
+  int index_outlinks{0};
+  int index{0};
+};
+
+/**
+ * @brief The base class of all the graph.
+ */
+class Graph {
+ public:
+  using node_order_t = std::vector<GraphNode*>;
+  using edge_order_t = std::vector<GraphEdge*>;
+
+  //! Add a node to the graph.
+  //! @{
+  GraphNode* RegisterNode(size_t key, GraphNode* node);
+  GraphNode* RegisterNode(const std::string& key, GraphNode* node);
+  //! @}
+
+  //! Retrive a node.
+  //! @{
+  GraphNode* RetrieveNode(size_t key) const;
+  GraphNode* RetrieveNode(const std::string& key) const;
+  //! @}
+
+  //! Get the start point of the graph (the nodes those has no inlinks).
+  std::vector<const GraphNode*> start_points() const;
+  std::vector<GraphNode*> start_points();
+
+  //! Return the graph's nodes and edges(visited) in topological order.
+  std::tuple<std::vector<GraphNode*>, std::vector<GraphEdge*>> topological_order() const;
+
+  //! Return the graph's DFS order.
+  std::vector<GraphNode*> dfs_order();
+
+  //! Return the dependency nodes of a set of nodes.
+  std::set<GraphNode*> dependencies(const std::vector<GraphNode*>& nodes);
+
+  std::vector<const GraphNode*> nodes() const;
+  std::vector<GraphNode*> nodes();
+
+  //! Collect the nodes match the condition defined by \p teller in the graph.
+  std::set<GraphNode*> CollectNodes(std::function<bool(const common::GraphNode*)>&& teller);
+
+  void DropNode(GraphNode* n) {
+    auto it = std::find_if(nodes_.begin(), nodes_.end(), [&](auto& x) { return x.get() == n; });
+    if (it != nodes_.end()) {
+      nodes_.erase(it);
+    }
+  }
+
+  //! Get a string representation to visualize a graph.
+  std::string Visualize() const;
+
+  void ClearUnlinkedNodes(absl::flat_hash_map<std::string, std::vector<int>>* shape_dict,
+                          absl::flat_hash_map<std::string, common::Type>* type_dict,
+                          absl::flat_hash_map<std::string, std::string>* layout_dict);
+
+  size_t num_nodes() const { return nodes_.size(); }
+
+ protected:
+  //! A lookup table that map from hash key to graph node, note that it doesn't own the graph node.
+  std::map<size_t, GraphNode*> registry_;
+  //! A list owns the graph nodes.
+  std::vector<Shared<GraphNode>> nodes_;
+};
+
+}  // namespace common
+}  // namespace cinn
+
+namespace std {
+template <>
+struct hash<cinn::common::GraphNode> {
+  size_t operator()(const cinn::common::GraphNode& x) { return reinterpret_cast<size_t>(hash<std::string>()(x.id())); }
+};
+
+}  // namespace std
diff --git a/paddle/cinn/common/graph_utils_test.cc b/paddle/cinn/common/graph_utils_test.cc
new file mode 100644
index 0000000000000..8734797ee1607
--- /dev/null
+++ b/paddle/cinn/common/graph_utils_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/common/common.h"
+
+namespace cinn {
+namespace common {
+
+struct GraphNodeWithName : public GraphNode {
+  explicit GraphNodeWithName(std::string name) : name(name) {}
+
+  std::string id() const override { return name; }
+
+  std::string name;
+};
+
+// A simple graph.
+std::unique_ptr<Graph> CreateGraph0() {
+  std::unique_ptr<Graph> graph(new Graph);
+
+  auto* A = make_shared<GraphNodeWithName>("A");
+  auto* B = make_shared<GraphNodeWithName>("B");
+  auto* C = make_shared<GraphNodeWithName>("C");
+  auto* D = make_shared<GraphNodeWithName>("D");
+  auto* E = make_shared<GraphNodeWithName>("E");
+
+  graph->RegisterNode("A", A);
+  graph->RegisterNode("B", B);
+  graph->RegisterNode("C", C);
+  graph->RegisterNode("D", D);
+  graph->RegisterNode("E", E);
+
+  A->LinkTo(B);
+  A->LinkTo(C);
+
+  B->LinkTo(D);
+  C->LinkTo(D);
+  C->LinkTo(E);
+
+  return graph;
+}
+
+std::unique_ptr<Graph> CreateGraph1() {
+  std::unique_ptr<Graph> graph(new Graph);
+
+  auto* A = make_shared<GraphNodeWithName>("A");
+  auto* B = make_shared<GraphNodeWithName>("B");
+
+  graph->RegisterNode("A", A);
+  graph->RegisterNode("B", B);
+
+  B->LinkTo(A);
+
+  return graph;
+}
+
+TEST(Graph, Visualize) {
+  auto graph = CreateGraph0();
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+}
+
+TEST(Graph, simple) {
+  auto graph = CreateGraph1();
+  Graph::node_order_t node_order;
+  Graph::edge_order_t edge_order;
+  std::tie(node_order, edge_order) = graph->topological_order();
+
+  LOG(INFO) << "graph1 " << graph->Visualize();
+
+  std::vector<GraphNode*> node_order_target({graph->RetrieveNode("B"), graph->RetrieveNode("A")});
+
+  ASSERT_EQ(node_order.size(), node_order_target.size());
+  for (int i = 0; i < node_order.size(); i++) {
+    EXPECT_EQ(node_order[i]->id(), node_order_target[i]->id());
+  }
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/info_registry.cc b/paddle/cinn/common/info_registry.cc
new file mode 100644
index 0000000000000..fc0bd31c31f6b
--- /dev/null
+++ b/paddle/cinn/common/info_registry.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/info_registry.h"
+
+namespace cinn {
+namespace common {}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/info_registry.h b/paddle/cinn/common/info_registry.h
new file mode 100644
index 0000000000000..ba28bae1a8b86
--- /dev/null
+++ b/paddle/cinn/common/info_registry.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+#include <absl/types/any.h>
+
+#include <string>
+
+namespace cinn {
+namespace common {
+
+/**
+ * Key value.
+ */
+class InfoRegistry {
+ public:
+  template <typename T>
+  T& Get(const std::string& key);
+
+  size_t size() const { return data_.size(); }
+
+  void Clear() { data_.clear(); }
+
+ private:
+  absl::flat_hash_map<std::string, absl::any> data_;
+};
+
+template <typename T>
+T& InfoRegistry::Get(const std::string& key) {
+  auto it = data_.find(key);
+  if (it == data_.end()) {
+    data_[key] = T();
+  }
+  return absl::any_cast<T&>(data_[key]);
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc
new file mode 100755
index 0000000000000..f48991516e37d
--- /dev/null
+++ b/paddle/cinn/common/ir_util.cc
@@ -0,0 +1,417 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/ir_util.h"
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "cinn/common/cas.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/optim/cast_simplify.h"
+
+namespace cinn {
+namespace common {
+
+namespace {
+
+// ramp + scalar or broadcast
+Expr RampRelatedMul(ir::Ramp *ramp, Expr other) {
+  CHECK_EQ(other.type().ElementOf(), Int(32));
+  CHECK_EQ(ramp->base.type(), Int(32));
+  CHECK_EQ(ramp->stride.type(), Int(32));
+  auto *other_broadcast = other.As<ir::Broadcast>();
+  if (other_broadcast) {
+    CHECK_EQ(ramp->lanes, other_broadcast->lanes);
+    other = other_broadcast->value;
+  }
+  return ir::Ramp::Make(ramp->base * other, ramp->stride * other, ramp->lanes);
+}
+
+Expr RampRelatedMul(ir::Broadcast *broadcast, Expr other) {
+  CHECK_EQ(other.type().lanes(), 1);
+  return ir::Broadcast::Make(broadcast->value * other, broadcast->lanes);
+}
+// ramp * ramp
+Expr RampRelatedMul(ir::Ramp *ramp, ir::Ramp *other) {
+  CINN_NOT_IMPLEMENTED
+  return Expr();
+}
+// ramp + scalar
+Expr RampRelatedAdd(ir::Ramp *ramp, Expr other) {
+  CHECK_EQ(other.type().ElementOf(), Int(32));
+
+  auto *other_broadcast = other.As<ir::Broadcast>();
+  if (other_broadcast) {
+    CHECK_EQ(ramp->lanes, other_broadcast->lanes);
+    other = other_broadcast->value;
+  }
+  return ir::Ramp::Make(ramp->base + other, ramp->stride, ramp->lanes);
+}
+Expr RampRelatedAdd(ir::Broadcast *broadcast, Expr other) {
+  CHECK_EQ(other.type().lanes(), 1);
+  return ir::Broadcast::Make(broadcast->value + other, broadcast->lanes);
+}
+// ramp + ramp
+Expr RampRelatedAdd(ir::Ramp *ramp, ir::Ramp *other) {
+  CHECK(ramp);
+  CHECK(other);
+  if (ramp->lanes == other->lanes) {
+    Expr base_add   = common::AutoSimplify(ramp->base + other->base);
+    Expr stride_add = common::AutoSimplify(ramp->stride + other->stride);
+    VLOG(2) << base_add;
+    VLOG(2) << stride_add;
+    return ir::Ramp::Make(base_add, stride_add, ramp->lanes);
+  }
+  CINN_NOT_IMPLEMENTED
+  return Expr();
+}
+
+Expr RampRelatedAdd(Expr a, Expr b) {
+  auto *a_ramp      = a.As<ir::Ramp>();
+  auto *b_ramp      = b.As<ir::Ramp>();
+  auto *a_broadcast = a.As<ir::Broadcast>();
+  auto *b_broadcast = b.As<ir::Broadcast>();
+  if (a_ramp && !b_ramp && (b->type().lanes() == 1 || b_broadcast)) {
+    return RampRelatedAdd(a_ramp, b);
+  } else if (!a_ramp && b_ramp && (a->type().lanes() == 1 || a_broadcast)) {
+    return RampRelatedAdd(b_ramp, a);
+  } else if (!a_ramp && !b_ramp && !a->type().is_vector() && !b->type().is_vector()) {
+    return a + b;
+  } else if (a_ramp && b_ramp) {  // a_ramp && b_ramp
+    return RampRelatedAdd(a_ramp, b_ramp);
+  } else if (a_broadcast && !b_broadcast) {
+    return RampRelatedAdd(a_broadcast, b);
+  } else if (!a_broadcast && b_broadcast) {
+    return RampRelatedAdd(b_broadcast, a);
+  } else if (a_broadcast && b_broadcast) {
+    CHECK_EQ(a_broadcast->lanes, b_broadcast->lanes);
+    return ir::Broadcast::Make(a_broadcast->value + b_broadcast->value, a_broadcast->lanes);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+Expr RampRelatedMul(Expr a, Expr b) {
+  auto *a_ramp      = a.As<ir::Ramp>();
+  auto *b_ramp      = b.As<ir::Ramp>();
+  auto *a_broadcast = a.As<ir::Broadcast>();
+  auto *b_broadcast = b.As<ir::Broadcast>();
+  if (a_ramp && !b_ramp && (!b->type().is_vector() || b_broadcast)) {
+    return RampRelatedMul(a_ramp, b);
+  } else if (!a_ramp && b_ramp && (a->type().is_vector() || a_broadcast)) {
+    return RampRelatedMul(b_ramp, a);
+  } else if (!a_ramp && !b_ramp && !a->type().is_vector() && !b->type().is_vector()) {
+    return a * b;
+  } else if (a_ramp && b_ramp) {  // a_ramp && b_ramp
+    return RampRelatedMul(a_ramp, b_ramp);
+  } else if (a_broadcast && !b_broadcast) {
+    return RampRelatedMul(a_broadcast, b);
+  } else if (!a_broadcast && b_broadcast) {
+    return RampRelatedMul(b_broadcast, a);
+  } else if (a_broadcast && b_broadcast) {
+    CHECK_EQ(a_broadcast->lanes, b_broadcast->lanes);
+    return ir::Broadcast::Make(a_broadcast->value * b_broadcast->value, a_broadcast->lanes);
+  } else {
+    VLOG(3) << "a,b: " << a << " " << b;
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+}  // namespace
+
+Expr IndiceToAbsOffset(const std::vector<Expr> &shape, const std::vector<Expr> &indices) {
+  VLOG(3) << "Begin IndiceToAbsOffset";
+  VLOG(3) << "shape is : " << utils::Join(shape, ",");
+  VLOG(3) << "indices is : " << utils::Join(indices, ",");
+  CHECK_LE(shape.size(), indices.size());
+  Expr res;
+  for (int i = 0; i < shape.size(); i++) {
+    CHECK_EQ(shape[i].type(), Int(32));
+    Expr indice_prod = indices[i];
+    optim::CastSimplify(&indice_prod);
+    for (int j = i + 1; j < shape.size(); j++) {
+      indice_prod = RampRelatedMul(indice_prod, shape[j]);
+    }
+    if (res.defined()) {
+      res = RampRelatedAdd(res, indice_prod);
+    } else {
+      res = indice_prod;
+    }
+  }
+  return common::AutoSimplify(res);
+}
+
+Expr IndiceToAbsOffset(const std::vector<int> &shape, const std::vector<Expr> &indices) {
+  std::vector<Expr> shape_;
+  for (int v : shape) shape_.push_back(Expr(v));
+  return IndiceToAbsOffset(shape, indices);
+}
+
+Expr PrecedingAxisToAbsOffset(const std::vector<Expr> &shape, int preceding_n_axis) {
+  std::vector<Expr> indices;
+  for (int i = 0; i < preceding_n_axis; i++) indices.push_back(shape[i]);
+  return IndiceToAbsOffset(shape, indices);
+}
+
+namespace {
+
+class SubstituteMutator : ir::IRMutator<ir::Expr *> {
+ public:
+  explicit SubstituteMutator(const std::map<const ir::_Var_ *, Expr> &var_map) {
+    for (auto &item : var_map) {
+      var_map_[item.first->name] = item.second;
+    }
+  }
+
+  void operator()(ir::Expr *expr) { Visit(expr); }
+
+ private:
+  void Visit(Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::_Var_ *op, ir::Expr *expr) override {
+    auto it = var_map_.find(op->name);
+    if (it == var_map_.end()) return;
+    *expr = it->second;
+  }
+
+  Expr *expr_{};
+  std::map<std::string, Expr> var_map_;
+};
+
+}  // namespace
+
+void Substitute(Expr *expr, const std::map<const ir::_Var_ *, Expr> &var_map) {
+  SubstituteMutator mutator(var_map);
+  mutator(expr);
+}
+
+bool is_zero(Expr v) {
+  v             = AutoSimplify(v);
+  auto *int_n   = v.As<ir::IntImm>();
+  auto *float_n = v.As<ir::FloatImm>();
+
+  if (int_n) return int_n->value == 0;
+  if (float_n) return float_n->value = 0.f;
+  return false;
+}
+
+Expr CastIfNeeded(Expr body, Type type) {
+  if (body.type() == type) return body;
+  return ir::Cast::Make(type, body);
+}
+
+bool MathEqual(const Expr &a, const Expr &b) {
+  auto c = a - b;
+  c      = AutoSimplify(c);
+  return is_zero(c);
+}
+
+Expr select(Expr cond, Expr true_value, Expr false_value) { return ir::Select::Make(cond, true_value, false_value); }
+
+Expr and_all(const std::vector<Expr> &conds) {
+  CHECK(!conds.empty());
+  Expr res = conds.front();
+  for (int i = 1; i < conds.size(); i++) {
+    res = ir::And::Make(res, conds[i]);
+  }
+  return res;
+}
+
+Expr or_all(const std::vector<Expr> &conds) {
+  CHECK(!conds.empty());
+  Expr res = conds.front();
+  for (int i = 1; i < conds.size(); i++) {
+    res = ir::Or::Make(res, conds[i]);
+  }
+  return res;
+}
+
+void CheckTensorUniqueInExpr(Expr expr) {
+  auto tensor_uniq = ir::CollectIRNodes(expr, [](const Expr *x) { return x->as_tensor(); });
+  absl::flat_hash_map<std::string, const ir::_Tensor_ *> tensor_names;
+  for (auto &t : tensor_uniq) {
+    auto *tp = t.as_tensor();
+    if (!tensor_names.count(tp->name)) {
+      tensor_names[tp->name] = tp;
+    } else {
+      CHECK_EQ(tensor_names[tp->name], tp)
+          << "Found tensor not unique [" << tp->name << "]\nThe original expression is \n"
+          << expr;
+    }
+  }
+}
+
+void CheckBufferUniqueInExpr(Expr expr) {
+  // the buffers exists in tensor and lowered functions.
+  CheckTensorUniqueInExpr(expr);
+
+  auto tensors = ir::CollectIRNodes(expr, [](const Expr *x) { return x->as_tensor(); });
+  auto funcs   = ir::CollectIRNodes(expr, [](const Expr *x) { return x->as_lowered_func(); });
+
+  absl::flat_hash_map<std::string, const ir::_Buffer_ *> buffer_name;
+  auto check_buffer_uniq = [&](const ir::_Buffer_ *b) {
+    if (buffer_name.count(b->name)) {
+      CHECK_EQ(buffer_name[b->name], b);
+    } else {
+      buffer_name[b->name] = b->const_self();
+    }
+  };
+  for (auto &e : tensors) {
+    auto *t = e.as_tensor();
+    if (t->buffer.defined()) {
+      check_buffer_uniq(t->buffer->const_self());
+    }
+  }
+
+  for (auto &e : funcs) {
+    auto *f = e.as_lowered_func();
+    for (auto &b : f->temp_bufs) {
+      if (b.defined()) {
+        check_buffer_uniq(b->const_self());
+      }
+    }
+  }
+}
+
+Expr cast(Expr e, Type type) {
+  if (e.is_constant()) {
+    if (type.is_bool()) {
+      return Expr(static_cast<bool>(e.get_constant()));
+    } else if (type.is_int(8)) {
+      return Expr(static_cast<int8_t>(e.get_constant()));
+    } else if (type.is_int(16)) {
+      return Expr(static_cast<int16_t>(e.get_constant()));
+    } else if (type.is_int(32)) {
+      return Expr(static_cast<int32_t>(e.get_constant()));
+    } else if (type.is_int(64)) {
+      return Expr(static_cast<int64_t>(e.get_constant()));
+    } else if (type.is_uint(8)) {
+      return Expr(static_cast<uint8_t>(e.get_constant()));
+    } else if (type.is_uint(16)) {
+      return Expr(static_cast<uint16_t>(e.get_constant()));
+    } else if (type.is_uint(32)) {
+      return Expr(static_cast<uint32_t>(e.get_constant()));
+    } else if (type.is_uint(64)) {
+      return Expr(static_cast<uint64_t>(e.get_constant()));
+    } else if (type.is_float(32)) {
+      return Expr(static_cast<float>(e.get_constant()));
+    } else if (type.is_float(64)) {
+      return Expr(static_cast<double>(e.get_constant()));
+    } else if (type.is_bfloat16()) {
+      return Expr(static_cast<cinn::common::bfloat16>(e.get_constant()));
+    } else if (type.is_float16()) {
+      return Expr(static_cast<cinn::common::float16>(e.get_constant()));
+    } else {
+      CINN_NOT_IMPLEMENTED
+    }
+  }
+
+  return ir::Cast::Make(type, e);
+}
+
+std::vector<std::string> GatherItersToTensorProducer(const std::string &target_tensor_name, Expr *expr) {
+  struct Visitor : public ir::IRMutator<> {
+    std::vector<std::string> iters;
+    const std::string &target_tensor_name;
+
+    explicit Visitor(const std::string &target_tensor_name) : target_tensor_name(target_tensor_name) {}
+
+    std::vector<std::string> operator()(Expr *expr) {
+      ir::IRMutator<>::Visit(expr, expr);
+      return iters;
+    }
+
+    void Visit(const ir::Store *op, Expr *expr) {
+      if (op->tensor.as_tensor()->name == target_tensor_name) {
+        CHECK(iters.empty());
+        for (auto &e : for_stack) {
+          auto *for_n     = e->As<ir::For>();
+          auto *polyfor_n = e->As<ir::PolyFor>();
+          if (for_n) {
+            iters.push_back(for_n->loop_var->name);
+          } else {
+            iters.push_back(polyfor_n->iterator->name);
+          }
+        }
+      }
+    }
+
+    void Visit(const ir::For *op, Expr *expr) {
+      for_stack.push_back(expr);
+      ir::IRMutator<>::Visit(op, expr);
+      for_stack.pop_back();
+    }
+    void Visit(const ir::PolyFor *op, Expr *expr) {
+      for_stack.push_back(expr);
+      ir::IRMutator<>::Visit(op, expr);
+      for_stack.pop_back();
+    }
+
+    std::vector<Expr *> for_stack;
+  };
+
+  return Visitor(target_tensor_name)(expr);
+}
+
+std::vector<Expr *> GetForloopStackToStore(Expr *expr, const std::string &tensor_name) {
+  VLOG(4) << "search store " << tensor_name << " in expr:\n";
+  VLOG(4) << *expr;
+  struct Mutator : public ir::IRMutator<> {
+    std::vector<Expr *> forloop_stack;
+    bool found{false};
+
+    std::string tensor_name;
+
+    explicit Mutator(const std::string &tensor_name) : tensor_name(tensor_name) {}
+
+    std::vector<Expr *> operator()(Expr *expr) {
+      ir::IRMutator<>::Visit(expr, expr);
+      return forloop_stack;
+    }
+
+    void Visit(const ir::For *op, Expr *expr) {
+      auto *node = expr->As<ir::For>();
+      forloop_stack.push_back(expr);
+      ir::IRMutator<>::Visit(&node->body, &node->body);
+      if (!found) forloop_stack.pop_back();
+    }
+
+    void Visit(const ir::PolyFor *op, Expr *expr) {
+      auto *node = expr->As<ir::PolyFor>();
+      forloop_stack.push_back(expr);
+      ir::IRMutator<>::Visit(&node->body, &node->body);
+      if (!found) forloop_stack.pop_back();
+    }
+
+    void Visit(const ir::Store *op, Expr *expr) { found = op->tensor.as_tensor()->name == tensor_name; }
+  };
+
+  return Mutator(tensor_name)(expr);
+}
+
+Expr max(Expr a, Expr b) {
+  CHECK_EQ(a.type(), b.type());
+  return ir::Max::Make(a, b);
+}
+
+Expr min(Expr a, Expr b) {
+  CHECK_EQ(a.type(), b.type());
+  return ir::Min::Make(a, b);
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/ir_util.h b/paddle/cinn/common/ir_util.h
new file mode 100644
index 0000000000000..8cf8a578b3938
--- /dev/null
+++ b/paddle/cinn/common/ir_util.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/common/bfloat16.h"
+#include "cinn/common/float16.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace common {
+
+Expr IndiceToAbsOffset(const std::vector<Expr> &shape, const std::vector<Expr> &indices);
+Expr IndiceToAbsOffset(const std::vector<int> &shape, const std::vector<Expr> &indices);
+
+Expr PrecedingAxisToAbsOffset(const std::vector<Expr> &shape, int preceding_n_axis);
+
+Expr CastIfNeeded(Expr body, Type type);
+
+//! Substitute vars to other expressions.
+//! @param expr The expression to do modification.
+//! @param var_map The map from variables to the target expressions.
+void Substitute(Expr *expr, const std::map<const ir::_Var_ *, Expr> &var_map);
+
+//! Get a stack of forloops(For and PolyFor nodes) to a Store node target to \p tensor_name
+std::vector<Expr *> GetForloopStackToStore(Expr *expr, const std::string &tensor_name);
+
+// make const
+// @{
+inline Expr make_const(int32_t x) { return Expr(static_cast<int32_t>(x)); }
+inline Expr make_const(int64_t x) { return Expr(static_cast<int64_t>(x)); }
+inline Expr make_const(bfloat16 x) { return Expr(static_cast<bfloat16>(x)); }
+inline Expr make_const(float16 x) { return Expr(static_cast<float16>(x)); }
+inline Expr make_const(float x) { return Expr(static_cast<float>(x)); }
+inline Expr make_const(double x) { return Expr(static_cast<double>(x)); }
+inline Expr make_const(bool x) { return Expr(static_cast<bool>(x)); }
+// @}
+
+//! maker for some general consts.
+// @{
+template <typename T = int32_t>
+inline Expr make_zero() {
+  return make_const(static_cast<T>(0));
+}
+template <typename T = int32_t>
+inline Expr make_one() {
+  return make_const(static_cast<T>(1));
+}
+inline Expr make_bool(bool x) { return common::make_shared<ir::UIntImm>(Bool(), x); }
+inline Expr make_bool(bool x, int lanes) { return common::make_shared<ir::UIntImm>(Bool(lanes), x); }
+// @}
+
+/**
+ * \brief Check all the tensors are unique in an expression.
+ */
+void CheckTensorUniqueInExpr(Expr expr);
+
+/**
+ * \brief Check all the buffers are uniuqe in an expression.
+ */
+void CheckBufferUniqueInExpr(Expr expr);
+
+std::vector<std::string> GatherItersToTensorProducer(const std::string &target_tensor_name, Expr *expr);
+
+bool is_zero(Expr v);
+
+bool MathEqual(const Expr &a, const Expr &b);
+
+//! helper function to get a ir::Select node.
+Expr select(Expr cond, Expr true_value, Expr false_value);
+
+//! helper function to get the And of all the conditions.
+Expr and_all(const std::vector<Expr> &conds);
+
+//! helper function to get the Or of all the conditions.
+Expr or_any(const std::vector<Expr> &conds);
+
+//! Cast the expression \p e to type \type.
+Expr cast(Expr e, Type type);
+
+Expr max(Expr a, Expr b);
+
+Expr min(Expr a, Expr b);
+
+template <typename T>
+Expr make_const(Type t, T v) {
+  if (t.is_vector()) {
+    if (t.is_int()) {
+      return ir::Broadcast::Make(make_shared<ir::IntImm>(t.ElementOf(), static_cast<int64_t>(v)), t.lanes());
+    } else if (t.is_uint()) {
+      return ir::Broadcast::Make(make_shared<ir::UIntImm>(t.ElementOf(), static_cast<uint64_t>(v)), t.lanes());
+    } else if (t.is_float()) {
+      return ir::Broadcast::Make(make_shared<ir::FloatImm>(t.ElementOf(), static_cast<double>(v)), t.lanes());
+    } else if (t.is_bool()) {
+      return ir::Broadcast::Make(make_shared<ir::UIntImm>(t.ElementOf(), static_cast<bool>(v)), t.lanes());
+    } else {
+      CINN_NOT_IMPLEMENTED
+    }
+  } else {
+    if (t.is_int()) {
+      return make_shared<ir::IntImm>(t, static_cast<int64_t>(v));
+    } else if (t.is_uint()) {
+      return make_shared<ir::UIntImm>(t, static_cast<uint64_t>(v));
+    } else if (t.is_float()) {
+      return make_shared<ir::FloatImm>(t, static_cast<double>(v));
+    } else if (t.is_bool()) {
+      return make_shared<ir::UIntImm>(t, static_cast<bool>(v));
+    } else {
+      CINN_NOT_IMPLEMENTED
+    }
+  }
+  return Expr();
+}
+
+template <typename FuncOp>
+Expr FoldExpr(FuncOp func_op, const std::vector<Expr> &values) {
+  Expr init_value;
+  for (const Expr &val : values) {
+    if (!init_value.defined()) {
+      init_value = val;
+    } else {
+      init_value = func_op(val, init_value);
+    }
+  }
+  return init_value;
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/macros.h b/paddle/cinn/common/macros.h
new file mode 100644
index 0000000000000..fce0d19292ec3
--- /dev/null
+++ b/paddle/cinn/common/macros.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(NDEBUG)
+#define CINN_DEBUG
+#endif
+
+#define CINN_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;           \
+  void operator=(const TypeName&) = delete
+
+#ifndef CINN_NOT_IMPLEMENTED
+#define CINN_NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented";
+#endif
+
+#define CINN_RESULT_SHOULD_USE __attribute__((warn_unused_result))
+
+/**
+ * A trick to enforce the registry.
+ *
+ * usage:
+ *
+ * CINN_REGISTER_HELPER(some_key) {
+ *   // register methods
+ * }
+ *
+ * CINN_USE_REGISTER(some_key);
+ */
+#define CINN_REGISTER_HELPER(symbol__) bool __cinn__##symbol__##__registrar()
+#define CINN_USE_REGISTER(symbol__)              \
+  extern bool __cinn__##symbol__##__registrar(); \
+  [[maybe_unused]] static bool __cinn_extern_registrar_##symbol__ = __cinn__##symbol__##__registrar();
+
+#if __cplusplus >= 201703L
+#define CINN_NODISCARD [[nodiscard]]
+#else
+#define CINN_NODISCARD
+#endif
diff --git a/paddle/cinn/common/object.cc b/paddle/cinn/common/object.cc
new file mode 100644
index 0000000000000..f04cdceff5c6f
--- /dev/null
+++ b/paddle/cinn/common/object.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/object.h"
+
+namespace cinn {
+namespace common {}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/object.h b/paddle/cinn/common/object.h
new file mode 100644
index 0000000000000..e73234911a0ff
--- /dev/null
+++ b/paddle/cinn/common/object.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstring>
+
+#include "cinn/common/shared.h"
+
+namespace cinn {
+namespace common {
+
+template <typename T>
+class Shared;
+/**
+ * Object is the basic element in the CINN, with `Shared` wrapper, the object can be shared across the system.
+ */
+struct Object {
+  //! Get the type representation of this object.
+  virtual const char* type_info() const = 0;
+
+  //! Cast to a derived type.
+  template <typename T>
+  T* as() {
+    return static_cast<T*>(this);
+  }
+
+  //! Cast to a derived type.
+  template <typename T>
+  const T* as() const {
+    return static_cast<const T*>(this);
+  }
+
+  //! Type safe cast.
+  template <typename T>
+  T* safe_as() {
+    if (std::strcmp(type_info(), T::__type_info__) == 0) {
+      return static_cast<T*>(this);
+    }
+    return nullptr;
+  }
+  //! Type safe cast.
+  template <typename T>
+  const T* safe_as() const {
+    if (std::strcmp(type_info(), T::__type_info__) == 0) {
+      return static_cast<const T*>(this);
+    }
+    return nullptr;
+  }
+
+  //! Check if the type is right.
+  template <typename T>
+  bool is_type() const {
+    if (std::strcmp(type_info(), T::__type_info__) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  //! The reference count, which make all the derived type able to share.
+  mutable RefCount __ref_count__;
+};
+
+using object_ptr    = Object*;
+using shared_object = Shared<Object>;
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/python_interpreter_guard.cc b/paddle/cinn/common/python_interpreter_guard.cc
new file mode 100644
index 0000000000000..bafe376d972c2
--- /dev/null
+++ b/paddle/cinn/common/python_interpreter_guard.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/python_interpreter_guard.h"
+
+#include <pybind11/embed.h>
+
+namespace cinn {
+namespace common {
+
+PythonInterpreterGuard::PythonInterpreterGuard() { pybind11::initialize_interpreter(); }
+
+PythonInterpreterGuard::~PythonInterpreterGuard() { pybind11::finalize_interpreter(); }
+
+PythonInterpreterGuard& PythonInterpreterGuard::Guard() {
+  static PythonInterpreterGuard guard;
+  return guard;
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/python_interpreter_guard.h b/paddle/cinn/common/python_interpreter_guard.h
new file mode 100644
index 0000000000000..8c7961af81c36
--- /dev/null
+++ b/paddle/cinn/common/python_interpreter_guard.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace cinn {
+namespace common {
+
+/**
+ * Singleton to handle Python interpreter life time, since pybind11::initialize_interpreter and
+ * pybind11::finalize_interpreter cannot be called initialization again after finalization, this
+ * singleton calls pybind11::finalize_interpreter when it constructs and calls finalization when
+ * it destructs.
+ *
+ * In this case, every caller can call this guard to make sure the pybind11 Python interpreter
+ * is alive.
+ */
+class PythonInterpreterGuard {
+ public:
+  // Destructor
+  ~PythonInterpreterGuard();
+
+  // Singleton get instance
+  static PythonInterpreterGuard& Guard();
+
+ private:
+  // Constructor
+  PythonInterpreterGuard();
+};
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/shared.cc b/paddle/cinn/common/shared.cc
new file mode 100644
index 0000000000000..0052054cf469b
--- /dev/null
+++ b/paddle/cinn/common/shared.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/shared.h"
diff --git a/paddle/cinn/common/shared.h b/paddle/cinn/common/shared.h
new file mode 100644
index 0000000000000..1b33512c984aa
--- /dev/null
+++ b/paddle/cinn/common/shared.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include <type_traits>
+
+namespace cinn {
+namespace common {
+
+class RefCount {
+ public:
+  using value_type = int32_t;
+  RefCount()       = default;
+
+  value_type Inc() { return ++count_; }
+  value_type Dec() { return --count_; }
+  bool is_zero() const { return 0 == count_; }
+  std::string to_string() { return std::to_string(count_.load()); }
+  int32_t val() const { return count_; }
+
+ private:
+  std::atomic<value_type> count_{0};
+};
+
+class Object;
+/**
+ * The templated methods are used to unify the way to get the RefCount instance in client classes.
+ */
+template <typename T>
+RefCount& ref_count(const T* t) {
+  static_assert(std::is_base_of<Object, T>::value, "T is not a Object");
+  return t->__ref_count__;
+}
+template <typename T>
+void Destroy(const T* t) {
+  delete t;
+}
+
+template <typename T>
+struct Shared {
+  using object_ptr = T*;
+
+  Shared() = default;
+  Shared(T* p) : p_(p) {
+    if (p) IncRef(p);
+  }
+  Shared(const Shared& other) : p_(other.p_) { IncRef(p_); }
+  Shared(Shared&& other) : p_(other.p_) { other.p_ = nullptr; }
+  Shared<T>& operator=(const Shared<T>& other);
+
+  //! Reset to another pointer \p x.
+  void Reset(T* x = nullptr);
+
+  //! Access the pointer in various ways.
+  // @{
+  inline T* get() const { return p_; }
+  inline T& operator*() const { return *p_; }
+  inline T* operator->() const { return p_; }
+  inline T* self() { return p_; }
+  inline const T* self() const { return p_; }
+  // @}
+
+  inline bool same_as(const Shared& other) const { return p_ == other.p_; }
+  inline bool defined() const { return p_; }
+  inline bool operator<(const Shared& other) const { return p_ < other.p_; }
+  inline Shared<T>& operator=(T* x);
+  inline bool operator==(const Shared& other) const { return p_ == other.p_; }
+
+  ~Shared();
+
+ private:
+  //! Increase the share count.
+  void IncRef(T* p);
+
+  //! Decrease the share count.
+  void DecRef(T* p);
+
+ protected:
+  T* p_{};
+};
+
+template <typename T>
+void Shared<T>::IncRef(T* p) {
+  if (p) {
+    ref_count(p).Inc();
+  }
+}
+template <typename T>
+void Shared<T>::DecRef(T* p) {
+  if (p) {
+    if (ref_count(p).Dec() == 0) {
+      Destroy(p);
+    }
+  }
+}
+template <typename T>
+Shared<T>& Shared<T>::operator=(const Shared<T>& other) {
+  if (other.p_ == p_) return *this;
+  // Other can be inside of something owned by this, so we should be careful to incref other before we decref
+  // ourselves.
+  T* tmp = other.p_;
+  IncRef(tmp);
+  DecRef(p_);
+  p_ = tmp;
+  return *this;
+}
+
+template <typename T, typename... Args>
+T* make_shared(Args&&... args) {
+  return new T(args...);
+}
+
+template <typename T>
+Shared<T>& Shared<T>::operator=(T* x) {
+  if (p_ == x) return *this;
+
+  T* tmp = x;
+  IncRef(tmp);
+  DecRef(p_);
+  p_ = tmp;
+  return *this;
+}
+
+template <typename T>
+Shared<T>::~Shared() {
+  DecRef(p_);
+  p_ = nullptr;
+}
+
+template <typename T>
+void Shared<T>::Reset(T* x) {
+  if (x) IncRef(x);
+  DecRef(p_);
+  p_ = x;
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/shared_test.cc b/paddle/cinn/common/shared_test.cc
new file mode 100644
index 0000000000000..ce089fea5f7ac
--- /dev/null
+++ b/paddle/cinn/common/shared_test.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/shared.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/common/object.h"
+
+namespace cinn {
+namespace common {
+
+struct A : public Object {
+  const char *type_info() const override { return "A"; }
+
+  Shared<A> other;
+};
+
+class B : public Object {};
+
+TEST(Shared, test) {
+  Shared<A> a_ref(make_shared<A>());
+  ASSERT_EQ(ref_count(a_ref.get()).val(), 1);
+
+  {  // local copy
+    Shared<A> b = a_ref;
+    EXPECT_EQ(ref_count(a_ref.get()).val(), 2);
+    ASSERT_EQ(ref_count(b.get()).val(), 2);
+  }
+
+  ASSERT_EQ(ref_count(a_ref.get()).val(), 1);
+}
+
+TEST(Shared, cycle_share) {
+  {
+    Shared<A> a_ref(make_shared<A>());
+    a_ref->other = a_ref;
+    ASSERT_EQ(a_ref->__ref_count__.val(), 2);
+  }
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/target.cc b/paddle/cinn/common/target.cc
new file mode 100644
index 0000000000000..1e58408b4c097
--- /dev/null
+++ b/paddle/cinn/common/target.cc
@@ -0,0 +1,225 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime_api.h>
+#include <driver_types.h>
+#endif
+
+#include <glog/logging.h>
+
+#include <sstream>
+
+#include "cinn/common/target.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime_api.h>
+#include <driver_types.h>
+#endif
+
+namespace cinn {
+namespace common {
+
+bool Target::operator==(const Target &other) const {
+  return os == other.os &&      //
+         arch == other.arch &&  //
+         bits == other.bits &&  //
+         features == other.features;
+}
+
+int Target::runtime_arch() const {
+  switch (arch) {
+    case Arch::Unk:
+      return cinn_unk_device;
+    case Arch::X86:
+      return cinn_x86_device;
+    case Arch::ARM:
+      return cinn_arm_device;
+    default:
+      LOG(FATAL) << "Not supported arch";
+  }
+  return -1;
+}
+
+int Target::max_num_threads() const {
+  CHECK(arch == Arch::NVGPU) << "The target is not NVGPU! Cannot get max number of threads.";
+  return 1024;
+}
+
+int Target::get_multi_processor_count() const {
+  CHECK(arch == Arch::NVGPU) << "The target is not NVGPU! Cannot get multi processor count";
+  int num_sm = 0;
+#ifdef CINN_WITH_CUDA
+  cudaDeviceGetAttribute(&num_sm, cudaDeviceAttr::cudaDevAttrMultiProcessorCount, 0);
+#endif
+  return num_sm;
+}
+
+int Target::get_max_threads_per_sm() const {
+  CHECK(arch == Arch::NVGPU) << "The target is not NVGPU! Cannot get max threads per stream processor";
+  int max_thread = 0;
+#ifdef CINN_WITH_CUDA
+  cudaDeviceGetAttribute(&max_thread, cudaDeviceAttr::cudaDevAttrMaxThreadsPerMultiProcessor, 0);
+#endif
+  return max_thread;
+}
+
+int Target::get_max_blocks_per_sm() const {
+  CHECK(arch == Arch::NVGPU) << "The target is not NVGPU! Cannot get max blocks per stream processor";
+  int max_blocks = 1;
+#ifdef CINN_WITH_CUDA
+  cudaDeviceGetAttribute(&max_blocks, cudaDeviceAttr::cudaDevAttrMaxBlocksPerMultiprocessor, 0);
+#endif
+  return max_blocks;
+}
+
+std::vector<Target::Lib> Target::get_target_libs() const { return libs; }
+
+int Target::get_target_bits() const {
+  switch (bits) {
+    case Bit::k32:
+      return 32;
+    case Bit::k64:
+      return 64;
+    case Bit::Unk:
+      return 0;
+    default:
+      LOG(FATAL) << "Not supported Bit";
+  }
+  return -1;
+}
+
+std::string Target::arch_str() const {
+  std::ostringstream oss;
+  oss << arch;
+  return oss.str();
+}
+
+std::ostream &operator<<(std::ostream &os, const Target &target) {
+  os << "Target<";
+  switch (target.os) {
+    case Target::OS::Linux:
+      os << "linux";
+      break;
+    case Target::OS::Windows:
+      os << "windows";
+      break;
+    case Target::OS::Unk:
+      os << "unk";
+      break;
+  }
+
+  os << ",";
+
+  switch (target.arch) {
+    case Target::Arch::X86:
+      os << "x86";
+      break;
+    case Target::Arch::ARM:
+      os << "arm";
+      break;
+    case Target::Arch::NVGPU:
+      os << "nvgpu";
+      break;
+    case Target::Arch::Unk:
+      os << "unk";
+      break;
+  }
+  os << ",";
+
+  switch (target.bits) {
+    case Target::Bit::k32:
+      os << "32";
+      break;
+    case Target::Bit::k64:
+      os << "64";
+      break;
+    case Target::Bit::Unk:
+      os << "unk";
+      break;
+  }
+  os << ">";
+
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, Target::Arch arch) {
+  switch (arch) {
+    case Target::Arch::Unk:
+      os << "Unk";
+      break;
+    case Target::Arch::X86:
+      os << "X86";
+      break;
+    case Target::Arch::ARM:
+      os << "ARM";
+      break;
+    case Target::Arch::NVGPU:
+      os << "NVGPU";
+      break;
+  }
+  return os;
+}
+
+const Target &UnkTarget() {
+  static Target target(Target::OS::Unk, Target::Arch::Unk, Target::Bit::Unk, {}, {});
+  return target;
+}
+const Target &DefaultHostTarget() {
+  static Target target(Target::OS::Linux, Target::Arch::X86, Target::Bit::k64, {}, {});
+  return target;
+}
+
+const Target &DefaultNVGPUTarget() {
+  static Target target(Target::OS::Linux, Target::Arch::NVGPU, Target::Bit::k64, {}, {});
+  return target;
+}
+
+int GetMaxThreads() {
+  // cudaDeviceGetAttribute ( int* value, cudaDeviceAttr attr, int  device )
+  int max_threads = 1;
+#ifdef CINN_WITH_CUDA
+  int num_sm = 1;
+  cudaDeviceGetAttribute(&num_sm, cudaDeviceAttr::cudaDevAttrMultiProcessorCount, 0);
+  cudaDeviceGetAttribute(&max_threads, cudaDeviceAttr::cudaDevAttrMaxThreadsPerMultiProcessor, 0);
+  // multiplication num_sm
+  max_threads *= (num_sm * 4);
+#endif
+  return max_threads;
+}
+
+int GetMaxBlocks() {
+  // cudaDeviceGetAttribute ( int* value, cudaDeviceAttr attr, int  device )
+  int max_blocks = 1;
+#ifdef CINN_WITH_CUDA
+  int num_sm = 1;
+  cudaDeviceGetAttribute(&num_sm, cudaDeviceAttr::cudaDevAttrMultiProcessorCount, 0);
+  cudaDeviceGetAttribute(&max_blocks, cudaDeviceAttr::cudaDevAttrMaxBlocksPerMultiprocessor, 0);
+
+  // multiplication num_sm
+  max_blocks *= num_sm;
+#endif
+  return max_blocks;
+}
+
+const Target &DefaultTarget() {
+#ifdef CINN_WITH_CUDA
+  return DefaultNVGPUTarget();
+#else
+  return DefaultHostTarget();
+#endif
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/target.h b/paddle/cinn/common/target.h
new file mode 100755
index 0000000000000..ad858b4bb8c6f
--- /dev/null
+++ b/paddle/cinn/common/target.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace cinn {
+namespace common {
+
+struct Target {
+  /**
+   * The operating system used by the target. Determines which system calls to generate.
+   */
+  enum class OS : int {
+    Unk = -1,
+    Linux,
+    Windows,
+  };
+
+  /**
+   * The architecture used by the target. Determines the instruction set to use.
+   */
+  enum class Arch : int {
+    Unk = -1,
+    X86,
+    ARM,
+    NVGPU,
+  };
+
+  enum class Bit : int {
+    Unk = -1,
+    k32,
+    k64,
+  };
+
+  OS os{OS::Unk};
+  Arch arch{Arch::Unk};
+  Bit bits{Bit::Unk};
+
+  enum class Feature : int {
+    JIT = 0,
+    Debug,
+  };
+
+  /**
+   * The library used by the target.
+   */
+  enum class Lib : int {
+    Unk = -1,
+    MKL,
+  };
+  std::vector<Feature> features;
+  std::vector<Lib> libs;
+
+  explicit Target(OS o                                 = OS::Linux,
+                  Arch a                               = Arch::Unk,
+                  Bit b                                = Bit::Unk,
+                  const std::vector<Feature>& features = {},
+                  const std::vector<Lib>& libs         = {})
+      : os(o), arch(a), bits(b), features(features), libs(libs) {}
+
+  bool defined() const { return os != OS::Unk && arch != Arch::Unk && bits != Bit::Unk; }
+
+  //! Get the Runtime architecture, it is casted to integer to avoid header file depending.
+  int runtime_arch() const;
+
+  int max_num_threads() const;
+
+  int get_multi_processor_count() const;
+
+  int get_max_threads_per_sm() const;
+
+  int get_max_blocks_per_sm() const;
+
+  int get_target_bits() const;
+
+  std::vector<Lib> get_target_libs() const;
+
+  std::string arch_str() const;
+
+  bool operator==(const Target& other) const;
+  bool operator!=(const Target& other) const { return !(*this == other); }
+  friend std::ostream& operator<<(std::ostream& os, const Target& target);
+};
+
+const Target& UnkTarget();
+
+const Target& DefaultHostTarget();
+
+const Target& DefaultNVGPUTarget();
+
+const Target& DefaultTarget();
+
+int GetMaxThreads();
+
+int GetMaxBlocks();
+
+std::ostream& operator<<(std::ostream& os, Target::Arch arch);
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/test_helper.cc b/paddle/cinn/common/test_helper.cc
new file mode 100644
index 0000000000000..5e0b88ab0ac8e
--- /dev/null
+++ b/paddle/cinn/common/test_helper.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/test_helper.h"
+
+namespace cinn {
+namespace common {
+
+cinn_buffer_t* BufferBuilder::Build() {
+  cinn_type_t cinn_type;
+  if (type_ == type_of<float>()) {
+    cinn_type = cinn_float32_t();
+  } else if (type_ == type_of<double>()) {
+    cinn_type = cinn_float64_t();
+  } else if (type_ == type_of<int8_t>()) {
+    cinn_type = cinn_int8_t();
+  } else if (type_ == type_of<int32_t>()) {
+    cinn_type = cinn_int32_t();
+  } else if (type_ == type_of<int64_t>()) {
+    cinn_type = cinn_int64_t();
+  } else if (type_ == type_of<bool>()) {
+    cinn_type = cinn_bool_t();
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  auto* buffer = cinn_buffer_t::new_(cinn_device_kind_t::cinn_x86_device, cinn_type, shape_, align_);
+
+  cinn_buffer_malloc(nullptr, buffer);
+
+  switch (init_type_) {
+    case InitType::kZero:
+      memset(buffer->memory, 0, buffer->memory_size);
+      break;
+
+    case InitType::kRandom:
+      if (type_ == type_of<float>()) {
+        RandomFloat<float>(buffer->memory, buffer->num_elements());
+      } else if (type_ == type_of<double>()) {
+        RandomFloat<double>(buffer->memory, buffer->num_elements());
+      } else if (type_ == type_of<bool>()) {
+        RandomInt<int8_t>(buffer->memory, buffer->num_elements());
+      } else if (type_ == type_of<int8_t>()) {
+        RandomInt<int8_t>(buffer->memory, buffer->num_elements());
+      } else if (type_ == type_of<int32_t>()) {
+        RandomInt<int32_t>(buffer->memory, buffer->num_elements());
+      } else if (type_ == type_of<int64_t>()) {
+        RandomInt<int64_t>(buffer->memory, buffer->num_elements());
+      }
+      break;
+
+    case InitType::kSetValue:
+      if (type_ == type_of<int>()) {
+        SetVal<int>(buffer->memory, buffer->num_elements(), init_val_);
+      } else if (type_ == type_of<int8_t>()) {
+        SetVal<int8_t>(buffer->memory, buffer->num_elements(), init_val_);
+      } else if (type_ == type_of<float>()) {
+        SetVal<float>(buffer->memory, buffer->num_elements(), init_val_);
+      } else {
+        CINN_NOT_IMPLEMENTED
+      }
+      break;
+  }
+
+  return buffer;
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/test_helper.h b/paddle/cinn/common/test_helper.h
new file mode 100644
index 0000000000000..ec03ec558c4b4
--- /dev/null
+++ b/paddle/cinn/common/test_helper.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/cinn.h"
+
+namespace cinn {
+namespace common {
+
+/**
+ * Create buffer for test.
+ *
+ * usage:
+ *
+ * auto* buf = BufferBuilder(Float(32), {20, 20}).set_random().Build();
+ */
+struct BufferBuilder {
+  enum class InitType {
+    kRandom   = 0,
+    kZero     = 1,
+    kSetValue = 2,
+  };
+  explicit BufferBuilder(Type type, const std::vector<int>& shape) : type_(type), shape_(shape) {}
+
+  BufferBuilder& set_random() {
+    init_type_ = InitType::kRandom;
+    return *this;
+  }
+
+  BufferBuilder& set_zero() {
+    init_type_ = InitType::kZero;
+    return *this;
+  }
+
+  BufferBuilder& set_val(float x) {
+    init_type_ = InitType::kSetValue;
+    init_val_  = x;
+    return *this;
+  }
+
+  BufferBuilder& set_align(int align) {
+    align_ = align;
+    return *this;
+  }
+
+  cinn_buffer_t* Build();
+
+ private:
+  template <typename T>
+  void RandomFloat(void* arr, uint64_t len) {
+    auto* data = static_cast<T*>(arr);
+    for (uint64_t i = 0; i < len; i++) {
+      data[i] = static_cast<T>(rand()) / RAND_MAX;  // NOLINT
+    }
+  }
+
+  template <typename T>
+  void RandomInt(void* arr, int len) {
+    auto* data = static_cast<T*>(arr);
+    for (int i = 0; i < len; i++) {
+      data[i] = static_cast<T>(rand() % std::numeric_limits<T>::max());  // NOLINT
+    }
+  }
+
+  template <typename T>
+  void SetVal(void* arr, int len, T x) {
+    auto* data = static_cast<T*>(arr);
+    for (int i = 0; i < len; i++) {
+      data[i] = x;
+    }
+  }
+
+ private:
+  std::vector<int> shape_;
+  InitType init_type_{InitType::kZero};
+  float init_val_{};
+  int align_{};
+  Type type_;
+};
+
+struct ArgsBuilder {
+  template <typename T>
+  ArgsBuilder& Add(T x) {
+    data_.emplace_back(x);
+    return *this;
+  }
+
+  std::vector<cinn_pod_value_t> Build() {
+    CHECK(!data_.empty());
+    return data_;
+  }
+
+ private:
+  std::vector<cinn_pod_value_t> data_;
+};
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/type.cc b/paddle/cinn/common/type.cc
new file mode 100644
index 0000000000000..7240017e97533
--- /dev/null
+++ b/paddle/cinn/common/type.cc
@@ -0,0 +1,570 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/type.h"
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "type.h"
+
+namespace cinn {
+namespace common {
+
+struct Type::Storage {
+  Storage() = default;
+  Storage(type_t t, int b, int w, specific_type_t st) : type_(t), bits_(b), lanes_(w), specific_type_(st) {}
+
+  type_t type_{type_t::Unk};
+  // distinguish FP16/BF16, or E5M2/E4M3 (when FP8 is supported)
+  specific_type_t specific_type_{specific_type_t::None};
+  cpp_type_t cpp_type_{cpp_type_t::None};
+
+  //! How many bits per element.
+  int bits_{0};
+
+  //! How many elements(if a vector type), for scalar types, it should be 1.
+  int lanes_{1};
+
+  //! Name of the customized type.
+  std::string customized_type_;
+};
+
+Type::~Type() {}
+
+std::ostream &operator<<(std::ostream &os, const Type &t) {
+  if (t.is_cpp_const()) os << "const ";
+  os << Type2Str(t);
+
+  if (t.lanes() > 1) os << "<" << t.lanes() << ">";
+  if (t.is_cpp_handle()) os << "*";
+  if (t.is_cpp_handle2()) os << "**";
+
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, Type::type_t t) {
+  switch (t) {
+    case Type::type_t::Void:
+      os << "Void";
+      break;
+    case Type::type_t::UInt:
+      os << "UInt";
+      break;
+    case Type::type_t::Int:
+      os << "Int";
+      break;
+    case Type::type_t::Float:
+      os << "Float";
+      break;
+    case Type::type_t::Unk:
+      os << "Unk";
+      break;
+    case Type::type_t::Customized:
+      os << "Customized";
+  }
+  return os;
+}
+
+Type &Type::set_cpp_handle(bool x) {
+  // unset the other handle-related bits.
+  set_cpp_handle2(false);
+
+  auto &v = (*reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_));
+  // unset the other handle-related bits.
+  v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+  v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  if (x)
+    v |= static_cast<uint8_t>(cpp_type_t::Handle);
+  else
+    v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+
+  return *this;
+}
+
+Type &Type::set_cpp_handle2(bool x) {
+  auto &v = (*reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_));
+
+  // unset the other handle-related bits.
+  v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+  v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  if (x)
+    v |= static_cast<uint8_t>(cpp_type_t::HandleHandle);
+  else
+    v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  return *this;
+}
+
+Type Type::VectorOf(int w) const {
+  CheckTypeValid();
+  return Type(type(), bits(), w, specific_type());
+}
+
+Type::Type(const Type &other) {
+  if (other.storage_) storage_.reset(new Storage(*other.storage_));
+}
+
+Type Type::ElementOf() const {
+  CheckTypeValid();
+  auto type             = *this;
+  type.storage_->lanes_ = 1;
+  return type;
+}
+
+void Type::CheckTypeValid() const {
+  CHECK_NE(GetStorage().type_, type_t::Unk);
+  if (GetStorage().type_ == type_t::Float && GetStorage().bits_ == 16) {
+    CHECK(GetStorage().specific_type_ == specific_type_t::FP16 || GetStorage().specific_type_ == specific_type_t::BF16)
+        << "When creating a 16 bits Float, the specific_type_t must be FP16 or BF16.";
+  }
+}
+
+Type Type::PointerOf() const {
+  CheckTypeValid();
+  auto x = *this;
+  CHECK(!x.is_cpp_handle2()) << "Not support three level of PointerOf";
+  if (x.is_cpp_handle())
+    x.set_cpp_handle2();
+  else
+    x.set_cpp_handle();
+  return x;
+}
+
+Type Type::ConstOf() const {
+  CheckTypeValid();
+  auto x = *this;
+  x.set_cpp_const();
+  return x;
+}
+
+bool Type::is_supported() const {
+  return this->is_float(32) || this->is_float16() || this->is_bfloat16() || this->is_float(64) || this->is_bool() ||
+         this->is_int(8) || this->is_int(16) || this->is_int(32) || this->is_int(64) || this->is_uint(8) ||
+         this->is_uint(16) || this->is_uint(32) || this->is_uint(64);
+}
+
+Type Type::IgnoreConst() const {
+  CheckTypeValid();
+  auto x = *this;
+  x.set_cpp_const(false);
+  return x;
+}
+
+Type Type::with_bits(int x) const {
+  CHECK(is_primitive());
+  Type type               = *this;
+  type.GetStorage().bits_ = x;
+  return type;
+}
+
+Type Type::with_type(Type::type_t x) const {
+  Type type               = *this;
+  type.GetStorage().type_ = x;
+  return type;
+}
+
+Type Type::with_lanes(int x) const {
+  CHECK(valid());
+  Type type                = *this;
+  type.GetStorage().lanes_ = x;
+  return type;
+}
+
+Type Type::with_cpp_const(bool x) const {
+  Type type = *this;
+  type.set_cpp_const(x);
+  return type;
+}
+
+Type &Type::set_cpp_const(bool is_const) {
+  uint8_t &data = *reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_);
+  if (is_const) {
+    data |= static_cast<uint8_t>(cpp_type_t::Const);
+  } else {
+    data &= ~(static_cast<uint8_t>(cpp_type_t::Const));
+  }
+
+  return *this;
+}
+Type &Type::set_customized_type(const std::string &t) {
+  GetStorage().type_            = type_t ::Customized;
+  GetStorage().customized_type_ = t;
+
+  return *this;
+}
+
+bool Type::valid() const {
+  if (is_unk()) return false;
+  if (is_customized()) {
+    return !GetStorage().customized_type_.empty();
+  }
+  if (is_float() && GetStorage().bits_ == 16) {
+    return (GetStorage().specific_type_ == specific_type_t::FP16 ||
+            GetStorage().specific_type_ == specific_type_t::BF16);
+  }
+  if (is_primitive()) {
+    return bits() != 0;
+  }
+
+  return true;
+}
+
+Type::Type(Type::type_t t, int b, int w, specific_type_t st) : storage_(new Storage(t, b, w, st)) {
+  if (t == Type::type_t::Float && b == 16) {
+    CHECK(st == specific_type_t::FP16 || st == specific_type_t::BF16)
+        << "When creating a 16 bits Float, the specific_type_t must be FP16 or BF16.";
+  }
+}
+bool Type::is_primitive() const { return !is_unk() && type() != type_t::Customized; }
+bool Type::is_customized() const { return !is_unk() && type() == type_t::Customized; }
+bool Type::is_unk() const { return type() == type_t::Unk; }
+bool Type::is_bool() const { return type() == type_t::UInt && bits() == 1; }
+bool Type::is_void() const { return type() == type_t::Void; }
+bool Type::is_vector() const { return lanes() > 1; }
+bool Type::is_scalar() const { return lanes() == 1; }
+// Note: when calling is_float(16), 'st' can't be specific_type_t::None to distinguish FP16/BF16, or use
+// is_float16()/is_bfloat16() for short
+bool Type::is_float(int bits, specific_type_t st) const {
+  if (type() == type_t::Float && bits == 16) {
+    CHECK(st != specific_type_t::None) << "when calling is_float(16), 'st' can't be specific_type_t::None to "
+                                          "distinguish FP16/BF16, or use is_float16()/is_bfloat16() for short";
+    return st == this->specific_type();
+  } else {
+    return type() == type_t::Float && (bits < 0 || bits == this->bits());
+  }
+}
+bool Type::is_float16() const { return is_float(16, specific_type_t::FP16); }
+bool Type::is_bfloat16() const { return is_float(16, specific_type_t::BF16); }
+bool Type::is_uint(int bits) const { return type() == type_t::UInt && (bits < 0 || bits == this->bits()); }
+bool Type::is_int(int bits) const { return type() == type_t::Int && (bits < 0 || bits == this->bits()); }
+bool Type::is_integer(int bits) const {
+  return (type() == type_t::Int || type() == type_t::UInt) && (bits < 0 || bits == this->bits());
+}
+bool Type::is_index_type() { return is_int() && lanes() == 1 && (bits() == 32 || bits() == 64); }
+bool Type::is_cpp_handle() const {
+  return static_cast<uint8_t>(GetStorage().cpp_type_) & static_cast<uint8_t>(cpp_type_t::Handle);
+}
+bool Type::is_cpp_handle2() const {
+  return static_cast<uint8_t>(GetStorage().cpp_type_) & static_cast<uint8_t>(cpp_type_t::HandleHandle);
+}
+bool Type::is_cpp_const() const {
+  return static_cast<uint8_t>(cpp_type_t::Const) & static_cast<uint8_t>(GetStorage().cpp_type_);
+}
+const std::string &Type::customized_type() const { return GetStorage().customized_type_; }
+bool Type::is_customized_type() const { return !GetStorage().customized_type_.empty(); }
+Type::type_t Type::type() const { return GetStorage().type_; }
+Type::specific_type_t Type::specific_type() const { return GetStorage().specific_type_; }
+int Type::bits() const { return GetStorage().bits_; }
+int Type::lanes() const { return GetStorage().lanes_; }
+Type::cpp_type_t Type::cpp_type() const { return GetStorage().cpp_type_; }
+bool Type::operator==(const Type &other) const {
+  return type() == other.type() && specific_type() == other.specific_type() && bits() == other.bits() &&
+         lanes() == other.lanes() && GetStorage().cpp_type_ == other.GetStorage().cpp_type_ &&
+         customized_type() == other.customized_type();
+}
+bool Type::is_string() const { return type() == type_t::String; }
+
+Type &Type::operator=(const Type &other) {
+  if (other.storage_) {
+    storage_.reset(new Storage(other.GetStorage().type_,
+                               other.GetStorage().bits_,
+                               other.GetStorage().lanes_,
+                               other.GetStorage().specific_type_));
+    storage_->cpp_type_        = other.GetStorage().cpp_type_;
+    storage_->customized_type_ = other.GetStorage().customized_type_;
+  }
+  return *this;
+}
+
+Type::Storage &Type::GetStorage() {
+  CHECK(storage_) << "The type not initializated! Please check.";
+  return *storage_;
+}
+const Type::Storage &Type::GetStorage() const {
+  CHECK(storage_) << "The type not initializated! Please check.";
+  return *storage_;
+}
+
+Type::Type() : storage_(new Storage) {}
+Type::Type(Type &&other) : storage_(std::move(other.storage_)) {}
+
+const Type &BF16() {
+  static auto t = Float(16, 1, Type::specific_type_t::BF16);
+  return t;
+}
+const Type &F16() {
+  static auto t = Float(16, 1, Type::specific_type_t::FP16);
+  return t;
+}
+const Type &F32() {
+  static auto t = Float(32);
+  return t;
+}
+const Type &F64() {
+  static auto t = Float(64);
+  return t;
+}
+const Type &I8() {
+  static auto t = Int(8);
+  return t;
+}
+const Type &I16() {
+  static auto t = Int(16);
+  return t;
+}
+const Type &I32() {
+  static auto t = Int(32);
+  return t;
+}
+const Type &I64() {
+  static auto t = Int(64);
+  return t;
+}
+const Type &UI8() {
+  static auto t = UInt(8);
+  return t;
+}
+const Type &UI16() {
+  static auto t = UInt(16);
+  return t;
+}
+const Type &UI32() {
+  static auto t = UInt(32);
+  return t;
+}
+const Type &UI64() {
+  static auto t = UInt(64);
+  return t;
+}
+const Type &I1() {
+  static auto t = Int(1);
+  return t;
+}
+const Type &UI1() {
+  static auto t = UInt(1);
+  return t;
+}
+
+struct TypeHash {
+  size_t operator()(const Type &type) const {
+    std::string hash_str;
+    hash_str += std::to_string(static_cast<int>(type.type()));
+    hash_str += std::to_string(static_cast<int>(type.specific_type()));
+    hash_str += std::to_string(type.bits());
+    hash_str += std::to_string(type.lanes());
+    hash_str += std::to_string(static_cast<int>(type.cpp_type()));
+    if (type.is_customized_type()) {
+      hash_str += type.customized_type();
+    }
+
+    return std::hash<std::string>()(hash_str);
+  }
+};
+
+int Type::bytes() const {
+  // if the type is a pointer
+  auto cpp_type = this->cpp_type();
+  if (cpp_type == Type::cpp_type_t::Handle || cpp_type == Type::cpp_type_t::HandleHandle) {
+    return sizeof(void *);
+  }
+
+// if the type is an known pod type
+#define GET_TYPE_SIZE_PAIR(TYPE) \
+  { type_of<TYPE>(), sizeof(TYPE) }
+  static std::unordered_map<Type, int, TypeHash> type_bytes = {
+      GET_TYPE_SIZE_PAIR(bfloat16),
+      GET_TYPE_SIZE_PAIR(float16),
+      GET_TYPE_SIZE_PAIR(float),
+      GET_TYPE_SIZE_PAIR(double),
+
+      GET_TYPE_SIZE_PAIR(char),
+      GET_TYPE_SIZE_PAIR(signed char),
+      GET_TYPE_SIZE_PAIR(unsigned char),
+
+      GET_TYPE_SIZE_PAIR(int8_t),
+      GET_TYPE_SIZE_PAIR(int16_t),
+      GET_TYPE_SIZE_PAIR(int32_t),
+      GET_TYPE_SIZE_PAIR(int64_t),
+
+      GET_TYPE_SIZE_PAIR(uint8_t),
+      GET_TYPE_SIZE_PAIR(uint16_t),
+      GET_TYPE_SIZE_PAIR(uint32_t),
+      GET_TYPE_SIZE_PAIR(uint64_t),
+
+      GET_TYPE_SIZE_PAIR(bool),
+  };
+#undef GET_TYPE_SIZE_PAIR
+
+  if (type_bytes.count(*this)) {
+    return type_bytes.at(*this);
+  }
+
+  // else get size by bits size
+  auto bit_size = this->bits();
+  return (bit_size + 7) / 8;
+}
+
+Type Str2Type(const std::string &type) {
+  static std::unordered_map<std::string, Type> str2type_map = {
+      {"unk", Type()},
+      {"void", Void()},
+      {"bool", Bool()},
+      {"unsigned char", UI8()},
+
+      {"char", I8()},
+      {"signed char", I8()},
+
+      {"string", String()},
+
+      {"bit", I1()},
+      {"signed bit", I1()},
+      {"int1", I1()},
+      {"int1_t", I1()},
+
+      {"ubit", UI1()},
+      {"unsigned bit", UI1()},
+      {"uint1", UI1()},
+      {"uint1_t", UI1()},
+
+      {"int8", I8()},
+      {"int8_t", I8()},
+
+      {"int16", I16()},
+      {"int16_t", I16()},
+
+      {"int", I32()},
+      {"int32", I32()},
+      {"int32_t", I32()},
+
+      {"int64", I64()},
+      {"int64_t", I64()},
+
+      {"uint8", UI8()},
+      {"uint8_t", UI8()},
+
+      {"uint16", UI16()},
+      {"uint16_t", UI16()},
+
+      {"uint", UI32()},
+      {"uint32", UI32()},
+      {"uint32_t", UI32()},
+
+      {"uint64", UI64()},
+      {"uint64_t", UI64()},
+
+      {"bfloat16", BF16()},
+      {"float16", F16()},
+      {"half", F16()},
+
+      {"float", F32()},
+      {"float32", F32()},
+
+      {"float64", F64()},
+      {"double", F64()},
+
+      {"void*", type_of<void *>()},
+      {"void_p", type_of<void *>()},
+      {"void**", type_of<void **>()},
+      {"void_p_p", type_of<void **>()},
+
+      {"int8*", type_of<int8_t *>()},
+      {"int8_p", type_of<int8_t *>()},
+      {"int8_t*", type_of<int8_t *>()},
+
+      {"uint8*", type_of<uint8_t *>()},
+      {"uint8_p", type_of<uint8_t *>()},
+      {"uint8_t*", type_of<uint8_t *>()},
+
+      {"bfloat16*", type_of<bfloat16 *>()},
+      {"float16*", type_of<float16 *>()},
+      {"half*", type_of<float16 *>()},
+      {"bfloat16_p", type_of<bfloat16 *>()},
+      {"float16_p", type_of<float16 *>()},
+      {"half_p", type_of<float16 *>()},
+
+      {"float*", type_of<float *>()},
+      {"float32*", type_of<float *>()},
+      {"float_p", type_of<float *>()},
+      {"float32_p", type_of<float *>()},
+
+      {"double*", type_of<double *>()},
+      {"float64*", type_of<double *>()},
+      {"double_p", type_of<double *>()},
+      {"float64_p", type_of<double *>()},
+
+      {"cinn_buffer", type_of<cinn_buffer_t>()},
+      {"cinn_buffer*", type_of<cinn_buffer_t>()},
+      {"cinn_buffer_p", type_of<cinn_buffer_t *>()},
+
+      {"const cinn_buffer*", type_of<const cinn_buffer_t *>()},
+      {"const_cinn_buffer_p", type_of<const cinn_buffer_t *>()},
+
+      {"cinn_pod_value", type_of<cinn_pod_value_t>()},
+      {"cinn_pod_value*", type_of<cinn_pod_value_t *>()},
+      {"cinn_pod_value_p", type_of<cinn_pod_value_t *>()},
+  };
+
+  CHECK(str2type_map.find(type) != str2type_map.end()) << "Not support type [" << type << "] ! Please Check.\n";
+  return str2type_map.at(type);
+}
+
+std::string Type2Str(const Type &type) {
+  switch (type.type()) {
+    case Type::type_t::Int:
+      return "int" + std::to_string(type.bits());
+
+    case Type::type_t::UInt:
+      if (type.bits() == 1) {
+        return "bool";
+      } else {
+        return "uint" + std::to_string(type.bits());
+      }
+
+    case Type::type_t::Float:
+      switch (type.specific_type()) {
+        case Type::specific_type_t::None:
+          return "float" + std::to_string(type.bits());
+        case Type::specific_type_t::BF16:
+          return "bfloat16";
+        case Type::specific_type_t::FP16:
+          return "float16";
+        default:
+          break;
+      }
+
+    case Type::type_t::Void:
+      return "void";
+
+    case Type::type_t::Customized:
+      return type.customized_type();
+
+    case Type::type_t::String:
+      return "string";
+
+    case Type::type_t::Unk:
+      return "unk";
+
+    default:
+      LOG(FATAL) << "Not support type [" << type << "] ! Please Check.\n";
+  }
+  return "unk";
+}
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/type.h b/paddle/cinn/common/type.h
new file mode 100644
index 0000000000000..490b80c64f6f6
--- /dev/null
+++ b/paddle/cinn/common/type.h
@@ -0,0 +1,316 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include <memory>
+#include <string>
+
+#include "cinn/common/bfloat16.h"
+#include "cinn/common/float16.h"
+#include "cinn/common/float16_bfloat16_utils.h"
+#include "cinn/common/macros.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+//! Much of the concepts are borrowed from Halide project.
+
+namespace cinn {
+namespace common {
+
+/**
+ * Types in the CINN type system. They can be ints, unsigned ints, or floats of various bit-widths.
+ * They can also be vectors of the same (by setting the `lanes` field to something larger than one).
+ * NOTE: Front-end code other than vectorize shouldn't use vector types.
+ */
+struct Type {
+  enum class type_t {
+    Unk = -1,
+    Int,
+    UInt,
+    Float,
+    String,
+    Void,
+    // stupid idea to mix the Customized with other primitive types, large refactor needs here.
+    Customized,  // Customized type
+  };
+
+  // CINN use type_t and bits to distinguish data types, like is_float(64) for double,
+  // is_float(32) for float, but for Float16 and BFloat16, the bits are both 16, so we need
+  // some other info to distinguish them.
+  enum class specific_type_t {
+    // None for some cases we only care about the bits, e.g. vectorize for hardwares
+    None = -1,
+    FP16,
+    BF16,
+    // for FP8 in future
+    // E5M2,
+    // E4M3,
+  };
+
+  //! type decorators in C++, the different code can used together.
+  enum class cpp_type_t : uint8_t {
+    None         = 0,       // None information.
+    Const        = 1,       // const.
+    Handle       = 1 << 1,  // pointer type, such as `cinn_buffer_t*`.
+    HandleHandle = 1 << 2,  // pointer of pointer, such as `cinn_buffer_t**`.
+  };
+
+  Type();
+  Type(type_t t, int b, int w, specific_type_t st = specific_type_t::None);
+  Type(const Type& other);
+  explicit Type(Type&& other);
+  Type& operator=(const Type& other);
+
+  CINN_NODISCARD bool is_primitive() const;
+  CINN_NODISCARD bool is_customized() const;
+  CINN_NODISCARD bool valid() const;
+
+  //! Some helper functions to check a type.
+  // @{
+  CINN_NODISCARD bool is_unk() const;
+  CINN_NODISCARD bool is_void() const;
+  CINN_NODISCARD bool is_bool() const;
+  CINN_NODISCARD bool is_vector() const;
+  CINN_NODISCARD bool is_scalar() const;
+  CINN_NODISCARD bool is_float(int bits = -1, specific_type_t st = specific_type_t::None) const;
+  CINN_NODISCARD bool is_float16() const;
+  CINN_NODISCARD bool is_bfloat16() const;
+  CINN_NODISCARD bool is_int(int bits = -1) const;
+  CINN_NODISCARD bool is_integer(int bits = -1) const;
+  CINN_NODISCARD bool is_uint(int bits = -1) const;
+  CINN_NODISCARD bool is_string() const;
+  CINN_NODISCARD bool is_index_type();
+  // @}
+
+  Type& set_cpp_handle(bool x = true);
+  CINN_NODISCARD bool is_cpp_handle() const;
+
+  Type& set_cpp_handle2(bool x = true);
+  CINN_NODISCARD bool is_cpp_handle2() const;
+
+  Type& set_cpp_const(bool is_const = true);
+  CINN_NODISCARD bool is_cpp_const() const;
+
+  Type& set_customized_type(const std::string& t);
+  const std::string& customized_type() const;
+  CINN_NODISCARD bool is_customized_type() const;
+
+  // Get a new type with bits set to \p x.
+  Type with_bits(int x) const;
+  // Get a new type with type set to \p x.
+  Type with_type(type_t x) const;
+  // Get a new type with lanes set to \p x.
+  Type with_lanes(int x) const;
+  // Get a new type with cpp_const set to \p x.
+  Type with_cpp_const(bool x = true) const;
+
+  //! Getters
+  // @{
+  type_t type() const;
+  specific_type_t specific_type() const;
+  int bits() const;
+  int lanes() const;
+  cpp_type_t cpp_type() const;
+  int bytes() const;
+  // @}
+
+  //! Compare two types for equality.
+  bool operator==(const Type& other) const;
+
+  //! Compare two types for inequality.
+  bool operator!=(const Type& other) const { return !(*this == other); }
+
+  //! Generate a vector of this type, with `w` elements.
+  Type VectorOf(int w) const;
+  //! Generate a element type of this type.
+  Type ElementOf() const;
+  //! Generate the address type.
+  Type PointerOf() const;
+  //! Ignore const.
+  Type IgnoreConst() const;
+  //! Add const.
+  Type ConstOf() const;
+  //! Check if a dtype is supported in CINN yet.
+  bool is_supported() const;
+
+  friend std::ostream& operator<<(std::ostream& os, const Type& t);
+
+  ~Type();
+
+ private:
+  void CheckTypeValid() const;
+
+  struct Storage;
+  Storage& GetStorage();
+  const Storage& GetStorage() const;
+
+  std::unique_ptr<Storage> storage_;
+};  // namespace common
+
+inline Type Void() { return Type(Type::type_t ::Void, 1, 0); }
+inline Type Int(int bits, int lanes = 1) { return Type(Type::type_t ::Int, bits, lanes); }
+inline Type UInt(int bits, int lanes = 1) { return Type(Type::type_t ::UInt, bits, lanes); }
+inline Type BFloat16(int lanes = 1) { return Type(Type::type_t ::Float, 16, lanes, Type::specific_type_t::BF16); }
+inline Type Float16(int lanes = 1) { return Type(Type::type_t ::Float, 16, lanes, Type::specific_type_t::FP16); }
+inline Type Float(int bits, int lanes = 1, Type::specific_type_t st = Type::specific_type_t::None) {
+  if (bits == 16) {
+    CHECK(st == Type::specific_type_t::FP16 || st == Type::specific_type_t::BF16)
+        << "When creating a 16 bits Float, the specific_type_t must be FP16 or BF16.";
+  }
+  return Type(Type::type_t ::Float, bits, lanes, st);
+}
+inline Type Bool(int lanes = 1) { return Type(Type::type_t ::UInt, 1, lanes); }
+inline Type String() { return Type(Type::type_t::String, 1, 1); }
+
+//! Builtin native types as global singletons.
+// @{
+const Type& BF16();
+const Type& F16();
+const Type& F32();
+const Type& F64();
+const Type& I8();
+const Type& I16();
+const Type& I32();
+const Type& I64();
+const Type& UI8();
+const Type& UI16();
+const Type& UI32();
+const Type& UI64();
+const Type& I1();
+const Type& UI1();
+// @}
+
+template <typename T>
+Type type_of();
+
+// clang-format off
+template <> inline Type type_of<void>() { return Void(); }
+
+template <> inline Type type_of<bfloat16>() { return BF16(); }
+template <> inline Type type_of<float16>() { return F16(); }
+template <> inline Type type_of<float>() { return F32(); }
+template <> inline Type type_of<double>() { return F64(); }
+
+template <> inline Type type_of<bool>() { return UI1(); }
+template <> inline Type type_of<char>() { return I8(); }
+// template <> inline Type type_of<signed char>() { return I8(); }
+// template <> inline Type type_of<unsigned char>() { return UI8(); }
+template <> inline Type type_of<std::string>() { return String(); }
+
+template <> inline Type type_of<int8_t>() { return I8(); }
+template <> inline Type type_of<int16_t>() { return I16(); }
+template <> inline Type type_of<int32_t>() { return I32(); }
+template <> inline Type type_of<int64_t>() { return I64(); }
+
+template <> inline Type type_of<uint8_t>() { return UI8(); }
+template <> inline Type type_of<uint16_t>() { return UI16(); }
+template <> inline Type type_of<uint32_t>() { return UI32(); }
+template <> inline Type type_of<uint64_t>() { return UI64(); }
+
+// clang-format on
+template <>
+inline Type type_of<int8_t*>() {
+  Type x = Int(8);
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<uint8_t*>() {
+  Type x = UInt(8);
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<void*>() {
+  Type x = type_of<void>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<void**>() {
+  Type x = type_of<void>();
+  x.set_cpp_handle2();
+  return x;
+}
+template <>
+inline Type type_of<bfloat16*>() {
+  Type x = type_of<float16>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<float16*>() {
+  Type x = type_of<float16>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<float*>() {
+  Type x = type_of<float>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<double*>() {
+  Type x = type_of<double>();
+  x.set_cpp_handle();
+  return x;
+}
+
+std::ostream& operator<<(std::ostream& os, Type::type_t t);
+
+namespace customized_type {
+
+static const char* kArgs_type_repr        = "Args";
+static const char* kArgValue_type_repr    = "ArgValue";
+static const char* kbuffer_t              = "cinn_buffer_t";
+static const char* kpod_value_t           = "cinn_pod_value_t";
+static const char* kcuda_builtin_vector_t = "CudaVectorType::";
+
+}  // namespace customized_type
+
+template <>
+inline Type type_of<cinn_buffer_t>() {
+  return Type().set_customized_type(customized_type::kbuffer_t);
+}
+template <>
+inline Type type_of<cinn_buffer_t*>() {
+  return Type().set_customized_type(customized_type::kbuffer_t).set_cpp_handle();
+}
+template <>
+inline Type type_of<const cinn_buffer_t*>() {
+  return Type().set_customized_type(customized_type::kbuffer_t).set_cpp_handle().set_cpp_const();
+}
+template <>
+inline Type type_of<cinn_pod_value_t>() {
+  return Type().set_customized_type(customized_type::kpod_value_t);
+}
+template <>
+inline Type type_of<cinn_pod_value_t*>() {
+  return Type().set_customized_type(customized_type::kpod_value_t).set_cpp_handle();
+}
+
+Type Str2Type(const std::string& type);
+
+std::string Type2Str(const Type& type);
+
+enum class Layout {
+  kUnk = 0,
+  kNCHW,
+  kNHWC,
+};
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/type_test.cc b/paddle/cinn/common/type_test.cc
new file mode 100644
index 0000000000000..7de6c00105048
--- /dev/null
+++ b/paddle/cinn/common/type_test.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/type.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn::common {
+
+TEST(Type, basic) {
+  LOG(INFO) << I32();
+
+  auto i32 = I32();
+  LOG(INFO) << I32();
+
+  LOG(INFO) << F32();
+  LOG(INFO) << type_of<float>();
+}
+
+}  // namespace cinn::common
diff --git a/paddle/cinn/common/union_find.cc b/paddle/cinn/common/union_find.cc
new file mode 100644
index 0000000000000..9301517e0c544
--- /dev/null
+++ b/paddle/cinn/common/union_find.cc
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/union_find.h"
+
+namespace cinn {
+namespace common {
+
+const char* UnionFindNode::__type_info__ = "UnionFindNode";
+const char* UnionFindNode::type_info() const { return __type_info__; }
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/common/union_find.h b/paddle/cinn/common/union_find.h
new file mode 100644
index 0000000000000..573e87b08c721
--- /dev/null
+++ b/paddle/cinn/common/union_find.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * \file This file implements a general UnionFind algorithm to help cluster something.
+ */
+#pragma once
+#include <cstring>
+#include <map>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "cinn/common/object.h"
+#include "cinn/common/shared.h"
+
+namespace cinn {
+namespace common {
+
+struct UnionFindNode : public Object {
+  UnionFindNode* parent{};
+  std::string cluster_info;
+
+  std::tuple<UnionFindNode*, int /*height*/> GetRoot() {
+    auto* p   = this;
+    int level = 0;
+    while (p->parent) {
+      p = p->parent;
+      level++;
+    }
+    return std::make_tuple(p, level);
+  }
+
+  void Union(UnionFindNode* other) {
+    auto _p0_l0_ = GetRoot();
+    auto& p0     = std::get<0>(_p0_l0_);
+    auto& l0     = std::get<1>(_p0_l0_);
+    auto _p1_l1_ = other->GetRoot();
+    auto& p1     = std::get<0>(_p1_l1_);
+    auto& l1     = std::get<1>(_p1_l1_);
+    if (p0 == p1) return;
+
+    if (l0 < l1) {
+      p1->parent = p0;
+    } else {
+      p0->parent = p1;
+    }
+  }
+
+  template <typename T>
+  T* safe_as() {
+    CHECK_EQ(std::strcmp(T::__type_info__, type_info()), 0)
+        << "Want a " << T::__type_info__ << " but get a " << type_info();
+    return reinterpret_cast<T*>(this);
+  }
+
+  const char* type_info() const override;
+
+  static const char* __type_info__;
+};
+
+struct UnionFind {
+  UnionFindNode* AddNode(UnionFindNode* node) {
+    nodes.emplace_back(node);
+    return node;
+  }
+
+  std::vector<std::vector<UnionFindNode*>> GetClusters() {
+    std::map<UnionFindNode* /*root*/, std::vector<UnionFindNode*>> clusters;
+
+    for (auto& n : nodes) {
+      auto _root_l_ = n->GetRoot();  // NOLINT
+      auto& root    = std::get<0>(_root_l_);
+      auto& l       = std::get<1>(_root_l_);
+      clusters[root].push_back(n.get());
+    }
+
+    std::vector<std::vector<UnionFindNode*>> res;
+    for (auto& item : clusters) {
+      res.push_back(item.second);
+    }
+    return res;
+  }
+
+  std::vector<common::Shared<UnionFindNode>> nodes;
+};
+
+}  // namespace common
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/CMakeLists.txt b/paddle/cinn/frontend/CMakeLists.txt
new file mode 100755
index 0000000000000..d1cf70fd87486
--- /dev/null
+++ b/paddle/cinn/frontend/CMakeLists.txt
@@ -0,0 +1,49 @@
+core_gather_headers()
+gather_srcs(cinnapi_src SRCS
+  computation.cc
+  syntax.cc
+  paddle_model_to_program.cc
+  interpreter.cc
+  net_builder.cc
+  op_mapper_registry.cc
+  paddle_model_convertor.cc
+  program_pass.cc
+  optimize.cc)
+
+if(NOT WITH_CUDA)
+  cc_test(test_frontend_syntax
+          ARGS "--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
+          SRCS syntax_test.cc DEPS cinncore)
+
+#  cc_test(test_frontend_interpreter
+#          ARGS --model_dir=${THIRD_PARTY_PATH}/naive_mul_model
+#          SRCS interpreter_test.cc DEPS cinncore)
+
+else()
+  nv_test(test_frontend_syntax
+          ARGS "--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
+          SRCS syntax_test.cc DEPS cinncore)
+
+  nv_test(test_frontend_interpreter
+          ARGS --model_dir=${THIRD_PARTY_PATH}/naive_mul_model
+          SRCS interpreter_test.cc DEPS cinncore)
+endif()
+
+#cc_test(test_paddle_model_convertor
+#        ARGS --model_dir=${THIRD_PARTY_PATH}/resnet_model
+#        SRCS paddle_model_convertor_test.cc DEPS cinncore decomposer_test_helper)
+
+#cc_test(test_computation
+#  ARGS "--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
+#  SRCS computation_test.cc DEPS cinncore)
+
+cc_test(test_net_builder SRCS net_builder_test.cc DEPS cinncore)
+cc_test(test_decomposer_registry
+        SRCS decomposer_registry_test.cc DEPS cinncore)
+
+add_subdirectory(paddle)
+add_subdirectory(decomposer)
+add_subdirectory(op_mappers)
+add_subdirectory(pass)
+
+cc_test(test_op_mapper_registry SRCS op_mapper_registry_test.cc DEPS cinncore)
diff --git a/paddle/cinn/frontend/computation.cc b/paddle/cinn/frontend/computation.cc
new file mode 100644
index 0000000000000..447f3605d4a4f
--- /dev/null
+++ b/paddle/cinn/frontend/computation.cc
@@ -0,0 +1,243 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/computation.h"
+
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/scope.h"
+
+namespace cinn {
+namespace frontend {
+
+struct ComputationContext {
+  Target target;
+  void *stream;
+  std::shared_ptr<hlir::framework::Graph> graph;
+  std::shared_ptr<hlir::framework::Scope> scope;
+  std::shared_ptr<hlir::framework::Program> program;
+  std::shared_ptr<hlir::framework::GraphCompiler> graph_compiler;
+
+  CinnComputation::CompileOptions compile_options;
+
+  std::vector<hlir::framework::Tensor> inputs;
+  std::vector<hlir::framework::Tensor> outputs;
+  std::unordered_map<std::string, Variable> varmap;
+  std::unordered_map<std::string, std::string> varmap_paddle2program;
+};
+
+std::shared_ptr<ComputationContext> CompileProgram(const Target &target,
+                                                   Program &program,
+                                                   const std::vector<Variable> &outputs,
+                                                   std::shared_ptr<hlir::framework::Scope> scope,
+                                                   const CinnComputation::CompileOptions &options,
+                                                   void *stream) {
+  std::shared_ptr<ComputationContext> ctx(new ComputationContext());
+  ctx->stream          = stream;
+  ctx->target          = target;
+  ctx->compile_options = options;
+  if (ctx->compile_options.use_decomposer) {
+    ProgramPass::Apply(&program, {}, target, {"Decomposer"});
+  }
+  ctx->graph.reset(new hlir::framework::Graph(program, target));
+
+  if (ctx->compile_options.use_default_passes) {
+    hlir::framework::ApplyPass(ctx->graph.get(), "InferShape");
+
+#ifndef CINN_WITH_CUDA
+    if (target.arch == Target::Arch::X86) {
+      hlir::framework::ApplyPass(ctx->graph.get(), "AlterLayout");
+    }
+#endif
+    hlir::framework::ApplyPass(ctx->graph.get(), "ConstPropagate");
+    hlir::framework::ApplyPasses(ctx->graph.get(), DefaultOpFusionPasses());
+  }
+  for (auto &pass_name : ctx->compile_options.passes) {
+    hlir::framework::ApplyPass(ctx->graph.get(), pass_name);
+  }
+
+  ctx->scope = hlir::framework::BuildScope(target, ctx->graph, scope);
+  ctx->graph_compiler.reset(new hlir::framework::GraphCompiler(target, ctx->scope, ctx->graph));
+
+  std::unordered_set<std::string> fetch_var_ids;
+  for (auto &out : outputs) {
+    fetch_var_ids.insert(out->id);
+  }
+
+  ctx->program = ctx->graph_compiler->Build(options, std::move(fetch_var_ids)).runtime_program;
+  if (ctx->compile_options.do_prerun) {
+    ctx->program->PreRun();
+  }
+
+  for (auto &in_v : program.GetInputs()) {
+    hlir::framework::Tensor t = ctx->scope->GetTensor(in_v->id);
+    ctx->inputs.push_back(t);
+  }
+  for (auto &out_v : outputs) {
+    hlir::framework::Tensor t = ctx->scope->GetTensor(out_v->id);
+    ctx->outputs.push_back(t);
+  }
+  return ctx;
+}
+
+std::vector<std::string> CinnComputation::GetAllTensorNames() {
+  std::vector<std::string> res;
+  for (auto &v : context_->scope->var_names()) {
+    res.push_back(std::string(v));
+  }
+  return res;
+}
+
+std::shared_ptr<CinnComputation> CinnComputation::CompilePaddleModel(
+    const Target &target,
+    const std::string &model_path,
+    const std::vector<std::string> &input_names,
+    const std::vector<hlir::framework::shape_t> &input_shapes,
+    bool params_combined,
+    const CompileOptions &options,
+    void *stream) {
+  CHECK(input_names.size() == input_shapes.size());
+  auto scope = std::make_shared<hlir::framework::Scope>();
+  std::unordered_map<std::string, std::vector<int>> input_shape_map;
+  for (int idx = 0; idx < input_names.size(); ++idx) {
+    input_shape_map[input_names[idx]] = input_shapes[idx];
+  }
+  auto loadedProgram          = LoadPaddleProgram(model_path, scope.get(), input_shape_map, params_combined, target);
+  auto &program               = std::get<0>(loadedProgram);
+  auto &varmap                = std::get<1>(loadedProgram);
+  auto &varmap_paddle2program = std::get<2>(loadedProgram);
+  auto &fetch_names           = std::get<3>(loadedProgram);
+
+  // std::vector<Variable> input_vars;
+  // for (int i = 0; i < input_names.size(); i++) {
+  //   auto &name = input_names[i];
+  //   auto &var  = varmap.at(name);
+  //   var->shape = input_shapes[i];
+  //   input_vars.push_back(var);
+  // }
+  // program->SetInputs({input_vars});
+  // program->Validate();
+  VLOG(3) << "program:\n" << *program;
+  std::vector<Variable> output_vars;
+  for (auto &name : fetch_names) {
+    output_vars.push_back(varmap.at(name));
+  }
+
+  std::shared_ptr<ComputationContext> ctx = CompileProgram(target, *program, output_vars, scope, options, stream);
+  for (auto &v : varmap) {
+    ctx->varmap[v.first] = v.second;
+  }
+  for (auto &v : varmap_paddle2program) {
+    ctx->varmap_paddle2program[v.first] = v.second;
+  }
+
+  auto computation      = std::make_shared<CinnComputation>();
+  computation->context_ = std::move(ctx);
+
+  return computation;
+}
+
+std::shared_ptr<CinnComputation> CinnComputation::BuildAndCompile(const Target &target,
+                                                                  NetBuilder &builder,
+                                                                  const CompileOptions &options,
+                                                                  const std::vector<Variable> &outputs,
+                                                                  void *stream) {
+  auto program = builder.Build();
+  return Compile(target, program, options, outputs, stream);
+}
+
+std::shared_ptr<CinnComputation> CinnComputation::Compile(const Target &target,
+                                                          Program &program,
+                                                          const CompileOptions &options,
+                                                          const std::vector<Variable> &outputs,
+                                                          void *stream) {
+  std::vector<Variable> output_vars = outputs;
+  if (output_vars.empty()) {
+    output_vars.push_back(program[program.size() - 1].GetOutput(0));
+  }
+
+  std::shared_ptr<ComputationContext> ctx = CompileProgram(target, program, output_vars, nullptr, options, stream);
+
+  auto computation      = std::make_shared<CinnComputation>();
+  computation->context_ = std::move(ctx);
+
+  return computation;
+}
+
+void CinnComputation::SetTensorData(const std::string &tname, void *data, size_t size) {
+  hlir::framework::Tensor t = GetTensor(tname);
+  SetTensorData(t, data, size);
+}
+
+void CinnComputation::SetTensorData(hlir::framework::Tensor &t, void *data, size_t size) {
+  void *tdata = t->mutable_data(context_->target, t->type());
+  CHECK_EQ(size, t->shape().numel() * t->type().bytes());
+  if (context_->target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+    CUDA_CALL(cudaMemcpy(tdata, data, size, cudaMemcpyHostToDevice));
+#else
+    CINN_NOT_IMPLEMENTED
+#endif
+  } else if (context_->target.arch == Target::Arch::X86) {
+    memcpy(tdata, data, size);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+void CinnComputation::GetTensorData(hlir::framework::Tensor &t, void *data, size_t size) {
+  void *tdata = t->mutable_data(context_->target, t->type());
+  CHECK_EQ(size, t->shape().numel() * t->type().bytes());
+  if (context_->target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+    CUDA_CALL(cudaMemcpy(data, tdata, size, cudaMemcpyDeviceToHost));
+#else
+    CINN_NOT_IMPLEMENTED
+#endif
+  } else if (context_->target.arch == Target::Arch::X86) {
+    memcpy(data, tdata, size);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void CinnComputation::GetTensorData(const std::string &tname, void *data, size_t size) {
+  hlir::framework::Tensor t = GetTensor(tname);
+  GetTensorData(t, data, size);
+}
+
+std::vector<hlir::framework::Tensor> CinnComputation::GetInputTensors() { return context_->inputs; }
+
+std::vector<hlir::framework::Tensor> CinnComputation::GetOutputTensors() { return context_->outputs; }
+
+hlir::framework::Tensor CinnComputation::GetTensor(const std::string &tname) {
+  if (context_->scope->FindVar(tname)) {
+    return context_->scope->GetTensor(tname);
+  }
+  auto it = context_->varmap_paddle2program.find(tname);
+  if (it == context_->varmap_paddle2program.end()) {
+    LOG(FATAL) << "No variable called [" << tname
+               << "] found in computation\nThe existing vars: " << utils::Join(context_->scope->var_names(), ", ");
+  }
+  return context_->scope->GetTensor(it->second);
+}
+
+void CinnComputation::Execute(const std::map<std::string, cinn_pod_value_t> *name2podargs) {
+  context_->program->Execute(name2podargs, context_->stream);
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/computation.h b/paddle/cinn/frontend/computation.h
new file mode 100644
index 0000000000000..689dc731dd2ca
--- /dev/null
+++ b/paddle/cinn/frontend/computation.h
@@ -0,0 +1,161 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/tensor.h"
+
+namespace cinn {
+namespace frontend {
+
+struct ComputationContext;
+
+class CinnComputation {
+ public:
+  struct CompileOptions : public hlir::framework::GraphCompiler::CompileOptions {
+    bool use_decomposer     = false;
+    bool do_prerun          = true;
+    bool use_default_passes = true;
+    std::vector<std::string> passes;
+  };
+
+  inline static CompileOptions DefaultCompileOptions() {
+    CompileOptions options;
+    options.with_instantiate_variables = true;
+    options.use_decomposer             = false;
+    options.passes                     = {};
+    options.do_prerun                  = true;
+    options.use_default_passes         = true;
+    return options;
+  }
+
+  /**
+   * build program from NetBuilder, then compile it. NetBuilder is normally NetBuilder or CINNBuilder.
+   * @param target the target to run the program
+   * @param builder program builder (NetBuilder or CINNBuilder)
+   * @param options CompileOptions, config the compilation steps
+   * @param outputs program output variables, if outputs is empty, then the output variable
+   *                of the last instruction of the program is used
+   * @param stream CUDA stream, the value is meaningful only when target is NVGPU
+   * @return shared_ptr pointing to CinnComputation instance
+   */
+  static std::shared_ptr<CinnComputation> BuildAndCompile(const Target &target,
+                                                          NetBuilder &builder,
+                                                          const CompileOptions &options = DefaultCompileOptions(),
+                                                          const std::vector<Variable> &outputs = {},
+                                                          void *stream                         = nullptr);
+  /**
+   * compile the program
+   * @param target the target to run the program
+   * @param program program (usually generated by a Builder, or converted from Paddle model)
+   * @param options CompileOptions, config the compilation steps
+   * @param outputs program output variables, if outputs is empty, then the output variable
+   *                of the last instruction of the program is used
+   * @param stream CUDA stream, the value is meaningful only when target is NVGpu
+   * @return shared_ptr pointing to CinnComputation instance
+   */
+  static std::shared_ptr<CinnComputation> Compile(const Target &target,
+                                                  Program &program,
+                                                  const CompileOptions &options        = DefaultCompileOptions(),
+                                                  const std::vector<Variable> &outputs = {},
+                                                  void *stream                         = nullptr);
+  /**
+   * convert a paddle model to program, then compile it.
+   * @param target the target to run the program
+   * @param model_path the path of the paddle model
+   * @param input_names input variable names of paddle model
+   * @param input_shapes input variable shapes of paddle model
+   * @param params_combined whether params are stored combined
+   * @param options CompileOptions, config the compilation steps
+   * @param stream CUDA stream, the value is meaningful only when target is NVGpu
+   * @return shared_ptr pointing to CinnComputation instance
+   */
+  static std::shared_ptr<CinnComputation> CompilePaddleModel(const Target &target,
+                                                             const std::string &model_path,
+                                                             const std::vector<std::string> &input_names,
+                                                             const std::vector<hlir::framework::shape_t> &input_shapes,
+                                                             bool params_combined,
+                                                             const CompileOptions &options = DefaultCompileOptions(),
+                                                             void *stream                  = nullptr);
+
+  /**
+   * get all variable names in the program
+   */
+  std::vector<std::string> GetAllTensorNames();
+
+  /**
+   * get tensor by name
+   * @param name tensor name
+   */
+  hlir::framework::Tensor GetTensor(const std::string &name);
+
+  /**
+   * get input tensors
+   */
+  std::vector<hlir::framework::Tensor> GetInputTensors();
+
+  /**
+   * get output tensors
+   */
+  std::vector<hlir::framework::Tensor> GetOutputTensors();
+
+  /**
+   * set the data of a tensor from user specified buffer.
+   * if tensor is in NVGPU device memory, cudaMemcpy is used.
+   * @param t the tensor
+   * @param data address of the memory buffer to store tensor's data
+   * @param size size of the memory buffer
+   */
+  void SetTensorData(hlir::framework::Tensor &t, void *data, size_t size);
+
+  /**
+   * set the data of a tensor (specified by it's name) from user specified buffer.
+   * if tensor is in NVGPU device memory, cudaMemcpy is used.
+   * @param tname name of the tensor
+   * @param data address of the memory buffer to store tensor's data
+   * @param size size of the memory buffer
+   */
+  void SetTensorData(const std::string &tname, void *data, size_t size);
+
+  /**
+   * copy the data of a tensor to user specified buffer.
+   * if tensor is in NVGPU device memory, cudaMemcpy is used.
+   * @param t the tensor
+   * @param data address of the memory buffer to store tensor's data
+   * @param size size of the memory buffer
+   */
+  void GetTensorData(hlir::framework::Tensor &t, void *data, size_t size);
+  /**
+   * copy the data of a tensor (specified by it's name) to user specified buffer.
+   * if tensor is in NVGPU device memory, cudaMemcpy is used.
+   * @param tname name of the tensor
+   * @param data address of the memory buffer to store tensor's data
+   * @param size size of the memory buffer
+   */
+  void GetTensorData(const std::string &tname, void *data, size_t size);
+
+  /**
+   * run the compiled program
+   */
+  void Execute(const std::map<std::string, cinn_pod_value_t> *name2podargs = nullptr);
+
+ private:
+  std::shared_ptr<ComputationContext> context_;
+};
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/computation_test.cc b/paddle/cinn/frontend/computation_test.cc
new file mode 100644
index 0000000000000..9fd159e936cf3
--- /dev/null
+++ b/paddle/cinn/frontend/computation_test.cc
@@ -0,0 +1,300 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/computation.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/decomposer/use_decomposer.h"
+#include "cinn/frontend/decomposer_registry.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace cinn {
+namespace frontend {
+
+Program CreateTestProgram() {
+  constexpr int B = 8;
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a = builder.CreateInput(Float(32), {M, N / 2}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N / 2}, "B");
+  auto t = builder.Transpose(b, {1, 0});
+  auto r = builder.Reshape(t, {M, N / 2});
+  auto c = builder.Add(a, r);
+  auto x = builder.Divide(a, b);
+  auto d = builder.Concat({c, x}, 1);
+  auto e = builder.BroadcastTo(d, {B, M, N}, {1, 2});
+  auto f = builder.Concat({a, b}, 1);
+  auto g = builder.BroadcastTo(f, {B, M, N}, {1, 2});
+  auto h = builder.Subtract(e, g);
+  auto i = builder.Max(e, h);
+  auto j = builder.Min(e, h);
+  auto k = builder.Multiply(i, j);
+  auto l = builder.Constant<bool>(1, "condition");
+  auto m = builder.BroadcastTo(l, {B, M, N}, {0});
+  auto n = builder.Select(m, j, k);
+  auto o = builder.ReduceSum(n, {0, 1, 2});
+
+  auto program = builder.Build();
+  return program;
+}
+
+Program CreateAddProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {M, N});
+  auto b       = builder.CreateInput(Float(32), {M, N});
+  auto c       = builder.Relu(a);
+  auto d       = builder.Add(b, c);
+  auto program = builder.Build();
+
+  return program;
+}
+
+TEST(cinn_computation, basic_cpu) {
+  NetBuilder builder("basic");
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  auto a = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c = builder.Add(a, b);
+  auto d = builder.Add(a, c);
+
+  auto target = common::DefaultHostTarget();
+  auto comp   = CinnComputation::BuildAndCompile(target, builder);
+  std::vector<float> hostA(M * N);
+  std::vector<float> hostB(M * N);
+  std::vector<float> hostD(M * N);
+  std::vector<float> hostD_expected(M * N);
+  for (int i = 0; i < M * N; i++) {
+    hostA[i]          = static_cast<float>(rand()) / INT_MAX;
+    hostB[i]          = static_cast<float>(rand()) / INT_MAX;
+    hostD_expected[i] = hostA[i] * 2 + hostB[i];
+  }
+
+  comp->SetTensorData("A", reinterpret_cast<void *>(hostA.data()), hostA.size() * sizeof(float));
+  comp->SetTensorData("B", reinterpret_cast<void *>(hostB.data()), hostB.size() * sizeof(float));
+  comp->Execute();
+  comp->GetTensorData(d->id, reinterpret_cast<void *>(hostD.data()), hostD.size() * sizeof(float));
+  for (int i = 0; i < hostD.size(); i++) {
+    ASSERT_NEAR(hostD[i], hostD_expected[i], 1e-5);
+  }
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(cinn_computation, basic_gpu) {
+  NetBuilder builder("basic");
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  auto a = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c = builder.Add(a, b);
+  auto d = builder.Add(a, c);
+
+  auto target = common::DefaultNVGPUTarget();
+  auto comp   = CinnComputation::BuildAndCompile(target, builder);
+  std::vector<float> hostA(M * N);
+  std::vector<float> hostB(M * N);
+  std::vector<float> hostD(M * N);
+  std::vector<float> hostD_expected(M * N);
+  for (int i = 0; i < M * N; i++) {
+    hostA[i]          = static_cast<float>(rand()) / INT_MAX;
+    hostB[i]          = static_cast<float>(rand()) / INT_MAX;
+    hostD_expected[i] = hostA[i] * 2 + hostB[i];
+  }
+
+  comp->SetTensorData("A", reinterpret_cast<void *>(hostA.data()), hostA.size() * sizeof(float));
+  comp->SetTensorData("B", reinterpret_cast<void *>(hostB.data()), hostB.size() * sizeof(float));
+  comp->Execute();
+  comp->GetTensorData(d->id, reinterpret_cast<void *>(hostD.data()), hostD.size() * sizeof(float));
+  for (int i = 0; i < hostD.size(); i++) {
+    ASSERT_NEAR(hostD[i], hostD_expected[i], 1e-5);
+  }
+}
+#endif
+
+TEST(cinn_computation, net_builder_cpu) {
+  auto program = CreateTestProgram();
+  auto target  = common::DefaultHostTarget();
+  auto compute = CinnComputation::Compile(target, program);
+  auto inputs  = compute->GetInputTensors();
+  ASSERT_EQ(inputs.size(), 2);
+  auto tensorA = inputs[0];
+  auto tensorB = inputs[1];
+  ASSERT_EQ(tensorA->shape().numel(), 32 * 24 / 2);
+  ASSERT_EQ(tensorB->shape().numel(), 32 * 24 / 2);
+
+  auto outputs = compute->GetOutputTensors();
+  ASSERT_EQ(outputs.size(), 1);
+  auto tensorOut = outputs[0];
+
+  auto load_input = [=](hlir::framework::Tensor t) {
+    float *ptr = t->mutable_data<float>(target);
+    for (int i = 0; i < t->shape().numel(); i++) {
+      ptr[i] = static_cast<float>(rand()) / INT_MAX;
+    }
+  };
+
+  // run inference for 10 times
+  for (int i = 0; i < 10; i++) {
+    // load data directly to tensor's host memory
+    load_input(tensorA);
+    load_input(tensorB);
+    // execute engine
+    compute->Execute();
+    // get outputs (ignored)
+  }
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(cinn_computation, net_builder_gpu) {
+  auto program = CreateTestProgram();
+  auto target  = common::DefaultNVGPUTarget();
+  auto compute = CinnComputation::Compile(target, program);
+  auto inputs  = compute->GetInputTensors();
+  ASSERT_EQ(inputs.size(), 2);
+  auto tensorA = inputs[0];
+  auto tensorB = inputs[1];
+  ASSERT_EQ(tensorA->shape().numel(), 32 * 24 / 2);
+  ASSERT_EQ(tensorB->shape().numel(), 32 * 24 / 2);
+  auto outputs = compute->GetOutputTensors();
+  ASSERT_EQ(outputs.size(), 1);
+  auto tensorOut = outputs[0];
+
+  // run inference for 10 times
+  for (int i = 0; i < 10; i++) {
+    // load data directly to tensor's host memory
+    // assume tensorA is generated in GPU directly
+    float *device_ptrA = tensorOut->mutable_data<float>(target);
+    // ... generated data directly in device memory via gpu kernels
+    // ... or async copy to device memory
+    // ... not showed here
+
+    // assume tensorB is generated in host memory, needs copy to GPU memory (sync.)
+    std::vector<float> hostB(32 * 24 / 2);
+    compute->SetTensorData(tensorB, reinterpret_cast<void *>(hostB.data()), hostB.size() * sizeof(float));
+
+    // execute engine
+    compute->Execute();
+    // get outputs
+    std::vector<float> hostOut(tensorOut->shape().numel());
+    compute->GetTensorData(tensorOut, reinterpret_cast<void *>(hostOut.data()), hostOut.size() * sizeof(float));
+  }
+}
+#endif
+
+TEST(cinn_computation, fc_execute_cpu) {
+  auto target = common::DefaultHostTarget();
+  ASSERT_NE(FLAGS_model_dir, "");
+  auto compute = CinnComputation::CompilePaddleModel(target, FLAGS_model_dir, {"A"}, {{1, 30}}, false);
+  auto inputs  = compute->GetInputTensors();
+  ASSERT_EQ(inputs.size(), 1);
+  auto A = inputs[0];
+  ASSERT_EQ(A->shape().numel(), 1 * 30);
+  float *ptrA = A->mutable_data<float>(target);
+  for (int i = 0; i < 30; i++) ptrA[i] = static_cast<float>(rand()) / INT_MAX;
+  for (int i = 0; i < 30; i++) ptrA[i] = static_cast<float>(0);
+  compute->Execute();
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(cinn_computation, fc_execute_gpu) {
+  auto target = common::DefaultNVGPUTarget();
+  ASSERT_NE(FLAGS_model_dir, "");
+  auto compute = CinnComputation::CompilePaddleModel(target, FLAGS_model_dir, {"A"}, {{1, 30}}, false);
+
+  auto inputs = compute->GetInputTensors();
+  ASSERT_EQ(inputs.size(), 1);
+  auto A = inputs[0];
+  ASSERT_EQ(A->shape().numel(), 1 * 30);
+  auto outputs = compute->GetOutputTensors();
+  ASSERT_EQ(outputs.size(), 1);
+  auto out = outputs[0];
+
+  std::vector<float> hostA(30);
+  for (float &v : hostA) v = static_cast<float>(rand()) / INT_MAX;
+  compute->SetTensorData(A, reinterpret_cast<void *>(hostA.data()), hostA.size() * sizeof(float));
+
+  compute->Execute();
+
+  std::vector<float> hostOut(30);
+  compute->GetTensorData(out, reinterpret_cast<void *>(hostOut.data()), hostOut.size() * sizeof(float));
+}
+#endif
+
+TEST(cinn_computation, decomposer_cpu) {
+  // this test only shows the API usage
+  ASSERT_NE(cinn::frontend::ProgramPassRegistry::Global()->Find("Decomposer"), nullptr);
+  // without decomposer
+  {
+    auto prog              = CreateAddProgram();
+    auto target            = common::DefaultHostTarget();
+    auto options           = CinnComputation::DefaultCompileOptions();
+    options.use_decomposer = false;
+    auto compute           = CinnComputation::Compile(target, prog, options);
+    auto names             = compute->GetAllTensorNames();
+    ASSERT_EQ(names.size(), 3);
+  }
+  // with decomposer
+  {
+    auto prog              = CreateAddProgram();
+    auto target            = common::DefaultHostTarget();
+    auto options           = CinnComputation::DefaultCompileOptions();
+    options.use_decomposer = true;
+    auto compute           = CinnComputation::Compile(target, prog, options);
+    auto names             = compute->GetAllTensorNames();
+  }
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(cinn_computation, gpu_stream) {
+  // this test only shows the API usage
+  auto target  = common::DefaultNVGPUTarget();
+  auto prog    = CreateAddProgram();
+  auto options = CinnComputation::DefaultCompileOptions();
+
+  cudaStream_t streams[1];
+  cudaStreamCreate(&streams[0]);
+  auto compute = CinnComputation::Compile(target, prog, options, {}, static_cast<void *>(streams[0]));
+  compute->Execute();
+}
+#endif
+
+TEST(cinn_computation, without_instantiate_variables) {
+  // this test only shows the API usage
+  auto target                        = common::DefaultHostTarget();
+  auto prog                          = CreateAddProgram();
+  auto options                       = CinnComputation::DefaultCompileOptions();
+  options.with_instantiate_variables = false;
+
+  auto compute = CinnComputation::Compile(target, prog, options);
+  auto names   = compute->GetAllTensorNames();
+
+  std::map<std::string, cinn_pod_value_t> pod2args;
+  // compute->Execute(&pod2args);
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/decomposer/CMakeLists.txt b/paddle/cinn/frontend/decomposer/CMakeLists.txt
new file mode 100755
index 0000000000000..db216057594b7
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/CMakeLists.txt
@@ -0,0 +1,19 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    activation.cc
+    elementwise.cc
+    broadcast.cc
+    batch_norm.cc
+    top_k.cc
+    )
+
+cc_library(decomposer_test_helper SRCS test_helper.cc DEPS cinncore)
+
+if (WITH_CUDA)
+cc_test(test_activation_decomposer SRCS activation_test.cc DEPS cinncore decomposer_test_helper)
+cc_test(test_elementwise_decomposer SRCS elementwise_test.cc DEPS cinncore decomposer_test_helper)
+cc_test(test_broadcast_decomposer SRCS broadcast_test.cc DEPS cinncore decomposer_test_helper)
+cc_test(test_batch_norm_decomposer SRCS batch_norm_test.cc DEPS cinncore decomposer_test_helper)
+cc_test(test_top_k_decomposer SRCS top_k_test.cc DEPS cinncore decomposer_test_helper)
+endif()
diff --git a/paddle/cinn/frontend/decomposer/activation.cc b/paddle/cinn/frontend/decomposer/activation.cc
new file mode 100644
index 0000000000000..947e9c32b4920
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/activation.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/decomposer_registry.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace decomposer {
+
+void relu(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 1UL) << " 1 input tensor for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL) << "1 output tensor for " << instr->op_type;
+  auto x        = instr->inputs[0];
+  auto output   = instr->outputs[0];
+  auto* builder = context.builder();
+
+  auto bcast_zero = builder->FillConstant(x->shape, 0.0f, common::UniqName("zero"), common::Type2Str(x->type));
+  auto out        = builder->Max(x, bcast_zero);
+
+  // map the the output of decomposed operator to the original.
+  context.MapOutToOrigin(out, output);
+}
+
+void relu_grad(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 2UL) << " 2 input tensors for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL) << "1 output tensor for " << instr->op_type;
+  auto dout     = instr->inputs[0];
+  auto out      = instr->inputs[1];
+  auto dx       = instr->outputs[0];
+  auto* builder = context.builder();
+
+  auto bcast_zero = builder->FillConstant(out->shape, 0.0f, common::UniqName("zero"), common::Type2Str(out->type));
+  auto condition  = builder->GreaterThan(out, bcast_zero);
+  auto res        = builder->Select(condition, dout, bcast_zero);
+
+  // map the the output of decomposed operator to the original.
+  context.MapOutToOrigin(res, dx);
+}
+
+void gelu(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 1UL) << " 1 input tensor for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL) << "1 output tensor for " << instr->op_type;
+  auto x        = instr->inputs[0];
+  auto output   = instr->outputs[0];
+  auto* builder = context.builder();
+
+  // x * (0.5 + 0.5 * erf(sqrtf(0.5) * x))
+  auto p_5 = builder->FillConstant(x->shape, 0.5f, common::UniqName("p_5"), common::Type2Str(x->type));
+  auto p_7 = builder->FillConstant(x->shape, std::sqrt(0.5), common::UniqName("p_7"), common::Type2Str(x->type));
+  auto erf = builder->Erf(builder->Multiply(x, p_7));
+  auto cdf = builder->Add(p_5, builder->Multiply(p_5, erf));
+  auto out = builder->Multiply(x, cdf);
+
+  // map the the output of decomposed operator to the original.
+  context.MapOutToOrigin(out, output);
+}
+
+void softmax(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 1UL) << " 1 input tensor for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL) << "1 output tensor for " << instr->op_type;
+  auto x        = instr->inputs[0];
+  auto output   = instr->outputs[0];
+  auto* builder = context.builder();
+
+  std::vector<int> b_axes;
+  auto axes = instr.GetAttrs<std::vector<int>>("axes");
+  CHECK(axes.size());
+  for (auto& axis : axes) {
+    if (axis < 0) {
+      axis += x->shape.size();
+    }
+  }
+  for (int idx = 0; idx < x->shape.size(); ++idx) {
+    if (std::find(axes.begin(), axes.end(), idx) == axes.end()) {
+      b_axes.push_back(idx);
+    }
+  }
+
+  // When the rank of x is 1, broadcast axes will be empty, so we need to insert last dim as broadcast axis.
+  if (b_axes.empty()) {
+    b_axes.emplace_back(-1);
+  }
+
+  auto mode = instr.GetAttrs<std::string>("mode");
+  if (mode == "fast") {
+    // x_sum = sum(exp(x))
+    auto x_sum = builder->BroadcastTo(builder->ReduceSum(builder->Exp(x), axes), x->shape, b_axes);
+    // x_exp / x_sum
+    auto out = builder->Divide(builder->Exp(x), x_sum);
+
+    // map the the output of decomposed operator to the original.
+    context.MapOutToOrigin(out, output);
+  } else {
+    // x = max(x)
+    auto x_max = builder->BroadcastTo(builder->ReduceMax(x, axes), x->shape, b_axes);
+    // x_exp = exp(x - x_max)
+    auto x_exp = builder->Exp(builder->Subtract(x, x_max));
+    // x_sum = sum(x_exp)
+    auto x_sum = builder->BroadcastTo(builder->ReduceSum(x_exp, axes), x->shape, b_axes);
+    // x_exp / x_sum
+    auto out = builder->Divide(builder->Exp(builder->Subtract(x, x_max)), x_sum);
+
+    // map the the output of decomposed operator to the original.
+    context.MapOutToOrigin(out, output);
+  }
+}
+
+}  // namespace decomposer
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(relu_decomposers) {
+  CINN_DECOMPOSER_REGISTER(relu, cinn::frontend::decomposer::relu);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(relu_grad_decomposers) {
+  CINN_DECOMPOSER_REGISTER(relu_grad, cinn::frontend::decomposer::relu_grad);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(gelu_decomposers) {
+  CINN_DECOMPOSER_REGISTER(gelu, cinn::frontend::decomposer::gelu);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(softmax_decomposers) {
+  CINN_DECOMPOSER_REGISTER(softmax, cinn::frontend::decomposer::softmax);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/decomposer/activation_test.cc b/paddle/cinn/frontend/decomposer/activation_test.cc
new file mode 100644
index 0000000000000..2174321520d4f
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/activation_test.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn::frontend {
+
+TEST(Decomposer, relu) {
+  NetBuilder builder("relu");
+  auto x   = builder.CreateInput(Float(32), {20, 10}, "x");
+  auto out = builder.Relu(x);
+
+  auto relu_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    size_t n   = lengths[0];
+    float* x   = static_cast<float*>(ptrs[0]);
+    float* out = static_cast<float*>(ptrs[1]);
+    for (size_t i = 0; i < n; ++i) {
+      float tmp_0 = x[i];
+      out[i]      = tmp_0 > 0 ? tmp_0 : 0;
+    }
+  };
+
+  std::vector<std::string> input_names        = {x.id().data()};
+  std::vector<std::string> output_names       = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{20, 10}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, relu_cpu, -1, 1);
+}
+
+TEST(Decomposer, relu_grad) {
+  NetBuilder builder("relu_grad");
+  auto dout = builder.CreateInput(Float(32), {20, 10}, "dout");
+  auto out  = builder.CreateInput(Float(32), {20, 10}, "out");
+  auto dx   = builder.ReluGrad(dout, out);
+
+  auto relu_grad_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    size_t n    = lengths[0];
+    float* dout = static_cast<float*>(ptrs[0]);
+    float* out  = static_cast<float*>(ptrs[1]);
+    float* dx   = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < n; ++i) {
+      dx[i] = out[i] > 0 ? dout[i] : 0;
+    }
+  };
+
+  std::vector<std::string> input_names        = {dout.id().data(), out.id().data()};
+  std::vector<std::string> output_names       = {dx->id};
+  std::vector<std::vector<int>> output_shapes = {{20, 10}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, relu_grad_cpu, -1, 1);
+}
+
+TEST(Decomposer, softmax_decomposer) {
+  int n = 16, c = 128, h = 14, w = 14;
+  std::vector<int> axes = {1, 2, 3};
+  NetBuilder net_builder("softmax_decomposer");
+  std::unordered_set<std::string> output_names;
+  {
+    auto x = net_builder.CreateInput(Float(32), {n, c, h, w}, "x");
+    auto y = net_builder.Softmax(x, axes);
+    output_names.insert(y->id);
+  }
+  auto program = net_builder.Build();
+
+  auto target = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, output_names, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto run_program = gc.Build();
+
+  std::vector<float> x(n * c * h * w);
+  InitRandomVector<float>(&x, n * c * h * w, 0.0f, 1.0f, 1e-3);
+  std::vector<std::pair<std::string, std::vector<float>>> inputs = {{"x", x}};
+  for (auto& input : inputs) {
+    scope->Var<hlir::framework::Tensor>(input.first);
+    auto tensor = scope->GetTensor(input.first);
+    auto* data  = tensor->mutable_data<float>(target);
+    CopyFromVector(input.second, tensor, target);
+  }
+  run_program->Execute();
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/decomposer/batch_norm.cc b/paddle/cinn/frontend/decomposer/batch_norm.cc
new file mode 100644
index 0000000000000..44cd8711c85d5
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/batch_norm.cc
@@ -0,0 +1,302 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/decomposer_registry.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace decomposer {
+
+struct BatchNormHelper {
+  BatchNormHelper(NetBuilder* net_builder,
+                  const std::vector<int>& arg_x_shape,
+                  const std::vector<int>& arg_param_shape,
+                  std::string data_layout,
+                  std::string bn_op_type) {
+    CHECK_EQ(arg_x_shape.size(), 4UL) << "Only 4-D input tensor is supported, but get " << arg_x_shape.size()
+                                      << "-D input tensor.";
+
+    builder     = net_builder;
+    x_shape     = arg_x_shape;
+    param_shape = arg_param_shape;
+
+    if (data_layout == "NCHW") {
+      channel_dim   = 1;
+      reduce_dim    = {0, 2, 3};
+      element_count = x_shape[0] * x_shape[2] * x_shape[3];
+    } else if (data_layout == "NHWC") {
+      channel_dim   = 3;
+      reduce_dim    = {0, 1, 2};
+      element_count = x_shape[0] * x_shape[1] * x_shape[2];
+    } else {
+      LOG(FATAL) << data_layout << " setting is not support!";
+    }
+
+    num_instructions = builder->size();
+    op_type          = bn_op_type;
+  }
+
+  ~BatchNormHelper() {
+    VLOG(4) << op_type << " is decomposed to " << builder->size() - num_instructions << " instructions.";
+  }
+
+  std::vector<Variable> MeanAndVariance(Variable x) {
+    auto mean = Mean(x);
+    // variance = reduce_sum(x * x) / nhw - mean * mean, shape = [c], simplified by equation: E(x^2) - [E(x)]^2
+    auto variance = Variance(x, mean);
+    return {mean, variance};
+  }
+
+  std::vector<Variable> GradBiasAndScale(Variable x, Variable x_mean, Variable y_grad) {
+    auto mean_4d     = builder->BroadcastTo(x_mean, x->shape, {channel_dim});
+    auto x_mean_diff = builder->Subtract(x, mean_4d);
+    // bias_grad = reduce_sum(y_grad), shape = [c]
+    auto bias_grad                     = Reduce(y_grad);
+    auto sum_of_y_grad_mul_x_mean_diff = Reduce(builder->Multiply(y_grad, x_mean_diff));
+    return {bias_grad, sum_of_y_grad_mul_x_mean_diff};
+  }
+
+  // mean = reduce_sum(x) / nhw
+  Variable Mean(Variable x) {
+    auto sum              = Reduce(x);
+    auto element_count_1d = builder->FillConstant(
+        sum->shape, element_count, common::UniqName("element_count"), common::Type2Str(sum->type));
+    auto mean = builder->Divide(sum, element_count_1d);
+    return mean;
+  }
+
+  // variance = reduce_sum(x * x) / nhw - mean * mean
+  Variable Variance(Variable x, Variable mean) {
+    auto x_square         = builder->Multiply(x, builder->Identity(x));
+    auto x_square_sum     = Reduce(x_square);
+    auto element_count_1d = builder->FillConstant(
+        x_square_sum->shape, element_count, common::UniqName("element_count"), common::Type2Str(x_square_sum->type));
+    auto x_square_mean = builder->Divide(x_square_sum, element_count_1d);
+    auto variance      = builder->Subtract(x_square_mean, builder->Multiply(mean, builder->Identity(mean)));
+    return variance;
+  }
+
+  // std_variance_inv = rsqrt(variance + epsilon)
+  Variable StdVarianceInv1d(Variable variance, float epsilon) {
+    auto epsilon_1d =
+        builder->FillConstant(variance->shape, epsilon, common::UniqName("epsilon"), common::Type2Str(variance->type));
+    auto std_variance_inv = builder->Rsqrt(builder->Add(variance, epsilon_1d));
+    return std_variance_inv;
+  }
+
+  // std_variance_inv = rsqrt(variance + epsilon)
+  Variable StdVarianceInv4d(Variable variance, float epsilon) {
+    auto variance_4d = builder->BroadcastTo(variance, x_shape, {channel_dim});
+    auto epsilon_4d  = builder->FillConstant(
+        variance_4d->shape, epsilon, common::UniqName("epsilon"), common::Type2Str(variance_4d->type));
+    auto std_variance_inv_4d = builder->Rsqrt(builder->Add(variance_4d, epsilon_4d));
+    return std_variance_inv_4d;
+  }
+
+  // moving_value = moving_value * momentum + (1.0 - momentum) * saved_value
+  // value maybe mean and variance.
+  Variable UpdateMeanVariance(Variable moving_value, Variable saved_value, float momentum) {
+    auto factor_0 = builder->FillConstant(
+        moving_value->shape, momentum, common::UniqName("factor_0"), common::Type2Str(moving_value->type));
+    auto factor_1 = builder->FillConstant(
+        saved_value->shape, 1.0f - momentum, common::UniqName("factor_1"), common::Type2Str(saved_value->type));
+    auto new_moving_value =
+        builder->Add(builder->Multiply(moving_value, factor_0), builder->Multiply(saved_value, factor_1));
+    return new_moving_value;
+  }
+
+  Variable Reduce(Variable x) { return builder->ReduceSum(x, reduce_dim); }
+
+  NetBuilder* builder{nullptr};
+  std::vector<int> x_shape;
+  std::vector<int> param_shape;
+  std::vector<int> reduce_dim;
+  float element_count{0};
+  int channel_dim{0};
+  std::string op_type;
+  int num_instructions{0};
+};
+
+void batch_norm_train(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 5UL) << "The number of the given inputs is not equal to the required for op "
+                                      << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 5UL) << "The number of the given outputs is not equal to the required for op "
+                                       << instr->op_type;
+
+  auto& x               = instr->inputs[0];
+  auto& scale           = instr->inputs[1];
+  auto& bias            = instr->inputs[2];
+  auto& moving_mean     = instr->inputs[3];
+  auto& moving_variance = instr->inputs[4];
+  CHECK_EQ(scale->type, bias->type);
+  CHECK_EQ(scale->type, moving_mean->type);
+  CHECK_EQ(scale->type, moving_variance->type);
+
+  float epsilon      = instr.GetAttrs<float>("epsilon");
+  float momentum     = instr.GetAttrs<float>("momentum");
+  std::string layout = instr.GetAttrs<std::string>("data_layout");
+
+  NetBuilder* builder = context.builder();
+  BatchNormHelper helper(builder, x->shape, scale->shape, layout, "batch_norm_train");
+
+  auto mean_variance = helper.MeanAndVariance(x);
+  auto mean          = mean_variance[0];
+  auto variance      = mean_variance[1];
+
+  auto mean_4d = builder->BroadcastTo(mean, x->shape, {helper.channel_dim});
+  // std_variance_inv = rsqrt(variance + epsilon), shape = [c]
+  auto std_variance_inv_4d = helper.StdVarianceInv4d(variance, epsilon);
+
+  // y = scale * (x - mean) * std_variance_inv + bias, shape = [n, c, h, w]
+  auto scale_4d          = builder->BroadcastTo(scale, x->shape, {helper.channel_dim});
+  auto bias_4d           = builder->BroadcastTo(bias, x->shape, {helper.channel_dim});
+  auto normalized        = builder->Multiply(builder->Subtract(x, mean_4d), std_variance_inv_4d);
+  auto scaled_normalized = builder->Multiply(normalized, scale_4d);
+  auto y                 = builder->Add(scaled_normalized, bias_4d);
+
+  // moving_mean = moving_mean * momentum + (1.0 - momentum) * mean, shape = [c]
+  auto new_moving_mean = helper.UpdateMeanVariance(moving_mean, mean, momentum);
+
+  // moving_variance = moving_variance * momentum + (1.0 - momentum) * variance, shape = [c]
+  auto new_moving_variance = helper.UpdateMeanVariance(moving_variance, variance, momentum);
+
+  context.MapOutToOrigin(y, instr->outputs[0]);
+  context.MapOutToOrigin(mean, instr->outputs[1]);
+  context.MapOutToOrigin(variance, instr->outputs[2]);
+  context.MapOutToOrigin(new_moving_mean, instr->outputs[3]);
+  context.MapOutToOrigin(new_moving_variance, instr->outputs[4]);
+}
+
+void batch_norm_grad(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 5UL) << " The number of the given inputs is not equal to the required "
+                                      << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 3UL) << " The number of the given outputs is not equal to the required"
+                                       << instr->op_type;
+
+  auto& y_grad        = instr->inputs[0];
+  auto& x             = instr->inputs[1];
+  auto& scale         = instr->inputs[2];
+  auto& save_mean     = instr->inputs[3];
+  auto& save_variance = instr->inputs[4];
+  CHECK_EQ(y_grad->type, x->type);
+  CHECK_EQ(scale->type, save_mean->type);
+  CHECK_EQ(scale->type, save_variance->type);
+
+  auto epsilon = instr.GetAttrs<float>("epsilon");
+  auto layout  = instr.GetAttrs<std::string>("data_layout");
+
+  NetBuilder* builder = context.builder();
+  BatchNormHelper helper(builder, x->shape, scale->shape, layout, "batch_norm_grad");
+
+  auto vars                          = helper.GradBiasAndScale(x, save_mean, y_grad);
+  auto bias_grad                     = vars[0];
+  auto sum_of_y_grad_mul_x_mean_diff = vars[1];
+
+  // scale_grad = reduce_sum(y_grad * (x - mean)) * rsqrt(variance + epsilon), shape = [c]
+  auto scale_grad = builder->Multiply(sum_of_y_grad_mul_x_mean_diff, helper.StdVarianceInv1d(save_variance, epsilon));
+
+  // x_grad = 1/nhw * scale * rsqrt(variance + epsilon) *
+  //   (nhw * y_grad - reduce_sum(y_grad) - (x - mean) * reduce_sum(y_grad * (x - mean)) / (variance + epsilon))
+  // => x_grad = tmp0 * (tmp1 - tmp2 - tmp3)
+  auto scaled_std_variance_inv = builder->Multiply(scale, helper.StdVarianceInv1d(save_variance, epsilon));
+  auto element_count_1d        = builder->FillConstant(scaled_std_variance_inv->shape,
+                                                helper.element_count,
+                                                common::UniqName("element_count_1d"),
+                                                common::Type2Str(scaled_std_variance_inv->type));
+  auto tmp0 =
+      builder->BroadcastTo(builder->Divide(scaled_std_variance_inv, element_count_1d), x->shape, {helper.channel_dim});
+
+  auto element_count_4d = builder->FillConstant(
+      y_grad->shape, helper.element_count, common::UniqName("element_count_4d"), common::Type2Str(y_grad->type));
+  auto tmp1 = builder->Multiply(y_grad, element_count_4d);
+
+  auto tmp2 = builder->BroadcastTo(bias_grad, x->shape, {helper.channel_dim});
+
+  auto mean_4d     = builder->BroadcastTo(save_mean, x->shape, {helper.channel_dim});
+  auto x_mean_diff = builder->Subtract(x, mean_4d);
+
+  auto sum_of_y_grad_mul_x_mean_diff_4d =
+      builder->BroadcastTo(sum_of_y_grad_mul_x_mean_diff, x->shape, {helper.channel_dim});
+  auto tmp3_0     = builder->Multiply(x_mean_diff, sum_of_y_grad_mul_x_mean_diff_4d);
+  auto epsilon_1d = builder->FillConstant(
+      save_variance->shape, epsilon, common::UniqName("epsilon"), common::Type2Str(save_variance->type));
+  auto variance_add_eps    = builder->Add(save_variance, epsilon_1d);
+  auto variance_add_eps_4d = builder->BroadcastTo(variance_add_eps, x->shape, {helper.channel_dim});
+  auto tmp3                = builder->Divide(tmp3_0, variance_add_eps_4d);
+
+  auto x_grad = builder->Multiply(tmp0, builder->Subtract(builder->Subtract(tmp1, tmp2), tmp3));
+
+  context.MapOutToOrigin(x_grad, instr->outputs[0]);
+  context.MapOutToOrigin(scale_grad, instr->outputs[1]);
+  context.MapOutToOrigin(bias_grad, instr->outputs[2]);
+}
+
+void batch_norm(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 5UL) << "The number of the given inputs is not equal to the required for op "
+                                      << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL) << "The number of the given outputs is not equal to the required for op "
+                                       << instr->op_type;
+
+  auto& x               = instr->inputs[0];
+  auto& scale           = instr->inputs[1];
+  auto& bias            = instr->inputs[2];
+  auto& moving_mean     = instr->inputs[3];
+  auto& moving_variance = instr->inputs[4];
+  CHECK_EQ(scale->type, bias->type);
+  CHECK_EQ(scale->type, moving_mean->type);
+  CHECK_EQ(scale->type, moving_variance->type);
+
+  float epsilon      = instr.GetAttrs<float>("epsilon");
+  float momentum     = instr.GetAttrs<float>("momentum");
+  std::string layout = instr.GetAttrs<std::string>("data_layout");
+
+  NetBuilder* builder = context.builder();
+  BatchNormHelper helper(builder, x->shape, scale->shape, layout, "batch_norm");
+
+  auto mean_4d = builder->BroadcastTo(moving_mean, x->shape, {helper.channel_dim});
+  // std_variance_inv = rsqrt(variance + epsilon), shape = [c]
+  auto std_variance_inv_4d = helper.StdVarianceInv4d(moving_variance, epsilon);
+
+  // y = scale * (x - mean) * std_variance_inv + bias, shape = [n, c, h, w]
+  auto scale_4d          = builder->BroadcastTo(scale, x->shape, {helper.channel_dim});
+  auto bias_4d           = builder->BroadcastTo(bias, x->shape, {helper.channel_dim});
+  auto normalized        = builder->Multiply(builder->Subtract(x, mean_4d), std_variance_inv_4d);
+  auto scaled_normalized = builder->Multiply(normalized, scale_4d);
+  auto y                 = builder->Add(scaled_normalized, bias_4d);
+
+  context.MapOutToOrigin(y, instr->outputs[0]);
+}
+
+}  // namespace decomposer
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(batch_norm_decomposer) {
+  CINN_DECOMPOSER_REGISTER(batch_norm, cinn::frontend::decomposer::batch_norm);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(batch_norm_train_decomposer) {
+  CINN_DECOMPOSER_REGISTER(batch_norm_train, cinn::frontend::decomposer::batch_norm_train);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(batch_norm_grad_decomposer) {
+  CINN_DECOMPOSER_REGISTER(batch_norm_grad, cinn::frontend::decomposer::batch_norm_grad);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/decomposer/batch_norm_test.cc b/paddle/cinn/frontend/decomposer/batch_norm_test.cc
new file mode 100755
index 0000000000000..c2f6b8e1dc085
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/batch_norm_test.cc
@@ -0,0 +1,420 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn {
+namespace frontend {
+namespace {
+
+struct Offset {
+  int n;
+  int c;
+  int h;
+  int w;
+
+  Offset(int arg_n, int arg_c, int arg_h, int arg_w) : n(arg_n), c(arg_c), h(arg_h), w(arg_w) {}
+
+  int operator()(int idx_n, int idx_c, int idx_h, int idx_w) const {
+    return idx_n * c * h * w + idx_c * h * w + idx_h * w + idx_w;
+  }
+};
+
+template <typename FuncType>
+void Loop(FuncType func, const int n, const int c, const int h, const int w) {
+  for (int in = 0; in < n; ++in) {
+    for (int ic = 0; ic < c; ++ic) {
+      for (int ih = 0; ih < h; ++ih) {
+        for (int iw = 0; iw < w; ++iw) {
+          func(in, ic, ih, iw);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ComputeBatchNormTrainRef(const std::vector<T>& x,
+                              const std::vector<T>& scale,
+                              const std::vector<T>& bias,
+                              const std::vector<T>& moving_mean,
+                              const std::vector<T>& moving_variance,
+                              const int n,
+                              const int c,
+                              const int h,
+                              const int w,
+                              std::vector<T>* y,
+                              std::vector<T>* saved_mean,
+                              std::vector<T>* saved_variance,
+                              std::vector<T>* new_moving_mean,
+                              std::vector<T>* new_moving_variance,
+                              const float epsilon,
+                              const float momentum) {
+  Offset offset(n, c, h, w);
+
+  // sum
+  memset(saved_mean->data(), 0, sizeof(T) * c);
+  auto func_sum_x = [=](int in, int ic, int ih, int iw) { saved_mean->at(ic) += x[offset(in, ic, ih, iw)]; };
+  Loop(func_sum_x, n, c, h, w);
+
+  // saved mean
+  float element_count = static_cast<float>(n * h * w);
+  for (int ic = 0; ic < c; ++ic) {
+    // Checking result of saved_mean:
+    // output[saved_mean], var_name=var_5, shape={32}
+    // - Total 0 different results, offset=0, 0.00527001 vs 0.00527001, maximum_relative_diff=0(absolute_diff=0)
+    saved_mean->at(ic) /= element_count;
+  }
+
+  // square_sum
+  std::vector<float> x_square_mean(c, 0);
+  auto func_sum_square_x = [&](int in, int ic, int ih, int iw) {
+    x_square_mean.at(ic) += x[offset(in, ic, ih, iw)] * x[offset(in, ic, ih, iw)];
+  };
+  Loop(func_sum_square_x, n, c, h, w);
+
+  for (int ic = 0; ic < c; ++ic) {
+    x_square_mean[ic] /= element_count;
+  }
+
+  // saved variance, according to equation: E(x^2) - [E(x)]^2
+  std::vector<float> std_variance(c);
+  for (int ic = 0; ic < c; ++ic) {
+    // Checking results of saved_variance and std_variance:
+    // output[saved_variance], var_name=var_6, shape={32}
+    // - Total 0 different results, offset=0, 0.336347 vs 0.336347, maximum_relative_diff=0(absolute_diff=0)
+    // output[std_variance], var_name=std_variance, shape={32}
+    // - Total 0 different results, offset=0, 0.579963 vs 0.579963, maximum_relative_diff=0(absolute_diff=0)
+    saved_variance->at(ic) = x_square_mean[ic] - (saved_mean->at(ic) * saved_mean->at(ic));
+    std_variance[ic]       = sqrt(saved_variance->at(ic) + epsilon);
+  }
+
+  // compute output
+  std::vector<float> y_nobias(n * c * h * w);
+  auto func_y_nobias = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    // Checking result of y_nobias:
+    // output[y_nobias], var_name=y_nobias, shape={16, 32, 16, 16}
+    // - Total 0 different results, offset=32104, -0.000488288 vs -0.000488288,
+    // maximum_relative_diff=1.19208e-07(absolute_diff=5.82077e-11)
+    y_nobias[idx] = (x[idx] - saved_mean->at(ic)) * scale[ic] / std_variance[ic];
+  };
+  Loop(func_y_nobias, n, c, h, w);
+
+  auto func_y = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    // Checking result of y:
+    // output[y], var_name=var_4, shape={16, 32, 16, 16}
+    // - Total 80 different results, offset=126409, 1.81794e-06 vs 1.80304e-06,
+    // maximum_relative_diff=0.00826446(absolute_diff=1.49012e-08) For the following case:
+    //   idx=126409, y[idx]=1.80304e-06, y_nobias[idx]=0.2033332, bias[ic]=-0.2033314
+    // The computing result of CPU and GPU may have some difference, like
+    //   i=126409, 1.8179417e-06 vs 1.8030405e-06, relative_diff=0.0082644625, absolute_diff=1.4901161e-08
+    // This case is considered reasonable.
+    y->at(idx) = y_nobias[idx] + bias[ic];
+  };
+  Loop(func_y, n, c, h, w);
+
+  // new moving running and variance
+  float factor_0 = momentum;
+  float factor_1 = static_cast<float>(1.0f - momentum);
+  for (int ic = 0; ic < c; ++ic) {
+    // Checking result of new_moving_mean and new_moving_variance:
+    // output[new_moving_mean], var_name=var_7, shape={32}
+    // - Total 0 different results, offset=9, 0.00123065 vs 0.00123065,
+    // maximum_relative_diff=9.45967e-08(absolute_diff=1.16415e-10) output[new_moving_variance], var_name=var_8,
+    // shape={32}
+    // - Total 0 different results, offset=16, -0.00140787 vs -0.00140787,
+    // maximum_relative_diff=5.29211e-06(absolute_diff=7.45058e-09)
+    new_moving_mean->at(ic)     = moving_mean[ic] * factor_0 + saved_mean->at(ic) * factor_1;
+    new_moving_variance->at(ic) = moving_variance[ic] * factor_0 + saved_variance->at(ic) * factor_1;
+  }
+}
+
+TEST(Decomposer, BatchNormTrain) {
+  int n = 16, c = 128, h = 14, w = 14;
+  float epsilon           = 1e-5;
+  float momentum          = 0.9f;
+  std::string data_layout = "NCHW";
+  bool is_test            = false;
+  NetBuilder net_builder("batch_norm_train");
+  std::vector<std::string> output_names;
+  {
+    auto x               = net_builder.CreateInput(Float(32), {n, c, h, w}, "x");
+    auto scale           = net_builder.CreateInput(Float(32), {c}, "scale");
+    auto bias            = net_builder.CreateInput(Float(32), {c}, "bias");
+    auto moving_mean     = net_builder.CreateInput(Float(32), {c}, "moving_mean");
+    auto moving_variance = net_builder.CreateInput(Float(32), {c}, "moving_variance");
+
+    auto outputs =
+        net_builder.BatchNorm(x, scale, bias, moving_mean, moving_variance, epsilon, momentum, data_layout, is_test);
+    for (auto output : outputs) {
+      output_names.push_back(output->id);
+    }
+  }
+  auto program = net_builder.Build();
+
+  auto target = common::DefaultTarget();
+  RunDecomposer(&program, target, cinn::frontend::DefaultTrainingOptimizeOptions().program_passes, output_names);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto run_program = gc.Build();
+
+  // set input
+  float precision = 1e-3;
+  std::vector<float> x(n * c * h * w), scale(c), bias(c), moving_mean(c), moving_variance(c);
+  InitRandomVector<float>(&x, n * c * h * w, 0.0f, 1.0f, precision);
+  InitRandomVector<float>(&scale, c, 0.0f, 1.0f, precision);
+  InitRandomVector<float>(&bias, c, 10.0f, 20.0f, precision);
+  InitRandomVector<float>(&moving_mean, c, 0.0f, 1.0f, precision);
+  InitRandomVector<float>(&moving_variance, c, 0.0f, 1.0f, precision);
+
+  std::vector<float> y(n * c * h * w), new_moving_mean(c), new_moving_variance(c), saved_mean(c), saved_variance(c);
+  ComputeBatchNormTrainRef<float>(x,
+                                  scale,
+                                  bias,
+                                  moving_mean,
+                                  moving_variance,
+                                  n,
+                                  c,
+                                  h,
+                                  w,
+                                  &y,
+                                  &saved_mean,
+                                  &saved_variance,
+                                  &new_moving_mean,
+                                  &new_moving_variance,
+                                  epsilon,
+                                  momentum);
+
+  std::vector<std::pair<std::string, std::vector<float>>> inputs = {
+      {"x", x}, {"scale", scale}, {"bias", bias}, {"moving_mean", moving_mean}, {"moving_variance", moving_variance}};
+  for (auto& input : inputs) {
+    scope->Var<hlir::framework::Tensor>(input.first);
+    auto tensor = scope->GetTensor(input.first);
+    auto* data  = tensor->mutable_data<float>(target);
+    CopyFromVector(input.second, tensor, target);
+  }
+  run_program->Execute();
+
+  std::unordered_map<std::string, std::pair<std::string, std::vector<float>>> outputs_ref = {
+      {"new_moving_variance", {output_names[4], new_moving_variance}},
+      {"new_moving_mean", {output_names[3], new_moving_mean}},
+      {"saved_variance", {output_names[2], saved_variance}},
+      {"saved_mean", {output_names[1], saved_mean}},
+      {"y", {output_names[0], y}}};
+
+  for (auto& iter : outputs_ref) {
+    auto output = iter.second;
+    auto tensor = scope->GetTensor(output.first);
+    std::vector<float> data(tensor->shape().numel());
+    CopyToVector(tensor, &data);
+
+    LOG(INFO) << "output[" << iter.first << "], var_name=" << output.first << ", shape=" << tensor->shape().data();
+    CheckOutput<float>(data, output.second, 1e-8, 1e-4);
+  }
+}
+
+template <typename T>
+void ComputeBatchNormGradRef(const std::vector<T>& y_grad,
+                             const std::vector<T>& x,
+                             const std::vector<T>& scale,
+                             const std::vector<T>& save_mean,
+                             const std::vector<T>& save_variance,
+                             const int n,
+                             const int c,
+                             const int h,
+                             const int w,
+                             std::vector<T>* x_grad,
+                             std::vector<T>* scale_grad,
+                             std::vector<T>* bias_grad,
+                             const float epsilon = 1e-5) {
+  Offset offset(n, c, h, w);
+
+  // bias_grad
+  memset(bias_grad->data(), 0, sizeof(T) * c);
+  auto func_bias_grad = [=](int in, int ic, int ih, int iw) { bias_grad->at(ic) += y_grad[offset(in, ic, ih, iw)]; };
+  Loop(func_bias_grad, n, c, h, w);
+
+  // std_variance
+  std::vector<T> std_variance(c);
+  for (int ic = 0; ic < c; ++ic) {
+    std_variance[ic] = sqrt(save_variance[ic] + epsilon);
+  }
+
+  // grad scale
+  memset(scale_grad->data(), 0, sizeof(T) * c);
+  auto func_scale_grad = [=](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    scale_grad->at(ic) += y_grad[idx] * (x[idx] - save_mean[ic]);
+  };
+  Loop(func_scale_grad, n, c, h, w);
+  for (int ic = 0; ic < c; ++ic) {
+    scale_grad->at(ic) /= std_variance[ic];
+  }
+
+  // std_norm_grad
+  std::vector<T> std_norm_grad(n * c * h * w);
+  auto func_std_norm_grad = [&](int in, int ic, int ih, int iw) {
+    int idx            = offset(in, ic, ih, iw);
+    std_norm_grad[idx] = y_grad[idx] * scale[ic];
+  };
+  Loop(func_std_norm_grad, n, c, h, w);
+
+  // x_mean_diff_grad
+  std::vector<T> x_mean_diff_grad(n * c * h * w);
+  auto func_x_mean_diff_grad = [&](int in, int ic, int ih, int iw) {
+    int idx               = offset(in, ic, ih, iw);
+    x_mean_diff_grad[idx] = std_norm_grad[idx] / std_variance[ic];
+  };
+  Loop(func_x_mean_diff_grad, n, c, h, w);
+
+  // std_variance_grad
+  std::vector<T> std_variance_grad(c, 0);
+  auto func_std_variance_grad = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    std_variance_grad[ic] += -1.0f * std_norm_grad[idx] * (x[idx] - save_mean[ic]) / (save_variance[ic] + epsilon);
+  };
+  Loop(func_std_variance_grad, n, c, h, w);
+
+  // variance_grad_without_mul
+  std::vector<T> variance_grad_without_mul(c);
+  for (int ic = 0; ic < c; ++ic) {
+    variance_grad_without_mul[ic] = std_variance_grad[ic] / std_variance[ic];
+  }
+
+  // x_grad_0
+  float element_count = static_cast<float>(n * h * w);
+  std::vector<T> x_grad_0(n * c * h * w);
+  auto func_x_grad_0 = [&](int in, int ic, int ih, int iw) {
+    int idx       = offset(in, ic, ih, iw);
+    x_grad_0[idx] = x[idx] * (variance_grad_without_mul[ic] / element_count);
+  };
+  Loop(func_x_grad_0, n, c, h, w);
+
+  // minus_mean_grad
+  std::vector<T> minus_mean_grad(c, 0);
+  auto func_minus_mean_grad = [&](int in, int ic, int ih, int iw) {
+    minus_mean_grad[ic] += x_mean_diff_grad[offset(in, ic, ih, iw)];
+  };
+  Loop(func_minus_mean_grad, n, c, h, w);
+  for (int ic = 0; ic < c; ++ic) {
+    minus_mean_grad[ic] += variance_grad_without_mul[ic] * save_mean[ic];
+    minus_mean_grad[ic] /= element_count;
+  }
+
+  auto func_x_grad = [=](int in, int ic, int ih, int iw) {
+    int idx         = offset(in, ic, ih, iw);
+    x_grad->at(idx) = x_mean_diff_grad[idx] + x_grad_0[idx] - minus_mean_grad[ic];
+  };
+  Loop(func_x_grad, n, c, h, w);
+}
+
+TEST(Decomposer, BatchNormGrad) {
+  int n = 16, c = 128, h = 14, w = 14;
+  int num       = n * c * h * w;
+  float epsilon = 1e-5;
+  NetBuilder net_builder("batch_norm_grad");
+  std::vector<std::string> output_names;
+  {
+    auto y_grad         = net_builder.CreateInput(Float(32), {n, c, h, w}, "y_grad");
+    auto x              = net_builder.CreateInput(Float(32), {n, c, h, w}, "x");
+    auto scale          = net_builder.CreateInput(Float(32), {c}, "scale");
+    auto saved_mean     = net_builder.CreateInput(Float(32), {c}, "saved_mean");
+    auto saved_variance = net_builder.CreateInput(Float(32), {c}, "saved_variance");
+
+    auto outputs = net_builder.BatchNormGrad(y_grad, x, scale, saved_mean, saved_variance, epsilon);
+    for (auto output : outputs) {
+      output_names.push_back(output->id);
+    }
+  }
+  auto program = net_builder.Build();
+
+  auto target = common::DefaultTarget();
+  RunDecomposer(&program, target, cinn::frontend::DefaultTrainingOptimizeOptions().program_passes, output_names);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto run_program = gc.Build();
+
+  // set input
+  float precision = 1e-3;
+  std::vector<float> y_grad(num), x(num), scale(c), saved_mean(c, 0), saved_variance(c, 0);
+  InitRandomVector(&y_grad, num, 0.0f, 1.0f, precision);
+  InitRandomVector(&x, num, 0.0f, 1.0f, precision);
+  InitRandomVector(&scale, c, 0.0f, 1.0f, precision);
+
+  Offset offset(n, c, h, w);
+  auto func_save_mean = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    saved_mean[ic] += x[idx];
+    saved_variance[ic] += x[idx] * x[idx];
+  };
+  Loop(func_save_mean, n, c, h, w);
+  float element_count = static_cast<float>(n * h * w);
+  for (int ic = 0; ic < c; ++ic) {
+    saved_mean[ic] /= element_count;
+    saved_variance[ic] = saved_variance[ic] / element_count - saved_mean[ic] * saved_mean[ic];
+  }
+
+  std::vector<std::pair<std::string, std::vector<float>>> inputs = {
+      {"y_grad", y_grad}, {"x", x}, {"scale", scale}, {"saved_mean", saved_mean}, {"saved_variance", saved_variance}};
+  for (auto& input : inputs) {
+    scope->Var<hlir::framework::Tensor>(input.first);
+    auto tensor = scope->GetTensor(input.first);
+    CopyFromVector(input.second, tensor, target);
+  }
+  run_program->Execute();
+
+  std::vector<float> x_grad(num), scale_grad(c), bias_grad(c);
+  ComputeBatchNormGradRef(
+      y_grad, x, scale, saved_mean, saved_variance, n, c, h, w, &x_grad, &scale_grad, &bias_grad, epsilon);
+
+  std::unordered_map<std::string, std::pair<std::string, std::vector<float>>> output_refs = {
+      {"bias_grad", {output_names[2], bias_grad}},
+      {"scale_grad", {output_names[1], scale_grad}},
+      {"x_grad", {output_names[0], x_grad}}};
+
+  for (auto& iter : output_refs) {
+    auto output = iter.second;
+    auto tensor = scope->GetTensor(output.first);
+    std::vector<float> data(tensor->shape().numel());
+    CopyToVector(tensor, &data);
+
+    LOG(INFO) << "output[" << iter.first << "], var_name=" << output.first << ", shape=" << tensor->shape().data();
+    if (iter.first == "x_grad") {
+      // TODO(Xreki): fix the precision check of x_grad.
+      // CheckOutput<float>(data, output.second, 1e-8, 1e-1);
+    } else if (iter.first == "scale_grad") {
+      CheckOutput<float>(data, output.second, 1e-8, 1e-2);
+    } else {
+      CheckOutput<float>(data, output.second);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/decomposer/broadcast.cc b/paddle/cinn/frontend/decomposer/broadcast.cc
new file mode 100644
index 0000000000000..f9f06d679a4c5
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/broadcast.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/decomposer_registry.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace decomposer {
+
+void GetReduceDimsForX(const std::vector<int>& dx_shape,
+                       const std::vector<int>& dout_shape,
+                       std::vector<int>* reduce_dims) {
+  // e.g., dx_shape = [4, 1, 3], dout_shape = [4, 2, 3], reduce_dims=[1]
+  for (size_t i = 0; i < dout_shape.size(); ++i) {
+    if (dx_shape[i] == 1 && dout_shape[i] != 1) {
+      reduce_dims->push_back(i);
+    }
+  }
+  VLOG(3) << "The reduce_dims for X: " << utils::Join(*reduce_dims, ",");
+}
+
+void GetReduceDimsForY(const std::vector<int>& dy_shape,
+                       const std::vector<int>& dout_shape,
+                       int axis,
+                       std::vector<int>* reduce_dims) {
+  // e.g., dy_shape = [3, 1, 4], dout_shape = [2, 3, 4, 4, 5], axis = 1
+  // reduce_dims=[0, 2, 4]
+  for (size_t i = 0; i < dout_shape.size(); ++i) {
+    if (i < axis || i >= axis + dy_shape.size()) {
+      reduce_dims->push_back(i);
+    } else {
+      if (dy_shape[i - axis] == 1 && dout_shape[i] != 1) {
+        reduce_dims->push_back(i);
+      }
+    }
+  }
+  VLOG(3) << "The reduce_dims for Y: " << utils::Join(*reduce_dims, ",");
+}
+
+void elementwise_add(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 2UL) << " 2 input tensors for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL) << "1 output tensor for " << instr->op_type;
+  auto x      = instr->inputs[0];
+  auto y      = instr->inputs[1];
+  auto output = instr->outputs[0];
+
+  int axis = -1;
+  if (instr->attrs.find("axis") != instr->attrs.end()) {
+    axis = instr.GetAttrs<int>("axis");
+  }
+
+  if (x->shape.size() >= y->shape.size()) {
+    axis          = axis >= 0 ? axis : x->shape.size() - y->shape.size();
+    auto* builder = context.builder();
+
+    Variable out;
+    Variable bcast_x = x;
+    Variable bcast_y = y;
+
+    // e.g., x.shape = [4, 1, 3], y.shape = [2, 3], aixs = 1 out.shape = [4, 2, 3]
+    // bcast_axes_x = [0, 1, 2], bcast_axes_y = [1, 2]
+    if (x->shape != output->shape) {
+      std::vector<int> bcast_axes_x(x->shape.size());
+      std::iota(bcast_axes_x.begin(), bcast_axes_x.end(), 0);
+      bcast_x = builder->BroadcastTo(x, output->shape, bcast_axes_x);
+    }
+
+    // if y.shape=[1], y does not need to be broadcast
+    if (y->shape != output->shape && y->shape != std::vector<int>(1, 1)) {
+      std::vector<int> bcast_axes_y(y->shape.size());
+      std::iota(bcast_axes_y.begin(), bcast_axes_y.end(), axis);
+      bcast_y = builder->BroadcastTo(y, output->shape, bcast_axes_y);
+    }
+
+    out = builder->Add(bcast_x, bcast_y);
+
+    // map the the output of decomposed operator to the original.
+    context.MapOutToOrigin(out, output);
+  } else {
+    axis          = axis >= 0 ? axis : y->shape.size() - x->shape.size();
+    auto* builder = context.builder();
+
+    Variable out;
+    Variable bcast_x = x;
+    Variable bcast_y = y;
+
+    if (y->shape != output->shape) {
+      std::vector<int> bcast_axes_y(y->shape.size());
+      std::iota(bcast_axes_y.begin(), bcast_axes_y.end(), 0);
+      bcast_y = builder->BroadcastTo(y, output->shape, bcast_axes_y);
+    }
+
+    if (x->shape != output->shape && x->shape != std::vector<int>(1, 1)) {
+      std::vector<int> bcast_axes_x(x->shape.size());
+      std::iota(bcast_axes_x.begin(), bcast_axes_x.end(), axis);
+      bcast_x = builder->BroadcastTo(x, output->shape, bcast_axes_x);
+    }
+
+    out = builder->Add(bcast_x, bcast_y);
+
+    // map the the output of decomposed operator to the original.
+    context.MapOutToOrigin(out, output);
+  }
+}
+
+void elementwise_add_grad(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 3UL) << " 3 input tensors for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 2UL) << "2 output tensors for " << instr->op_type;
+  auto dout = instr->inputs[0];
+  auto dx   = instr->outputs[0];
+  auto dy   = instr->outputs[1];
+  int axis  = instr.GetAttrs<int>("axis");
+  if (axis < 0 && dx->shape.size() < dy->shape.size()) {
+    LOG(FATAL) << "Please make sure x'rank greater than or equal to y'rank when axis = -1";
+  }
+  axis          = axis >= 0 ? axis : dx->shape.size() - dy->shape.size();
+  auto* builder = context.builder();
+
+  Variable dx_t;
+  if (dx->shape == dout->shape) {
+    dx_t = builder->Identity(dout);
+    context.MapOutToOrigin(dx, dout);
+  } else {
+    std::vector<int> x_reduce_dims;
+    GetReduceDimsForX(dx->shape, dout->shape, &x_reduce_dims);
+    // The rank of dx is same as dout, so set keep_dim = true
+    dx_t = builder->ReduceSum(dout, x_reduce_dims, true);
+  }
+
+  Variable dy_t;
+  if (dy->shape == dout->shape) {
+    dy_t = builder->Identity(dout);
+    context.MapOutToOrigin(dy, dout);
+  } else {
+    std::vector<int> y_reduce_dims;
+    GetReduceDimsForY(dy->shape, dout->shape, axis, &y_reduce_dims);
+    // The rank of dy is less or equal to dout, after reduce_sum, there
+    // may be some extra "1" in the front or back of dy_res's shape. So
+    // the dt_res needs to be reshaped.
+    auto dy_res = builder->ReduceSum(dout, y_reduce_dims, true);
+    dy_t        = builder->Reshape(dy_res, dy->shape);
+  }
+
+  // map the the output of decomposed operator to the original.
+  context.MapOutToOrigin(dx_t, dx);
+  context.MapOutToOrigin(dy_t, dy);
+}
+
+}  // namespace decomposer
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(broadcast_decomposers) {
+  CINN_DECOMPOSER_REGISTER(elementwise_add, cinn::frontend::decomposer::elementwise_add);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(broadcast_grad_decomposers) {
+  CINN_DECOMPOSER_REGISTER(elementwise_add_grad, cinn::frontend::decomposer::elementwise_add_grad);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/decomposer/broadcast_test.cc b/paddle/cinn/frontend/decomposer/broadcast_test.cc
new file mode 100644
index 0000000000000..7e11f5d8f0e9a
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/broadcast_test.cc
@@ -0,0 +1,281 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn::frontend {
+
+TEST(Decomposer, elementwise_add_bcast0) {
+  NetBuilder builder("elementwise_add");
+  auto x   = builder.CreateInput(Float(32), {4, 1, 20, 10});
+  auto y   = builder.CreateInput(Float(32), {10, 20});
+  auto out = builder.Add(x, y, 1);
+
+  std::vector<std::string> input_names        = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names       = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{4, 10, 20, 10}};
+  RunAndCheckShape<float>(builder, input_names, output_names, output_shapes);
+}
+
+TEST(Decomposer, elementwise_add_bcase1) {
+  NetBuilder builder("elementwise_add");
+  auto x   = builder.CreateInput(Float(32), {10, 20});
+  auto y   = builder.CreateInput(Float(32), {4, 1, 20, 10});
+  auto out = builder.Add(x, y, 1);
+
+  std::vector<std::string> input_names        = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names       = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{4, 10, 20, 10}};
+  RunAndCheckShape<float>(builder, input_names, output_names, output_shapes);
+}
+
+TEST(Decomposer, elementwise_add_grad_bcast0) {
+  NetBuilder builder("elementwise_add_grad");
+  auto dout      = builder.CreateInput(Float(32), {4, 10, 20, 10});
+  auto x         = builder.CreateInput(Float(32), {4, 1, 20, 10});
+  auto y         = builder.CreateInput(Float(32), {10, 20});
+  auto out_grads = builder.ElementwiseAddGrad(dout, x, y, 1);
+
+  std::vector<std::string> input_names        = {dout.id().data()};
+  std::vector<std::string> output_names       = {out_grads[0]->id, out_grads[1]->id};
+  std::vector<std::vector<int>> output_shapes = {{4, 1, 20, 10}, {10, 20}};
+  RunAndCheckShape<float>(builder, input_names, output_names, output_shapes);
+}
+
+TEST(Decomposer, elementwise_add_bcast1) {
+  NetBuilder builder("elementwise_add");
+  auto x   = builder.CreateInput(Float(32), {32, 64, 32, 32});
+  auto y   = builder.CreateInput(Float(32), {64});
+  auto out = builder.Add(x, y, 1);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    float* x   = static_cast<float*>(ptrs[0]);
+    float* y   = static_cast<float*>(ptrs[1]);
+    float* out = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < 32; ++i) {
+      for (size_t j = 0; j < 64; ++j) {
+        for (size_t k = 0; k < 32 * 32; ++k) {
+          out[(i * 64 + j) * 32 * 32 + k] = x[(i * 64 + j) * 32 * 32 + k] + y[j];
+        }
+      }
+    }
+  };
+
+  std::vector<std::string> input_names        = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names       = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 64, 32, 32}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_bcast1_2) {
+  NetBuilder builder("elementwise_add");
+  auto x   = builder.CreateInput(Float(32), {64});
+  auto y   = builder.CreateInput(Float(32), {32, 64, 32, 32});
+  auto out = builder.Add(x, y, 1);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    float* x   = static_cast<float*>(ptrs[0]);
+    float* y   = static_cast<float*>(ptrs[1]);
+    float* out = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < 32; ++i) {
+      for (size_t j = 0; j < 64; ++j) {
+        for (size_t k = 0; k < 32 * 32; ++k) {
+          out[(i * 64 + j) * 32 * 32 + k] = y[(i * 64 + j) * 32 * 32 + k] + x[j];
+        }
+      }
+    }
+  };
+
+  std::vector<std::string> input_names        = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names       = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 64, 32, 32}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_grad_bcast1) {
+  NetBuilder builder("elementwise_add_grad");
+  auto dout      = builder.CreateInput(Float(32), {32, 64, 32, 32});
+  auto x         = builder.CreateInput(Float(32), {32, 64, 32, 32});
+  auto y         = builder.CreateInput(Float(32), {64});
+  auto out_grads = builder.ElementwiseAddGrad(dout, x, y, 1);
+
+  auto add_grad_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    float* dout = static_cast<float*>(ptrs[0]);
+    float* dx   = static_cast<float*>(ptrs[1]);
+    float* dy   = static_cast<float*>(ptrs[2]);
+    for (size_t j = 0; j < 64; ++j) {
+      dy[j] = 0;
+    }
+    for (size_t i = 0; i < 32; ++i) {
+      for (size_t j = 0; j < 64; ++j) {
+        for (size_t k = 0; k < 32 * 32; ++k) {
+          dx[(i * 64 + j) * 32 * 32 + k] = dout[(i * 64 + j) * 32 * 32 + k];
+          dy[j]                          = dy[j] + dout[(i * 64 + j) * 32 * 32 + k];
+        }
+      }
+    }
+  };
+
+  std::vector<std::string> input_names        = {dout.id().data()};
+  std::vector<std::string> output_names       = {out_grads[0]->id, out_grads[1]->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 64, 32, 32}, {64}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, add_grad_cpu);
+}
+
+TEST(Decomposer, elementwise_add_bcast2) {
+  NetBuilder builder("elementwise_add");
+  auto x   = builder.CreateInput(Float(32), {32, 16});
+  auto y   = builder.CreateInput(Float(32), {1});
+  auto out = builder.Add(x, y);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    size_t n     = lengths[0];
+    float* x     = static_cast<float*>(ptrs[0]);
+    float* y     = static_cast<float*>(ptrs[1]);
+    float* out   = static_cast<float*>(ptrs[2]);
+    float y_data = y[0];
+    for (size_t i = 0; i < n; ++i) {
+      out[i] = x[i] + y_data;
+    }
+  };
+
+  std::vector<std::string> input_names        = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names       = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_bcast2_2) {
+  NetBuilder builder("elementwise_add");
+  auto x   = builder.CreateInput(Float(32), {1});
+  auto y   = builder.CreateInput(Float(32), {32, 16});
+  auto out = builder.Add(x, y);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    size_t n     = 32 * 16;
+    float* x     = static_cast<float*>(ptrs[0]);
+    float* y     = static_cast<float*>(ptrs[1]);
+    float* out   = static_cast<float*>(ptrs[2]);
+    float x_data = x[0];
+    for (size_t i = 0; i < n; ++i) {
+      out[i] = y[i] + x_data;
+    }
+  };
+
+  std::vector<std::string> input_names        = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names       = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_bcast2_3) {
+  constexpr int kLength = 64;
+  using int_ty          = int64_t;
+  NetBuilder builder("elementwise_add");
+  auto x   = builder.CreateInput(Int(kLength), {32, 16});
+  auto y   = builder.CreateInput(Int(kLength), {1});
+  auto out = builder.Add(x, y);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    size_t n      = lengths[0];
+    int_ty* x     = static_cast<int_ty*>(ptrs[0]);
+    int_ty* y     = static_cast<int_ty*>(ptrs[1]);
+    int_ty* out   = static_cast<int_ty*>(ptrs[2]);
+    int_ty y_data = y[0];
+    for (size_t i = 0; i < n; ++i) {
+      out[i] = x[i] + y_data;
+    }
+  };
+
+  std::vector<std::string> input_names        = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names       = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}};
+  RunAndCheck<int_ty>(builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_grad_bcast2) {
+  NetBuilder builder("elementwise_add_grad");
+  auto dout      = builder.CreateInput(Float(32), {32, 16});
+  auto x         = builder.CreateInput(Float(32), {32, 16});
+  auto y         = builder.CreateInput(Float(32), {1});
+  auto out_grads = builder.ElementwiseAddGrad(dout, x, y);
+
+  auto add_grad_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    size_t n    = lengths[0];
+    float* dout = static_cast<float*>(ptrs[0]);
+    float* dx   = static_cast<float*>(ptrs[1]);
+    float* dy   = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < n; ++i) {
+      float tmp = dout[i];
+      dx[i]     = tmp;
+      dy[0] += tmp;
+    }
+  };
+
+  std::vector<std::string> input_names        = {dout.id().data()};
+  std::vector<std::string> output_names       = {out_grads[0]->id, out_grads[1]->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}, {1}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, add_grad_cpu);
+}
+
+TEST(Decomposer, elementwise_add_same_dims) {
+  NetBuilder builder("elementwise_add");
+  auto x   = builder.CreateInput(Float(32), {32, 16});
+  auto y   = builder.CreateInput(Float(32), {32, 16});
+  auto out = builder.Add(x, y);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    size_t n   = lengths[0];
+    float* x   = static_cast<float*>(ptrs[0]);
+    float* y   = static_cast<float*>(ptrs[1]);
+    float* out = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < n; ++i) {
+      out[i] = x[i] + y[i];
+    }
+  };
+
+  std::vector<std::string> input_names        = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names       = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_grad_same_dims) {
+  NetBuilder builder("elementwise_add_grad");
+  auto dout      = builder.CreateInput(Float(32), {32, 16});
+  auto x         = builder.CreateInput(Float(32), {32, 16});
+  auto y         = builder.CreateInput(Float(32), {32, 16});
+  auto out_grads = builder.ElementwiseAddGrad(dout, x, y);
+
+  auto add_grad_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    size_t n    = lengths[0];
+    float* dout = static_cast<float*>(ptrs[0]);
+    float* dx   = static_cast<float*>(ptrs[1]);
+    float* dy   = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < n; ++i) {
+      float tmp = dout[i];
+      dx[i]     = tmp;
+      dy[i]     = tmp;
+    }
+  };
+
+  std::vector<std::string> input_names        = {dout.id().data()};
+  std::vector<std::string> output_names       = {out_grads[0]->id, out_grads[1]->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}, {32, 16}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, add_grad_cpu);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/decomposer/elementwise.cc b/paddle/cinn/frontend/decomposer/elementwise.cc
new file mode 100644
index 0000000000000..e613ad765a76b
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/elementwise.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/decomposer_registry.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace decomposer {
+
+void sum(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_GT(instr->inputs.size(), 0UL) << "At least 1 input tensor for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL) << "1 output tensor for " << instr->op_type;
+  auto inputs   = instr->inputs;
+  auto output   = instr->outputs[0];
+  auto* builder = context.builder();
+
+  auto sum = builder->Identity(inputs[0]);
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    sum = builder->Add(sum, inputs[i]);
+  }
+
+  // map the the output of decomposed operator to the original.
+  context.MapOutToOrigin(sum, output);
+}
+
+}  // namespace decomposer
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(sum_decomposers) {
+  CINN_DECOMPOSER_REGISTER(sum, cinn::frontend::decomposer::sum);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/decomposer/elementwise_test.cc b/paddle/cinn/frontend/decomposer/elementwise_test.cc
new file mode 100644
index 0000000000000..4aaca9ed72b51
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/elementwise_test.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn::frontend {
+
+TEST(Decomposer, sum) {
+  NetBuilder builder("sum");
+  auto x   = builder.CreateInput(Float(32), {32, 16});
+  auto y   = builder.CreateInput(Float(32), {32, 16});
+  auto z   = builder.CreateInput(Float(32), {32, 16});
+  auto out = builder.Sum({x, y, z});
+
+  auto sum_cpu = [](const std::vector<size_t>& lengths, const std::vector<void*>& ptrs) {
+    size_t n   = lengths[0];
+    float* x   = static_cast<float*>(ptrs[0]);
+    float* y   = static_cast<float*>(ptrs[1]);
+    float* z   = static_cast<float*>(ptrs[2]);
+    float* out = static_cast<float*>(ptrs[3]);
+    for (size_t i = 0; i < n; ++i) {
+      out[i] = x[i] + y[i] + z[i];
+    }
+  };
+
+  std::vector<std::string> input_names        = {x.id().data(), y.id().data(), z.id().data()};
+  std::vector<std::string> output_names       = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}};
+  RunAndCheck<float>(builder, input_names, output_names, output_shapes, sum_cpu);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/decomposer/test_helper.cc b/paddle/cinn/frontend/decomposer/test_helper.cc
new file mode 100644
index 0000000000000..3ca29f5f07c31
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/test_helper.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn::frontend {
+
+void RunDecomposer(Program* prog,
+                   const Target& target,
+                   const std::vector<std::string>& passes,
+                   const std::vector<std::string>& fetch_ids) {
+  VLOG(1) << "===================== Before Program Pass =====================";
+  for (int i = 0; i < prog->size(); i++) {
+    VLOG(1) << "instruction: " << (*prog)[i];
+  }
+  ProgramPass::Apply(prog, std::unordered_set<std::string>(fetch_ids.begin(), fetch_ids.end()), target, passes);
+  VLOG(1) << "===================== After Program Pass =====================";
+  for (int i = 0; i < prog->size(); i++) {
+    VLOG(1) << "instruction: " << (*prog)[i];
+  }
+}
+
+template <>
+void InitRandomVector<int>(std::vector<int>* vec, size_t numel, int low, int high, float precision) {
+  std::random_device seed;
+  std::default_random_engine engine(seed());
+  std::uniform_int_distribution<int> dist(low, high);
+
+  vec->resize(numel);
+  for (size_t i = 0; i < numel; ++i) {
+    vec->at(i) = dist(engine);
+  }
+}
+
+template <>
+void CopyFromVector<bool>(const std::vector<bool>& vec, hlir::framework::Tensor tensor, Target target) {
+  auto* data = tensor->mutable_data<bool>(target);
+
+  size_t numel = tensor->shape().numel();
+  CHECK_EQ(vec.size(), numel);
+
+#ifdef CINN_WITH_CUDA
+  // why not use vector<bool> ? Because to optimizes space, each value is stored in a single bit.
+  // So that the vector<bool> doesn't has data() function.
+  CHECK_EQ(sizeof(bool), sizeof(char)) << "The test need ensure the byte size of bool equal to the byte size of char.";
+
+  std::vector<char> vec_char(numel);
+  for (int i = 0; i < numel; ++i) vec_char[i] = static_cast<char>(vec[i]);
+  cudaMemcpy(data, vec_char.data(), numel * sizeof(bool), cudaMemcpyHostToDevice);
+#else
+  std::copy(vec.begin(), vec.end(), data);
+#endif
+}
+
+template <>
+void CopyToVector<bool>(const hlir::framework::Tensor tensor, std::vector<bool>* vec) {
+  auto* data = tensor->data<bool>();
+
+  size_t numel = tensor->shape().numel();
+  vec->resize(numel);
+
+#ifdef CINN_WITH_CUDA
+  // why not use vector<bool> ? Because to optimizes space, each value is stored in a single bit.
+  // So that the vector<bool> doesn't has data() function.
+  CHECK_EQ(sizeof(bool), sizeof(char)) << "The test need ensure the byte size of bool equal to the byte size of char.";
+
+  std::vector<char> vec_char(numel);
+  cudaMemcpy(vec_char.data(), data, numel * sizeof(bool), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < numel; ++i) vec->at(i) = static_cast<bool>(vec_char[i]);
+#else
+  for (size_t i = 0; i < numel; ++i) {
+    vec->at(i) = data[i];
+  }
+#endif
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/decomposer/test_helper.h b/paddle/cinn/frontend/decomposer/test_helper.h
new file mode 100644
index 0000000000000..a00e94b82f07e
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/test_helper.h
@@ -0,0 +1,242 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <iomanip>
+#include <random>
+
+#include "cinn/frontend/decomposer/use_decomposer.h"
+#include "cinn/frontend/decomposer_registry.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+
+namespace cinn::frontend {
+
+using CPUKernelFunc = std::function<void(const std::vector<size_t>& lengths, const std::vector<void*>& ptrs)>;
+
+template <typename T, typename Alloc = std::allocator<T>>
+std::ostream& operator<<(std::ostream& os, const std::vector<T, Alloc>& vec) {
+  os << "{";
+  bool is_first = true;
+  for (auto e : vec) {
+    if (is_first) {
+      is_first = false;
+    } else {
+      os << ", ";
+    }
+    os << e;
+  }
+  os << "}\n";
+  return os;
+}
+
+template <typename T>
+void InitRandomVector(
+    std::vector<T>* vec, size_t numel, T low = static_cast<T>(0), T high = static_cast<T>(1), float precision = 1e-5) {
+  std::random_device seed;
+  std::default_random_engine engine(seed());
+  std::uniform_real_distribution<double> dist(low, high);
+
+  vec->resize(numel);
+  for (size_t i = 0; i < numel; ++i) {
+    T value    = static_cast<T>(dist(engine));
+    int coeff  = static_cast<int>(value / precision);
+    vec->at(i) = precision * static_cast<T>(coeff);
+  }
+}
+
+template <>
+void InitRandomVector<int>(std::vector<int>* vec, size_t numel, int low, int high, float precision);
+
+template <typename T>
+void CopyFromVector(const std::vector<T>& vec, hlir::framework::Tensor tensor, Target target) {
+  auto* data = tensor->mutable_data<T>(target);
+
+  size_t numel = tensor->shape().numel();
+  CHECK_EQ(vec.size(), numel);
+
+  if (target == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+    cudaMemcpy(data, vec.data(), numel * sizeof(T), cudaMemcpyHostToDevice);
+#else
+    LOG(FATAL) << "NVGPU Target only support on flag CINN_WITH_CUDA ON! Please check.";
+#endif
+  } else {
+    std::copy(vec.begin(), vec.end(), data);
+  }
+}
+
+template <>
+void CopyFromVector<bool>(const std::vector<bool>& vec, hlir::framework::Tensor tensor, Target target);
+
+template <typename T>
+void CopyToVector(const hlir::framework::Tensor tensor, std::vector<T>* vec) {
+  auto* data = tensor->data<T>();
+
+  size_t numel = tensor->shape().numel();
+  vec->resize(numel);
+
+#ifdef CINN_WITH_CUDA
+  cudaMemcpy(vec->data(), data, numel * sizeof(T), cudaMemcpyDeviceToHost);
+#else
+  for (size_t i = 0; i < numel; ++i) {
+    vec->at(i) = data[i];
+  }
+#endif
+}
+
+template <>
+void CopyToVector<bool>(const hlir::framework::Tensor tensor, std::vector<bool>* vec);
+
+template <typename T>
+void CheckOutput(const std::vector<T>& actual, const std::vector<T>& expect, float atol = 1e-8, float rtol = 1e-5) {
+  CHECK_EQ(actual.size(), expect.size());
+
+  auto allclose = [](T a, T e, float atol, float rtol) { return abs(a - e) <= (atol + rtol * abs(e)); };
+
+  float max_diff = 0.0f;
+  int offset     = 0;
+  int num_diffs  = 0;
+
+  size_t numel = actual.size();
+  for (size_t i = 0; i < numel; ++i) {
+    if (!allclose(actual[i], expect[i], atol, rtol)) {
+      float absolute_diff = abs((actual[i] - expect[i]));
+      float relative_diff = abs(absolute_diff / expect[i]);
+      if (relative_diff > max_diff) {
+        max_diff = relative_diff;
+        offset   = i;
+      }
+      num_diffs += 1;
+      VLOG(4) << "- i=" << i << ", " << std::setprecision(8) << actual[i] << " (actual) vs " << std::setprecision(8)
+              << expect[i] << " (expect), relative_diff=" << relative_diff << ", absolute_diff=" << absolute_diff;
+    }
+  }
+  LOG(INFO) << "- Total " << num_diffs << " different results, offset=" << offset << ", " << actual[offset]
+            << " (actual) vs " << expect[offset] << " (expect), maximum_relative_diff=" << max_diff
+            << " (absolute_diff=" << abs((actual[offset] - expect[offset])) << ")";
+  CHECK_EQ(num_diffs, 0);
+}
+
+template <typename T>
+void ComputeReferenceCpu(const std::vector<std::vector<T>>& input_vecs,
+                         const std::vector<std::vector<T>>& output_vecs,
+                         std::vector<std::vector<T>>* output_refs,
+                         CPUKernelFunc cpu_kernel_func) {
+  output_refs->resize(output_vecs.size());
+  for (size_t i = 0; i < output_vecs.size(); ++i) {
+    output_refs->at(i).resize(output_vecs[i].size());
+  }
+
+  // Prepare the arguments for reference.
+  // For different operations, the needed parameters maybe different.
+  size_t n = input_vecs[0].size();
+  std::vector<size_t> lengths;
+  lengths.push_back(n);
+
+  std::vector<void*> ptrs(input_vecs.size() + output_refs->size());
+  for (size_t i = 0; i < input_vecs.size(); ++i) {
+    ptrs[i] = const_cast<void*>(static_cast<const void*>(input_vecs[i].data()));
+  }
+  for (size_t i = 0; i < output_refs->size(); ++i) {
+    ptrs[input_vecs.size() + i] = output_refs->at(i).data();
+  }
+  cpu_kernel_func(lengths, ptrs);
+}
+
+void RunDecomposer(Program* prog,
+                   const Target& target,
+                   const std::vector<std::string>& passes    = {"Decomposer"},
+                   const std::vector<std::string>& fetch_ids = {});
+
+template <typename T>
+void RunAndCheckShape(NetBuilder& builder,
+                      const std::vector<std::string>& input_names,
+                      const std::vector<std::string>& output_names,
+                      const std::vector<std::vector<int>>& output_shapes,
+                      std::vector<std::vector<T>>* input_vecs  = nullptr,
+                      std::vector<std::vector<T>>* output_vecs = nullptr,
+                      T low                                    = 0,
+                      T high                                   = 1,
+                      const std::vector<std::string>& passes   = {"Decomposer"}) {
+  auto prog     = builder.Build();
+  Target target = common::DefaultTarget();
+  RunDecomposer(&prog, target, passes, output_names);
+  auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
+  hlir::framework::ApplyPasses(graph.get(), DefaultOpFusionPasses());
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+
+  auto runtime_program = gc.Build();
+  std::vector<std::vector<T>> input_vecs_internal;
+  std::vector<std::vector<T>>* input_vecs_ptr = input_vecs ? input_vecs : &input_vecs_internal;
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    scope->Var<hlir::framework::Tensor>(input_names[i]);
+    auto tensor = scope->GetTensor(input_names[i]);
+
+    std::vector<T> vec;
+    InitRandomVector<T>(&vec, tensor->shape().numel(), low, high);
+    CopyFromVector<T>(vec, tensor, target);
+    input_vecs_ptr->push_back(vec);
+  }
+  runtime_program->Execute();
+
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    auto tensor = scope->GetTensor(output_names[i]);
+    CHECK_EQ(tensor->shape().data() == output_shapes[i], true)
+        << "The " << i << "-th shape is expected to be " << output_shapes[i];
+    if (output_vecs) {
+      std::vector<T> vec;
+      CopyToVector<T>(tensor, &vec);
+      output_vecs->push_back(vec);
+    }
+  }
+}
+
+template <typename T>
+void RunAndCheck(NetBuilder& builder,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names,
+                 const std::vector<std::vector<int>>& output_shapes,
+                 CPUKernelFunc cpu_kernel_func,
+                 T low                                  = 0,
+                 T high                                 = 1,
+                 float atol                             = 1e-8,
+                 float rtol                             = 1e-5,
+                 const std::vector<std::string>& passes = {"Decomposer"}) {
+  std::vector<std::vector<T>> input_vecs;
+  std::vector<std::vector<T>> output_vecs;
+  RunAndCheckShape<T>(builder, input_names, output_names, output_shapes, &input_vecs, &output_vecs, low, high, passes);
+
+  std::vector<std::vector<T>> output_refs;
+  ComputeReferenceCpu<T>(input_vecs, output_vecs, &output_refs, cpu_kernel_func);
+
+  for (size_t i = 0; i < output_vecs.size(); ++i) {
+    LOG(INFO) << "Check the " << i << "-th output, name=" << output_names[i] << ", shape=" << output_shapes[i];
+    CheckOutput<T>(output_vecs[i], output_refs[i], atol, rtol);
+  }
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/decomposer/top_k.cc b/paddle/cinn/frontend/decomposer/top_k.cc
new file mode 100644
index 0000000000000..14472e0c0c0ba
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/top_k.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/decomposer_registry.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace decomposer {
+
+void top_k(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 1UL) << " 1 input tensor for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 2UL) << "2 output tensors for " << instr->op_type;
+  auto x       = instr->inputs[0];
+  auto output  = instr->outputs[0];
+  auto indices = instr->outputs[1];
+
+  auto* builder = context.builder();
+  int k         = instr.GetAttrs<int>("k");
+  CHECK_GT(k, 0) << "The attribute k must be greater than 0.";
+  int axis = instr.GetAttrs<int>("axis");
+  if (axis < 0) {
+    axis += x->shape.size();
+  }
+
+  auto sort_tmp    = builder->Sort(x, axis, false);
+  auto sort_out    = builder->Slice(sort_tmp, {axis}, {0}, {k});
+  auto argsort_tmp = builder->ArgSort(x, axis, false).at(0);
+  auto argsort_out = builder->Cast(builder->Slice(argsort_tmp, {axis}, {0}, {k}), "int64");
+
+  // map the the output of decomposed operator to the original.
+  context.MapOutToOrigin(sort_out, output);
+  context.MapOutToOrigin(argsort_out, indices);
+}
+
+}  // namespace decomposer
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(top_k_decomposer) {
+  CINN_DECOMPOSER_REGISTER(top_k, cinn::frontend::decomposer::top_k);
+  return true;
+}
diff --git a/paddle/cinn/frontend/decomposer/top_k_test.cc b/paddle/cinn/frontend/decomposer/top_k_test.cc
new file mode 100644
index 0000000000000..2107bbb68def2
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/top_k_test.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn::frontend {
+
+TEST(Decomposer, top_k_decomposer) {
+  NetBuilder net_builder("top_k_decomposer");
+  std::unordered_set<std::string> output_names;
+  {
+    auto x = net_builder.CreateInput(Float(32), {10, 5}, "x");
+    auto y = net_builder.TopK(x, 1, -1, true);
+    output_names.insert(y[0]->id);
+    output_names.insert(y[1]->id);
+  }
+  auto program = net_builder.Build();
+
+  auto target = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, output_names, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto run_program = gc.Build();
+
+  std::vector<float> x(10 * 5);
+  InitRandomVector<float>(&x, 10 * 5, 0.0f, 1.0f, 1e-3);
+  std::vector<std::pair<std::string, std::vector<float>>> inputs = {{"x", x}};
+  for (auto& input : inputs) {
+    scope->Var<hlir::framework::Tensor>(input.first);
+    auto tensor = scope->GetTensor(input.first);
+    auto* data  = tensor->mutable_data<float>(target);
+    CopyFromVector(input.second, tensor, target);
+  }
+  run_program->Execute();
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/decomposer/use_decomposer.h b/paddle/cinn/frontend/decomposer/use_decomposer.h
new file mode 100644
index 0000000000000..0afb73e2d1815
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer/use_decomposer.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/common/macros.h"
+
+CINN_USE_REGISTER(relu_decomposers)
+CINN_USE_REGISTER(relu_grad_decomposers)
+CINN_USE_REGISTER(gelu_decomposers)
+CINN_USE_REGISTER(softmax_decomposers)
+CINN_USE_REGISTER(sum_decomposers)
+CINN_USE_REGISTER(broadcast_decomposers)
+CINN_USE_REGISTER(broadcast_grad_decomposers)
+CINN_USE_REGISTER(batch_norm_decomposer)
+CINN_USE_REGISTER(batch_norm_train_decomposer)
+CINN_USE_REGISTER(batch_norm_grad_decomposer)
+CINN_USE_REGISTER(top_k_decomposer)
diff --git a/paddle/cinn/frontend/decomposer_registry.h b/paddle/cinn/frontend/decomposer_registry.h
new file mode 100644
index 0000000000000..6eeb2be3b8292
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer_registry.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+
+class Decomposer;
+
+class DecomposerContext {
+ public:
+  explicit DecomposerContext(NetBuilder* builder, absl::flat_hash_map<std::string, Variable>* var_map)
+      : builder_(builder), var_map_(var_map) {}
+
+  NetBuilder* builder() const { return builder_; };
+
+  // Map the new var to the original var.
+  void MapOutToOrigin(const Variable& new_var, const Variable& ori_var) const {
+    if (new_var->shape != ori_var->shape) {
+      LOG(FATAL) << "The output shape should be equal to the original. But received : " << new_var->id << ".shape=["
+                 << utils::Join(new_var->shape, ", ") << "] and the original var " << ori_var->id << ".shape=["
+                 << utils::Join(ori_var->shape, ", ") << "].";
+    }
+    if (new_var->type != ori_var->type) {
+      LOG(FATAL) << "The output type shoule be equal to the original. But received : " << new_var->id
+                 << ".type=" << new_var->type << " and the original var " << ori_var->id << ".type=" << ori_var->type;
+    }
+    (*var_map_)[new_var->id] = ori_var;
+  }
+
+ private:
+  NetBuilder* builder_{nullptr};
+  absl::flat_hash_map<std::string, Variable>* var_map_{nullptr};
+};
+
+class InstrDecomposerRegistry : public Registry<Decomposer> {
+ public:
+  static InstrDecomposerRegistry* Global() {
+    static InstrDecomposerRegistry x;
+    return &x;
+  }
+
+  inline const Decomposer* Get(const std::string& op_name, const common::Target& target) {
+    const Decomposer* decomposer = Find(op_name, target);
+    CHECK(decomposer) << "Decomposer for [" << op_name << ", " << target << "] is not registered";
+    return decomposer;
+  }
+
+  inline const Decomposer* Find(const std::string& name, const common::Target& target) {
+    return Registry<Decomposer>::Find(name + "_" + target.arch_str());
+  }
+
+ private:
+  InstrDecomposerRegistry() = default;
+  CINN_DISALLOW_COPY_AND_ASSIGN(InstrDecomposerRegistry);
+};
+
+class Decomposer {
+ public:
+  using DecomposerKernel = std::function<void(const Instruction& instr, const DecomposerContext&)>;
+
+  Decomposer& SetBody(const DecomposerKernel& kernel) {
+    kernel_ = kernel;
+    return *this;
+  }
+
+  void Run(const Instruction& instr, const DecomposerContext& context) const { kernel_(instr, context); }
+
+  std::string name;
+
+ private:
+  DecomposerKernel kernel_;
+};
+
+#define CINN_DECOMPOSER_REGISTER_CORE(name, target, kernel)        \
+  ::cinn::frontend::InstrDecomposerRegistry::Global()              \
+      ->__REGISTER__(std::string(#name) + "_" + target.arch_str()) \
+      .SetBody(kernel)
+
+#define CINN_DECOMPOSER_REGISTER_ALL(name, kernel)                                                 \
+  static std::vector<::cinn::common::Target> all_targets = {::cinn::common::DefaultHostTarget(),   \
+                                                            ::cinn::common::DefaultNVGPUTarget()}; \
+  for (auto& target : all_targets) {                                                               \
+    ::cinn::frontend::InstrDecomposerRegistry::Global()                                            \
+        ->__REGISTER__(std::string(#name) + "_" + target.arch_str())                               \
+        .SetBody(kernel);                                                                          \
+  }
+
+/**
+ * @def CINN_DECOMPOSER_REGISTER
+ * \brief Register a decomposer kernel
+ *
+ * Register a decomposer on the specific target:
+ * \code
+ *  CINN_DECOMPOSER_REGISTER(name, target, kernel);
+ * \endcode
+ *
+ * Register a decomposer on all default targets:
+ * \code
+ * CINN_DECOMPOSER_REGISTER(name, kernel);
+ * \endcode
+ */
+#define GET_MACRO(_0, _1, _2, FUNC, ...) FUNC
+#define CINN_DECOMPOSER_REGISTER(...) \
+  GET_MACRO(__VA_ARGS__, CINN_DECOMPOSER_REGISTER_CORE, CINN_DECOMPOSER_REGISTER_ALL)(__VA_ARGS__)
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/decomposer_registry_test.cc b/paddle/cinn/frontend/decomposer_registry_test.cc
new file mode 100644
index 0000000000000..6fa3382939f95
--- /dev/null
+++ b/paddle/cinn/frontend/decomposer_registry_test.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/decomposer_registry.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/use_decomposer.h"
+
+namespace cinn::frontend {
+
+TEST(InstrDecomposerRegistry, basic) {
+  common::Target target = common::DefaultHostTarget();
+  ASSERT_EQ(InstrDecomposerRegistry::Global()->Find("conv", target), nullptr);
+  ASSERT_NE(InstrDecomposerRegistry::Global()->Find("relu", target), nullptr);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/interpreter.cc b/paddle/cinn/frontend/interpreter.cc
new file mode 100755
index 0000000000000..5b151ed0981ce
--- /dev/null
+++ b/paddle/cinn/frontend/interpreter.cc
@@ -0,0 +1,142 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/interpreter.h"
+
+#include "cinn/auto_schedule/auto_tuner.h"
+#include "cinn/auto_schedule/tuning.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(enable_auto_tuner);
+
+namespace cinn::frontend {
+
+struct Interpreter::Impl {
+  Impl(const std::vector<std::string>& input_names, const std::vector<hlir::framework::shape_t>& input_shapes)
+      : scope_(std::make_shared<hlir::framework::Scope>()), input_names_(input_names), input_shapes_(input_shapes) {}
+
+  /**
+   * Build the model.
+   * @param input_names The name of input variables.
+   * @param input_shapes The input shapes.
+   */
+  void Build(const Target& target, const std::string& model_name = "");
+
+ private:
+  friend class Interpreter;
+
+  std::vector<std::string> input_names_;
+  absl::flat_hash_set<std::string> fetch_names_;
+  std::vector<hlir::framework::shape_t> input_shapes_;
+
+  std::shared_ptr<hlir::framework::Scope> scope_;
+  std::unique_ptr<frontend::Program> program_;
+  std::unique_ptr<hlir::framework::GraphCompiler> graph_compiler_;
+
+  absl::flat_hash_map<std::string, Variable> var_map_;
+  absl::flat_hash_map<std::string, std::string> var_map_paddle_to_cinn_;
+  absl::flat_hash_map<std::string, std::string> var_map_cinn_to_paddle_;
+
+  std::unique_ptr<hlir::framework::Program> runtime_program_;
+  std::unique_ptr<hlir::framework::Program> prerun_program_;
+};
+
+void Interpreter::LoadPaddleModel(const std::string& model_dir,
+                                  const Target& target,
+                                  bool params_combined,
+                                  const std::string& model_name) {
+  std::unordered_map<std::string, std::vector<int>> input_shape_map;
+  CHECK_EQ(impl_->input_names_.size(), impl_->input_shapes_.size());
+  for (int idx = 0; idx < impl_->input_names_.size(); ++idx) {
+    input_shape_map[impl_->input_names_[idx]] = impl_->input_shapes_[idx];
+  }
+  auto programTuple = LoadPaddleProgram(model_dir, impl_->scope_.get(), input_shape_map, params_combined, target);
+  auto& program     = std::get<0>(programTuple);
+  auto& var_map     = std::get<1>(programTuple);
+  auto& var_map_paddle_to_program = std::get<2>(programTuple);
+  auto& fetch_names               = std::get<3>(programTuple);
+  impl_->program_.reset(program.release());
+  impl_->var_map_                = var_map;
+  impl_->var_map_paddle_to_cinn_ = var_map_paddle_to_program;
+  impl_->fetch_names_            = fetch_names;
+
+  impl_->Build(target, model_name);
+}
+
+frontend::Program Interpreter::GetProgram() {
+  frontend::Program* res = impl_->program_.get();
+  return *res;
+}
+
+void Interpreter::Run() { impl_->runtime_program_->Execute(); }
+
+hlir::framework::Tensor Interpreter::GetTensor(const std::string& name) {
+  if (impl_->scope_->FindVar(name)) return impl_->scope_->GetTensor(name);
+
+  auto it = impl_->var_map_paddle_to_cinn_.find(name);
+  if (it == impl_->var_map_paddle_to_cinn_.end()) {
+    LOG(FATAL) << "No variable called [" << name
+               << "] found in executor\nThe existing vars: " << utils::Join(impl_->scope_->var_names(), ", ");
+  }
+  return impl_->scope_->GetTensor(it->second);
+}
+
+void Interpreter::Impl::Build(const Target& target, const std::string& model_name) {
+  CHECK(!var_map_.empty());
+  VLOG(3) << "Program:\n" << *program_;
+  // applay frontend pass
+  std::unordered_set<std::string> fetch_var_ids;
+  for (auto& name : fetch_names_) {
+    CHECK(var_map_.count(name)) << "var_map finds no fetch var " << name;
+    fetch_var_ids.insert(var_map_.at(name)->id);
+  }
+
+  auto graph = Optimize(program_.get(), fetch_var_ids, target);
+  // auto graph                 = std::make_shared<hlir::framework::Graph>(*program_, target);
+  graph->attrs["model_name"] = std::make_shared<absl::any>(model_name);
+  scope_                     = hlir::framework::BuildScope(target, graph, scope_);
+
+  graph_compiler_.reset(new hlir::framework::GraphCompiler(target, scope_, graph));
+  hlir::framework::GraphCompiler::CompileOptions options;
+  options.with_instantiate_variables = true;
+  if (FLAGS_enable_auto_tuner) {
+    VLOG(4) << "Compile with auto-tune";
+    auto_schedule::AutoTuner auto_tuner(target, graph.get());
+    auto_tuner.Initialize(auto_schedule::AutoTuner::Config(), graph_compiler_.get());
+    auto_schedule::TuningOptions tuning_options;
+    auto_schedule::TuningResult tuning_result = auto_tuner.Tune(tuning_options);
+    options.Apply(tuning_result);
+  }
+  runtime_program_ = graph_compiler_->Build(options, std::move(fetch_var_ids)).runtime_program;
+  runtime_program_->PreRun();
+}
+
+std::shared_ptr<hlir::framework::Scope> Interpreter::GetScope() {
+  CHECK(impl_->scope_);
+  return impl_->scope_;
+}
+
+Interpreter::Interpreter(const std::vector<std::string>& input_names,
+                         const std::vector<hlir::framework::shape_t>& input_shapes)
+    : impl_(new Impl(input_names, input_shapes)) {}
+
+}  // namespace cinn::frontend
+
+cinn::frontend::Interpreter::~Interpreter() {}
diff --git a/paddle/cinn/frontend/interpreter.h b/paddle/cinn/frontend/interpreter.h
new file mode 100755
index 0000000000000..a6e5dc019bb19
--- /dev/null
+++ b/paddle/cinn/frontend/interpreter.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/scope.h"
+
+namespace cinn {
+namespace frontend {
+
+/**
+ * The executor for a model.
+ */
+class Interpreter final {
+ public:
+  Interpreter(const std::vector<std::string>& input_names, const std::vector<hlir::framework::shape_t>& input_shapes);
+
+  /**
+   * Load a Paddle model.
+   * @param model_dir The directory path to the model.
+   * @param params_combined Whether the parameters are composed to a single file.
+   */
+  void LoadPaddleModel(const std::string& model_dir,
+                       const Target& target,
+                       bool params_combined          = false,
+                       const std::string& model_name = "");
+
+  /**
+   * Run the executor.
+   */
+  void Run();
+
+  frontend::Program GetProgram();
+
+  hlir::framework::Tensor GetTensor(const std::string& name);
+
+  std::shared_ptr<hlir::framework::Scope> GetScope();
+
+  ~Interpreter();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/interpreter_test.cc b/paddle/cinn/frontend/interpreter_test.cc
new file mode 100755
index 0000000000000..40549f88ee6b1
--- /dev/null
+++ b/paddle/cinn/frontend/interpreter_test.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/interpreter.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/runtime/use_extern_funcs.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace cinn::frontend {
+
+TEST(Interpreter, basic) {
+  Interpreter executor({"A"}, {{1, 30}});
+  executor.LoadPaddleModel(FLAGS_model_dir, common::DefaultTarget());
+  executor.Run();
+  // fc_0.tmp_2 is eliminated by OpFusion, so here
+  // change to get tenor of the out variable
+  executor.GetTensor("fc_0.tmp_2");
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/net_builder.cc b/paddle/cinn/frontend/net_builder.cc
new file mode 100644
index 0000000000000..0d04897d1dd94
--- /dev/null
+++ b/paddle/cinn/frontend/net_builder.cc
@@ -0,0 +1,939 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/net_builder.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/functional.h"
+#include "cinn/utils/profiler.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace frontend {
+
+using common::Context;
+using common::Type;
+using hlir::framework::Operator;
+using utils::AttributeMap;
+using utils::ShapeType;
+
+NetBuilder::NetBuilder(const std::string& name) : name_(name) {}
+
+Program NetBuilder::Build(bool in_reverse) {
+  utils::RecordEvent("NetBuilder::Build", utils::EventType::kProgram);
+  std::vector<Instruction> instrs;
+  if (in_reverse) {
+    instrs.reserve(instrs_.size());
+    for (auto it = instrs_.rbegin(); it != instrs_.rend(); it++) {
+      instrs.emplace_back(*it);
+    }
+  } else {
+    instrs = std::move(instrs_);
+  }
+
+  Program program{std::move(instrs), std::move(inputs_)};
+  program.Validate();
+  return program;
+}
+
+void NetBuilder::InferShape(Instruction instr) const {
+  using ShapeFunc           = std::function<std::vector<ShapeType>(const std::vector<ShapeType>&, const AttributeMap&)>;
+  using TypeFunc            = std::function<std::vector<Type>(const std::vector<Type>&, const AttributeMap&)>;
+  const auto& op_infershape = Operator::GetAttrs<ShapeFunc>("infershape");
+  const auto& op_inferdtype = Operator::GetAttrs<TypeFunc>("inferdtype");
+
+  size_t size = instr->inputs.size();
+  std::vector<ShapeType> in_shapes(size);
+  std::vector<Type> in_types(size);
+  std::transform(
+      instr->inputs.begin(), instr->inputs.end(), in_shapes.begin(), [](const Variable& var) { return var->shape; });
+  std::transform(
+      instr->inputs.begin(), instr->inputs.end(), in_types.begin(), [](const Variable& var) { return var->type; });
+  auto key        = Operator::Get(instr->op_type);
+  auto out_shapes = op_infershape[key](in_shapes, instr->attrs);
+  auto out_types  = op_inferdtype[key](in_types, instr->attrs);
+
+  auto& outs            = instr->outputs;
+  size_t origin_out_num = outs.size();
+  outs.resize(out_shapes.size());
+  for (size_t i = origin_out_num; i < outs.size(); i++) {
+    outs[i] = Variable();
+  }
+  for (size_t i = 0; i < outs.size(); i++) {
+    outs[i]->shape = out_shapes[i];
+    outs[i]->type  = out_types[i];
+  }
+}
+
+const std::vector<Variable>& NetBuilder::CustomInstr(const std::string& type,
+                                                     const std::vector<Variable>& inputs,
+                                                     const AttributeMap& attrs) {
+  Instruction instr(type, inputs);
+  for (auto& kv : attrs) {
+    instr.SetAttr(kv.first, kv.second);
+  }
+  utils::RecordEvent("NetBuilder." + type, utils::EventType::kProgram);
+  InferShape(instr);
+  AppendInstruction(instr);
+  return instr.GetOutputs();
+}
+
+Variable NetBuilder::BinaryOp(const std::string& op_type, const Variable& lhs, const Variable& rhs, int axis) {
+  CHECK_EQ(lhs->type, rhs->type) << "The inputs type of op " << op_type << " should be equal!";
+  return CustomInstr(op_type, {lhs, rhs}, {{"axis", axis}}).front();
+}
+
+Variable NetBuilder::UnaryOp(const std::string& op_type, const Variable& operand) {
+  return CustomInstr(op_type, {operand}, {}).front();
+}
+
+Variable NetBuilder::Reduce(const std::string& op_type, const Variable& x, const std::vector<int>& dim, bool keep_dim) {
+  // TODO(thisjiang): move the reduce simplify to frontend pass
+  auto product = std::accumulate(x->shape.begin(), x->shape.end(), 1, std::multiplies<int>());
+  if (product == 1) {
+    if (keep_dim) {
+      return Identity(x);
+    } else {
+      CHECK_GE(x->shape.size(), dim.size()) << "The inputs rank should be greater than or equal to axes.";
+      int new_rank = x->shape.size() == dim.size() ? 1 : x->shape.size() - dim.size();
+      std::vector<int> new_shape(new_rank, 1);
+      return Reshape(x, new_shape);
+    }
+  }
+  // Convert the negative dim to a positive number
+  std::vector<int> reduce_dim(dim.begin(), dim.end());
+  for (int i = 0; i < dim.size(); i++) {
+    if (reduce_dim[i] < 0) {
+      reduce_dim[i] = x->shape.size() + reduce_dim[i];
+    }
+  }
+  return CustomInstr(op_type, {x}, {{"dim", reduce_dim}, {"keep_dim", keep_dim}}).front();
+}
+
+#define NETBUILDER_UNARY_OP_DEF(func_name__, op_type__) \
+  Variable NetBuilder::func_name__(const Variable& operand) { return UnaryOp(#op_type__, operand); }
+NETBUILDER_UNARY_OP_DEF(Sqrt, sqrt)
+NETBUILDER_UNARY_OP_DEF(Tanh, tanh)
+NETBUILDER_UNARY_OP_DEF(Relu, relu)
+NETBUILDER_UNARY_OP_DEF(Gelu, gelu)
+NETBUILDER_UNARY_OP_DEF(Sigmoid, sigmoid)
+NETBUILDER_UNARY_OP_DEF(Identity, identity)
+NETBUILDER_UNARY_OP_DEF(Exp, exp)
+NETBUILDER_UNARY_OP_DEF(Erf, erf)
+NETBUILDER_UNARY_OP_DEF(Rsqrt, rsqrt)
+NETBUILDER_UNARY_OP_DEF(Log, log)
+NETBUILDER_UNARY_OP_DEF(Log2, log2)
+NETBUILDER_UNARY_OP_DEF(Log10, log10)
+NETBUILDER_UNARY_OP_DEF(Floor, floor)
+NETBUILDER_UNARY_OP_DEF(Ceil, ceil)
+NETBUILDER_UNARY_OP_DEF(Round, round)
+NETBUILDER_UNARY_OP_DEF(Trunc, trunc)
+NETBUILDER_UNARY_OP_DEF(Sin, sin)
+NETBUILDER_UNARY_OP_DEF(Cos, cos)
+NETBUILDER_UNARY_OP_DEF(Tan, tan)
+NETBUILDER_UNARY_OP_DEF(Sinh, sinh)
+NETBUILDER_UNARY_OP_DEF(Cosh, cosh)
+NETBUILDER_UNARY_OP_DEF(Asin, asin)
+NETBUILDER_UNARY_OP_DEF(Acos, acos)
+NETBUILDER_UNARY_OP_DEF(Atan, atan)
+NETBUILDER_UNARY_OP_DEF(Asinh, asinh)
+NETBUILDER_UNARY_OP_DEF(Acosh, acosh)
+NETBUILDER_UNARY_OP_DEF(Atanh, atanh)
+NETBUILDER_UNARY_OP_DEF(IsNan, isnan)
+NETBUILDER_UNARY_OP_DEF(IsFinite, isfinite)
+NETBUILDER_UNARY_OP_DEF(IsInf, isinf)
+NETBUILDER_UNARY_OP_DEF(LogicalNot, logical_not)
+NETBUILDER_UNARY_OP_DEF(BitwiseNot, bitwise_not)
+NETBUILDER_UNARY_OP_DEF(Negative, negative)
+NETBUILDER_UNARY_OP_DEF(Sign, sign)
+NETBUILDER_UNARY_OP_DEF(Abs, abs)
+NETBUILDER_UNARY_OP_DEF(Cbrt, cbrt)
+NETBUILDER_UNARY_OP_DEF(Clz, clz)
+NETBUILDER_UNARY_OP_DEF(Popc, popc)
+NETBUILDER_UNARY_OP_DEF(Reciprocal, reciprocal)
+
+#undef NETBUILDER_UNARY_OP_DEF
+
+#define NETBUILDER_BINARY_OP_DEF(func_name__, op_type__)                                 \
+  Variable NetBuilder::func_name__(const Variable& lhs, const Variable& rhs, int axis) { \
+    return BinaryOp(#op_type__, lhs, rhs, axis);                                         \
+  }
+NETBUILDER_BINARY_OP_DEF(Add, elementwise_add)
+NETBUILDER_BINARY_OP_DEF(ElementwiseAdd, elementwise_add)
+NETBUILDER_BINARY_OP_DEF(Atan2, atan2)
+NETBUILDER_BINARY_OP_DEF(Multiply, elementwise_mul)
+NETBUILDER_BINARY_OP_DEF(ElementwiseMul, elementwise_mul)
+NETBUILDER_BINARY_OP_DEF(Divide, divide)
+NETBUILDER_BINARY_OP_DEF(Subtract, subtract)
+NETBUILDER_BINARY_OP_DEF(FloorDivide, floor_divide)
+NETBUILDER_BINARY_OP_DEF(Mod, mod)
+NETBUILDER_BINARY_OP_DEF(Remainder, remainder)
+NETBUILDER_BINARY_OP_DEF(Max, max)
+NETBUILDER_BINARY_OP_DEF(Min, min)
+NETBUILDER_BINARY_OP_DEF(Pow, pow)
+NETBUILDER_BINARY_OP_DEF(LogicalAnd, logical_and)
+NETBUILDER_BINARY_OP_DEF(LogicalOr, logical_or)
+NETBUILDER_BINARY_OP_DEF(LogicalXor, logical_xor)
+NETBUILDER_BINARY_OP_DEF(BitwiseAnd, bitwise_and)
+NETBUILDER_BINARY_OP_DEF(BitwiseOr, bitwise_or)
+NETBUILDER_BINARY_OP_DEF(BitwiseXor, bitwise_xor)
+NETBUILDER_BINARY_OP_DEF(LeftShift, left_shift)
+NETBUILDER_BINARY_OP_DEF(RightShift, right_shift)
+NETBUILDER_BINARY_OP_DEF(GreaterThan, greater_than);
+NETBUILDER_BINARY_OP_DEF(LessThan, less_than);
+NETBUILDER_BINARY_OP_DEF(Equal, equal);
+NETBUILDER_BINARY_OP_DEF(NotEqual, not_equal);
+NETBUILDER_BINARY_OP_DEF(GreaterEqual, greater_equal);
+NETBUILDER_BINARY_OP_DEF(LessEqual, less_equal);
+NETBUILDER_BINARY_OP_DEF(LogicalRightShift, logical_right_shift);
+
+#undef NETBUILDER_BINARY_OP_DEF
+
+#define NETBUILDER_REDUCE_OP_DEF(func_name__, op_type__)                                            \
+  Variable NetBuilder::func_name__(const Variable& x, const std::vector<int>& dim, bool keep_dim) { \
+    std::vector<int> axes = dim;                                                                    \
+    if (axes.size() == 0) {                                                                         \
+      for (int idx = 0; idx < x->shape.size(); ++idx) {                                             \
+        axes.push_back(idx);                                                                        \
+      }                                                                                             \
+    }                                                                                               \
+    return Reduce(#op_type__, x, axes, keep_dim);                                                   \
+  }
+
+NETBUILDER_REDUCE_OP_DEF(ReduceSum, reduce_sum)
+NETBUILDER_REDUCE_OP_DEF(ReduceProd, reduce_prod)
+NETBUILDER_REDUCE_OP_DEF(ReduceMax, reduce_max)
+NETBUILDER_REDUCE_OP_DEF(ReduceMin, reduce_min)
+NETBUILDER_REDUCE_OP_DEF(ReduceAll, reduce_all)
+NETBUILDER_REDUCE_OP_DEF(ReduceAny, reduce_any)
+
+#undef NETBUILDER_REDUCE_OP_DEF
+
+Placeholder NetBuilder::CreateInput(const Type& type, const std::vector<int>& shape, const std::string& id_hint) {
+  if (!id_hint.empty()) {
+    cinn::utils::TransValidVarName(id_hint);
+  }
+  std::string id = id_hint.empty() ? Context::Global().NewName("placeholder") : id_hint;
+
+  inputs_.emplace_back(id);
+  auto& var  = inputs_.back();
+  var->type  = type;
+  var->shape = shape;
+  return Placeholder(var);
+}
+
+Placeholder NetBuilder::CreateInput(const Variable& var) {
+  VLOG_IF(4, var->shape.empty()) << "The input's shape is empty, Create 0D-Tensor for " << var->id;
+  CHECK(!var->type.is_unk()) << "The input's type is not set yet";
+  inputs_.push_back(var);
+  return Placeholder(var);
+}
+
+Variable NetBuilder::FillConstant(const std::vector<int>& shape,
+                                  const std::string& str_value,
+                                  const std::string& name,
+                                  const std::string& dtype,
+                                  bool force_cpu) {
+  const auto& type = common::Str2Type(dtype);
+
+  utils::Attribute value;
+  if (type.is_float()) {
+    value = std::stod(str_value);
+  } else if (type.is_int() || type.is_uint()) {
+    value = static_cast<int64_t>(std::stoll(str_value));
+  } else if (type.is_bool()) {
+    value = !cinn::runtime::CheckStringFlagFalse(str_value);
+  } else {
+    LOG(FATAL) << "FillConstant only support int/float/bool, but here " << dtype;
+  }
+  auto out =
+      CustomInstr("fill_constant", {}, {{"shape", shape}, {"value", value}, {"dtype", dtype}, {"force_cpu", force_cpu}})
+          .front();
+  if (!name.empty()) {
+    out.set_id(cinn::utils::TransValidVarName(name));
+  }
+  return out;
+}
+
+std::vector<Variable> NetBuilder::Split(const Variable& operand, const std::vector<int>& num_or_sections, int axis) {
+  return CustomInstr("split", {operand}, {{"num_or_sections", num_or_sections}, {"axis", axis}});
+}
+
+Variable NetBuilder::Concat(const std::vector<Variable>& input_vars, int axis) {
+  CHECK(!input_vars.empty()) << "The inputs of concat op should not be empty! Please check.";
+  return CustomInstr("concat", input_vars, {{"axis", axis}}).front();
+}
+
+Variable NetBuilder::BroadcastTo(const Variable& operand, const std::vector<int>& out_shape) {
+  auto x_shape_size = operand->shape.size();
+  auto y_shape_size = out_shape.size();
+  CHECK_GT(x_shape_size, 0) << "Cannot broadcast a empty operand " << operand->id << " to "
+                            << cinn::utils::Join(out_shape, ",");
+  CHECK_LE(x_shape_size, y_shape_size) << "The broadcast_p's input shape dimension should less than the output's, "
+                                       << "but here (" << x_shape_size << " > " << y_shape_size << ").";
+
+  VLOG(4) << "Try broadcast " << operand->id << " from shape (" << cinn::utils::Join(operand->shape, ",")
+          << ") to shape (" << cinn::utils::Join(out_shape, ",") << ")";
+
+  std::vector<int> broadcast_axes(x_shape_size, 0);
+  if (x_shape_size > 1) {
+    for (int i = 1; i <= x_shape_size; ++i) {
+      CHECK((out_shape[y_shape_size - i] == operand->shape[x_shape_size - i]) ||
+            (operand->shape[x_shape_size - i] == 1))
+          << "We cannot broadcast from shape (" << cinn::utils::Join(operand->shape, ",") << ") to shape ("
+          << cinn::utils::Join(out_shape, ",") << ")";
+      broadcast_axes[x_shape_size - i] = y_shape_size - i;
+    }
+  } else {
+    int axis     = -1;
+    auto x_shape = operand->shape.at(0);
+    if (x_shape == 1) {
+      // Can broadcast directly, default axis 0
+      axis = 0;
+    } else {
+      // The broadcast axes is the index of the shape in out_shape when the input dimension is 1
+      for (int i = 0; i < y_shape_size; ++i) {
+        if (out_shape[i] == x_shape) {
+          axis = i;
+          break;
+        }
+      }
+      CHECK_NE(axis, -1) << "When we broadcast a 1-dimension shape, the number should contained in the out_shape. "
+                         << "We cannot broadcast from shape (" << cinn::utils::Join(operand->shape, ",")
+                         << ") to shape (" << cinn::utils::Join(out_shape, ",") << ")";
+    }
+    broadcast_axes[0] = axis;
+  }
+
+  return BroadcastTo(operand, out_shape, broadcast_axes);
+}
+
+Variable NetBuilder::BroadcastTo(const Variable& operand,
+                                 const std::vector<int>& out_shape,
+                                 const std::vector<int>& broadcast_axes) {
+  return CustomInstr("broadcast_to", {operand}, {{"out_shape", out_shape}, {"broadcast_axes", broadcast_axes}}).front();
+}
+
+Variable NetBuilder::Reshape(const Variable& operand, const std::vector<int>& shape) {
+  return CustomInstr("reshape", {operand}, {{"shape", shape}}).front();
+}
+
+Variable NetBuilder::Transpose(const Variable& operand, const std::vector<int>& axis) {
+  return CustomInstr("transpose", {operand}, {{"axis", utils::GetPositiveAxes(axis, operand->shape.size())}}).front();
+}
+
+Variable NetBuilder::Slice(const Variable& operand,
+                           const std::vector<int>& axes,
+                           const std::vector<int>& starts,
+                           const std::vector<int>& ends,
+                           const std::vector<int>& infer_flags,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& decrease_axis) {
+  return CustomInstr("slice",
+                     {operand},
+                     {{"axes", axes},
+                      {"starts", starts},
+                      {"ends", ends},
+                      {"infer_flags", infer_flags},
+                      {"strides", strides},
+                      {"decrease_axis", decrease_axis}})
+      .front();
+}
+
+Variable NetBuilder::SliceAssign(const Variable& input,
+                                 const Variable& assign,
+                                 const std::vector<int>& axes,
+                                 const std::vector<int>& starts,
+                                 const std::vector<int>& ends,
+                                 const std::vector<int>& strides) {
+  return CustomInstr("slice_assign",
+                     {input, assign},
+                     {{"axes", axes}, {"starts", starts}, {"ends", ends}, {"strides", strides}})
+      .front();
+}
+
+Variable NetBuilder::Reverse(const Variable& operand, const std::vector<int>& axis) {
+  return CustomInstr("reverse", {operand}, {{"axis", utils::GetPositiveAxes(axis, operand->shape.size())}}).front();
+}
+
+Variable NetBuilder::Select(const Variable& condition, const Variable& true_value, const Variable& false_value) {
+  return CustomInstr("select", {condition, true_value, false_value}, {}).front();
+}
+
+Variable NetBuilder::Gather(const Variable& operand, const Variable& index, int axis) {
+  size_t x_ndim = operand->shape.size();
+  if (axis < 0) {
+    axis += static_cast<int>(x_ndim);
+  }
+  CHECK_LT(axis, x_ndim) << "Axis must be in [" << -x_ndim << ", " << x_ndim - 1 << ").";
+  Variable transformed_index = index;
+  // If we got 1-D Tensor, the first step is reshape, in order to keep operand.rank == index.rank
+  if (index->shape.size() == 1) {
+    std::vector<int> index_reshape(x_ndim, 1);
+    index_reshape[axis] = index->shape[0];
+    transformed_index   = Reshape(index, index_reshape);
+  }
+  // Then we need to broadcast transformed index
+  auto broadcast_shape  = operand->shape;
+  broadcast_shape[axis] = transformed_index->shape[axis];
+  transformed_index     = BroadcastTo(transformed_index, broadcast_shape);
+  return CustomInstr("gather", {operand, transformed_index}, {{"axis", axis}}).front();
+}
+
+Variable NetBuilder::ScatterAssign(const Variable& operand, const Variable& updates, const Variable& index, int axis) {
+  return CustomInstr("scatter_assign", {operand, updates, index}, {{"axis", axis}}).front();
+}
+
+Variable NetBuilder::ScatterAdd(const Variable& operand, const Variable& updates, const Variable& index, int axis) {
+  return CustomInstr("scatter_add", {operand, updates, index}, {{"axis", axis}}).front();
+}
+
+Variable NetBuilder::IsClose(const Variable& x, const Variable& y, float rtol, float atol, bool equal_nan) {
+  return CustomInstr("isclose", {x, y}, {{"rtol", rtol}, {"atol", atol}, {"equal_nan", equal_nan}}).front();
+}
+
+Variable NetBuilder::Mul(const Variable& a, const Variable& b, int x_num_col_dims, int y_num_col_dims, bool is_infer) {
+  return CustomInstr("mul",
+                     {a, b},
+                     {{"x_num_col_dims", x_num_col_dims}, {"y_num_col_dims", y_num_col_dims}, {"is_infer", is_infer}})
+      .front();
+}
+
+const std::vector<Variable>& NetBuilder::ElementwiseAddGrad(const Variable& dout,
+                                                            const Variable& x,
+                                                            const Variable& y,
+                                                            int axis) {
+  return CustomInstr("elementwise_add_grad", {dout, x, y}, {{"axis", axis}});
+}
+
+Variable NetBuilder::Relu6(const Variable& a, float threshold) {
+  return CustomInstr("relu6", {a}, {{"threshold", threshold}}).front();
+}
+
+Variable NetBuilder::ReluGrad(const Variable& lhs, const Variable& rhs) {
+  return CustomInstr("relu_grad", {lhs, rhs}, {}).front();
+}
+
+Variable NetBuilder::GatherNd(const Variable& x, const Variable& index) {
+  return CustomInstr("gather_nd", {x, index}, {}).front();
+}
+
+Variable NetBuilder::Cast(const Variable& operand, const std::string& dtype) {
+  return CustomInstr("cast", {operand}, {{"dtype", dtype}}).front();
+}
+
+Variable NetBuilder::BitcastConvert(const Variable& operand, const std::string& dtype) {
+  std::string input_data_type = common::Type2Str(operand->type);
+  return CustomInstr("bitcast_convert", {operand}, {{"dtype", dtype}, {"input_data_type", input_data_type}}).front();
+}
+
+Variable NetBuilder::OneHot(const Variable& indices,
+                            const Variable& on_value,
+                            const Variable& off_value,
+                            const int depth,
+                            const int axis,
+                            const std::string& dtype) {
+  return CustomInstr("one_hot", {indices, on_value, off_value}, {{"depth", depth}, {"axis", axis}, {"dtype", dtype}})
+      .front();
+}
+
+Variable NetBuilder::Squeeze(const Variable& operand, const std::vector<int>& axes) {
+  return CustomInstr("squeeze", {operand}, {{"axes", axes}}).front();
+}
+
+Variable NetBuilder::ExpandDims(const Variable& operand, const cinn::utils::ShapeType& axes) {
+  return CustomInstr("expand_dims", {operand}, {{"axes", axes}}).front();
+}
+
+Variable NetBuilder::Conv(const Variable& lhs,
+                          const Variable& rhs,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::vector<int>& dilations,
+                          int groups,
+                          const std::string& conv_type,
+                          const std::string& data_format,
+                          const std::string& padding_algorithm,
+                          const std::vector<int>& output_shape) {
+  return CustomInstr("conv2d",
+                     {lhs, rhs},
+                     {{"stride", strides},
+                      {"padding", paddings},
+                      {"dilation", dilations},
+                      {"groups", groups},
+                      {"conv_type", conv_type},
+                      {"data_format", data_format},
+                      {"padding_algorithm", padding_algorithm},
+                      {"output_shape", output_shape}})
+      .front();
+}
+
+std::vector<Variable> NetBuilder::ArgSort(const Variable& operand, const int& axis, const bool& is_ascend) {
+  return CustomInstr("argsort", {operand}, {{"axis", axis}, {"is_ascend", is_ascend}});
+}
+
+Variable NetBuilder::Sort(const Variable& operand, const int& axis, const bool& is_ascend) {
+  return CustomInstr("sort", {operand}, {{"axis", axis}, {"is_ascend", is_ascend}}).front();
+}
+
+Variable NetBuilder::Argmax(const Variable& x, const int& axis, const bool& keep_dim) {
+  return CustomInstr("argmax", {x}, {{"axis", axis}, {"keep_dim", keep_dim}}).front();
+}
+
+Variable NetBuilder::Argmin(const Variable& x, const int& axis, const bool& keep_dim) {
+  return CustomInstr("argmin", {x}, {{"axis", axis}, {"keep_dim", keep_dim}}).front();
+}
+
+Variable NetBuilder::LookupTable(const Variable& table, const Variable& ids, int64_t padding_idx) {
+  return CustomInstr("lookup_table", {table, ids}, {{"padding_idx", padding_idx}}).front();
+}
+
+Variable NetBuilder::Conv2d(const Variable& a,
+                            const Variable& b,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            const std::vector<int>& dilations,
+                            int groups,
+                            const std::string& data_format,
+                            const std::string& padding_algorithm) {
+  return Conv(a, b, strides, paddings, dilations, groups, "forward", data_format, padding_algorithm, {});
+}
+
+Variable NetBuilder::DepthwiseConv2d(const Variable& a,
+                                     const Variable& b,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings,
+                                     const std::vector<int>& dilations,
+                                     int groups,
+                                     const std::string& data_format,
+                                     const std::string& padding_algorithm) {
+  return CustomInstr("depthwise_conv2d",
+                     {a, b},
+                     {{"stride", strides},
+                      {"padding", paddings},
+                      {"dilation", dilations},
+                      {"groups", groups},
+                      {"data_format", data_format},
+                      {"padding_algorithm", padding_algorithm}})
+      .front();
+}
+
+std::vector<int> UpdatePool2dKernelSize(const std::vector<int>& x_shape,
+                                        const std::vector<int>& ksize,
+                                        const bool global_pooling,
+                                        const std::string& data_format) {
+  std::vector<int> new_ksize{ksize};
+  // Setting h/w_axis according to data_format
+  int height_axis = -1;
+  int width_axis  = -1;
+  if (data_format == "NCHW") {
+    height_axis = 2;
+    width_axis  = 3;
+  } else if (data_format == "NHWC") {
+    height_axis = 1;
+    width_axis  = 2;
+  } else {
+    LOG(FATAL) << "Unsupport data_format: " << data_format;
+  }
+  if (global_pooling) {
+    new_ksize[0] = x_shape[height_axis];
+    new_ksize[1] = x_shape[width_axis];
+  }
+  return new_ksize;
+}
+
+std::vector<int> UpdatePool2dPaddings(const std::vector<int>& paddings,
+                                      const std::vector<int>& x_shape,
+                                      const std::vector<int>& ksize,
+                                      const std::vector<int>& stride,
+                                      const bool global_pooling,
+                                      const bool adaptive,
+                                      const std::string& padding_algorithm,
+                                      const std::string& data_format) {
+  std::vector<int> new_paddings{paddings};
+  if (paddings.size() == 2) {
+    new_paddings.insert(new_paddings.end(), paddings.begin(), paddings.end());
+  }
+  CHECK_EQ(new_paddings.size(), 4) << "Padding size must be 2 or 4, but got: " << paddings.size();
+  // Setting h/w_axis according to data_format
+  int height_axis = -1;
+  int width_axis  = -1;
+  if (data_format == "NCHW") {
+    height_axis = 2;
+    width_axis  = 3;
+  } else if (data_format == "NHWC") {
+    height_axis = 1;
+    width_axis  = 2;
+  } else {
+    LOG(FATAL) << "Unsupport data_format: " << data_format;
+  }
+  // When padding_algorithm is VALID, set paddings to [0, 0, 0, 0].
+  // When padding_algorithm is SAME, the calculation formula of padding is as follows:
+  // output_h/w = ceil(input_h/w / stride_h/w)
+  // padding_sum_h/w = (output_h/w - 1) * stride_h/w + kernel_h/w - input_h/w
+  // padding_top/left = padding_sum_h/w / 2;
+  // padding_bottom/right = padding_sum_h/w - padding_top/left
+  if (padding_algorithm == "VALID") {
+    new_paddings = {0, 0, 0, 0};
+  } else if (padding_algorithm == "SAME") {
+    int out_size_h = (x_shape[height_axis] + stride[0] - 1) / stride[0];
+    int out_size_w = (x_shape[width_axis] + stride[1] - 1) / stride[1];
+    int pad_sum_h  = std::max((out_size_h - 1) * stride[0] + ksize[0] - x_shape[height_axis], 0);
+    int pad_sum_w  = std::max((out_size_w - 1) * stride[1] + ksize[1] - x_shape[width_axis], 0);
+    int pad_top    = pad_sum_h / 2;
+    int pad_bottom = pad_sum_h - pad_top;
+    int pad_left   = pad_sum_w / 2;
+    int pad_right  = pad_sum_w - pad_left;
+    new_paddings   = {pad_top, pad_left, pad_bottom, pad_right};
+  }
+  // When global_pooling or adaptive is true, set paddings to [0, 0, 0, 0].
+  if (global_pooling || adaptive) {
+    new_paddings = {0, 0, 0, 0};
+  }
+  return new_paddings;
+}
+
+Variable NetBuilder::Pool2d(const Variable& a,
+                            const std::string& pooling_type,
+                            const std::vector<int>& ksize,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            bool ceil_mode,
+                            bool exclusive,
+                            bool global_pooling,
+                            const std::string& data_format,
+                            bool adaptive,
+                            const std::string& padding_algorithm) {
+  // Check input dim
+  CHECK_EQ(a->shape.size(), 4) << "Input's dim must be 4, but " << a->id << "'s shape is ["
+                               << cinn::utils::Join(a->shape, ", ") << "].";
+  // Transform pool_type
+  std::string pool_type;
+  std::transform(pooling_type.begin(), pooling_type.end(), std::back_inserter(pool_type), [](unsigned char c) {
+    return std::tolower(c);
+  });
+  CHECK(pool_type == "avg" || pool_type == "max") << "Pool_type must be avg or max, but got: " << pool_type;
+  // Transform ksize
+  std::vector<int> input_ksize{ksize};
+  if (input_ksize.size() == 1) {
+    input_ksize.insert(input_ksize.end(), ksize.begin(), ksize.end());
+  }
+  CHECK_EQ(input_ksize.size(), 2) << "Kernel_size length must be 1 or 2, but got: " << ksize.size();
+  // Transform stride
+  std::vector<int> new_strides{strides};
+  if (new_strides.size() == 1) {
+    new_strides.insert(new_strides.end(), strides.begin(), strides.end());
+  }
+  CHECK_EQ(new_strides.size(), 2) << "Stride length must be 1 or 2, but got: " << strides.size();
+  CHECK(new_strides[0] > 0 && new_strides[1] > 0) << "the value of kernel size for pool2d should greater than 0.";
+  // Transform data_format
+  std::string new_data_format{data_format};
+  if (new_data_format == "AnyLayout") {
+    new_data_format.assign("NCHW");
+  }
+  CHECK(new_data_format == "NCHW" || new_data_format == "NHWC")
+      << "Data_format must be AnyLayout/NCHW/NHWC, but got: " << data_format;
+  // Check padding_algorithm
+  CHECK(padding_algorithm == "EXPLICIT" || padding_algorithm == "SAME" || padding_algorithm == "VALID")
+      << "Padding_algorithm must be EXPLICIT/SAME/VALID, but got: " << padding_algorithm;
+  utils::AttributeMap attrs = {{"pool_type", pool_type},
+                               {"origin_kernel_size", input_ksize},
+                               {"stride_size", new_strides},
+                               {"origin_padding_size", paddings},
+                               {"ceil_mode", ceil_mode},
+                               {"exclusive", exclusive},
+                               {"origin_global_pooling", global_pooling},
+                               {"data_format", new_data_format},
+                               {"origin_adaptive", adaptive},
+                               {"padding_algorithm", padding_algorithm}};
+  // In avg_pool2d, if global_pooling = false, adaptive = true and ksize is [1, 1], we turn off adaptive and use global
+  // pooling instead
+  if (pooling_type == "avg" && !global_pooling && adaptive && input_ksize[0] == 1 && input_ksize[1] == 1) {
+    VLOG(4) << "In avg_pool2d, got global_pooling = false, adaptive = true, ksize = [1, 1], turn off adaptive and "
+               "trans to global_pooling";
+    adaptive       = false;
+    global_pooling = true;
+  }
+  // Transform paddings
+  auto new_paddings = UpdatePool2dPaddings(
+      paddings, a->shape, input_ksize, new_strides, global_pooling, adaptive, padding_algorithm, new_data_format);
+  // Update kernel_size
+  auto new_ksize          = UpdatePool2dKernelSize(a->shape, input_ksize, global_pooling, new_data_format);
+  attrs["kernel_size"]    = new_ksize;
+  attrs["padding_size"]   = new_paddings;
+  attrs["adaptive"]       = adaptive;
+  attrs["global_pooling"] = global_pooling;
+  return CustomInstr("pool2d", {a}, attrs).front();
+}
+
+Variable NetBuilder::Pool2dGrad(const Variable& x,
+                                const Variable& y,
+                                const Variable& dy,
+                                const std::string& pooling_type,
+                                const std::vector<int>& ksize,
+                                const std::vector<int>& strides,
+                                const std::vector<int>& paddings,
+                                bool ceil_mode,
+                                bool exclusive,
+                                bool global_pooling,
+                                const std::string& data_format,
+                                bool adaptive,
+                                const std::string& padding_algorithm) {
+  // Transform pool_type
+  std::string pool_type;
+  std::transform(pooling_type.begin(), pooling_type.end(), std::back_inserter(pool_type), [](unsigned char c) {
+    return std::tolower(c);
+  });
+  CHECK(pool_type == "avg" || pool_type == "max") << "Pool_type must be avg or max, but got: " << pool_type;
+  // Transform ksize
+  std::vector<int> input_ksize{ksize};
+  if (input_ksize.size() == 1) {
+    input_ksize.insert(input_ksize.end(), ksize.begin(), ksize.end());
+  }
+  CHECK_EQ(input_ksize.size(), 2) << "Kernel_size length must be 1 or 2, but got: " << ksize.size();
+  // Transform stride
+  std::vector<int> new_strides{strides};
+  if (new_strides.size() == 1) {
+    new_strides.insert(new_strides.end(), strides.begin(), strides.end());
+  }
+  CHECK_EQ(new_strides.size(), 2) << "Stride length must be 1 or 2, but got: " << strides.size();
+  CHECK(new_strides[0] > 0 && new_strides[1] > 0) << "the value of kernel size for pool2d should greater than 0.";
+  // Transform data_format
+  std::string new_data_format{data_format};
+  if (new_data_format == "AnyLayout") {
+    new_data_format.assign("NCHW");
+  }
+  CHECK(new_data_format == "NCHW" || new_data_format == "NHWC")
+      << "Data_format must be AnyLayout/NCHW/NHWC, but got: " << data_format;
+  // Check padding_algorithm
+  CHECK(padding_algorithm == "EXPLICIT" || padding_algorithm == "SAME" || padding_algorithm == "VALID")
+      << "Padding_algorithm must be EXPLICIT/SAME/VALID, but got: " << padding_algorithm;
+  // In avg_pool2d, if global_pooling = false, adaptive = true and ksize is [1, 1], we turn off adaptive and use global
+  // pooling instead
+  if (pooling_type == "avg" && !global_pooling && adaptive && input_ksize[0] == 1 && input_ksize[1] == 1) {
+    VLOG(4) << "In avg_pool2d, got global_pooling = false, adaptive = true, ksize = [1, 1], turn off adaptive and "
+               "trans to global_pooling";
+    adaptive       = false;
+    global_pooling = true;
+  }
+  // Transform paddings
+  auto new_paddings = UpdatePool2dPaddings(
+      paddings, x->shape, input_ksize, new_strides, global_pooling, adaptive, padding_algorithm, new_data_format);
+  // Update kernel_size
+  auto new_ksize = UpdatePool2dKernelSize(x->shape, input_ksize, global_pooling, new_data_format);
+  return CustomInstr("pool2d_grad",
+                     {x, y, dy},
+                     {{"pool_type", pool_type},
+                      {"kernel_size", new_ksize},
+                      {"stride_size", new_strides},
+                      {"padding_size", new_paddings},
+                      {"ceil_mode", ceil_mode},
+                      {"exclusive", exclusive},
+                      {"global_pooling", global_pooling},
+                      {"data_format", new_data_format},
+                      {"adaptive", adaptive},
+                      {"padding_algorithm", padding_algorithm}})
+      .front();
+}
+
+Variable NetBuilder::Repeat(const Variable& x, int repeats, int axis) {
+  return CustomInstr("repeat", {x}, {{"repeats", repeats}, {"axis", axis}}).front();
+}
+
+Variable NetBuilder::Resize(const Variable& x, const std::vector<int>& out_shape, const std::string& mode) {
+  return CustomInstr("resize", {x}, {{"out_shape", out_shape}, {"mode", mode}}).front();
+}
+
+std::vector<Variable> NetBuilder::BatchNorm(const Variable& a,
+                                            const Variable& scale,
+                                            const Variable& bias,
+                                            const Variable& mean,
+                                            const Variable& variance,
+                                            float epsilon,
+                                            float momentum,
+                                            const std::string& data_layout,
+                                            bool is_test) {
+  std::string op_type = is_test ? "batch_norm" : "batch_norm_train";
+  return CustomInstr(op_type,
+                     {a, scale, bias, mean, variance},
+                     {{"epsilon", epsilon}, {"momentum", momentum}, {"data_layout", data_layout}});
+}
+
+// batch norm grad, output(grad_x, grad_scale, grad_bias)
+std::vector<Variable> NetBuilder::BatchNormGrad(const Variable& dy,
+                                                const Variable& x,
+                                                const Variable& scale,
+                                                const Variable& save_mean,
+                                                const Variable& save_variance,
+                                                const float epsilon,
+                                                const std::string& data_layout) {
+  return CustomInstr("batch_norm_grad",
+                     {dy, x, scale, save_mean, save_variance},
+                     {{"epsilon", epsilon}, {"data_layout", data_layout}});
+}
+
+Variable NetBuilder::Scale(const Variable& a, float scale, float bias, bool bias_after_scale) {
+  return CustomInstr("scale", {a}, {{"scale", scale}, {"bias", bias}, {"bias_after_scale", bias_after_scale}}).front();
+}
+
+Variable NetBuilder::Softmax(const Variable& a,
+                             const std::vector<int>& axes,
+                             const std::string& mode,
+                             const std::string& data_format) {
+  return CustomInstr("softmax", {a}, {{"axes", axes}, {"mode", mode}, {"data_format", data_format}}).front();
+}
+
+Variable NetBuilder::DropoutInfer(const Variable& a, float dropout_prob, const std::string& dropout_implementation) {
+  return CustomInstr(
+             "dropout_infer", {a}, {{"dropout_prob", dropout_prob}, {"dropout_implementation", dropout_implementation}})
+      .front();
+}
+
+Variable NetBuilder::Sum(const std::vector<Variable>& inputs) {
+  return CustomInstr("sum", inputs, {}).front();
+  ;
+}
+
+Variable NetBuilder::Arange(const float start, const float stop, const float step, const std::string& dtype) {
+  return CustomInstr("arange", {}, {{"start", start}, {"stop", stop}, {"step", step}, {"dtype", dtype}}).front();
+}
+
+Variable NetBuilder::Flip(const Variable& operand, const std::vector<int>& axes) {
+  return CustomInstr("reverse", {operand}, {{"axis", utils::GetPositiveAxes(axes, operand->shape.size())}}).front();
+}
+
+Variable NetBuilder::Matmul(const Variable& x, const Variable& y, bool trans_x, bool trans_y, float alpha) {
+  return CustomInstr("matmul", {x, y}, {{"trans_a", trans_x}, {"trans_b", trans_y}, {"alpha", alpha}}).front();
+  ;
+}
+
+Variable NetBuilder::GaussianRandom(
+    const std::vector<int>& shape, float mean, float std, int seed, const std::string& dtype) {
+  return CustomInstr(
+             "gaussian_random", {}, {{"shape", shape}, {"mean", mean}, {"std", std}, {"seed", seed}, {"dtype", dtype}})
+      .front();
+}
+
+Variable NetBuilder::UniformRandom(const std::vector<int>& shape,
+                                   float min,
+                                   float max,
+                                   int seed,
+                                   const std::string& dtype,
+                                   int diag_num,
+                                   int diag_step,
+                                   float diag_val) {
+  auto uniform_out =
+      CustomInstr(
+          "uniform_random", {}, {{"shape", shape}, {"min", min}, {"max", max}, {"seed", seed}, {"dtype", dtype}})
+          .front();
+  if (min == 0.0f && max == 1.0f) {
+    return uniform_out;
+  }
+  auto uniform_range   = FillConstant(shape, max - min, UniqName("uniform_range"), dtype);
+  auto uniform_mul_out = Multiply(uniform_out, uniform_range);
+  auto uniform_min     = FillConstant(shape, min, UniqName("uniform_min"), dtype);
+  auto uniform_res     = Add(uniform_mul_out, uniform_min);
+  if (diag_num > 0) {
+    int numel = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+    CHECK_GT(numel, (diag_num - 1) * (diag_step + 1)) << "(diag_num - 1) * (diag_step + 1) should smaller than numel!";
+    auto diag_index =
+        Arange(0.0f, static_cast<float>(diag_num * (diag_step + 1)), static_cast<float>(diag_step + 1), "int32");
+    auto diag_val_tensor = FillConstant(diag_index->shape, diag_val, "diag_val", dtype);
+    auto uniform_flatten = Reshape(uniform_res, {-1});
+    auto uniform_scatter = ScatterAssign(uniform_flatten, diag_val_tensor, diag_index);
+    uniform_res          = Reshape(uniform_scatter, shape);
+  }
+  return uniform_res;
+}
+
+Variable NetBuilder::RandInt(const std::vector<int>& shape, int min, int max, int seed, const std::string& dtype) {
+  CHECK_GT(max, min) << "max: " << max << "should greater than"
+                     << "min: " << min;
+  auto randint_out   = CustomInstr("randint", {}, {{"shape", shape}, {"seed", seed}, {"dtype", dtype}}).front();
+  randint_out        = Cast(randint_out, dtype);
+  auto randint_range = FillConstant(shape, max - min, UniqName("randint_range"), dtype);
+  auto randint_mod   = Mod(randint_out, randint_range);
+  auto randint_min   = FillConstant(shape, min, UniqName("randint_min"), dtype);
+  auto randint_ret   = Add(randint_mod, randint_min);
+  return randint_ret;
+}
+
+Variable NetBuilder::Cholesky(const Variable& x, bool upper) {
+  auto cholesky_out = CustomInstr("cholesky", {x}, {{"upper", upper}}).front();
+  // Set upper/lower triangle of matrices to 0
+  auto x_ndim = x->shape.size();
+  CHECK_GE(x_ndim, 2) << "The input matrix x shape size should >= 2! Please check again.";
+  CHECK_EQ(x->shape[x_ndim - 1], x->shape[x_ndim - 2])
+      << "The input matrix x's last 2 dimensions must be the same! Please check again.";
+  int m          = x->shape[x_ndim - 1];
+  auto m_tensor  = FillConstant({m * m}, m);
+  auto index     = Arange(0.0f, static_cast<float>(m * m), 1.0f, "int32");
+  auto index_row = Mod(index, m_tensor);
+  auto index_col = FloorDivide(index, m_tensor);
+  auto mask      = upper ? GreaterEqual(index_row, index_col) : LessEqual(index_row, index_col);
+  auto mask_mat  = Reshape(mask, {m, m});
+  auto mask_full = BroadcastTo(mask_mat, x->shape);
+  auto zeros     = FillConstant(x->shape, 0.0f, "zeros", common::Type2Str(x->type));
+  auto out       = Select(mask_full, cholesky_out, zeros);
+  return out;
+}
+
+Variable NetBuilder::TriangularSolve(
+    const Variable& input1, const Variable& input2, bool left_side, bool upper, bool transpose_a, bool unit_diagonal) {
+  // broadcast
+  std::vector<Variable> inputs{input1, input2};
+  {
+    auto a_ndim = input1->shape.size();
+    auto b_ndim = input2->shape.size();
+    CHECK_GE(a_ndim, 2) << "The input matrix A shape size should >= 2! Please check again.";
+    CHECK_GE(b_ndim, 2) << "The input matrix B shape size should >= 2! Please check again.";
+    std::vector<int> input1_shape_cut(input1->shape.begin(), input1->shape.end() - 2);
+    std::vector<int> input2_shape_cut(input2->shape.begin(), input2->shape.end() - 2);
+    std::vector<int> common_shape;
+    hlir::pe::GetBroadcastOutShape(input1_shape_cut, input2_shape_cut, &common_shape);
+
+    // broadcast input1
+    std::vector<int> input1_shape(common_shape.begin(), common_shape.end());
+    input1_shape.push_back(input1->shape[a_ndim - 2]);
+    input1_shape.push_back(input1->shape[a_ndim - 1]);
+    inputs[0] = BroadcastTo(input1, input1_shape);
+
+    // broadcast input2
+    std::vector<int> input2_shape(common_shape.begin(), common_shape.end());
+    input2_shape.push_back(input2->shape[b_ndim - 2]);
+    input2_shape.push_back(input2->shape[b_ndim - 1]);
+    inputs[1] = BroadcastTo(input2, input2_shape);
+  }
+
+  return CustomInstr("triangular_solve",
+                     inputs,
+                     {{"left_side", left_side},
+                      {"upper", upper},
+                      {"transpose_a", transpose_a},
+                      {"unit_diagonal", unit_diagonal}})
+      .front();
+}
+
+std::vector<Variable> NetBuilder::TopK(const Variable& x, int k, int axis, bool largest) {
+  return CustomInstr("top_k", {x}, {{"k", k}, {"axis", axis}, {"largest", largest}});
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/net_builder.h b/paddle/cinn/frontend/net_builder.h
new file mode 100644
index 0000000000000..b16b9a91a4e61
--- /dev/null
+++ b/paddle/cinn/frontend/net_builder.h
@@ -0,0 +1,1146 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "cinn/common/macros.h"
+#include "cinn/common/type.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/utils/functional.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace frontend {
+
+// clang-format off
+
+// ******************************************* //
+// Elementwise compute each element in `input` variable, and return the result Variable.
+// Variable UNARY_OP(const Variable& x);
+#define NETBUILDER_UNARY_OP_FOREACH(macro__) \
+  macro__(Sqrt) \
+  macro__(Tanh) \
+  macro__(Relu) \
+  macro__(Gelu) \
+  macro__(Sigmoid) \
+  macro__(Identity) \
+  macro__(Exp) \
+  macro__(Erf) \
+  macro__(Rsqrt) \
+  macro__(Log) \
+  macro__(Log2) \
+  macro__(Log10) \
+  macro__(Floor) \
+  macro__(Ceil) \
+  macro__(Round) \
+  macro__(Trunc) \
+  macro__(Sin) \
+  macro__(Cos) \
+  macro__(Tan) \
+  macro__(Sinh) \
+  macro__(Cosh) \
+  macro__(Asin) \
+  macro__(Acos) \
+  macro__(Atan) \
+  macro__(Asinh) \
+  macro__(Acosh) \
+  macro__(Atanh) \
+  macro__(IsNan) \
+  macro__(IsFinite) \
+  macro__(IsInf) \
+  macro__(LogicalNot) \
+  macro__(BitwiseNot) \
+  macro__(Negative) \
+  macro__(Sign) \
+  macro__(Abs) \
+  macro__(Cbrt) \
+  macro__(Clz) \
+  macro__(Popc) \
+  macro__(Reciprocal)
+
+// ******************************************* //
+// The op has two input and one output, with a attribute [axis]
+// Variable BINARY_OP(const Variable& lhs, const Variable& rhs, int axis = -1);
+#define NETBUILDER_BINARY_OP_FOREACH(macro__) \
+  macro__(Add) \
+  macro__(ElementwiseAdd) \
+  macro__(Atan2) \
+  macro__(Subtract) \
+  macro__(Divide) \
+  macro__(Multiply) \
+  macro__(ElementwiseMul) \
+  macro__(FloorDivide) \
+  macro__(Mod) \
+  macro__(Remainder) \
+  macro__(Max) \
+  macro__(Min) \
+  macro__(Pow) \
+  macro__(LogicalAnd) \
+  macro__(LogicalOr) \
+  macro__(LogicalXor) \
+  macro__(BitwiseAnd) \
+  macro__(BitwiseOr) \
+  macro__(BitwiseXor) \
+  macro__(LeftShift) \
+  macro__(RightShift) \
+  macro__(Equal) \
+  macro__(NotEqual) \
+  macro__(GreaterThan) \
+  macro__(LessThan) \
+  macro__(GreaterEqual) \
+  macro__(LessEqual) \
+  macro__(LogicalRightShift)
+
+// ******************************************* //
+// Reduce array elements over the given dims.
+// Variable REDUCE_OP(const Variable& x, const cinn::utils::ShapeType& dim = {}, bool keep_dim = false);
+#define NETBUILDER_REDUCE_OP_FOREACH(macro__) \
+  macro__(ReduceSum) \
+  macro__(ReduceProd) \
+  macro__(ReduceMax) \
+  macro__(ReduceMin) \
+  macro__(ReduceAll) \
+  macro__(ReduceAny)
+// clang-format on
+
+class NetBuilder {
+  using AttributeMap = utils::AttributeMap;
+
+ private:
+  std::string name_;
+  std::vector<Instruction> instrs_;
+  std::vector<Variable> inputs_;
+
+ public:
+  // class base API
+  explicit NetBuilder(const std::string& name);
+
+  Program Build(bool in_reverse = false);
+
+  // name of this builder
+  const std::string& name() { return name_; }
+
+  // the number of instructions
+  const size_t size() { return instrs_.size(); }
+
+  virtual ~NetBuilder() = default;
+
+  void AppendInstruction(const Instruction& instr) { instrs_.push_back(instr); }
+
+  void InferShape(Instruction instr) const;
+
+  const std::vector<Variable>& CustomInstr(const std::string& type,
+                                           const std::vector<Variable>& inputs,
+                                           const AttributeMap& attrs);
+
+ protected:
+  /**
+   * @brief Helper function of UnaryOp.
+   *
+   * @param op_type The unary op name.
+   * @param x The input variable.
+   *
+   * @return The result variable.
+   */
+  Variable UnaryOp(const std::string& op_type, const Variable& x);
+
+  /**
+   * @briefHelper function of BinaryOp.
+   *
+   * @param op_type The binary op name.
+   * @param lhs The left input variable.
+   * @param rhs The right input variable.
+   * @param axis The compute axis. Default is -1.
+   *
+   * @return The result variable.
+   */
+  Variable BinaryOp(const std::string& op_type, const Variable& lhs, const Variable& rhs, int axis = -1);
+
+  /**
+   * @brief Reduce array elements over the given dims.
+   *
+   * @param op_type The reduce op name.
+   * @param x The input variable.
+   * @param dim The dims along which a sum is performed. If dim is empty, the operation will sum over all elements
+   * of the input array. If the dim has negative value, it should count from the last dim to the first. Default is None.
+   * @param keep_dim If it is set true, the axes which are reduced are left in the result as dimensions with size one.
+   * With this option, the result will broadcast correctly against the input array. Default is false.
+   *
+   * @return The result variable.
+   */
+  Variable Reduce(const std::string& op_type,
+                  const Variable& x,
+                  const cinn::utils::ShapeType& dim = {},
+                  bool keep_dim                     = false);
+
+ private:
+  // the helper function for Matmul API
+  std::pair<Variable, Variable> BroadcastMatmulInput(
+      const Variable& x, const Variable& y, bool trans_x, bool trans_y, float alpha);
+  cinn::utils::ShapeType GetMatmulOutputShape(
+      const Variable& x, const Variable& y, bool trans_x, bool trans_y, float alpha);
+
+  // the helper function for Constant API
+  template <typename T>
+  std::enable_if_t<std::is_arithmetic<T>::value, cinn::utils::ShapeType> GetVectorShape(const std::vector<T>& value) {
+    CHECK(!value.empty()) << "The vector should not has empty list! Please check.";
+    return {static_cast<int>(value.size())};
+  }
+
+  template <typename T>
+  std::enable_if_t<cinn::utils::IsVector<T>::value, cinn::utils::ShapeType> GetVectorShape(
+      const std::vector<T>& value) {
+    CHECK(!value.empty()) << "The vector should not has empty list! Please check.";
+
+    auto shape = GetVectorShape(value[0]);
+    shape.insert(shape.begin(), static_cast<int>(value.size()));
+    return shape;
+  }
+
+ public:
+  // *******************************************
+  // Elementwise Operator
+  /**
+   * @brief Elementwise compute each element in `input` variable, and return the result Variable.
+   * @param x The input variable.
+   * @return The output variable.
+   */
+#define NETBUILDER_UNARY_OP_DECL(func_name__) Variable func_name__(const Variable& x);
+  NETBUILDER_UNARY_OP_FOREACH(NETBUILDER_UNARY_OP_DECL)
+#undef NETBUILDER_UNARY_OP_DECL
+
+  /**
+   * @brief Compute each element in `lhs` variable and `rhs` variable in `axis` dimension, and return the result
+   * Variable.
+   * @param lhs The left input variable.
+   * @param rhs The right input variable.
+   * @param axis The compute axis. Default is -1.
+   * @return The result variable.
+   */
+#define NETBUILDER_BINARY_OP_DECL(func_name__) \
+  Variable func_name__(const Variable& lhs, const Variable& rhs, int axis = -1);
+  NETBUILDER_BINARY_OP_FOREACH(NETBUILDER_BINARY_OP_DECL)
+#undef NETBUILDER_BINARY_OP_DECL
+
+  /**
+   * @brief Return array elements depending on condition.
+   * @param condition The condition which determine return `true_value` or `false_value`.
+   * @param true_value Return `true_value` if the element of `condition` is true.
+   * @param false_value Return `false_value` if the element of `condition` is false.
+   * @return The result variable.
+   */
+  Variable Select(const Variable& condition, const Variable& true_value, const Variable& false_value);
+
+  /**
+   * @brief Scale operator.
+   * @param x Input N-D variable of scale operator.
+   * @param scale The scale factor of the input. Default is 1.0f.
+   * @param bias The bias to be put on the input. Default is 0.0f.
+   * @param bias_after_scale Apply bias addition after or before scaling. It is useful for numeric stability in some
+   * circumstances. Default is true.
+   * @return Output variable of scale operator, with shape and data type same as input.
+   */
+  Variable Scale(const Variable& x, float scale = 1.0f, float bias = 0.0f, bool bias_after_scale = true);
+
+  /**
+   * @brief This OP is used to sum one or more variable of the input.
+   * @param x A Varaible list. The shape and data type of the list elements should be consistent.
+   * @return The sum of input `x`. its shape and data types are consistent with `x`.
+   */
+  Variable Sum(const std::vector<Variable>& inputs);
+
+  /**
+   * @brief Drop or keep each element of x independently.
+   * @param x The input variable.
+   * @param dropout_prob Probability of setting units to zero. The dropout operator randomly sets (according to the
+   * given dropout probability) the outputs of some units to zero, while others are remain unchanged.
+   * @param dropout_implementation Choice the mode of dropout. When "downgrade_in_infer", downgrade the outcome at
+   * inference: `train: out = input * mask, inference: out = input * (1.0 - dropout_prob)`. When "upscale_in_train",
+   * upscale the outcome at training time: `train: out = input * mask / ( 1.0 - dropout_prob), inference: out = input`.
+   * @return A variable representing the dropout, has same shape and data type with input.
+   */
+  Variable DropoutInfer(const Variable& x,
+                        float dropout_prob                        = 0.5f,
+                        const std::string& dropout_implementation = "downgrade_in_infer");
+
+  Variable GatherNd(const Variable& x, const Variable& index);
+
+  Variable Scatter(const Variable& src, const Variable& index, const Variable& out, const int& axis = 0);
+  Variable Scatter(const Variable& src,
+                   const Variable& index,
+                   const cinn::utils::ShapeType& shape,
+                   const float& default_value = 0,
+                   const int& axis            = 0);
+
+  Variable ScatterNd(const Variable& src,
+                     const Variable& index,
+                     const Variable& out,
+                     const cinn::utils::ShapeType& axes = {});
+  Variable ScatterNd(const Variable& src,
+                     const Variable& index,
+                     const cinn::utils::ShapeType& shape,
+                     const float& default_value         = 0,
+                     const cinn::utils::ShapeType& axes = {});
+
+  /**
+   * @brief This operator checks if all `x` and `y` satisfy the condition: `|x - y| <= atol + rtol * |y|`
+   * @param x The first variable.
+   * @param y The second variable.
+   * @param rtol The relative tolerance. Default: 1e−5f.
+   * @param atol The absolute tolerance. Default: 1e−8f.
+   * @param equal_nan If `true`, then two NaNs will be compared as equal. Default: false .
+   * @return The output variable, it’s data type is bool.
+   */
+  Variable IsClose(
+      const Variable& x, const Variable& y, float rtol = 1e-05f, float atol = 1e-08f, bool equal_nan = false);
+
+  // *******************************************
+  // Reduction operator
+  /**
+   * @brief Reduce array elements over the given dims.
+   * @param x The input variable.
+   * @param dim The dims along which a sum is performed. If dim is empty, the operation will sum over all elements
+   * of the input array. If the dim has negative value, it should count from the last dim to the first. Default is None.
+   * @param keep_dim If it is set true, the axes which are reduced are left in the result as dimensions with size one.
+   * With this option, the result will broadcast correctly against the input array. Default is false.
+   * @return The result variable.
+   */
+#define NETBUILDER_REDUCE_OP_DECL(func_name__) \
+  Variable func_name__(const Variable& x, const cinn::utils::ShapeType& dim = {}, bool keep_dim = false);
+  NETBUILDER_REDUCE_OP_FOREACH(NETBUILDER_REDUCE_OP_DECL)
+#undef NETBUILDER_REDUCE_OP_DECL
+
+  // *******************************************
+  // Tensor operator
+  /**
+   * @brief Copy and create input from `input` variable.
+   * @param input The input variable.
+   * @return The new input.
+   */
+  Placeholder CreateInput(const Variable& input);
+
+  /**
+   * @brief Create new input, whose data type is `type`, shape is `shape`, and id is `id_hint`.
+   * @param type The input variable's data type.
+   * @param shape The input variable's shape.
+   * @param id_hint The input variable's name. Default is None.
+   * @return The new input.
+   */
+  Placeholder CreateInput(const common::Type& type,
+                          const cinn::utils::ShapeType& shape,
+                          const std::string& id_hint = "");
+
+  /**
+   * @brief Create constant tensor with the specific value/vector and type
+   * @param value The constant value to be set.
+   * @param name The name of output variable.
+   * @return The result variable.
+   */
+  template <typename T>
+  std::enable_if_t<std::is_arithmetic<T>::value, Variable> Constant(const T& value,
+                                                                    const std::string& name  = "",
+                                                                    const std::string& dtype = "") {
+    auto true_dtype = dtype.empty() ? common::Type2Str(common::type_of<T>()) : dtype;
+    auto out        = CustomInstr("const_scalar", {}, {{"value", value}, {"dtype", true_dtype}}).front();
+
+    if (!name.empty()) {
+      out.set_id(name);
+    }
+    return out;
+  }
+
+  template <typename T>
+  std::enable_if_t<cinn::utils::IsVector<T>::value, Variable> Constant(const T& value,
+                                                                       const std::string& name  = "",
+                                                                       const std::string& dtype = "") {
+    CHECK(!value.empty()) << "The value of Constant should not be None or empty list! Please check.";
+
+    // flatten n-dims vector to 1-dim vector
+    auto all_datas = cinn::utils::Flatten(value);
+    CHECK(!all_datas.empty()) << "The value of Constant should not be None or empty list! Please check.";
+
+    VLOG(4) << "Constant with values: " << cinn::utils::Join(all_datas, ", ");
+
+    using TYPE      = typename decltype(all_datas)::value_type;
+    auto true_dtype = dtype.empty() ? common::Type2Str(common::type_of<TYPE>()) : dtype;
+
+    const auto& real_shape = GetVectorShape(value);
+
+    if (real_shape == std::vector<int>{1}) {
+      return Constant<TYPE>(all_datas[0], name, true_dtype);
+    }
+
+    auto assign_out = CustomInstr("assign_value", {}, {{"values", all_datas}, {"dtype", true_dtype}}).front();
+    auto out        = Reshape(assign_out, real_shape);
+
+    // set the name correctly
+    if (!name.empty()) {
+      out.set_id(name);
+    }
+    return out;
+  }
+
+  /**
+   * @brief The op return a variable with the specific value, shape and type.
+   * @param shape Shape of the variable to be created.
+   * @param value The constant value used to initialize the variable to be created.
+   * @param name The name of the output variable.
+   * @param dtype Data type of the output variable.
+   * @param force_cpu Whether the variable should force placed in cpu, default in device memory. Default is false.
+   * @return The result variable.
+   */
+  template <typename T = float>
+  Variable FillConstant(const cinn::utils::ShapeType& shape,
+                        T value,
+                        const std::string& name,
+                        const std::string& dtype,
+                        bool force_cpu = false) {
+    auto out =
+        CustomInstr(
+            "fill_constant", {}, {{"shape", shape}, {"value", value}, {"dtype", dtype}, {"force_cpu", force_cpu}})
+            .front();
+    if (!name.empty()) {
+      out.set_id(cinn::utils::TransValidVarName(name));
+    }
+    return out;
+  }
+
+  /**
+   * @brief The op return a variable with the specific string value, shape and type.
+   * @param shape Shape of the variable to be created.
+   * @param str_value The constant string value used to initialize the variable to be created.
+   * @param name The name of the output variable.
+   * @param dtype Data type of the output variable.
+   * @param force_cpu Whether the variable should force placed in cpu, default in device memory. Default is false.
+   * @return The result variable.
+   */
+  Variable FillConstant(const cinn::utils::ShapeType& shape,
+                        const std::string& str_value,
+                        const std::string& name,
+                        const std::string& dtype,
+                        bool force_cpu = false);
+
+  /**
+   * @brief The op return a variable with the specific value, shape and type, the type is infered from value.
+   * @param shape Shape of the variable to be created.
+   * @param value The constant value used to initialize the variable to be created.
+   * @param name The name of the output variable.
+   * @param force_cpu Whether the variable should force placed in cpu, default in device memory. Default is false.
+   * @return The result variable.
+   */
+  template <typename T = float>
+  Variable FillConstant(const cinn::utils::ShapeType& shape,
+                        T value,
+                        const std::string& name = "",
+                        bool force_cpu          = false) {
+    return FillConstant<T>(shape, value, name, common::Type2Str(common::type_of<T>()), force_cpu);
+  }
+
+  /**
+   * @brief Return evenly spaced values within a given interval. Values are generated within the half-open interval
+   * `[start, stop)` (in other words, the interval including start but excluding stop).
+   * @param start Start of interval. The interval includes this value.
+   * @param stop End of interval. The interval does not include this value, except in some cases where step is not
+   * an integer and floating point round-off affects the length of out.
+   * @param step Spacing between values. For any output out, this is the distance between two adjacent values, `out[i+1]
+   * - out[i]`.
+   * @param dtype The data type of the output. Default: "float32".
+   * @return A 1-D variable which is evenly spaced values within a given interval. Its data type is set by dtype.
+   */
+  Variable Arange(const float start, const float stop, const float step, const std::string& dtype);
+
+  /**
+   * @brief This operator is used to perform matrix multiplication for input x and y.
+   * @param x The first input variable.
+   * @param y The second input variable.
+   * @param x_num_col_dims If the input `x` is a variable with more than two dimensions, `x` will be flattened into a
+   * two-dimensional matrix first. The flattening rule is: the first `num_col_dims` will be flattened to form the first
+   * dimension of the final matrix (the height of the matrix), and the rest `rank(x)` - `num_col_dims` dimensions are
+   * flattened to form the second dimension of the final matrix (the width of the matrix). Default is 1.
+   * @param y_num_col_dims If the input `y` is a variable with more than two dimensions, `y` will be flattened into a
+   * two-dimensional matrix first. The attribute `y_num_col_dims` determines how `y` is flattened. See comments of
+   * `x_num_col_dims` for more details. Default is 1.
+   * @return The result variable.
+   */
+  Variable Mul(
+      const Variable& x, const Variable& y, int x_num_col_dims = 1, int y_num_col_dims = 1, bool is_infer = false);
+
+  /**
+   * @brief Applies matrix multiplication to two variable. Matmul follows the complete broadcast rules, and its behavior
+   * is consistent with `np.matmul`.
+   * @param x The left input variable.
+   * @param y The right input variable.
+   * @param trans_x Whether to transpose `x` before multiplication. Default is false.
+   * @param trans_y Whether to transpose `y` before multiplication. Default is false.
+   * @param alpha The scale of output. Default 1.0f.
+   * @return The product variable.
+   */
+  Variable Matmul(const Variable& x, const Variable& y, bool trans_x = false, bool trans_y = false, float alpha = 1.0f);
+
+  /**
+   * @brief This operation calculates the pooling output based on the input, pooling_type and pool_size, pool_stride,
+   * pool_padding parameters.
+   * @param x The input variable of pooling operator which is a 4-D variable with shape [N, C, H, W]. The format of
+   * input variable is “NCHW” or “NHWC”, where N is batch size, C is the number of channels, H is the height of the
+   * feature, and W is the width of the feature.
+   * @param pooling_type Pooling type, can be “max” for max-pooling and “avg” for average-pooling
+   * @param ksize The pool kernel size. If pool kernel size is a tuple or list, it must contain two integers,
+   * (pool_size_Height, pool_size_Width). Otherwise, the pool kernel size will be a square of an int.
+   * @param strides  The pool stride size. If pool stride size is a tuple or list, it must contain two integers,
+   * (pool_stride_Height, pool_stride_Width). Otherwise, the pool stride size will be a square of an int. Default is {1,
+   * 1}.
+   * @param paddings The padding size. If padding is a list/tuple, it must contain two integers, (padding_H, padding_W).
+   * Otherwise, the padding_H = padding_W = padding. Default: padding = {0, 0}.
+   * @param ceil_mode Whether to use the ceil function to calculate output height and width. False is the default. If it
+   * is set to False, the floor function will be used. Default False
+   * @param exclusive Whether to exclude padding points in average pooling mode, default is true.
+   * @param global_pooling Whether to use the global pooling. If global_pooling = true, kernel size and paddings will be
+   * ignored. Default False.
+   * @param data_format Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
+   * @param adaptive When true, will perform adaptive pooling instead, output shape in H and W dimensions will be same
+   * as ksize, input data will be divided into grids specify by ksize averagely and perform pooling in each grid area to
+   * get output pooling value. Default: False.
+   * @param padding_algorithm Can be "EXPLICIT"/"SAME"/"VALID". Default: "EXPLICIT".
+   * @return The output variable of pooling result. The data type is same as input variable.
+   */
+  Variable Pool2d(const Variable& x,
+                  const std::string& pooling_type,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides      = {1, 1},
+                  const std::vector<int>& paddings     = {0, 0},
+                  bool ceil_mode                       = false,
+                  bool exclusive                       = true,
+                  bool global_pooling                  = false,
+                  const std::string& data_format       = "NCHW",
+                  bool adaptive                        = false,
+                  const std::string& padding_algorithm = "EXPLICIT");
+
+  /**
+   * @brief This operation calculates the pooling output based on the input, pooling_type and pool_size, pool_stride,
+   * pool_padding parameters.
+   * @param x The input variable of pooling operator which is a 4-D variable with shape [N, C, H, W]. The format of
+   * input variable is “NCHW” or “NHWC”, where N is batch size, C is the number of channels, H is the height of the
+   * feature, and W is the width of the feature.
+   * @param y The output variable of pooling operator.
+   * @param dy The gradient variable of pooling operator's otuput.
+   * @param pooling_type pooling type, can be “max” for max-pooling and “avg” for average-pooling
+   * @param ksize The pool kernel size. If pool kernel size is a tuple or list, it must contain two integers,
+   * (pool_size_Height, pool_size_Width). Otherwise, the pool kernel size will be a square of an int.
+   * @param strides The pool stride size. If pool stride size is a tuple or list, it must contain two integers,
+   * (pool_stride_Height, pool_stride_Width). Otherwise, the pool stride size will be a square of an int. Default is {1,
+   * 1}.
+   * @param paddings The padding size. If padding is a list/tuple, it must contain two integers, (padding_H, padding_W).
+   * Otherwise, the padding_H = padding_W = padding. Default: padding = {0, 0}.
+   * @param ceil_mode Whether to use the ceil function to calculate output height and width. False is the default. If it
+   * is set to False, the floor function will be used. Default False
+   * @param exclusive Whether to exclude padding points in average pooling mode, default is true.
+   * @param global_pooling Whether to use the global pooling. If global_pooling = true, kernel size and paddings will be
+   * ignored. Default False.
+   * @param data_format Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
+   * @param adaptive When true, will perform adaptive pooling instead, output shape in H and W dimensions will be same
+   * as ksize, input data will be divided into grids specify by ksize averagely and perform pooling in each grid area to
+   * get output pooling value. Default: False.
+   * @param padding_algorithm Can be "EXPLICIT"/"SAME"/"VALID". Default: "EXPLICIT".
+   * @return The gradient variable of pooling input "X". The data type is same as input variable.
+   */
+  Variable Pool2dGrad(const Variable& x,
+                      const Variable& y,
+                      const Variable& dy,
+                      const std::string& pooling_type,
+                      const std::vector<int>& ksize,
+                      const std::vector<int>& strides      = {1, 1},
+                      const std::vector<int>& paddings     = {0, 0},
+                      bool ceil_mode                       = false,
+                      bool exclusive                       = true,
+                      bool global_pooling                  = false,
+                      const std::string& data_format       = "NCHW",
+                      bool adaptive                        = false,
+                      const std::string& padding_algorithm = "EXPLICIT");
+
+  /**
+   * @brief Repeat elements of an array `repeats` times along axis `axis`
+   * @param x An input N-D variable.
+   * @param repeats The times of repeat operation.
+   * @param axis The index of dimension to repeat.
+   * @return The repeat result variable.
+   */
+  Variable Repeat(const Variable& x, int repeats, int axis);
+
+  /**
+   * @brief Resize operator does 2D scaling to the given size.
+   * @param x An input variable, the data layout of input is NCHW
+   * @param out_shape The out size to which the image will be resized.
+   * @param mode Scale method to used [nearest, bilinear, bicubic], this will default to `bilinear`.
+   * @return The resized result.
+   */
+  Variable Resize(const Variable& x, const std::vector<int>& out_shape, const std::string& mode);
+
+  // *******************************************
+  // Broadcast operator
+  /**
+   * @brief Broadcast the input variable to a given shape.
+   * @param x The input variable need to broadcast.
+   * @param out_shape The result shape after broadcasting. The value -1 in shape means keeping the corresponding
+   * dimension unchanged.
+   * @return The result variable with given shape.
+   */
+  Variable BroadcastTo(const Variable& x, const cinn::utils::ShapeType& out_shape);
+
+  /**
+   * @brief Broadcast the input variable to a given shape.
+   * @param x The input variable need to broadcast.
+   * @param out_shape  The result shape after broadcasting. The value -1 in shape means keeping the corresponding
+   * dimension unchanged.
+   * @param broadcast_axes The axes need to broadcast, the axis not in `broadcast_axes` of `out_shape`'s value should be
+   * the same as input shape.
+   * @return The result variable with given shape.
+   */
+  Variable BroadcastTo(const Variable& x,
+                       const cinn::utils::ShapeType& out_shape,
+                       const cinn::utils::ShapeType& broadcast_axes);
+
+  // *******************************************
+  // Data Layout transform operator
+  /**
+   * @brief This OP concatenates the input along the axis.
+   * @param x Variable list with same data type.
+   * @param axis Specify the axis to operate on the input concatenates. The effective range is [-R, R), where R is
+   * Rank(x). When axis < 0, it works the same way as axis+R. Default is 0.
+   * @return A variable with the same data type as x.
+   */
+  Variable Concat(const std::vector<Variable>& x, int axis = 0);
+
+  /**
+   * @brief Split the input variable into multiple sub-variables.
+   * @param x A N-D variable.
+   * @param num_or_sections If `num_or_sections` is an int, then `num_or_sections` indicates the number of equal sized
+   * sub-variables that the `x` will be divided into. If `num_or_sections` is a list, the length of it indicates the
+   * number of sub-variables and the elements in it indicate the sizes of sub-variables’ dimension orderly. The length
+   * of the list must not be larger than the `x`'s size of specified axis.
+   * @param axis The axis along which to split. The effective range is [-R, R), where R is Rank(x). When axis < 0, it
+   * works the same way as axis+R. Default is 0.
+   * @return The list of segmented variables.
+   */
+  std::vector<Variable> Split(const Variable& x, const std::vector<int>& num_or_sections, int axis = 0);
+
+  /**
+   * @brief This operator changes the shape of x without changing its data.
+   * @param x An N-D variable.
+   * @param shape Define the target shape. At most one dimension of the target shape can be -1.
+   * @return A reshaped variable with the same data type as x.
+   */
+  Variable Reshape(const Variable& x, const cinn::utils::ShapeType& shape);
+
+  /**
+   * @brief This OP will squeeze single-dimensional entries of input variable shape. If axes is provided, will remove
+   * the dims by axes, the dims selected by axes should be one. If not provide axes, all dims equal to one will be
+   * deleted.
+   * @param x An N-D variable.
+   * @param axes The dimensions to be squeezed. Axes range is `[−rank(input),rank(input)]`. If `axes` is negative,
+   * `axes=axes+rank(input)`.
+   * @return Output squeezed variable. Data type is same as input variable.
+   */
+  Variable Squeeze(const Variable& x, const cinn::utils::ShapeType& axes = {});
+
+  /**
+   * @brief Creates an operation to insert new dimensions of length 1.
+   * @param operand An N-D variable.
+   * @param axis The index of the first new dimension (allows negative indices as offsets from the last dimension).
+   * @param num_newaxis The number of new dimensions to insert
+   * @return A variable whose op member is the dim expandsion operation.
+   */
+  Variable ExpandDims(const Variable& operand, const cinn::utils::ShapeType& axes);
+
+  /**
+   * @brief This operator reverse the input along the axis.
+   * @param x An N-D variable.
+   * @param axis Specify the axis to operate on the input reverse.
+   * @return A reversed variable with the same data type as x.
+   */
+  Variable Reverse(const Variable& x, const cinn::utils::ShapeType& axis);
+
+  /**
+   * @brief Permute the data dimensions of input according to perm. The i-th dimension of the returned variable will
+   * correspond to the perm[i]-th dimension of input.
+   * @param x An N-D variable.
+   * @param axis Permute the input according to the data of perm.
+   * @return A transposed n-D variable.
+   */
+  Variable Transpose(const Variable& x, const cinn::utils::ShapeType& axis);
+
+  /**
+   * @brief This operator produces a slice of x along multiple axes.
+   * @param x An N-D variable.
+   * @param axes Axes that starts and ends apply to.
+   * @param starts The starting indices of corresponding axis in axes. Default: None.
+   * @param ends The ending indices of corresponding axis in axes. Default: None.
+   * @param infer_flags Whether the output shape can be infered in compile time. Now only support all 1. Default: None.
+   * @param strides The slice step of corresponding axis in axes. Default: None.
+   * @param decrease_axis Eliminate the specified dimension. Default: None.
+   * @return A variable with the same dimension as x. The data type is same as x.
+   */
+  Variable Slice(const Variable& x,
+                 const cinn::utils::ShapeType& axes,
+                 const std::vector<int>& starts        = {},
+                 const std::vector<int>& ends          = {},
+                 const std::vector<int>& infer_flags   = {},
+                 const std::vector<int>& strides       = {},
+                 const std::vector<int>& decrease_axis = {});
+
+  /**
+   * @brief Returns a new variable which indexes the input variable along dimension axis using the entries in index
+   * which is a variable. The returned variable has the same number of dimensions as the original x variable. The dim-th
+   * dimension has the same size as the length of index; other dimensions have the same size as in the x variable.
+   * @param x An N-D variable.
+   * @param index The 1-D variable containing the indices to index.
+   * @param axis  The dimension in which we index. Default: 0.
+   * @return A variable with same data type as x.
+   */
+  Variable Gather(const Variable& x, const Variable& index, int axis = 0);
+
+  /**
+   * @brief Output is obtained by updating the input on selected indices based on updates.
+   * @param x  The input N-D variable with ndim>=1.
+   * @param updates pdate input with updates parameter based on index. shape should be the same as input, and dim value
+   * with dim > 1 should be the same as input.
+   * @param index The index 1-D variable. The length of index cannot exceed updates’s length, and the value in index
+   * cannot exceed input’s length.
+   * @param axis  The dimension in which we index. Default: 0.
+   * @return A variable with same shape as x.
+   */
+  Variable ScatterAssign(const Variable& x, const Variable& updates, const Variable& index, int axis = 0);
+
+  /**
+   * @brief Output is obtained by adding the `input` and the `updates` on selected indices.
+   * @param x  The input N-D variable with ndim>=1.
+   * @param updates Update input with updates parameter based on index. Shape should be the same as input, and dim value
+   * with dim > 1 should be the same as input.
+   * @param index The index 1-D variable. The length of index cannot exceed updates’s length, and the value in index
+   * cannot exceed input’s length.
+   * @param axis  The dimension in which we index. Default: 0.
+   * @return A variable with same shape as x.
+   */
+  Variable ScatterAdd(const Variable& x, const Variable& updates, const Variable& index, int axis = 0);
+
+  /**
+   * @brief Replacing the value of `x` by `assign` variable on the range of `slice(x)`. In other word,
+   * `slice(x)=assign`.
+   * @param x An N-D variable.
+   * @param assign Update input with assign value based on slice result. Shape should be the same as the `slice` output
+   * shape.
+   * @param axes Axes that starts and ends apply to.
+   * @param starts The starting indices of corresponding axis in axes.
+   * @param ends The ending indices of corresponding axis in axes.
+   * @param strides The slice step of corresponding axis in axes.Default: None.
+   * @return A variable with the same shape as x. The data type is same as x.
+   */
+  Variable SliceAssign(const Variable& x,
+                       const Variable& assign,
+                       const cinn::utils::ShapeType& axes,
+                       const std::vector<int>& starts,
+                       const std::vector<int>& ends,
+                       const std::vector<int>& strides = {});
+
+  // *******************************************
+  // Activation Operator
+  /**
+   * @brief Relu6 Activation Operator.
+   * @param x Input of relu6 operator, an N-D Tensor.
+   * @param threshold  The threshold value of Relu6. Default is 6.0f.
+   * @return Output of relu6 operator, a variable with the same shape as input.
+   */
+  Variable Relu6(const Variable& x, float threshold = 6.0f);
+
+  /**
+   * @brief This operator implements the softmax layer.
+   * @param x An N-D variable.
+   * @param axis The index of dimension to perform softmax calculations, it should be in range `[−1,rank−1]`,
+   * while `rank` is the rank of input variable. Default: -1. -1 means the last dimension.
+   * @param data_format Specify the data format of the output data, the input will be transformed automatically.
+   * An optional string from: "AnyLayout", "NHWC", "NCHW". Default: "AnyLayout".
+   * @return Output of softmax. The data type and shape are the same as input .
+   */
+  Variable Softmax(const Variable& x,
+                   const std::vector<int>& axes   = {-1},
+                   const std::string& mode        = "fast",
+                   const std::string& data_format = "AnyLayout");
+
+  // *******************************************
+  // Type converter Operator
+  /**
+   * @brief This OP takes in the Variable `x` with `x.dtype` and casts it to the output with dtype.
+   * It’s meaningless if the output dtype equals the input `dtype`, but it’s fine if you do so.
+   * @param x An input N-D variable.
+   * @param dtype Data type of the output.
+   * @return A variable with the same shape as input’s.
+   */
+  Variable Cast(const Variable& x, const std::string& dtype);
+
+  /**
+   * @brief This OP takes in the Variable `x` with `x.dtype` and casts it to the output with dtype.
+   * The output data shape will be calculated according to the type of input data and the specified output data type.
+   * Assuming that the input data type is "T" and it's shape is [...], the output data type is specified as "S".
+   * If the "T" is larger than "S", then the shape changes from [...] to [..., sizeof(T)/sizeof(S)].
+   * If "T" is smaller than "S", this operator requires that the rightmost dimension must be equal to
+   * sizeof(S)/sizeof(T) and the shape then goes from [..., sizeof(S)/sizeof(T)] to [...].
+   * It’s meaningless if the output dtype equals the input `dtype`, but it’s fine if you do so.
+   * @param x An input N-D variable.
+   * @param dtype Data type of the output.
+   * @return A variable with the same data buffer as input’s, but shape may different.
+   */
+  Variable BitcastConvert(const Variable& x, const std::string& dtype);
+
+  /**
+   *  @brief Returns a one-hot tensor where the locations repsented by indices take value `on_value`,
+   *  other locations take value `off_value`.
+   *  @param on_value Value to fill at indices. Its shape must be [1].
+   *  @param on_value Value to fill at all other positions besides indices. Its shape must be [1]
+   *  @param depth Depth of the one-hot dimension.
+   *  @param axis Axis to fill.
+   */
+  Variable OneHot(const Variable& indices,
+                  const Variable& on_value,
+                  const Variable& off_value,
+                  const int depth,
+                  const int axis,
+                  const std::string& dtype);
+
+  // *******************************************
+  // Decomposer Operator
+  /**
+   * @brief The gradient function of `elementwise_add`.
+   * @param dout  The gradient variable of the `elementwise_add`'s output.
+   * @param x The left input variable.
+   * @param y The right input variable.
+   * @param axis  Specify the axis to operate on the input. Default: -1.
+   * @return The gradient variable of `x` and `y`.
+   */
+  const std::vector<Variable>& ElementwiseAddGrad(const Variable& dout,
+                                                  const Variable& x,
+                                                  const Variable& y,
+                                                  int axis = -1);
+
+  /**
+   * @brief The gradient function of `relu`.
+   * @param dout  The gradient variable of the `relu`'s output.
+   * @param x The input variable.
+   * @return The gradient variable of `x`.
+   */
+  Variable ReluGrad(const Variable& dout, const Variable& x);
+
+  /**
+   * @brief Compute the convolution.
+   * @param x The image variable.
+   * @param weight The filter variable.
+   * @param strides The stride size. If stride is a list/tuple, it must contain two integers, (stride_H, stride_W).
+   * Otherwise, the stride_H = stride_W = stride. Default: stride = {1, 1}.
+   * @param paddings The padding size. If padding is a list/tuple, it must contain two integers, (padding_H, padding_W).
+   * Otherwise, the padding_H = padding_W = padding. Default: padding = {0, 0}.
+   * @param dilations  The dilation size. If dilation is a list/tuple, it must contain two integers, (dilation_H,
+   * dilation_W). Otherwise, the dilation_H = dilation_W = dilation. Default: dilation = {1, 1}.
+   * @param groups The groups number of the conv layer. Default: groups=1.
+   * @param conv_type The convolution type. The choice contain "forward"/"backward_data"/"backward_filter", note only
+   * support "forward" when using cudnn.
+   * @param data_format Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
+   * @param padding_algorithm CINN not support! It can be "EXPLICIT"/"SAME"/"VALID". Default: "EXPLICIT".
+   * @param output_shape The shape of output. Default: None.
+   * @return The convolution result variable.
+   */
+  Variable Conv(const Variable& x,
+                const Variable& weight,
+                const std::vector<int>& strides            = {1, 1},
+                const std::vector<int>& paddings           = {0, 0},
+                const std::vector<int>& dilations          = {1, 1},
+                int groups                                 = 1,
+                const std::string& conv_type               = "forward",
+                const std::string& data_format             = "NCHW",
+                const std::string& padding_algorithm       = "EXPLICIT",
+                const cinn::utils::ShapeType& output_shape = {});
+
+  /**
+   * @brief Compute the convolution-2d.
+   * @param x The image variable.
+   * @param weights The filter variable.
+   * @param strides The stride size. If stride is a list/tuple, it must contain two integers, (stride_H, stride_W).
+   * Otherwise, the stride_H = stride_W = stride. Default: stride = {1, 1}.
+   * @param paddings The padding size. If padding is a list/tuple, it must contain two integers, (padding_H, padding_W).
+   * Otherwise, the padding_H = padding_W = padding. Default: padding = {0, 0}.
+   * @param dilations  The dilation size. If dilation is a list/tuple, it must contain two integers, (dilation_H,
+   * dilation_W). Otherwise, the dilation_H = dilation_W = dilation. Default: dilation = {1, 1}.
+   * @param groups The groups number of the conv layer. Default: groups=1.
+   * @param data_format Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
+   * @param padding_algorithm CINN not support! It can be "EXPLICIT"/"SAME"/"VALID". Default: "EXPLICIT".
+   * @return The convolution-2d result variable.
+   */
+  Variable Conv2d(const Variable& x,
+                  const Variable& weights,
+                  const std::vector<int>& strides      = {1, 1},
+                  const std::vector<int>& paddings     = {0, 0},
+                  const std::vector<int>& dilations    = {1, 1},
+                  int groups                           = 1,
+                  const std::string& data_format       = "NCHW",
+                  const std::string& padding_algorithm = "EXPLICIT");
+
+  /**
+   * @brief This API reverse the Variable x along the given axis.
+   * @param x An N-D variable.
+   * @param axis Specify the axis to operate on the input reverse.
+   * @return A reversed variable with the same data type as x.
+   */
+  Variable Flip(const Variable& operand, const std::vector<int>& axes);
+
+  /**
+   * @brief The gradient function of convolution-2d.
+   * @param dout The gradient variable of the `conv2d`'s output.
+   * @param x The image variable.
+   * @param weights The filter variable.
+   * @param strides The stride size. If stride is a list/tuple, it must contain two integers, (stride_H, stride_W).
+   * Otherwise, the stride_H = stride_W = stride. Default: stride = {1, 1}.
+   * @param paddings The padding size. If padding is a list/tuple, it must contain two integers, (padding_H, padding_W).
+   * Otherwise, the padding_H = padding_W = padding. Default: padding = {0, 0}.
+   * @param dilations  The dilation size. If dilation is a list/tuple, it must contain two integers, (dilation_H,
+   * dilation_W). Otherwise, the dilation_H = dilation_W = dilation. Default: dilation = {1, 1}.
+   * @param groups The groups number of the conv layer. Default: groups=1.
+   * @param data_format Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
+   * @param padding_algorithm CINN not support! It can be "EXPLICIT"/"SAME"/"VALID". Default: "EXPLICIT".
+   * @return The gradient variable of 'x'.
+   */
+  std::vector<Variable> Conv2dGrad(const Variable& dout,
+                                   const Variable& x,
+                                   const Variable& weights,
+                                   const std::vector<int>& strides      = {1, 1},
+                                   const std::vector<int>& paddings     = {0, 0},
+                                   const std::vector<int>& dilations    = {1, 1},
+                                   const int groups                     = 1,
+                                   const std::string& data_format       = "NCHW",
+                                   const std::string& padding_algorithm = "EXPLICIT");
+
+  /**
+   * @brief Compute the depthwise convolution-2d.
+   * @param x The image variable.
+   * @param weights The filter variable.
+   * @param strides The stride size. If stride is a list/tuple, it must contain two integers, (stride_H, stride_W).
+   * Otherwise, the stride_H = stride_W = stride. Default: stride = {1, 1}.
+   * @param paddings The padding size. If padding is a list/tuple, it must contain two integers, (padding_H, padding_W).
+   * Otherwise, the padding_H = padding_W = padding. Default: padding = {0, 0}.
+   * @param dilations  The dilation size. If dilation is a list/tuple, it must contain two integers, (dilation_H,
+   * dilation_W). Otherwise, the dilation_H = dilation_W = dilation. Default: dilation = {1, 1}.
+   * @param groups The groups number of the conv layer. Default: groups=1.
+   * @param data_format Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
+   * @param padding_algorithm CINN not support! It can be "EXPLICIT"/"SAME"/"VALID". Default: "EXPLICIT".
+   * @return The depthwise convolution-2d result variable.
+   */
+  Variable DepthwiseConv2d(const Variable& x,
+                           const Variable& weights,
+                           const std::vector<int>& strides      = {1, 1},
+                           const std::vector<int>& paddings     = {0, 0},
+                           const std::vector<int>& dilations    = {1, 1},
+                           int groups                           = 1,
+                           const std::string& data_format       = "NCHW",
+                           const std::string& padding_algorithm = "EXPLICIT");
+
+  /**
+   * @brief Compute the depthwise convolution-2d.
+   * @param x The image variable.
+   * @param scale Scale is a 1-dimensional tensor of size C that is applied to the output.
+   * @param bias Bias is a 1-dimensional tensor of size C that is applied to the output.
+   * @param mean The global mean (for training) or estimated mean (for testing).
+   * @param variance The global variance (for training) or estimated Variance (for testing)
+   * @param epsilon The small value added to the variance to prevent division by zero. Default: 1e-5f.
+   * @param momentum The value used for the moving_mean and moving_var computation. Default: 0.9f.
+   * @param data_layout Specify the input data format, may be “NC”, “NCL”, “NCHW”, “NCDHW”, “NLC”, “NHWC” or “NDHWC”.
+   * Defalut “NCHW”.
+   * @param is_test A flag indicating whether it is in test phrase or not.
+   * @return `{out}` if `is_test` it true, `{out, saved_mean, saved_variance, moving_mean, moving_variance}` if
+   * `is_test` is false.
+   */
+  std::vector<Variable> BatchNorm(const Variable& x,
+                                  const Variable& scale,
+                                  const Variable& bias,
+                                  const Variable& mean,
+                                  const Variable& variance,
+                                  float epsilon                  = 1e-5f,
+                                  float momentum                 = 0.9f,
+                                  const std::string& data_layout = "NCHW",
+                                  bool is_test                   = false);
+
+  /**
+   * @brief The gradient function of BatchNorm training.
+   * @param dout The gradient variable of the `batch_norm_training`'s first output.
+   * @param x The image variable.
+   * @param scale Scale is a 1-dimensional tensor of size C that is applied to the output.
+   * @param save_mean The global mean saved from forward compute.
+   * @param save_variance The global variance from forward compute.
+   * @param epsilon The small value added to the variance to prevent division by zero. Default: 1e-5f.
+   * @param data_layout Specify the input data format, may be “NC”, “NCL”, “NCHW”, “NCDHW”, “NLC”, “NHWC” or “NDHWC”.
+   * Defalut “NCHW”.
+   * @return `{x_grad, scale_grad, bias_grad}`.
+   */
+  // batch norm grad, output(x_grad, scale_grad, bias_grad)
+  std::vector<Variable> BatchNormGrad(const Variable& dout,
+                                      const Variable& x,
+                                      const Variable& scale,
+                                      const Variable& save_mean,
+                                      const Variable& save_variance,
+                                      const float epsilon            = 1e-5f,
+                                      const std::string& data_layout = "NCHW");
+
+  /**
+   * @brief Get index of variable x to the maximum value along the given axis.
+   * @param x An input N-D variable.
+   * @param axis Specify the axis to operate on the input. Default: 0.
+   * @param keep_dim Decide whether to keep the dimension.
+   * Defalut “NCHW”.
+   * @return `Index of variable x to the maximum value`.
+   */
+  Variable Argmax(const Variable& x, const int& axis = 0, const bool& keep_dim = false);
+
+  /**
+   * @brief Get index of variable x to the minimum value along the given axis.
+   * @param x An input N-D variable.
+   * @param axis Specify the axis to operate on the input. Default: 0.
+   * @param keep_dim Decide whether to keep the dimension.
+   * Defalut “NCHW”.
+   * @return `Index of variable x to the minimum value`.
+   */
+  Variable Argmin(const Variable& x, const int& axis = 0, const bool& keep_dim = false);
+
+  /**
+   * @brief Sort Variable x along the given axis and return sorted index. The original Variable x will not be changed.
+   * @param operand The variable that will be sorted.
+   * @param axis Specify the axis to operate on the input. Default: 0.
+   * @param is_ascend Sort mode.
+   * Defalut “NCHW”.
+   * @return `Sorted variable index`.
+   */
+  std::vector<Variable> ArgSort(const Variable& operand, const int& axis, const bool& is_ascend = true);
+
+  /**
+   * @brief Sort Variable x along the given axis and return sorted variable. The original Variable x will not be
+   * changed.
+   * @param operand The variable that will be sorted.
+   * @param axis Specify the axis to operate on the input. Default: 0.
+   * @param is_ascend Sort mode.
+   * Defalut “NCHW”.
+   * @return `Sorted variable`.
+   */
+  Variable Sort(const Variable& operand, const int& axis, const bool& is_ascend = true);
+
+  /**
+   * @brief Lookup embeddings vector of ids provided by x .
+   * @param table A variable with shape of lookup table parameter
+   * @param ids An input contains the id information.
+   * @param padding_idx If the value is -1, it makes no effect to lookup.
+                     Otherwise the given value indicates padding the output
+                     with zeros whenever lookup encounters it in Ids.
+   * @return `The concatenated variable of selected values`.
+   */
+  Variable LookupTable(const Variable& table, const Variable& ids, int64_t padding_idx);
+
+  /**
+   * @brief Gaussian random
+   * @param shape Shape of the variable to be created.
+   * @param mean Mean of the output variable, default is 0.0f.
+   * @param std Standard deviation of the output variable, default is 1.0f.
+   * @param seed Random seed of generator, default is 0.
+   * @param dtype Data type of output variable, supported data types: float32, float64.
+   */
+  Variable GaussianRandom(const std::vector<int>& shape,
+                          float mean               = 0.0f,
+                          float std                = 1.0f,
+                          int seed                 = 0,
+                          const std::string& dtype = "float32");
+
+  /**
+   * @brief Uniform random
+   * @param shape Shape of the variable to be created.
+   * @param min The lower bound of the range of random values ​​generated, min is included in the range.
+   * @param max The upper bound of the range of random values ​​generated, max is not included in the range.
+   * @param seed Random seed of generator, default is 0.
+   * @param dtype Data tpye of output variable, supported data types: float32, float64.
+   */
+  Variable UniformRandom(const std::vector<int>& shape,
+                         float min                = -1.0f,
+                         float max                = 1.0f,
+                         int seed                 = 0,
+                         const std::string& dtype = "float32",
+                         int diag_num             = 0,
+                         int diag_step            = 0,
+                         float diag_val           = 1.0f);
+
+  /**
+   * @brief Generate random integers in the range min to max
+   * @param shape Shape of the variable to be created.
+   * @param min The lower bound of the range of random values ​​generated, min is included in the range.
+   * @param max The upper bound of the range of random values ​​generated, max is not included in the range.
+   * @param seed Random seed of generator, default is 0.
+   * @param dtype Data tpye of output variable, supported data types: int32, int64.
+   */
+  Variable RandInt(
+      const std::vector<int>& shape, int min = 0, int max = 0, int seed = 0, const std::string& dtype = "int64");
+
+  /**
+   * @brief Compute cholesky decomposition of a positive definite symmetric matrix.
+   * @param x Positive definite symmetric matrix.
+   * @param upper When upper is true, calculate and return the upper triangular matrix.
+                  When upper is false, calculate and return the lower triangular matrix.
+   * @return Triangular matrix, shape is same as input.
+   */
+  Variable Cholesky(const Variable& x, bool upper = false);
+
+  /**
+   * @brief Solve triangular linear systems with multiple right-hand-sides.
+   * @param input1 triangular matrix stored in lower or upper mode.
+   * @param input2 matrix on the right hand side.
+   * @param left_side When left_side is true, compute A*X = B.
+                      When left_side is false, compute X*A = B.
+   * @param upper When upper is true, use the upper part of the triangular matrix.
+                  When upper is false, use the lower part of the triangular matrix.
+   * @param transpose_a When transpose_a is true, use the transpose of matrix A
+   * @param unit_diagonal When unit_diagonal is true, assume the elements on the main diagonal of matrix A are unity
+   * @return The solution for the triangular linear systems.
+   */
+  Variable TriangularSolve(
+      const Variable& input1, const Variable& input2, bool left_side, bool upper, bool transpose_a, bool unit_diagonal);
+
+  /**
+   * @brief Return values and indices of the k largest or smallest at the optional axis.
+   * If the input is a 1-D Tensor, finds the k largest or smallest values and indices.
+   * If the input is a Tensor with higher rank, this operator computes the top k values
+   * and indices along the axis.
+   * @param x Input tensor.
+   * @param k The number of top elements to look for along the axis.
+   * @param axis Axis to compute indices along. The effective range is [-R, R), where R is
+   * x.ndim. when axis < 0, it works the same way as axis + R. Default is -1.
+   * @param largest largest is a flag, if set to true, algorithm will sort by descending
+   * order, otherwise sort by ascending order. Default is True.
+   * @return The values and indices. The value data type is the same as the input x. The
+   * indices data type is int64.
+   */
+  std::vector<Variable> TopK(const Variable& x, int k, int axis, bool largest);
+
+ private:
+  CINN_DISALLOW_COPY_AND_ASSIGN(NetBuilder);
+};
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/net_builder_test.cc b/paddle/cinn/frontend/net_builder_test.cc
new file mode 100644
index 0000000000000..e57ec7a24198a
--- /dev/null
+++ b/paddle/cinn/frontend/net_builder_test.cc
@@ -0,0 +1,1501 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/net_builder.h"
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/utils/data_util.h"
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
+namespace cinn {
+namespace frontend {
+
+using hlir::framework::OpRegistry;
+
+namespace {
+Program CreateAddProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b       = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c       = builder.Add(a, b);
+  auto d       = builder.Add(a, c);
+  auto program = builder.Build();
+
+  return program;
+}
+
+template <typename T, typename Alloc = std::allocator<T>>
+std::ostream& operator<<(std::ostream& os, const std::vector<T, Alloc>& vec) {
+  os << "{ ";
+  for (auto e : vec) {
+    os << e << " ";
+  }
+  os << "}\n";
+  return os;
+}
+}  // namespace
+
+TEST(net_build, basic) {
+  LOG(INFO) << "The size of registered operators: " << OpRegistry::Global()->ListAllNames().size();
+  LOG(INFO) << "Registered operators:\n" << OpRegistry::Global()->ListAllNames();
+  auto program = CreateAddProgram();
+  // output program
+  for (int i = 0; i < program.size(); i++) {
+    LOG(INFO) << "instruction: " << program[i];
+  }
+}
+
+TEST(net_build, program_execute_multi_elementwise_add) {
+  auto program = CreateAddProgram();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+
+  auto A = scope->GetTensor("A");
+  auto B = scope->GetTensor("B");
+  SetRandData<float>(A, target);
+  SetRandData<float>(B, target);
+
+  runtime_program->Execute();
+}
+#ifdef CINN_WITH_CUDA
+TEST(net_build, program_execute_fc) {
+  constexpr int B = 10;  // batch size
+  constexpr int M = 32;
+  constexpr int K = 18;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a = builder.CreateInput(Float(32), {B * M, K}, "A");
+  auto w = builder.CreateInput(Float(32), {K, N}, "W");  // weight
+  auto b = builder.CreateInput(Float(32), {N}, "B");     // bias
+
+  auto mul_out = builder.Matmul(a, w);
+  auto add_out = builder.Add(mul_out, b);
+  auto program = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(a.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(w.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(b.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(mul_out->id));
+
+  auto a_ten        = scope->GetTensor(std::string(a.id()));
+  auto w_ten        = scope->GetTensor(std::string(w.id()));
+  auto b_ten        = scope->GetTensor(std::string(b.id()));
+  auto fake_out_ten = scope->GetTensor(std::string(mul_out->id));
+  auto add_out_ten  = scope->GetTensor(std::string(add_out->id));
+  SetRandData<float>(a_ten, target);
+  SetRandData<float>(w_ten, target);
+  SetRandData<float>(b_ten, target);
+
+  runtime_program->Execute();
+}
+#endif
+
+#ifdef CINN_WITH_CUDA
+TEST(net_build, program_execute_multi_elementwise_add_bf16) {
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(cinn::common::BFloat16(), {M, N}, "A");
+  auto b       = builder.CreateInput(cinn::common::BFloat16(), {M, N}, "B");
+  auto c       = builder.Add(a, b);
+  auto d       = builder.Add(a, c);
+  auto program = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+
+  auto A = scope->GetTensor("A");
+  auto B = scope->GetTensor("B");
+  SetRandData<float>(A, target);
+  SetRandData<float>(B, target);
+
+  runtime_program->Execute();
+}
+
+TEST(net_build, program_execute_fc_bf16) {
+  constexpr int B = 10;  // batch size
+  constexpr int M = 32;
+  constexpr int K = 18;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a = builder.CreateInput(cinn::common::BFloat16(), {B * M, K}, "A");
+  auto w = builder.CreateInput(cinn::common::BFloat16(), {K, N}, "W");  // weight
+  auto b = builder.CreateInput(cinn::common::BFloat16(), {N}, "B");     // bias
+
+  auto mul_out = builder.Matmul(a, w);
+  auto add_out = builder.Add(mul_out, b);
+  auto program = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(a.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(w.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(b.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(mul_out->id));
+
+  auto a_ten        = scope->GetTensor(std::string(a.id()));
+  auto w_ten        = scope->GetTensor(std::string(w.id()));
+  auto b_ten        = scope->GetTensor(std::string(b.id()));
+  auto fake_out_ten = scope->GetTensor(std::string(mul_out->id));
+  auto add_out_ten  = scope->GetTensor(std::string(add_out->id));
+  SetRandData<float>(a_ten, target);
+  SetRandData<float>(w_ten, target);
+  SetRandData<float>(b_ten, target);
+
+  runtime_program->Execute();
+}
+#endif
+
+TEST(net_build, program_execute_pool2d) {
+  const int B = 16;
+  const int C = 64;
+  const int H = 112;
+  const int W = 112;
+
+  NetBuilder builder("net_builder");
+  Placeholder input        = builder.CreateInput(Float(32), {B, C, H, W}, "Img");
+  std::string pooling_type = "max";
+  std::vector<int> ksize{3, 3};
+  std::vector<int> strides{2, 2};
+  std::vector<int> paddings{1, 1, 1, 1};
+  bool ceil_mode                = false;
+  bool exclusive                = true;
+  bool global_pooling           = false;
+  std::string data_format       = "NCHW";
+  bool adaptive                 = false;
+  std::string padding_algorithm = "EXPLICIT";
+  Variable pool_out             = builder.Pool2d(input,
+                                     pooling_type,
+                                     ksize,
+                                     strides,
+                                     paddings,
+                                     ceil_mode,
+                                     exclusive,
+                                     global_pooling,
+                                     data_format,
+                                     adaptive,
+                                     padding_algorithm);
+  auto program                  = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(pool_out->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  runtime_program->Execute();
+}
+
+TEST(net_build, program_execute_reverse) {
+  const int B = 16;
+  const int C = 3;
+  const int H = 224;
+  const int W = 224;
+
+  NetBuilder builder("net_builder");
+  Placeholder input    = builder.CreateInput(Float(32), {B, C, H, W}, "Img");
+  Variable reverse_out = builder.Reverse(input, {2, 3});
+  auto program         = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(reverse_out->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  runtime_program->Execute();
+}
+
+TEST(net_build, program_execute_gather) {
+  const int B     = 4;
+  const int H_IN1 = 18;
+  const int H_IN2 = 14;
+
+  NetBuilder builder("net_builder");
+  Placeholder input1 = builder.CreateInput(Float(32), {B, H_IN1}, "In1");
+  Placeholder input2 = builder.CreateInput(Int(32), {H_IN2}, "In2");
+  Variable output    = builder.Gather(input1, input2, 1);
+  auto program       = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input1.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(input2.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input1_tensor = scope->GetTensor(std::string(input1.id()));
+  SetRandData<float>(input1_tensor, target);
+  std::vector<float> input1_data = GetTensorData<float>(input1_tensor, target);
+
+  auto input2_tensor = scope->GetTensor(std::string(input2.id()));
+  SetRandInt(input2_tensor, target, -1, 0, H_IN1);
+  std::vector<int> input2_data = GetTensorData<int>(input2_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_tensor->type(), Float(32));
+  EXPECT_EQ(output_shape.size(), 2UL);
+  EXPECT_EQ(output_shape[0], B);
+  EXPECT_EQ(output_shape[1], H_IN2);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int b = 0; b < B; ++b) {
+    for (int h = 0; h < H_IN2; ++h) {
+      std::string line;
+      int index      = h + H_IN2 * b;
+      float in_data  = input1_data[input2_data[h] + H_IN1 * b];
+      float out_data = output_data[index];
+      line += (std::to_string(out_data) + ", ");
+      EXPECT_EQ(in_data, out_data);
+      VLOG(6) << line;
+    }
+  }
+}
+
+TEST(net_build, program_execute_gather_nd) {
+  const int B     = 4;
+  const int H_IN1 = 11;
+  const int H_IN2 = 14;
+
+  NetBuilder builder("net_builder");
+  Placeholder input1 = builder.CreateInput(Float(32), {B, H_IN1}, "In1");
+  Placeholder input2 = builder.CreateInput(Int(32), {B, H_IN2, 1}, "In2");
+  Variable output    = builder.GatherNd(input1, input2);
+  auto program       = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input1.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(input2.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input1_tensor = scope->GetTensor(std::string(input1.id()));
+  SetRandData<float>(input1_tensor, target);
+  std::vector<float> input1_data = GetTensorData<float>(input1_tensor, target);
+
+  auto input2_tensor = scope->GetTensor(std::string(input2.id()));
+  SetRandInt(input2_tensor, target, -1, 0, B);
+  std::vector<int> input2_data = GetTensorData<int>(input2_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_tensor->type(), Float(32));
+  EXPECT_EQ(output_shape.size(), 3UL);
+  EXPECT_EQ(output_shape[0], B);
+  EXPECT_EQ(output_shape[1], H_IN2);
+  EXPECT_EQ(output_shape[2], H_IN1);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int b = 0; b < B; ++b) {
+    for (int h = 0; h < H_IN2; ++h) {
+      std::string line;
+      for (int c = 0; c < H_IN1; ++c) {
+        float in_data  = input1_data[input2_data[b * H_IN2 + h] * H_IN1 + c];
+        int out_index  = c + h * H_IN1 + H_IN1 * H_IN2 * b;
+        float out_data = output_data[out_index];
+        line += (std::to_string(out_data) + ", ");
+        EXPECT_EQ(in_data, out_data);
+      }
+      VLOG(6) << line;
+    }
+  }
+}
+
+TEST(net_build, program_execute_cast) {
+  const int B = 4;
+  const int H = 7;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Int(32), {B, H}, "In");
+  Variable output   = builder.Cast(input, "float");
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandInt(input_tensor, target);
+  std::vector<int> input_data = GetTensorData<int>(input_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_tensor->type(), Float(32));
+  EXPECT_EQ(output_shape.size(), 2UL);
+  EXPECT_EQ(output_shape[0], B);
+  EXPECT_EQ(output_shape[1], H);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int b = 0; b < B; ++b) {
+    for (int h = 0; h < H; ++h) {
+      std::string line;
+      int index      = h + H * b;
+      float in_data  = (float)input_data[index];
+      float out_data = output_data[index];
+      line += (std::to_string(out_data) + ", ");
+      EXPECT_EQ(in_data, out_data);
+      VLOG(6) << line;
+    }
+  }
+}
+
+TEST(net_build, program_execute_squeeze_case0) {
+  const int B = 4;
+  const int C = 1;
+  const int H = 7;
+  const int W = 1;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {B, C, H, W}, "In");
+  Variable output   = builder.Squeeze(input, {1});
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_shape.size(), 3UL);
+  EXPECT_EQ(output_shape[0], B);
+  EXPECT_EQ(output_shape[1], H);
+  EXPECT_EQ(output_shape[2], W);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int b = 0; b < B; ++b) {
+    for (int c = 0; c < C; ++c) {
+      VLOG(6) << "b = " << b << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index      = w + W * (h + H * (c + C * b));
+          float in_data  = input_data[index];
+          float out_data = output_data[index];
+          line += (std::to_string(out_data) + ", ");
+          EXPECT_EQ(in_data, out_data);
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+}
+
+TEST(net_build, program_execute_squeeze_case1) {
+  const int B = 4;
+  const int C = 1;
+  const int H = 7;
+  const int W = 1;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {B, C, H, W}, "In");
+  Variable output   = builder.Squeeze(input, {-3});
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_shape.size(), 3UL);
+  EXPECT_EQ(output_shape[0], B);
+  EXPECT_EQ(output_shape[1], H);
+  EXPECT_EQ(output_shape[2], W);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int b = 0; b < B; ++b) {
+    for (int c = 0; c < C; ++c) {
+      VLOG(6) << "b = " << b << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index      = w + W * (h + H * (c + C * b));
+          float in_data  = input_data[index];
+          float out_data = output_data[index];
+          line += (std::to_string(out_data) + ", ");
+          EXPECT_EQ(in_data, out_data);
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+}
+
+TEST(net_build, program_execute_squeeze_case2) {
+  const int B = 4;
+  const int C = 1;
+  const int H = 7;
+  const int W = 1;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {B, C, H, W}, "In");
+  Variable output   = builder.Squeeze(input, {1, 3});
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_shape.size(), 2UL);
+  EXPECT_EQ(output_shape[0], B);
+  EXPECT_EQ(output_shape[1], H);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int b = 0; b < B; ++b) {
+    for (int c = 0; c < C; ++c) {
+      VLOG(6) << "b = " << b << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index      = w + W * (h + H * (c + C * b));
+          float in_data  = input_data[index];
+          float out_data = output_data[index];
+          line += (std::to_string(out_data) + ", ");
+          EXPECT_EQ(in_data, out_data);
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+}
+
+TEST(net_build, program_execute_squeeze_case3) {
+  const int B = 4;
+  const int C = 1;
+  const int H = 7;
+  const int W = 1;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {B, C, H, W}, "In");
+  Variable output   = builder.Squeeze(input, {1, -1});
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_shape.size(), 2UL);
+  EXPECT_EQ(output_shape[0], B);
+  EXPECT_EQ(output_shape[1], H);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int b = 0; b < B; ++b) {
+    for (int c = 0; c < C; ++c) {
+      VLOG(6) << "b = " << b << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index      = w + W * (h + H * (c + C * b));
+          float in_data  = input_data[index];
+          float out_data = output_data[index];
+          line += (std::to_string(out_data) + ", ");
+          EXPECT_EQ(in_data, out_data);
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+}
+
+TEST(net_build, program_execute_squeeze_case4) {
+  const int B = 4;
+  const int C = 1;
+  const int H = 7;
+  const int W = 1;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {B, C, H, W}, "In");
+  Variable output   = builder.Squeeze(input, {});
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_shape.size(), 2UL);
+  EXPECT_EQ(output_shape[0], B);
+  EXPECT_EQ(output_shape[1], H);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int b = 0; b < B; ++b) {
+    for (int c = 0; c < C; ++c) {
+      VLOG(6) << "b = " << b << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index      = w + W * (h + H * (c + C * b));
+          float in_data  = input_data[index];
+          float out_data = output_data[index];
+          line += (std::to_string(out_data) + ", ");
+          EXPECT_EQ(in_data, out_data);
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+}
+
+TEST(net_build, program_execute_argsort) {
+  const int B = 4;
+  const int H = 7;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {B, H}, "In");
+  Variable output   = builder.ArgSort(input, 0, true).at(0);
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_tensor->type(), Int(32));
+  EXPECT_EQ(output_shape.size(), 2UL);
+  EXPECT_EQ(output_shape[0], B);
+  EXPECT_EQ(output_shape[1], H);
+
+  std::vector<int> output_data = GetTensorData<int>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int h = 0; h < H; ++h) {
+    std::vector<float> sorted_data;
+    std::vector<float> out_sorted_data(H);
+    for (int b = 0; b < B; ++b) {
+      int index = h + H * b;
+      sorted_data.push_back(input_data[index]);
+      out_sorted_data[b] = input_data[h + H * output_data[index]];
+    }
+    std::sort(sorted_data.begin(), sorted_data.begin() + B);
+
+    for (int b = 0; b < B; ++b) {
+      std::string line;
+      int index       = h + H * b;
+      float true_data = sorted_data[b];
+      float out_data  = out_sorted_data[b];
+      line += (std::to_string(out_data) + ", ");
+      EXPECT_EQ(true_data, out_data);
+      VLOG(6) << line;
+    }
+  }
+}
+
+TEST(net_build, program_execute_sort) {
+  const int B = 4;
+  const int H = 7;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {B, H}, "In");
+  Variable output   = builder.Sort(input, 0, true);
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_tensor->type(), Float(32));
+  EXPECT_EQ(output_shape.size(), 2UL);
+  EXPECT_EQ(output_shape[0], B);
+  EXPECT_EQ(output_shape[1], H);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int h = 0; h < H; ++h) {
+    std::vector<float> sorted_data;
+    for (int b = 0; b < B; ++b) {
+      int index = h + H * b;
+      sorted_data.push_back(input_data[index]);
+    }
+    std::sort(sorted_data.begin(), sorted_data.begin() + B);
+
+    for (int b = 0; b < B; ++b) {
+      std::string line;
+      int index       = h + H * b;
+      float true_data = sorted_data[b];
+      float out_data  = output_data[index];
+      line += (std::to_string(out_data) + ", ");
+      EXPECT_EQ(true_data, out_data);
+      VLOG(6) << line;
+    }
+  }
+}
+
+TEST(net_build, program_execute_arange_float) {
+  const float start       = 1.5F;
+  const float stop        = 31.5F;
+  const float step        = 2.0F;
+  const std::string dtype = "float32";
+
+  NetBuilder builder("net_builder");
+  Variable out = builder.Arange(start, stop, step, dtype);
+  auto program = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(out->id));
+
+  runtime_program->Execute();
+
+  auto out_tensor                          = scope->GetTensor(std::string(out->id));
+  const std::vector<int>& out_tensor_shape = out_tensor->shape().data();
+  EXPECT_EQ(out_tensor->type(), Float(32));
+  EXPECT_EQ(out_tensor_shape.size(), 1UL);
+
+  int num_elem = static_cast<int>(std::ceil((stop - start) / step));
+  EXPECT_EQ(out_tensor_shape[0], num_elem);
+
+  std::vector<float> out_data = GetTensorData<float>(out_tensor, target);
+  for (int i = 0; i < out_tensor_shape[0]; ++i) {
+    EXPECT_NEAR(out_data[i], start + step * i, 1e-5);
+    VLOG(6) << out_data[i];
+  }
+}
+
+TEST(net_build, program_execute_arange_int) {
+  const float start       = 1.5F;
+  const float stop        = 31.5F;
+  const float step        = 1.6F;
+  const std::string dtype = "int32";
+
+  NetBuilder builder("net_builder");
+  Variable out = builder.Arange(start, stop, step, dtype);
+  auto program = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(out->id));
+
+  runtime_program->Execute();
+
+  auto out_tensor                          = scope->GetTensor(std::string(out->id));
+  const std::vector<int>& out_tensor_shape = out_tensor->shape().data();
+  EXPECT_EQ(out_tensor->type(), Int(32));
+  EXPECT_EQ(out_tensor_shape.size(), 1UL);
+
+  int num_elem = static_cast<int>(std::ceil((stop - start) / step));
+  EXPECT_EQ(out_tensor_shape[0], num_elem);
+
+  std::vector<int> out_data = GetTensorData<int>(out_tensor, target);
+  for (int i = 0; i < out_tensor_shape[0]; ++i) {
+    EXPECT_EQ(out_data[i], static_cast<int32_t>(start + step * i));
+    VLOG(6) << out_data[i];
+  }
+}
+
+TEST(net_build, program_argmax_case1) {
+  const int N     = 4;
+  const int IN_C  = 3;
+  const int OUT_C = 1;
+  const int H     = 7;
+  const int W     = 7;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {N, IN_C, H, W}, "In");
+  Variable output   = builder.Argmax(input, 1, true);
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+  VLOG(6) << "Visualize input_data";
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < IN_C; ++c) {
+      VLOG(6) << "n = " << n << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index = w + W * (h + H * (c + IN_C * n));
+          line += (std::to_string(input_data[index]) + ", ");
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_shape.size(), 4UL);
+  EXPECT_EQ(output_shape[0], N);
+  EXPECT_EQ(output_shape[1], OUT_C);
+  EXPECT_EQ(output_shape[2], H);
+  EXPECT_EQ(output_shape[3], W);
+
+  std::vector<int> output_data = GetTensorData<int>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < IN_C; ++c) {
+      VLOG(6) << "n = " << n << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index     = w + W * (h + H * (c + IN_C * n));
+          int out_index = w + W * (h + H * n);
+          float in_data = input_data[index];
+          int out_data  = output_data[out_index];
+          EXPECT_LE(0, out_data);
+          EXPECT_LT(out_data, IN_C);
+          int max_index   = w + W * (h + H * (out_data + IN_C * n));
+          float max_value = input_data[max_index];
+          line += (std::to_string(out_data) + ", ");
+          EXPECT_LE(in_data, max_value);
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+}
+
+TEST(net_build, program_argmax_case2) {
+  const int N    = 4;
+  const int IN_C = 3;
+  const int H    = 7;
+  const int W    = 7;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {N, IN_C, H, W}, "In");
+  Variable output   = builder.Argmax(input, 1, false);
+  auto program      = builder.Build();
+
+  Target target = common::DefaultHostTarget();
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  float* input_data = input_tensor->mutable_data<float>(target);
+  VLOG(6) << "Visualize input_data";
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < IN_C; ++c) {
+      VLOG(6) << "n = " << n << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index = w + W * (h + H * (c + IN_C * n));
+          line += (std::to_string(input_data[index]) + ", ");
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_shape.size(), 3UL);
+  EXPECT_EQ(output_shape[0], N);
+  EXPECT_EQ(output_shape[1], H);
+  EXPECT_EQ(output_shape[2], W);
+
+  int* output_data = output_tensor->mutable_data<int>(target);
+  VLOG(6) << "Visualize output_data";
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < IN_C; ++c) {
+      VLOG(6) << "n = " << n << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index     = w + W * (h + H * (c + IN_C * n));
+          int out_index = w + W * (h + H * n);
+          float in_data = input_data[index];
+          int out_data  = output_data[out_index];
+          EXPECT_LE(0, out_data);
+          EXPECT_LT(out_data, IN_C);
+          int max_index   = w + W * (h + H * (out_data + IN_C * n));
+          float max_value = input_data[max_index];
+          line += (std::to_string(out_data) + ", ");
+          EXPECT_LE(in_data, max_value);
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+}
+
+TEST(net_build, program_argmin_case1) {
+  const int N     = 4;
+  const int IN_C  = 3;
+  const int OUT_C = 1;
+  const int H     = 7;
+  const int W     = 7;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {N, IN_C, H, W}, "In");
+  Variable output   = builder.Argmin(input, 1, true);
+  auto program      = builder.Build();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+  VLOG(6) << "Visualize input_data";
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < IN_C; ++c) {
+      VLOG(6) << "n = " << n << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index = w + W * (h + H * (c + IN_C * n));
+          line += (std::to_string(input_data[index]) + ", ");
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_shape.size(), 4UL);
+  EXPECT_EQ(output_shape[0], N);
+  EXPECT_EQ(output_shape[1], OUT_C);
+  EXPECT_EQ(output_shape[2], H);
+  EXPECT_EQ(output_shape[3], W);
+
+  std::vector<int> output_data = GetTensorData<int>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < IN_C; ++c) {
+      VLOG(6) << "n = " << n << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index     = w + W * (h + H * (c + IN_C * n));
+          int out_index = w + W * (h + H * n);
+          float in_data = input_data[index];
+          int out_data  = output_data[out_index];
+          EXPECT_LE(0, out_data);
+          EXPECT_LT(out_data, IN_C);
+          int max_index   = w + W * (h + H * (out_data + IN_C * n));
+          float max_value = input_data[max_index];
+          line += (std::to_string(out_data) + ", ");
+          EXPECT_GE(in_data, max_value);
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+}
+
+TEST(net_build, program_argmin_case2) {
+  const int N    = 4;
+  const int IN_C = 3;
+  const int H    = 7;
+  const int W    = 7;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {N, IN_C, H, W}, "In");
+  Variable output   = builder.Argmin(input, 1, false);
+  auto program      = builder.Build();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+  VLOG(6) << "Visualize input_data";
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < IN_C; ++c) {
+      VLOG(6) << "n = " << n << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index = w + W * (h + H * (c + IN_C * n));
+          line += (std::to_string(input_data[index]) + ", ");
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  EXPECT_EQ(output_shape.size(), 3UL);
+  EXPECT_EQ(output_shape[0], N);
+  EXPECT_EQ(output_shape[1], H);
+  EXPECT_EQ(output_shape[2], W);
+
+  std::vector<int> output_data = GetTensorData<int>(output_tensor, target);
+  VLOG(6) << "Visualize output_data";
+  for (int n = 0; n < N; ++n) {
+    for (int c = 0; c < IN_C; ++c) {
+      VLOG(6) << "n = " << n << ", c = " << c;
+      for (int h = 0; h < H; ++h) {
+        std::string line;
+        for (int w = 0; w < W; ++w) {
+          int index     = w + W * (h + H * (c + IN_C * n));
+          int out_index = w + W * (h + H * n);
+          float in_data = input_data[index];
+          int out_data  = output_data[out_index];
+          EXPECT_LE(0, out_data);
+          EXPECT_LT(out_data, IN_C);
+          int max_index   = w + W * (h + H * (out_data + IN_C * n));
+          float max_value = input_data[max_index];
+          line += (std::to_string(out_data) + ", ");
+          EXPECT_GE(in_data, max_value);
+        }
+        VLOG(6) << line;
+      }
+    }
+  }
+}
+
+TEST(net_build, program_execute_repeat_axis_0) {
+  const int M       = 4;
+  const int N       = 4;
+  const int repeats = 3;
+  const int axis    = 0;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {M, N}, "In");
+  Variable output   = builder.Repeat(input, repeats, axis);
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+
+  const int new_M = M * repeats;
+  const int new_N = N;
+  EXPECT_EQ(output_tensor->type(), Float(32));
+  EXPECT_EQ(output_shape.size(), 2UL);
+  EXPECT_EQ(output_shape[0], new_M);
+  EXPECT_EQ(output_shape[1], new_N);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  for (int m = 0; m < new_M; ++m) {
+    for (int n = 0; n < new_N; ++n) {
+      int in_index   = n + N * static_cast<int>(std::floor((float)m / repeats));
+      int out_index  = n + new_N * m;
+      float in_data  = input_data[in_index];
+      float out_data = output_data[out_index];
+      EXPECT_EQ(in_data, out_data);
+    }
+  }
+}
+
+TEST(net_build, program_execute_repeat_axis_1) {
+  const int M       = 4;
+  const int N       = 4;
+  const int repeats = 3;
+  const int axis    = 1;
+
+  NetBuilder builder("net_builder");
+  Placeholder input = builder.CreateInput(Float(32), {M, N}, "In");
+  Variable output   = builder.Repeat(input, repeats, axis);
+  auto program      = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor = scope->GetTensor(std::string(input.id()));
+  SetRandData<float>(input_tensor, target);
+  std::vector<float> input_data = GetTensorData<float>(input_tensor, target);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+
+  const int new_M = M;
+  const int new_N = N * repeats;
+  EXPECT_EQ(output_tensor->type(), Float(32));
+  EXPECT_EQ(output_shape.size(), 2UL);
+  EXPECT_EQ(output_shape[0], new_M);
+  EXPECT_EQ(output_shape[1], new_N);
+
+  std::vector<float> output_data = GetTensorData<float>(output_tensor, target);
+  for (int m = 0; m < new_M; ++m) {
+    for (int n = 0; n < new_N; ++n) {
+      int in_index   = N * m + static_cast<int>(std::floor((float)n / repeats));
+      int out_index  = n + new_N * m;
+      float in_data  = input_data[in_index];
+      float out_data = output_data[out_index];
+      EXPECT_EQ(in_data, out_data);
+    }
+  }
+}
+
+TEST(net_build, program_execute_one_hot) {
+  const int M             = 4;
+  const int N             = 4;
+  const int on_value      = 1;
+  const int off_value     = 0;
+  const int depth         = 11;
+  const int axis          = 0;  // [-1 , M]
+  const std::string dtype = "int32";
+
+  NetBuilder builder("net_builder");
+  Placeholder input           = builder.CreateInput(Int(32), {M, N}, "In");
+  Placeholder on_value_input  = builder.CreateInput(Int(32), {1}, "OnValue");
+  Placeholder off_value_input = builder.CreateInput(Int(32), {1}, "OffValue");
+  Variable output             = builder.OneHot(input, on_value_input, off_value_input, depth, axis, dtype);
+  auto program                = builder.Build();
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>(std::string(input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(on_value_input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(off_value_input.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(output->id));
+
+  auto input_tensor                    = scope->GetTensor(std::string(input.id()));
+  const std::vector<int>& intput_shape = input_tensor->shape().data();
+  SetRandInt(input_tensor, target);
+  std::vector<int> input_data = GetTensorData<int>(input_tensor, target);
+
+  auto on_value_tensor = scope->GetTensor(std::string(on_value_input.id()));
+  SetRandInt(on_value_tensor, target, -1, on_value, on_value + 1);
+
+  auto off_value_tensor = scope->GetTensor(std::string(off_value_input.id()));
+  SetRandInt(off_value_tensor, target, -1, off_value, off_value + 1);
+
+  runtime_program->Execute();
+
+  auto output_tensor                   = scope->GetTensor(std::string(output->id));
+  const std::vector<int>& output_shape = output_tensor->shape().data();
+  std::vector<int> output_data         = GetTensorData<int>(output_tensor, target);
+
+  EXPECT_EQ(output_tensor->type(), Int(32));
+  EXPECT_EQ(output_shape.size(), intput_shape.size() + 1);
+
+  const int true_axis   = axis == -1 ? M : axis;
+  int input_shape_index = 0;
+
+  for (int i = 0; i < output_shape.size(); i++) {
+    LOG(INFO) << output_shape[i];
+    if (i == true_axis) {
+      EXPECT_EQ(output_shape[i], depth);
+    } else {
+      EXPECT_EQ(output_shape[i], intput_shape[input_shape_index++]);
+    }
+  }
+
+  for (int i = 0; i < output_shape[0]; ++i) {
+    for (int j = 0; j < output_shape[1]; ++j) {
+      for (int k = 0; k < output_shape[2]; ++k) {
+        std::vector<int> s = {i, j, k};
+        int input_index    = 0;
+        int output_index   = 0;
+        int base           = 1;
+
+        for (int x = s.size() - 1; x >= 0; --x) {
+          if (x == true_axis) {
+            continue;
+          }
+          input_index += base * s[x];
+          base = base * output_shape[x];
+        }
+
+        base = 1;
+
+        for (int x = s.size() - 1; x >= 0; --x) {
+          output_index += base * s[x];
+          base = base * output_shape[x];
+        }
+
+        if (s[true_axis] == input_data[input_index]) {
+          EXPECT_EQ(output_data[output_index], on_value);
+        } else {
+          EXPECT_EQ(output_data[output_index], off_value);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/op_mapper_registry.cc b/paddle/cinn/frontend/op_mapper_registry.cc
new file mode 100644
index 0000000000000..423271f33b973
--- /dev/null
+++ b/paddle/cinn/frontend/op_mapper_registry.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+
+#include "cinn/frontend/paddle/cpp/var_desc.h"
+
+namespace cinn {
+namespace frontend {
+
+void OpMapperContext::AddVar(const std::string& origin_name, const Variable& var, bool can_inplace) const {
+  CHECK(can_inplace || !var_map_->count(origin_name))
+      << "Duplicate variable \"" << origin_name << "\" found, whose id is " << var_map_->at(origin_name)->id;
+  if (var_map_->count(origin_name)) {
+    VLOG(1) << "The Paddle inplace output var \"" << origin_name + paddle::InplaceOutSuffix
+            << "\" is mapped to CINN var \"" << var->id << "\" with shape=[" << cinn::utils::Join(var->shape, ", ")
+            << "], dtype=" << var->type << ". The input var \"" << origin_name << "\" still mapped to \""
+            << var_map_->at(origin_name)->id << "\"";
+  } else {
+    VLOG(1) << "The Paddle var \"" << origin_name << "\" is mapped to CINN var \"" << var->id << "\" with shape=["
+            << cinn::utils::Join(var->shape, ", ") << "], dtype=" << var->type;
+  }
+  (*var_map_)[origin_name] = var;
+}
+
+void OpMapperContext::AddVarModelToProgram(const std::string& name, const std::string& id, bool can_inplace) const {
+  CHECK(!id.empty()) << "Paddle name [" << name << "]'s program id is empty ! Please check.";
+  if (!var_model_to_program_map_->count(name)) {
+    (*var_model_to_program_map_)[name] = id;
+    VLOG(4) << "Paddle name [" << name << "] map to program id " << id;
+  } else {
+    CHECK(can_inplace) << "Duplicate variable [" << name << "] found, whose id is "
+                       << var_model_to_program_map_->at(name);
+
+    const auto& inplace_out_name                   = name + paddle::InplaceOutSuffix;
+    (*var_model_to_program_map_)[inplace_out_name] = id;
+
+    VLOG(4) << "Paddle name [" << name << "] 's trick output [" << inplace_out_name << "] map to program id [" << id
+            << "]";
+  }
+}
+
+void OpMapperContext::AddFetchVarName(const std::string& name) const { fetch_var_names_->insert(name); }
+
+Variable OpMapperContext::GetVar(const std::string& origin_name) const {
+  auto it = var_map_->find(origin_name);
+  if (it != var_map_->end()) return it->second;
+
+  const auto& name = cinn::utils::TransValidVarName(origin_name);
+  CheckVarNameValid(name);
+
+  auto* var = scope_.FindVar(name);
+  if (var) {
+    auto& tensor = absl::get<hlir::framework::Tensor>(*var);
+    Variable local_var;
+    local_var.set_id(name);
+    local_var->shape = tensor->shape().data();
+    local_var->type  = tensor->type();
+    AddVar(origin_name, local_var);
+    return local_var;
+  }
+
+  LOG(FATAL) << "No var called [" << origin_name << "] exists";
+  return Variable();
+}
+
+void OpMapperContext::AddFeedInfo(const std::string& name, const FeedInfo& info) {
+  CHECK(!feed_info_map_.count(name)) << "Duplicate variable info [" << name << "] found";
+  feed_info_map_[name] = info;
+}
+
+const OpMapperContext::FeedInfo& OpMapperContext::GetFeedInfo(const std::string& name) const {
+  CHECK(feed_info_map_.count(name)) << "No variable info called [" << name << "] exists";
+  return feed_info_map_.at(name);
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/op_mapper_registry.h b/paddle/cinn/frontend/op_mapper_registry.h
new file mode 100644
index 0000000000000..e4529e4b660a5
--- /dev/null
+++ b/paddle/cinn/frontend/op_mapper_registry.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <variant>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+#include "cinn/common/type.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/paddle/cpp/op_desc.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/utils/registry.h"
+
+namespace cinn {
+namespace frontend {
+
+namespace paddle {
+// NOTE: Please Ensure the GradVarName function's definition is the
+// same as Paddle's!!! The definition ref to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L97
+inline std::string GradVarName(const std::string& var_name) {
+  constexpr char kGradVarSuffix[]     = "@GRAD";
+  constexpr size_t kGradVarSuffixSize = 5U;
+
+  std::string result;
+  result.reserve(var_name.size() + kGradVarSuffixSize);
+  result += var_name;
+  result += kGradVarSuffix;
+  return result;
+}
+
+// Only used for rename the output of inplace var!
+// Trick solution to fix CINN not support inplace op problem.
+constexpr char InplaceOutSuffix[] = "@InplaceOut";
+}  // namespace paddle
+
+class OpMapperContext {
+ public:
+  OpMapperContext(const hlir::framework::Scope& scope,
+                  const common::Target& target,
+                  NetBuilder* builder,
+                  std::unordered_map<std::string, Variable>* var_map,
+                  std::unordered_map<std::string, std::string>* var_model_to_program_map,
+                  std::unordered_set<std::string>* fetch_var_names)
+      : scope_(scope),
+        target_(target),
+        builder_(builder),
+        var_map_(var_map),
+        var_model_to_program_map_(var_model_to_program_map),
+        fetch_var_names_(fetch_var_names) {
+    CHECK_NOTNULL(builder_);
+    CHECK_NOTNULL(var_map_);
+    CHECK_NOTNULL(var_model_to_program_map_);
+    CHECK_NOTNULL(fetch_var_names_);
+  }
+
+  const auto& Scope() const { return scope_; }
+
+  const auto& Target() const { return target_; }
+
+  NetBuilder* Builder() const { return builder_; }
+
+  // add Variable into local var_map
+  void AddVar(const std::string& name, const Variable& var, bool can_inplace = true) const;
+
+  // get Variable from local var_map or scope
+  Variable GetVar(const std::string& name) const;
+
+  // add map from paddle name to cinn name into var_model_to_program_map
+  void AddVarModelToProgram(const std::string& name, const std::string& id, bool can_inplace = true) const;
+
+  void AddFetchVarName(const std::string& name) const;
+
+  struct FeedInfo {
+    std::vector<int> shape;
+    common::Type type;
+  };
+
+  void AddFeedInfo(const std::string& name, const FeedInfo& info);
+
+  const FeedInfo& GetFeedInfo(const std::string& name) const;
+
+ private:
+  const hlir::framework::Scope& scope_;
+  const common::Target& target_;
+  NetBuilder* builder_{nullptr};
+
+  std::unordered_map<std::string, Variable>* var_map_{nullptr};
+  // map from var in Paddle model to var name in program.
+  std::unordered_map<std::string, std::string>* var_model_to_program_map_{nullptr};
+  // fetch var names used in Paddle
+  std::unordered_set<std::string>* fetch_var_names_{nullptr};
+
+  std::unordered_map<std::string, FeedInfo> feed_info_map_;
+};
+
+class OpMapper {
+ public:
+  using OpMapperFunc = std::function<void(const paddle::cpp::OpDesc&, const OpMapperContext&)>;
+
+  OpMapper() = default;
+
+  OpMapper& Set(const OpMapperFunc& kernel) {
+    kernel_ = kernel;
+    return *this;
+  }
+  void Run(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) const { kernel_(op_desc, ctx); }
+
+  std::string name;
+
+ private:
+  OpMapperFunc kernel_;
+};
+
+class OpMapperRegistry : public Registry<OpMapper> {
+ public:
+  OpMapperRegistry() = default;
+
+ private:
+  CINN_DISALLOW_COPY_AND_ASSIGN(OpMapperRegistry);
+};
+
+#define UNIQUE_OPMAPPER_NAME(OpName) static ::cinn::frontend::OpMapper& __op_mapper_registrar_##OpName
+
+#define CINN_REGISTER_OP_MAPPER(OpName, Kernel)                \
+  CINN_STR_CONCAT(UNIQUE_OPMAPPER_NAME(OpName), __COUNTER__) = \
+      ::cinn::frontend::OpMapperRegistry::Global()->__REGISTER_OR_GET__(#OpName).Set(Kernel);
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/op_mapper_registry_test.cc b/paddle/cinn/frontend/op_mapper_registry_test.cc
new file mode 100644
index 0000000000000..c5765c6ac8a9d
--- /dev/null
+++ b/paddle/cinn/frontend/op_mapper_registry_test.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+
+#include <gtest/gtest.h>
+
+#include <typeinfo>
+
+#include "cinn/frontend/op_mappers/use_op_mappers.h"
+#include "cinn/utils/registry.h"
+
+namespace cinn {
+namespace frontend {
+
+TEST(OpMapperRegistryTest, list_all_opmappers) {
+  auto all_opmappers_names = OpMapperRegistry::Global()->ListAllNames();
+  LOG(INFO) << "Total has " << all_opmappers_names.size() << " registered OpMappers:\n"
+            << cinn::utils::Join(all_opmappers_names, ", ");
+  ASSERT_FALSE(all_opmappers_names.empty());
+}
+
+TEST(OpMapperRegistryTest, basic) {
+  auto kernel = OpMapperRegistry::Global()->Find("sigmoid");
+  ASSERT_NE(kernel, nullptr);
+  ASSERT_EQ(typeid(*kernel), typeid(OpMapper));
+  ASSERT_EQ(kernel->name, "sigmoid");
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/op_mappers/CMakeLists.txt b/paddle/cinn/frontend/op_mappers/CMakeLists.txt
new file mode 100644
index 0000000000000..2ec2787a95478
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/CMakeLists.txt
@@ -0,0 +1,4 @@
+core_gather_headers()
+
+add_subdirectory(paddle)
+add_subdirectory(science)
diff --git a/paddle/cinn/frontend/op_mappers/common_utils.h b/paddle/cinn/frontend/op_mappers/common_utils.h
new file mode 100644
index 0000000000000..d5a93b1f2076e
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/common_utils.h
@@ -0,0 +1,171 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "cinn/frontend/paddle/cpp/op_desc.h"
+#include "cinn/frontend/var_type_utils.h"
+#include "cinn/utils/functional.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace frontend {
+namespace utils {
+
+template <typename T>
+inline T GetAttrOrDefault(const paddle::cpp::OpDesc& op_desc, const std::string& name, const T& default_value = T{}) {
+  if (op_desc.HasAttr(name)) {
+    return op_desc.GetAttr<T>(name);
+  }
+  return default_value;
+}
+
+#define EXPAND_SINGLE_NUM_TO_VECTOR(DATA_TYPE, ATTR_TYPE)                                                             \
+  template <>                                                                                                         \
+  inline std::vector<DATA_TYPE> GetAttrOrDefault(                                                                     \
+      const paddle::cpp::OpDesc& op_desc, const std::string& name, const std::vector<DATA_TYPE>& default_value) {     \
+    if (op_desc.HasAttr(name)) {                                                                                      \
+      auto attr_type = op_desc.GetAttrType(name);                                                                     \
+      using AttrType = paddle::cpp::OpDescAPI::AttrType;                                                              \
+      switch (attr_type) {                                                                                            \
+        case AttrType::ATTR_TYPE##S:                                                                                  \
+          return op_desc.GetAttr<std::vector<DATA_TYPE>>(name);                                                       \
+        case AttrType::ATTR_TYPE:                                                                                     \
+          return std::vector<DATA_TYPE>{op_desc.GetAttr<DATA_TYPE>(name)};                                            \
+        default:                                                                                                      \
+          if (attr_type == AttrType::BOOLEANS) {                                                                      \
+            LOG(WARNING) << "Op \"" << op_desc.Type() << "\"'s attribute \"" << name << "\" should be " << #ATTR_TYPE \
+                         << "S, but here is BOOLEANS, considering the type of python empty list in cpp are BOOLEANS," \
+                         << " here we will return a empty vector.";                                                   \
+            return {};                                                                                                \
+          } else {                                                                                                    \
+            LOG(FATAL) << "Op \"" << op_desc.Type() << "\"'s attribute \"" << name << "\" should be " << #ATTR_TYPE   \
+                       << "S. But here " << static_cast<int>(attr_type) << " Please Check!";                          \
+          }                                                                                                           \
+      }                                                                                                               \
+    }                                                                                                                 \
+    return default_value;                                                                                             \
+  }
+
+EXPAND_SINGLE_NUM_TO_VECTOR(int, INT)
+EXPAND_SINGLE_NUM_TO_VECTOR(float, FLOAT)
+EXPAND_SINGLE_NUM_TO_VECTOR(std::string, STRING)
+EXPAND_SINGLE_NUM_TO_VECTOR(bool, BOOLEAN)
+
+#undef EXPAND_SINGLE_NUM_TO_VECTOR
+
+template <>
+inline bool GetAttrOrDefault(const paddle::cpp::OpDesc& op_desc, const std::string& name, const bool& default_value) {
+  if (op_desc.HasAttr(name)) {
+    auto attr_type = op_desc.GetAttrType(name);
+    using AttrType = paddle::cpp::OpDescAPI::AttrType;
+    switch (attr_type) {
+      case AttrType::BOOLEAN:
+        return op_desc.GetAttr<bool>(name);
+      case AttrType::INT:
+        return static_cast<bool>(op_desc.GetAttr<int>(name));
+      case AttrType::LONG:
+        return static_cast<bool>(op_desc.GetAttr<int64_t>(name));
+      default:
+        LOG(FATAL) << "Op " << op_desc.Type() << "'s attribute " << name << " should be BOOLEAN. Please Check!";
+    }
+  }
+  return default_value;
+}
+
+template <>
+inline int64_t GetAttrOrDefault(const paddle::cpp::OpDesc& op_desc,
+                                const std::string& name,
+                                const int64_t& default_value) {
+  if (op_desc.HasAttr(name)) {
+    auto attr_type = op_desc.GetAttrType(name);
+    using AttrType = paddle::cpp::OpDescAPI::AttrType;
+    switch (attr_type) {
+      case AttrType::LONG:
+        return op_desc.GetAttr<int64_t>(name);
+      case AttrType::INT:
+        return static_cast<int64_t>(op_desc.GetAttr<int>(name));
+      default:
+        LOG(FATAL) << "Op " << op_desc.Type() << "'s attribute " << name << " should be LONG. Please Check!";
+    }
+  }
+  return default_value;
+}
+
+template <>
+inline std::vector<int64_t> GetAttrOrDefault(const paddle::cpp::OpDesc& op_desc,
+                                             const std::string& name,
+                                             const std::vector<int64_t>& default_value) {
+  if (op_desc.HasAttr(name)) {
+    auto attr_type = op_desc.GetAttrType(name);
+    using AttrType = paddle::cpp::OpDescAPI::AttrType;
+    switch (attr_type) {
+      case AttrType::LONGS:
+        return op_desc.GetAttr<std::vector<int64_t>>(name);
+      case AttrType::LONG:
+        return std::vector<int64_t>{GetAttrOrDefault<int64_t>(op_desc, name)};
+      case AttrType::INTS: {
+        const auto& ints_val = GetAttrOrDefault<std::vector<int>>(op_desc, name);
+        return std::vector<int64_t>{ints_val.begin(), ints_val.end()};
+      }
+      case AttrType::INT:
+        return std::vector<int64_t>{GetAttrOrDefault<int>(op_desc, name)};
+      case AttrType::BOOLEANS: {
+        LOG(WARNING) << "Op \"" << op_desc.Type() << "\"'s attribute \"" << name << "\" should be LONGS, "
+                     << "but here is BOOLEANS, considering the type of python empty list in cpp are BOOLEANS, "
+                     << "here we will return a empty vector.";
+        return {};
+      }
+      default:
+        LOG(FATAL) << "Op " << op_desc.Type() << "'s attribute " << name << " should be LONGS. Please Check!";
+    }
+  }
+  return default_value;
+}
+
+template <typename T>
+inline cinn::utils::ShapeType ToShapeType(const std::vector<T>& shape) {
+  return cinn::utils::ShapeType(shape.begin(), shape.end());
+}
+
+template <typename T>
+inline cinn::utils::DimType ToDimType(const T& val) {
+  return static_cast<cinn::utils::DimType>(val);
+}
+
+inline std::string GetPaddleDtype(const paddle::cpp::OpDesc& op_desc,
+                                  const std::string& dtype_attr_name,
+                                  paddle::cpp::VarDescAPI::Type default_dtype) {
+  auto dtype_id = GetAttrOrDefault<int>(op_desc, dtype_attr_name, static_cast<int>(default_dtype));
+  if (dtype_id < 0) {
+    return "";
+  }
+  auto dtype_pd   = static_cast<paddle::cpp::VarDescAPI::Type>(dtype_id);
+  auto dtype_cinn = CppVarType2CommonType(dtype_pd);
+  if (dtype_cinn.is_unk()) {
+    return "";
+  }
+
+  return common::Type2Str(dtype_cinn);
+}
+
+}  // namespace utils
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/op_mappers/paddle/CMakeLists.txt b/paddle/cinn/frontend/op_mappers/paddle/CMakeLists.txt
new file mode 100644
index 0000000000000..65d62b3253e11
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/CMakeLists.txt
@@ -0,0 +1,3 @@
+core_gather_headers()
+file(GLOB paddle_op_mapper_srcs LIST_DIRECTORIES false RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cc)
+gather_srcs(cinnapi_src SRCS ${paddle_op_mapper_srcs})
diff --git a/paddle/cinn/frontend/op_mappers/paddle/arg_min_max.cc b/paddle/cinn/frontend/op_mappers/paddle/arg_min_max.cc
new file mode 100644
index 0000000000000..e67aed1a400e5
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/arg_min_max.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+enum class ArgType { ArgMax, ArgMin };
+
+template <ArgType type>
+Variable ArgImpl(NetBuilder* builder, const Variable& x, int axis, bool keepdims);
+
+template <>
+Variable ArgImpl<ArgType::ArgMax>(NetBuilder* builder, const Variable& x, int axis, bool keepdims) {
+  return builder->Argmax(x, axis, keepdims);
+}
+
+template <>
+Variable ArgImpl<ArgType::ArgMin>(NetBuilder* builder, const Variable& x, int axis, bool keepdims) {
+  return builder->Argmin(x, axis, keepdims);
+}
+
+template <ArgType type>
+void ArgOpMapperHelper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x    = ctx.GetVar(x_name);
+  auto axis = utils::GetAttrOrDefault<int64_t>(op_desc, "axis", -1);
+  CHECK(op_desc.HasAttr("axis")) << "Argmax/Argmin op should has attribute \"axis\"! Please check.";
+
+  auto keepdims = utils::GetAttrOrDefault<bool>(op_desc, "keepdims", false);
+  CHECK(op_desc.HasAttr("keepdims")) << "Argmax/Argmin op should has attribute \"keepdims\"! Please check.";
+
+  auto flatten = utils::GetAttrOrDefault<bool>(op_desc, "flatten", false);
+  CHECK(op_desc.HasAttr("flatten")) << "Argmax/Argmin op should has attribute \"flatten\"! Please check.";
+
+  auto dtype = utils::GetPaddleDtype(op_desc, "dtype", paddle::cpp::VarDescAPI::Type::INT64);
+  CHECK(dtype == "int32" || dtype == "int64") << "the indices dtype must be int32 or int64, but got dtype = " << dtype;
+
+  int ndim = x->shape.size();
+  // If flatten = true, flatten x and do opration on axis 0.
+  if (flatten) {
+    x    = ctx.Builder()->Reshape(x, {-1});
+    axis = 0;
+    ndim = x->shape.size();
+  }
+
+  auto out = ArgImpl<type>(ctx.Builder(), x, axis, keepdims);
+
+  out = ctx.Builder()->Cast(out, dtype);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void ArgMaxOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  ArgOpMapperHelper<ArgType::ArgMax>(op_desc, ctx);
+}
+
+void ArgMinOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  ArgOpMapperHelper<ArgType::ArgMin>(op_desc, ctx);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_arg) {
+  CINN_REGISTER_OP_MAPPER(arg_max, cinn::frontend::paddle_mappers::ArgMaxOpMapper)
+  CINN_REGISTER_OP_MAPPER(arg_min, cinn::frontend::paddle_mappers::ArgMinOpMapper)
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/argsort.cc b/paddle/cinn/frontend/op_mappers/paddle/argsort.cc
new file mode 100644
index 0000000000000..7faef70972bca
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/argsort.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <variant>
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void ArgsortOpMapper(const paddle::cpp::OpDesc& op_desc, const cinn::frontend::OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  CHECK_EQ(op_desc.Output("Indices").size(), 1UL);
+  auto indices_name = op_desc.Output("Indices").front();
+
+  auto is_ascend = !(utils::GetAttrOrDefault<bool>(op_desc, "descending", false));
+  auto axis      = utils::ToDimType(utils::GetAttrOrDefault<int64_t>(op_desc, "axis", 0));
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->ArgSort(x, axis, is_ascend);
+  auto idx = ctx.Builder()->Cast(out[0], "int64");
+
+  ctx.AddVar(indices_name, idx);
+  ctx.AddVarModelToProgram(indices_name, idx->id);
+
+  // TODO: return the sorted tensor here. Now out[1] is a temporary tensor.
+  // this is because output 'Out' is never uesd in Paddle API, but CINN need to return 2 output vars
+  // to meet the op defination, this should be resolved after sort op restructured.
+  ctx.AddVar(out_name, out[1]);
+  ctx.AddVarModelToProgram(out_name, out[1]->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_argsort) {
+  CINN_REGISTER_OP_MAPPER(argsort, cinn::frontend::paddle_mappers::ArgsortOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/atan.cc b/paddle/cinn/frontend/op_mappers/paddle/atan.cc
new file mode 100644
index 0000000000000..fde80539965ca
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/atan.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <variant>
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void Atan2OpMapper(const paddle::cpp::OpDesc& op_desc, const cinn::frontend::OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X1").size(), 1UL);
+  auto x1_name = op_desc.Input("X1").front();
+  CHECK_EQ(op_desc.Input("X2").size(), 1UL);
+  auto x2_name = op_desc.Input("X2").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x1 = ctx.GetVar(x1_name);
+  auto x2 = ctx.GetVar(x2_name);
+
+  if (x1->type.is_int() && x2->type.is_int()) {
+    x1 = ctx.Builder()->Cast(x1, "float64");
+    x2 = ctx.Builder()->Cast(x2, "float64");
+  }
+
+  auto out = ctx.Builder()->Atan2(x1, x2);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_atan) {
+  CINN_REGISTER_OP_MAPPER(atan2, cinn::frontend::paddle_mappers::Atan2OpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/batchnorm.cc b/paddle/cinn/frontend/op_mappers/paddle/batchnorm.cc
new file mode 100644
index 0000000000000..133ce244f6d9a
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/batchnorm.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void BatchNormOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  auto add_output = [&op_desc, &ctx](
+                        const std::string& pd_param_name, const Variable& out, bool can_inplace = false) -> void {
+    if (!op_desc.HasOutput(pd_param_name)) {
+      VLOG(4) << "Cannot find parameter " << pd_param_name << " in op " << op_desc.Type();
+      return;
+    }
+    CHECK_EQ(op_desc.Output(pd_param_name).size(), 1UL);
+    auto output_name = op_desc.Output(pd_param_name).front();
+
+    VLOG(4) << "The " << op_desc.Type() << "'s output " << pd_param_name << " is " << output_name;
+
+    ctx.AddVar(output_name, out, can_inplace);
+    ctx.AddVarModelToProgram(output_name, out->id, can_inplace);
+  };
+
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Scale").size(), 1UL);
+  auto scale_name = op_desc.Input("Scale").front();
+  CHECK_EQ(op_desc.Input("Bias").size(), 1UL);
+  auto bias_name = op_desc.Input("Bias").front();
+  CHECK_EQ(op_desc.Input("Mean").size(), 1UL);
+  auto mean_name = op_desc.Input("Mean").front();
+  CHECK_EQ(op_desc.Input("Variance").size(), 1UL);
+  auto variance_name = op_desc.Input("Variance").front();
+
+  auto epsilon     = utils::GetAttrOrDefault<float>(op_desc, "epsilon", 1e-5f);
+  auto momentum    = utils::GetAttrOrDefault<float>(op_desc, "momentum", 0.9f);
+  auto data_layout = utils::GetAttrOrDefault<std::string>(op_desc, "data_layout", "NCHW");
+  auto x           = ctx.GetVar(x_name);
+  auto scale       = ctx.GetVar(scale_name);
+  auto bias        = ctx.GetVar(bias_name);
+  auto mean        = ctx.GetVar(mean_name);
+  auto variance    = ctx.GetVar(variance_name);
+
+  auto is_test          = utils::GetAttrOrDefault<bool>(op_desc, "is_test", false);
+  auto trainable_stats  = utils::GetAttrOrDefault<bool>(op_desc, "trainable_statistics", false);
+  auto use_global_stats = utils::GetAttrOrDefault<bool>(op_desc, "use_global_stats", false);
+  bool use_run_stat     = (is_test && (!trainable_stats)) || use_global_stats;
+
+  VLOG(4) << "Try compute batch_norm(X:" << x_name << ", Scale:" << scale_name << ", Bias:" << bias_name
+          << ","
+             ", Mean:"
+          << mean_name << ", Variance:" << variance_name << ", epsilon=" << epsilon << ", momentum=" << momentum
+          << ", data_layout=" << data_layout << ", is_test=" << is_test << ", trainable_statistics=" << trainable_stats
+          << ", use_global_stats=" << use_global_stats << ")";
+
+  auto outs = ctx.Builder()->BatchNorm(x, scale, bias, mean, variance, epsilon, momentum, data_layout, use_run_stat);
+
+  if (use_run_stat) {
+    VLOG(4) << "Invoke batch_norm OpMapper with test mode";
+
+    add_output("Y", outs[0]);
+    // batch_norm eval mode should not modify mean and variance's value
+    auto save_mean = ctx.Builder()->Identity(mean);
+    add_output("SavedMean", save_mean);
+    auto save_variance = ctx.Builder()->Identity(variance);
+    add_output("SavedVariance", save_variance);
+    // Just for skip error of "Variable(batch_norm2d_0.w_2@InplaceOut) not applied in cinn" when run batchnorm in
+    // paddle, remove after inpace mechanism perfect. The value should shared memory with mean and variance.
+    auto mean_out = ctx.Builder()->Identity(mean);
+    add_output("MeanOut", mean_out, true);
+    auto variance_out = ctx.Builder()->Identity(variance);
+    add_output("VarianceOut", variance_out, true);
+  } else {
+    VLOG(4) << "Invoke batch_norm OpMapper with train mode";
+    CHECK_EQ(outs.size(), 5U) << "batch_norm in train mode should only has 5 output! Please check.";
+
+    add_output("Y", outs[0]);
+    add_output("SavedMean", outs[1]);
+    add_output("SavedVariance", outs[2]);
+    // the argument of MeanOut and VarianceOut are the same as Mean and Variance
+    add_output("MeanOut", outs[3], true);
+    add_output("VarianceOut", outs[4], true);
+  }
+}
+
+void BatchNormGradOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  std::unordered_map<std::string, std::string> input_names_map;
+  auto get_input_var = [&op_desc, &ctx, &input_names_map](const std::string& op_name) {
+    CHECK_EQ(op_desc.Input(op_name).size(), 1UL);
+    auto var_name = op_desc.Input(op_name).front();
+    input_names_map.emplace(op_name, var_name);
+    return ctx.GetVar(var_name);
+  };
+
+  std::unordered_map<std::string, std::string> output_names_map;
+  auto get_output_name = [&op_desc, &output_names_map](const std::string& op_name) -> std::string {
+    if (op_desc.Output(op_name).empty()) {
+      CHECK_NE(op_name, paddle::GradVarName("X")) << "The input X should not empty.";
+      return "";
+    }
+
+    CHECK_EQ(op_desc.Output(op_name).size(), 1UL);
+    auto var_name = op_desc.Output(op_name).front();
+    output_names_map.emplace(op_name, var_name);
+    return var_name;
+  };
+
+  std::vector<std::string> output_names = {get_output_name(paddle::GradVarName("X")),
+                                           get_output_name(paddle::GradVarName("Scale")),
+                                           get_output_name(paddle::GradVarName("Bias"))};
+
+  auto x              = get_input_var("X");
+  auto dy             = get_input_var(paddle::GradVarName("Y"));
+  auto scale          = get_input_var("Scale");
+  auto saved_mean     = get_input_var("SavedMean");
+  auto saved_variance = get_input_var("SavedVariance");
+
+  auto data_layout = utils::GetAttrOrDefault<std::string>(op_desc, "data_layout", "NCHW");
+  auto epsilon     = utils::GetAttrOrDefault<float>(op_desc, "epsilon", 1e-5f);
+
+  auto get_arg_debug_info = [](const std::unordered_map<std::string, std::string>& names_map) {
+    std::string res;
+    for (const auto& pair : names_map) {
+      res.append(pair.first + ":" + pair.second + ", ");
+    }
+    return res;
+  };
+
+  VLOG(4) << "{" << get_arg_debug_info(output_names_map) << "} = batch_norm_grad("
+          << get_arg_debug_info(input_names_map) << ", data_layout=" << data_layout << ", epsilon=" << epsilon << ")";
+
+  // batch norm grad, output(grad_x, grad_scale, grad_bias)
+  auto outs = ctx.Builder()->BatchNormGrad(dy, x, scale, saved_mean, saved_variance, epsilon, data_layout);
+  CHECK_EQ(outs.size(), 3ul) << "batch_norm_grad APIs should return 3 Variable!";
+
+  for (int i = 0; i < outs.size(); i++) {
+    if (output_names[i].empty()) {
+      continue;
+    }
+
+    ctx.AddVar(output_names[i], outs[i]);
+    ctx.AddVarModelToProgram(output_names[i], outs[i]->id);
+  }
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_batchnorm) {
+  CINN_REGISTER_OP_MAPPER(batch_norm, cinn::frontend::paddle_mappers::BatchNormOpMapper)
+  CINN_REGISTER_OP_MAPPER(batch_norm_grad, cinn::frontend::paddle_mappers::BatchNormGradOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/binary.cc b/paddle/cinn/frontend/op_mappers/paddle/binary.cc
new file mode 100644
index 0000000000000..8dcede4dcec7d
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/binary.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+#define BINARY_OPMAPPER_FUNCTION(OP_NAME)                                                  \
+  void OP_NAME##OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) { \
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);                                              \
+    auto x_name = op_desc.Input("X").front();                                              \
+    CHECK_EQ(op_desc.Input("Y").size(), 1UL);                                              \
+    auto y_name = op_desc.Input("Y").front();                                              \
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);                                           \
+    auto out_name = op_desc.Output("Out").front();                                         \
+    auto x        = ctx.GetVar(x_name);                                                    \
+    auto y        = ctx.GetVar(y_name);                                                    \
+    auto out      = ctx.Builder()->OP_NAME(x, y);                                          \
+    ctx.AddVar(out_name, out);                                                             \
+    ctx.AddVarModelToProgram(out_name, out->id);                                           \
+  }
+
+BINARY_OPMAPPER_FUNCTION(LogicalAnd)
+BINARY_OPMAPPER_FUNCTION(LogicalOr)
+BINARY_OPMAPPER_FUNCTION(LogicalXor)
+BINARY_OPMAPPER_FUNCTION(BitwiseAnd)
+BINARY_OPMAPPER_FUNCTION(BitwiseOr)
+BINARY_OPMAPPER_FUNCTION(BitwiseXor)
+
+#undef BINARY_OPMAPPER_FUNCTION
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_binary) {
+#define BINARY_OPMAPPER_REGISTER(PD_OP, CINN_OP) \
+  CINN_REGISTER_OP_MAPPER(PD_OP, cinn::frontend::paddle_mappers::CINN_OP##OpMapper)
+
+  BINARY_OPMAPPER_REGISTER(logical_and, LogicalAnd)
+  BINARY_OPMAPPER_REGISTER(logical_or, LogicalOr)
+  BINARY_OPMAPPER_REGISTER(logical_xor, LogicalXor)
+  BINARY_OPMAPPER_REGISTER(bitwise_and, BitwiseAnd)
+  BINARY_OPMAPPER_REGISTER(bitwise_or, BitwiseOr)
+  BINARY_OPMAPPER_REGISTER(bitwise_xor, BitwiseXor)
+
+#undef BINARY_OPMAPPER_REGISTER
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/cholesky.cc b/paddle/cinn/frontend/op_mappers/paddle/cholesky.cc
new file mode 100644
index 0000000000000..a6512dbc43063
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/cholesky.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void CholeskyOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto upper = utils::GetAttrOrDefault<bool>(op_desc, "upper", false);
+  VLOG(4) << out_name << " = cholesky(" << x_name << ", upper=" << upper << ")";
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->Cholesky(x, upper);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_cholesky) {
+  CINN_REGISTER_OP_MAPPER(cholesky, cinn::frontend::paddle_mappers::CholeskyOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/clip.cc b/paddle/cinn/frontend/op_mappers/paddle/clip.cc
new file mode 100644
index 0000000000000..5c49822ba6520
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/clip.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void ClipOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  auto x        = ctx.GetVar(x_name);
+  auto builder  = ctx.Builder();
+
+  if (op_desc.HasInput("Min") && op_desc.Input("Min").size() > 0) {
+    CHECK_EQ(op_desc.Input("Min").size(), 1) << "clip op should have only one input for Min";
+    auto min_val_name   = op_desc.Input("Min").front();
+    auto min_val_tensor = ctx.GetVar(min_val_name);
+    CHECK(min_val_tensor->shape == cinn::utils::ShapeType{1})
+        << "The [Min] tensor shape of clip op should be [1], but here ["
+        << cinn::utils::Join(min_val_tensor->shape, ", ") << "]";
+    if (x->type != min_val_tensor->type) {
+      min_val_tensor = builder->Cast(min_val_tensor, common::Type2Str(x->type));
+    }
+    min_val_tensor = builder->BroadcastTo(min_val_tensor, x->shape);
+    x              = builder->Max(x, min_val_tensor);
+  } else {
+    CHECK(op_desc.HasAttr("min")) << "The clip op should has [min] attribute or [Min] tensor input.";
+    auto min_value = op_desc.GetAttr<float>("min");
+    auto min_val_tensor =
+        builder->FillConstant(x->shape, min_value, common::UniqName(x->id + "_min"), common::Type2Str(x->type));
+    x = builder->Max(x, min_val_tensor);
+  }
+
+  if (op_desc.HasInput("Max") && op_desc.Input("Max").size() > 0) {
+    CHECK_EQ(op_desc.Input("Max").size(), 1) << "clip op should have only one input for Max";
+    auto max_val_name   = op_desc.Input("Max").front();
+    auto max_val_tensor = ctx.GetVar(max_val_name);
+    CHECK(max_val_tensor->shape == cinn::utils::ShapeType{1})
+        << "The [Max] tensor shape of clip op should be [1], but here ["
+        << cinn::utils::Join(max_val_tensor->shape, ", ") << "]";
+    if (x->type != max_val_tensor->type) {
+      max_val_tensor = builder->Cast(max_val_tensor, common::Type2Str(x->type));
+    }
+    max_val_tensor = builder->BroadcastTo(max_val_tensor, x->shape);
+    x              = builder->Min(x, max_val_tensor);
+  } else {
+    CHECK(op_desc.HasAttr("max")) << "The clip op should has [max] attribute or [Max] tensor input.";
+    auto max_value = op_desc.GetAttr<float>("max");
+    auto max_val_tensor =
+        builder->FillConstant(x->shape, max_value, common::UniqName("constant"), common::Type2Str(x->type));
+    x = builder->Min(x, max_val_tensor);
+  }
+
+  ctx.AddVar(out_name, x);
+  ctx.AddVarModelToProgram(out_name, x->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_clip) {
+  CINN_REGISTER_OP_MAPPER(clip, cinn::frontend::paddle_mappers::ClipOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/compare.cc b/paddle/cinn/frontend/op_mappers/paddle/compare.cc
new file mode 100644
index 0000000000000..30ffbe24a34b2
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/compare.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+static const std::string& GetCompareDebugString(const std::string& compare_op) {
+  static std::unordered_map<std::string, std::string> compare_debug_map = {
+      {"GreaterThan", " > "},
+      {"GreaterEqual", " >= "},
+      {"LessThan", " < "},
+      {"LessEqual", " <= "},
+      {"Equal", " == "},
+      {"NotEqual", " != "},
+  };
+  CHECK_GT(compare_debug_map.count(compare_op), 0) << "Unsupported compare op " << compare_op;
+  return compare_debug_map[compare_op];
+}
+
+#define COMPARE_OPMAPPER_FUNCTION(OP_NAME)                                                                 \
+  void OP_NAME##OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {                 \
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);                                                              \
+    auto x_name = op_desc.Input("X").front();                                                              \
+    CHECK_EQ(op_desc.Input("Y").size(), 1UL);                                                              \
+    auto y_name = op_desc.Input("Y").front();                                                              \
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);                                                           \
+    auto out_name = op_desc.Output("Out").front();                                                         \
+    auto axis     = utils::GetAttrOrDefault<int>(op_desc, "axis", -1);                                     \
+    VLOG(4) << out_name << " = " << x_name << GetCompareDebugString(#OP_NAME) << y_name << " at " << axis; \
+    auto x   = ctx.GetVar(x_name);                                                                         \
+    auto y   = ctx.GetVar(y_name);                                                                         \
+    auto out = ctx.Builder()->OP_NAME(x, y, axis);                                                         \
+    ctx.AddVar(out_name, out);                                                                             \
+    ctx.AddVarModelToProgram(out_name, out->id);                                                           \
+  }
+
+COMPARE_OPMAPPER_FUNCTION(GreaterThan)
+COMPARE_OPMAPPER_FUNCTION(GreaterEqual)
+COMPARE_OPMAPPER_FUNCTION(LessThan)
+COMPARE_OPMAPPER_FUNCTION(LessEqual)
+COMPARE_OPMAPPER_FUNCTION(Equal)
+COMPARE_OPMAPPER_FUNCTION(NotEqual)
+
+#undef COMPARE_OPMAPPER_FUNCTION
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_compare) {
+#define COMPARE_OPMAPPER_REGISTER(PD_OP, CINN_OP) \
+  CINN_REGISTER_OP_MAPPER(PD_OP, cinn::frontend::paddle_mappers::CINN_OP##OpMapper)
+
+  COMPARE_OPMAPPER_REGISTER(greater_than, GreaterThan)
+  COMPARE_OPMAPPER_REGISTER(greater_equal, GreaterEqual)
+  COMPARE_OPMAPPER_REGISTER(less_than, LessThan)
+  COMPARE_OPMAPPER_REGISTER(less_equal, LessEqual)
+  COMPARE_OPMAPPER_REGISTER(equal, Equal)
+  COMPARE_OPMAPPER_REGISTER(not_equal, NotEqual)
+
+#undef COMPARE_OPMAPPER_REGISTER
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/concat.cc b/paddle/cinn/frontend/op_mappers/paddle/concat.cc
new file mode 100644
index 0000000000000..aee846fb2ebab
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/concat.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void ConcatOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_GE(op_desc.Input("X").size(), 1UL);
+  auto x_names = op_desc.Input("X");
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto axis = utils::GetAttrOrDefault<int>(op_desc, "axis", 0);
+
+  std::vector<Variable> xs;
+  for (const auto& name : x_names) {
+    xs.emplace_back(ctx.GetVar(name));
+  }
+
+  auto err_x = std::find_if(xs.begin(), xs.end(), [&](Variable x) { return x->type != xs.front()->type; });
+  CHECK(err_x == xs.end()) << "All input's dtype of [concat] should be the same, be the input " << (*err_x)->id
+                           << "'s dtype [" << (*err_x)->type << "] not equal to the first input " << xs.front()->id
+                           << "'s dtype [" << xs.front()->type << "]";
+
+  auto out = ctx.Builder()->Concat(xs, axis);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void StackOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_GE(op_desc.Input("X").size(), 1UL);
+  auto x_names = op_desc.Input("X");
+
+  std::string out_name;
+  if (op_desc.HasOutput("Out")) {
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    out_name = op_desc.Output("Out").front();
+  } else if (op_desc.HasOutput("Y")) {
+    CHECK_EQ(op_desc.Output("Y").size(), 1UL);
+    out_name = op_desc.Output("Y").front();
+  } else {
+    LOG(FATAL) << "The output argument name of [stack] should be 'Out' or 'Y', but here cannot found! Please check.";
+  }
+
+  auto axis = utils::GetAttrOrDefault<int>(op_desc, "axis", 0);
+
+  std::vector<Variable> xs;
+  for (const auto& name : x_names) {
+    xs.emplace_back(ctx.GetVar(name));
+  }
+
+  auto err_x = std::find_if(xs.begin(), xs.end(), [&](Variable x) { return x->type != xs.front()->type; });
+  CHECK(err_x == xs.end()) << "All input's dtype of [concat] should be the same, be the input " << (*err_x)->id
+                           << "'s dtype [" << (*err_x)->type << "] not equal to the first input " << xs.front()->id
+                           << "'s dtype [" << xs.front()->type << "]";
+
+  err_x = std::find_if(xs.begin(), xs.end(), [&](Variable x) { return x->shape != xs.front()->shape; });
+  CHECK(err_x == xs.end()) << "All input shape of [stack] should be the same, be the input " << (*err_x)->id
+                           << "'s shape [" << cinn::utils::Join((*err_x)->shape, ", ") << "] not equal to "
+                           << "the first input " << xs.front()->id << "'s shape ["
+                           << cinn::utils::Join(xs.front()->shape, ", ") << "]";
+
+  auto concat_out = ctx.Builder()->Concat(xs, axis);
+
+  int rank = concat_out->shape.size();
+  axis     = axis >= 0 ? axis : axis + rank;
+  CHECK(axis >= 0 && axis < rank) << "The axis of stack should >=0 and <rank(x)! Please check.";
+
+  // N * [A, B] with axis=0 --> [N, A, B]; N * [A, B] with axis=1 --> [A, N, B];
+  cinn::utils::ShapeType new_shape;
+  for (int i = 0; i < rank; ++i) {
+    auto dim = concat_out->shape[i];
+    if (i != axis) {
+      new_shape.emplace_back(dim);
+    } else {
+      new_shape.emplace_back(xs.size());
+      // the shape same ensure `dim % xs.size() == 0`
+      new_shape.emplace_back(dim / xs.size());
+    }
+  }
+  auto out = ctx.Builder()->Reshape(concat_out, new_shape);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void SplitOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  ;
+  CHECK_GE(op_desc.Output("Out").size(), 1UL);
+  auto out_names = op_desc.Output("Out");
+
+  auto x   = ctx.GetVar(x_name);
+  int rank = x->shape.size();
+
+  auto axis = utils::GetAttrOrDefault<int>(op_desc, "axis", 0);
+  CHECK(axis >= -rank && axis < rank) << "The [axis] should in [-" << rank << ", " << rank << "), but here is " << axis;
+  if (axis < 0) {
+    axis += rank;
+  }
+
+  auto num      = utils::GetAttrOrDefault<int>(op_desc, "num", 0);
+  auto sections = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "sections");
+
+  auto dim = x->shape[axis];
+  CHECK(num != 0 || !sections.empty()) << "The [num_or_sections] in split op should not empty! Please check.";
+  if (num != 0) {
+    CHECK(dim % num == 0) << "The num_or_sections:" << num << " cannot divided by the split axis:" << axis
+                          << " 's dimension:" << dim;
+
+    sections.clear();
+    sections.resize(num, dim / num);
+  }
+  CHECK_EQ(sections.size(), out_names.size())
+      << "The output number of split op should be " << sections.size() << ", but actual " << out_names.size();
+
+  int neg_idx = -1, sum = 0;
+  for (int i = 0; i < sections.size(); ++i) {
+    if (sections[i] < 0) {
+      CHECK_LT(neg_idx, 0) << "The [num_or_sections] should only has one -1! But here "
+                           << cinn::utils::Join(sections, ", ");
+      neg_idx = i;
+    } else {
+      sum += sections[i];
+    }
+  }
+  if (neg_idx > 0) {
+    CHECK_LT(sum, dim) << "The sum of [num_or_sections] should less than to the dimension of split [axis] when -1 "
+                          "found in [num_or_sections]! But here "
+                       << cinn::utils::Join(sections, ", ");
+    sections[neg_idx] = dim - sum;
+  } else {
+    CHECK_EQ(sum, dim) << "The sum of [num_or_sections] should equal to the dimension of split [axis]! But here "
+                       << cinn::utils::Join(sections, ", ");
+  }
+
+  auto outs = ctx.Builder()->Split(x, sections, axis);
+
+  for (int i = 0; i < out_names.size(); ++i) {
+    ctx.AddVar(out_names[i], outs[i]);
+    ctx.AddVarModelToProgram(out_names[i], outs[i]->id);
+  }
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_concat) {
+  CINN_REGISTER_OP_MAPPER(concat, cinn::frontend::paddle_mappers::ConcatOpMapper)
+  CINN_REGISTER_OP_MAPPER(stack, cinn::frontend::paddle_mappers::StackOpMapper)
+  CINN_REGISTER_OP_MAPPER(split, cinn::frontend::paddle_mappers::SplitOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/constant.cc b/paddle/cinn/frontend/op_mappers/paddle/constant.cc
new file mode 100644
index 0000000000000..228bc8b0fc765
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/constant.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/types/optional.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <utility>
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void AssignOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->Identity(x);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void ShapeOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+  auto x_name = op_desc.Input("Input").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->Constant(x->shape, cinn::utils::TransValidVarName(out_name));
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void FillConstantOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto y_name = op_desc.Output("Out").front();
+
+  const auto& cinn_name = cinn::utils::TransValidVarName(y_name);
+  CheckVarNameValid(cinn_name);
+
+  auto shape     = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "shape"));
+  auto value     = utils::GetAttrOrDefault<float>(op_desc, "value", 0.0f);
+  auto str_value = utils::GetAttrOrDefault<std::string>(op_desc, "str_value", "");
+  auto force_cpu = utils::GetAttrOrDefault<bool>(op_desc, "force_cpu", false);
+
+  auto dtype = utils::GetPaddleDtype(op_desc, "dtype", paddle::cpp::VarDescAPI::Type::FP32);
+  CHECK(!dtype.empty()) << "The op \"fill_constant\"'s attribute \"dtype\" should not be unknown type! Please check.";
+
+  absl::optional<Variable> out;
+  if (op_desc.HasInput("ValueTensor") && !op_desc.Input("ValueTensor").empty()) {
+    CHECK_EQ(op_desc.Input("ValueTensor").size(), 1UL);
+    auto value_name   = op_desc.Input("ValueTensor").front();
+    auto value_tensor = ctx.GetVar(value_name);
+
+    VLOG(4) << "fill constant " << value_name << "=" << value_tensor << " with shape (" << cinn::utils::Join(shape, ",")
+            << ") and dtype [" << dtype << "]";
+
+    CHECK(value_tensor->shape == cinn::utils::ShapeType{1}) << "The shape of [ValueTensor] should be [1], but here ["
+                                                            << cinn::utils::Join(value_tensor->shape, ", ") << "]";
+    if (common::Type2Str(value_tensor->type) != dtype) {
+      value_tensor = ctx.Builder()->Cast(value_tensor, dtype);
+    }
+    out = ctx.Builder()->BroadcastTo(value_tensor, shape);
+    out.value().set_id(cinn_name);
+  } else {
+    if (!str_value.empty()) {
+      VLOG(4) << "fill constant (" << str_value << ") with shape (" << cinn::utils::Join(shape, ",") << ") and dtype ["
+              << dtype << "]";
+      out = ctx.Builder()->FillConstant(shape, str_value, cinn_name, dtype, force_cpu);
+    } else {
+      VLOG(4) << "fill constant (" << value << ") with shape (" << cinn::utils::Join(shape, ",") << ") and dtype ["
+              << dtype << "]";
+      out = ctx.Builder()->FillConstant(shape, value, cinn_name, dtype, force_cpu);
+    }
+  }
+
+  ctx.AddVar(y_name, out.value());
+  ctx.AddVarModelToProgram(y_name, out.value()->id);
+}
+
+void FillAnyLikeOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  auto x      = ctx.GetVar(x_name);
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto y_name = op_desc.Output("Out").front();
+
+  auto shape = utils::ToShapeType(x->shape);
+  auto value = utils::GetAttrOrDefault<float>(op_desc, "value");
+
+  auto dtype = utils::GetPaddleDtype(op_desc, "dtype", paddle::cpp::VarDescAPI::Type::FP32);
+  if (dtype.empty()) {
+    dtype = common::Type2Str(x->type);
+  }
+
+  VLOG(4) << "FillAnyLikeOp: fill constant (" << value << ") with shape (" << cinn::utils::Join(shape, ", ")
+          << ") and dtype [" << dtype << "]";
+
+  const auto& cinn_name = cinn::utils::TransValidVarName(y_name);
+  CheckVarNameValid(cinn_name);
+
+  auto out = ctx.Builder()->FillConstant(shape, value, cinn_name, dtype);
+
+  ctx.AddVar(y_name, out);
+  ctx.AddVarModelToProgram(y_name, out->id);
+}
+
+template <typename T>
+std::pair<bool, T> IsArithmeticSequence(const std::vector<T>& vec) {
+  if (vec.size() <= 1UL || (vec[1] - vec[0]) == 0) {
+    return {false, static_cast<T>(0)};
+  }
+
+  auto first_diff = vec[1] - vec[0];
+  for (int i = 2; i < vec.size(); ++i) {
+    if ((vec[i] - vec[i - 1]) != first_diff) {
+      return {false, first_diff};
+    }
+  }
+  return {true, first_diff};
+}
+
+void AssignValueOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name             = op_desc.Output("Out").front();
+  const auto& cinn_out_name = cinn::utils::TransValidVarName(out_name);
+
+  const auto& bool_values_tmp = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "bool_values");
+  std::vector<bool> bool_values;
+  if (!bool_values_tmp.empty()) {
+    std::transform(bool_values_tmp.begin(), bool_values_tmp.end(), std::back_inserter(bool_values), [](int x) {
+      return static_cast<bool>(x);
+    });
+  }
+  const auto& fp32_values  = utils::GetAttrOrDefault<std::vector<float>>(op_desc, "fp32_values");
+  const auto& int32_values = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "int32_values");
+  const auto& int64_values = utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "int64_values");
+
+  absl::optional<Variable> out;
+  if (!bool_values.empty()) {
+    VLOG(4) << "The input of assign_value is [" << cinn::utils::Join(bool_values, ", ") << "]";
+
+    out = ctx.Builder()->Constant(bool_values, cinn_out_name);
+  } else if (!fp32_values.empty()) {
+    VLOG(4) << "The input of assign_value is [" << cinn::utils::Join(fp32_values, ", ") << "]";
+
+    auto adj_diff = IsArithmeticSequence(fp32_values);
+
+    if (adj_diff.first) {
+      VLOG(4) << "The input of assign_value is a arithmetic sequence. Using Arange instead of Constant.";
+      auto epsilone =
+          adj_diff.second > 0 ? std::numeric_limits<float>::epsilon() : -std::numeric_limits<float>::epsilon();
+
+      out = ctx.Builder()->Arange(fp32_values.front(), fp32_values.back() + epsilone, adj_diff.second, "float32");
+    } else {
+      out = ctx.Builder()->Constant(fp32_values, cinn_out_name);
+    }
+  } else if (!int32_values.empty()) {
+    VLOG(4) << "The input of assign_value is [" << cinn::utils::Join(int32_values, ", ") << "]";
+
+    auto adj_diff = IsArithmeticSequence(int32_values);
+
+    if (adj_diff.first) {
+      VLOG(4) << "The input of assign_value is a arithmetic sequence. Using Arange instead of Constant.";
+      auto epsilone = adj_diff.second > 0 ? 1 : -1;
+
+      out = ctx.Builder()->Arange(static_cast<float>(int32_values.front()),
+                                  static_cast<float>(int32_values.back() + epsilone),
+                                  static_cast<float>(adj_diff.second),
+                                  "int32");
+    } else {
+      out = ctx.Builder()->Constant(int32_values, cinn_out_name);
+    }
+  } else if (!int64_values.empty()) {
+    VLOG(4) << "The input of assign_value is [" << cinn::utils::Join(int64_values, ", ") << "]";
+
+    auto adj_diff = IsArithmeticSequence(int64_values);
+
+    if (adj_diff.first) {
+      VLOG(4) << "The input of assign_value is a arithmetic sequence. Using Arange instead of Constant.";
+      auto epsilone = adj_diff.second > 0 ? 1 : -1;
+
+      out = ctx.Builder()->Arange(static_cast<float>(int64_values.front()),
+                                  static_cast<float>(int64_values.back() + epsilone),
+                                  static_cast<float>(adj_diff.second),
+                                  "int64");
+    } else {
+      out = ctx.Builder()->Constant(int64_values, cinn_out_name);
+    }
+  }
+
+  CHECK(out) << "assign_value's input should not empty, but " << out_name << "not! Please check.";
+  const auto& shape = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "shape", out.value()->shape);
+  if (shape != out.value()->shape) {
+    out = ctx.Builder()->Reshape(out.value(), shape);
+  }
+
+  ctx.AddVar(out_name, out.value());
+  ctx.AddVarModelToProgram(out_name, out.value()->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_constant) {
+  CINN_REGISTER_OP_MAPPER(assign, cinn::frontend::paddle_mappers::AssignOpMapper)
+  CINN_REGISTER_OP_MAPPER(shape, cinn::frontend::paddle_mappers::ShapeOpMapper)
+  CINN_REGISTER_OP_MAPPER(fill_constant, cinn::frontend::paddle_mappers::FillConstantOpMapper)
+  CINN_REGISTER_OP_MAPPER(fill_any_like, cinn::frontend::paddle_mappers::FillAnyLikeOpMapper)
+  CINN_REGISTER_OP_MAPPER(assign_value, cinn::frontend::paddle_mappers::AssignValueOpMapper)
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc b/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
new file mode 100644
index 0000000000000..49cbb0fc0c857
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void Conv2dOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+  auto x_name = op_desc.Input("Input").front();
+  CHECK_EQ(op_desc.Input("Filter").size(), 1UL);
+  auto y_name = op_desc.Input("Filter").front();
+
+  CHECK_EQ(op_desc.Output("Output").size(), 1UL);
+  auto out_name = op_desc.Output("Output").front();
+
+  auto strides   = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "strides", {1, 1});
+  auto paddings  = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "paddings", {0, 0});
+  auto dilations = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "dilations", {1, 1});
+  auto groups    = utils::GetAttrOrDefault<int>(op_desc, "groups", 1);
+
+  auto data_format = utils::GetAttrOrDefault<std::string>(op_desc, "data_format", "AnyLayout");
+  if (data_format == "AnyLayout") {
+    data_format = "NCHW";
+  }
+
+  auto padding_algorithm = utils::GetAttrOrDefault<std::string>(op_desc, "padding_algorithm", "EXPLICIT");
+  auto x                 = ctx.GetVar(x_name);
+  auto y                 = ctx.GetVar(y_name);
+
+  CHECK_EQ(x->shape.size(), 4) << "CINN conv2d operator only support 4-D tensor now, but Input's shape is ["
+                               << cinn::utils::Join(x->shape, ", ") << "]";
+  CHECK_EQ(y->shape.size(), 4) << "CINN conv2d operator only support 4-D tensor now, but Filter's shape is ["
+                               << cinn::utils::Join(y->shape, ", ") << "]";
+  if (data_format == "NHWC") {
+    // the weight in paddle always be NCHW, but cudnn need the same as input, transpose before
+    y = ctx.Builder()->Transpose(y, {0, 2, 3, 1});
+  }
+  auto out = ctx.Builder()->Conv2d(x, y, strides, paddings, dilations, groups, data_format, padding_algorithm);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void DepthwiseConv2dOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+  auto x_name = op_desc.Input("Input").front();
+  CHECK_EQ(op_desc.Input("Filter").size(), 1UL);
+  auto y_name = op_desc.Input("Filter").front();
+
+  CHECK_EQ(op_desc.Output("Output").size(), 1UL);
+  auto out_name = op_desc.Output("Output").front();
+
+  auto strides   = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "strides", {1, 1});
+  auto paddings  = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "paddings", {0, 0});
+  auto dilations = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "dilations", {1, 1});
+  auto groups    = utils::GetAttrOrDefault<int>(op_desc, "groups", 1);
+
+  auto data_format = utils::GetAttrOrDefault<std::string>(op_desc, "data_format", "NCHW");
+  if (data_format == "AnyLayout") {
+    data_format = "NCHW";
+  }
+
+  auto padding_algorithm = utils::GetAttrOrDefault<std::string>(op_desc, "padding_algorithm", "EXPLICIT");
+  auto x                 = ctx.GetVar(x_name);
+  auto y                 = ctx.GetVar(y_name);
+
+  Variable out;
+  if (ctx.Target().arch == Target::Arch::X86) {
+    out = ctx.Builder()->Conv2d(x, y, strides, paddings, dilations, groups, data_format, padding_algorithm);
+  } else {
+    out = ctx.Builder()->DepthwiseConv2d(x, y, strides, paddings, dilations, groups, data_format, padding_algorithm);
+  }
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void Conv2dGradOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  // get dy
+  CHECK_EQ(op_desc.Input(paddle::GradVarName("Output")).size(), 1UL);
+  auto dy_name = op_desc.Input(paddle::GradVarName("Output")).front();
+
+  // get intput input,filter
+  CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+  auto x_name = op_desc.Input("Input").front();
+  CHECK_EQ(op_desc.Input("Filter").size(), 1UL);
+  auto w_name = op_desc.Input("Filter").front();
+
+  // get d_x
+  std::string dx_name;
+  bool has_dx = !op_desc.Output(paddle::GradVarName("Input")).empty();
+  if (has_dx) {
+    CHECK_EQ(op_desc.Output(paddle::GradVarName("Input")).size(), 1UL);
+    dx_name = op_desc.Output(paddle::GradVarName("Input")).front();
+  }
+  // get d_filter
+  CHECK_EQ(op_desc.Output(paddle::GradVarName("Filter")).size(), 1UL);
+  auto dw_name = op_desc.Output(paddle::GradVarName("Filter")).front();
+
+  auto strides   = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "strides", {1, 1});
+  auto paddings  = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "paddings", {0, 0});
+  auto dilations = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "dilations", {1, 1});
+  auto groups    = utils::GetAttrOrDefault<int>(op_desc, "groups", 1);
+
+  auto data_format = utils::GetAttrOrDefault<std::string>(op_desc, "data_format", "AnyLayout");
+  if (data_format == "AnyLayout") {
+    data_format = "NCHW";
+  }
+
+  auto padding_algorithm = utils::GetAttrOrDefault<std::string>(op_desc, "padding_algorithm", "EXPLICIT");
+
+  auto dy     = ctx.GetVar(dy_name);
+  auto x      = ctx.GetVar(x_name);
+  auto weight = ctx.GetVar(w_name);
+
+  CHECK_EQ(x->shape.size(), 4) << "CINN conv2d_grad operator only support 4-D tensor now, but Input's shape is ["
+                               << cinn::utils::Join(x->shape, ", ") << "]";
+  CHECK_EQ(dy->shape.size(), 4) << "CINN conv2d_grad operator only support 4-D tensor now, but "
+                                << paddle::GradVarName("Output") << "'s shape is ["
+                                << cinn::utils::Join(dy->shape, ", ") << "]";
+  CHECK_EQ(weight->shape.size(), 4) << "CINN conv2d_grad operator only support 4-D tensor now, but Filter's shape is ["
+                                    << cinn::utils::Join(weight->shape, ", ") << "]";
+  if (data_format == "NHWC") {
+    // the weight in paddle always be NCHW, but cudnn need the same as input, transpose before
+    weight = ctx.Builder()->Transpose(weight, {0, 2, 3, 1});
+  }
+
+  if (has_dx) {
+    // create backward data
+    auto dx = ctx.Builder()->Conv(
+        weight, dy, strides, paddings, dilations, groups, "backward_data", data_format, padding_algorithm, x->shape);
+
+    ctx.AddVar(dx_name, dx);
+    ctx.AddVarModelToProgram(dx_name, dx->id);
+  }
+
+  // create backward filter
+  auto dw = ctx.Builder()->Conv(
+      x, dy, strides, paddings, dilations, groups, "backward_filter", data_format, padding_algorithm, weight->shape);
+
+  if (data_format == "NHWC") {
+    // the weight in paddle always be NCHW, but cudnn need the same as input, transpose back
+    dw = ctx.Builder()->Transpose(dw, {0, 3, 1, 2});
+  }
+
+  ctx.AddVar(dw_name, dw);
+  ctx.AddVarModelToProgram(dw_name, dw->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_conv2d) {
+  CINN_REGISTER_OP_MAPPER(conv2d, cinn::frontend::paddle_mappers::Conv2dOpMapper)
+  CINN_REGISTER_OP_MAPPER(depthwise_conv2d, cinn::frontend::paddle_mappers::DepthwiseConv2dOpMapper)
+
+#ifdef CINN_WITH_CUDNN
+  CINN_REGISTER_OP_MAPPER(conv2d_grad, cinn::frontend::paddle_mappers::Conv2dGradOpMapper)
+#endif
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc b/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
new file mode 100644
index 0000000000000..c83edaad0f20f
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void CumsumOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto input     = ctx.GetVar(x_name);
+  auto axis      = utils::GetAttrOrDefault<int>(op_desc, "axis", -1);
+  auto flatten   = utils::GetAttrOrDefault<bool>(op_desc, "flatten", false);
+  auto exclusive = utils::GetAttrOrDefault<bool>(op_desc, "exclusive", false);
+  auto reverse   = utils::GetAttrOrDefault<bool>(op_desc, "reverse", false);
+
+  auto x   = input;
+  int ndim = x->shape.size();
+  // If flatten = true, flatten x and do cumsum on axis 0.
+  if (flatten) {
+    x    = ctx.Builder()->Reshape(x, {-1});
+    axis = 0;
+    ndim = x->shape.size();
+  }
+  CHECK(-ndim <= axis && axis < ndim) << "Axis expected to be in range of [" << -ndim << "," << ndim << "]. But got "
+                                      << axis << ".";
+  if (axis < 0) {
+    axis = ndim + axis;
+  }
+  x       = ctx.Builder()->ExpandDims(x, {axis + 1});
+  auto rg = ctx.Builder()->Arange(0.0f, static_cast<float>(x->shape[axis]), 1.0f, "int32");
+  cinn::frontend::Variable mask;
+  if (reverse) {
+    mask = ctx.Builder()->GreaterEqual(ctx.Builder()->ExpandDims(rg, {1}), rg);
+  } else {
+    mask = ctx.Builder()->LessEqual(ctx.Builder()->ExpandDims(rg, {1}), rg);
+  }
+  for (int i = 0; i < ndim - axis - 1; i++) {
+    mask = ctx.Builder()->ExpandDims(mask, {-1});
+  }
+  // Infer broadcast shape for x and mask
+  int x_ndim    = x->shape.size();
+  int mask_ndim = mask->shape.size();
+  std::vector<int> broadcast_shape(std::max(x_ndim, mask_ndim), 0);
+  int broadcast_shape_size = broadcast_shape.size();
+  for (int i = broadcast_shape.size() - 1; i >= 0; --i) {
+    if (i - (broadcast_shape_size - x_ndim) >= 0) {
+      broadcast_shape[i] = std::max(broadcast_shape[i], x->shape[i - (broadcast_shape_size - x_ndim)]);
+    }
+    if (i - (broadcast_shape_size - mask_ndim) >= 0) {
+      broadcast_shape[i] = std::max(broadcast_shape[i], mask->shape[i - (broadcast_shape_size - mask_ndim)]);
+    }
+  }
+  // Do broadcast shape on mask, x and false_value
+  mask             = ctx.Builder()->BroadcastTo(mask, broadcast_shape);
+  x                = ctx.Builder()->BroadcastTo(x, broadcast_shape);
+  auto false_value = ctx.Builder()->FillConstant(x->shape, 0, UniqName("false_value"), common::Type2Str(x->type));
+  // Select elements with mask
+  auto selected_x = ctx.Builder()->Select(mask, x, false_value);
+  // Do reduce sum
+  auto output = ctx.Builder()->ReduceSum(selected_x, {axis});
+  // Exclusive
+  if (exclusive) {
+    output = ctx.Builder()->Subtract(output, input);
+  }
+  ctx.AddVar(out_name, output);
+  ctx.AddVarModelToProgram(out_name, output->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_cumsum) {
+  CINN_REGISTER_OP_MAPPER(cumsum, cinn::frontend::paddle_mappers::CumsumOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/dropout.cc b/paddle/cinn/frontend/op_mappers/paddle/dropout.cc
new file mode 100644
index 0000000000000..de18d82d0ce5f
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/dropout.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void DropoutInferOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto dropout_prob = utils::GetAttrOrDefault<float>(op_desc, "dropout_prob", 0.5f);
+  auto dropout_implementation =
+      utils::GetAttrOrDefault<std::string>(op_desc, "dropout_implementation", "downgrade_in_infer");
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->DropoutInfer(x, dropout_prob, dropout_implementation);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_dropout) {
+  CINN_REGISTER_OP_MAPPER(dropout, cinn::frontend::paddle_mappers::DropoutInferOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc b/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
new file mode 100644
index 0000000000000..c9a0496c08be0
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
@@ -0,0 +1,257 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/type.h"
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+enum class EltwiseType { kUnk = 0, kAdd, kDiv, kMul, kSub, kPow, kMod, kMax, kMin };
+
+template <EltwiseType Type>
+std::string GetEltwiseTypeString();
+
+#define EXPAND_ELTWISETYPE_STRING(Type, str)              \
+  template <>                                             \
+  std::string GetEltwiseTypeString<EltwiseType::Type>() { \
+    return str;                                           \
+  }
+
+EXPAND_ELTWISETYPE_STRING(kAdd, " + ")
+EXPAND_ELTWISETYPE_STRING(kDiv, " / ")
+EXPAND_ELTWISETYPE_STRING(kMul, " * ")
+EXPAND_ELTWISETYPE_STRING(kSub, " - ")
+EXPAND_ELTWISETYPE_STRING(kPow, " pow ")
+EXPAND_ELTWISETYPE_STRING(kMod, " % ")
+EXPAND_ELTWISETYPE_STRING(kMax, " max ")
+EXPAND_ELTWISETYPE_STRING(kMin, " min ")
+#undef EXPAND_ELTWISETYPE_STRING
+
+template <EltwiseType Type>
+struct OpBuilder {};
+
+#define ELTWISE_SPEC(enum_t, function)                                                               \
+  template <>                                                                                        \
+  struct OpBuilder<enum_t> {                                                                         \
+    constexpr static Variable (NetBuilder::*func)(const Variable&, const Variable&, int){&function}; \
+  }
+ELTWISE_SPEC(EltwiseType::kAdd, NetBuilder::Add);
+ELTWISE_SPEC(EltwiseType::kDiv, NetBuilder::Divide);
+ELTWISE_SPEC(EltwiseType::kMul, NetBuilder::Multiply);
+ELTWISE_SPEC(EltwiseType::kSub, NetBuilder::Subtract);
+ELTWISE_SPEC(EltwiseType::kPow, NetBuilder::Pow);
+ELTWISE_SPEC(EltwiseType::kMod, NetBuilder::Mod);
+ELTWISE_SPEC(EltwiseType::kMax, NetBuilder::Max);
+ELTWISE_SPEC(EltwiseType::kMin, NetBuilder::Min);
+#undef ELTWISE_SPEC
+
+void AddOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto y_name = op_desc.Input("Y").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  VLOG(4) << out_name << " = " << x_name << " + " << y_name;
+
+  auto x   = ctx.GetVar(x_name);
+  auto y   = ctx.GetVar(y_name);
+  auto out = ctx.Builder()->Add(x, y);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+template <EltwiseType Type>
+void ElementwiseOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  VLOG(5) << "Elementwise operator mapping type: " << static_cast<int>(Type);
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto y_name = op_desc.Input("Y").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto axis = utils::GetAttrOrDefault<int>(op_desc, "axis", -1);
+
+  VLOG(4) << out_name << " = " << x_name << GetEltwiseTypeString<Type>() << y_name << " at " << axis;
+
+  auto x   = ctx.GetVar(x_name);
+  auto y   = ctx.GetVar(y_name);
+  auto out = (ctx.Builder()->*OpBuilder<Type>::func)(x, y, axis);
+
+  ctx.AddVar(out_name, out, true);
+  ctx.AddVarModelToProgram(out_name, out->id, true);
+}
+
+void ElementwiseAddGradOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto y_name = op_desc.Input("Y").front();
+  CHECK_EQ(op_desc.Input(paddle::GradVarName("Out")).size(), 1UL);
+  auto dout_name = op_desc.Input(paddle::GradVarName("Out")).front();
+
+  std::string dx_name, dy_name;
+  bool has_dx = op_desc.Output(paddle::GradVarName("X")).size() > 0UL;
+  if (has_dx) {
+    dx_name = op_desc.Output(paddle::GradVarName("X")).front();
+  }
+  bool has_dy = op_desc.Output(paddle::GradVarName("Y")).size() > 0UL;
+  if (has_dy) {
+    dy_name = op_desc.Output(paddle::GradVarName("Y")).front();
+  }
+
+  auto axis = utils::GetAttrOrDefault<int>(op_desc, "axis", -1);
+
+  VLOG(4) << "{X@GRAD=" << dx_name << ", Y@GRAD=" << dy_name << "}=elementwise_add_grad(X=" << x_name
+          << ", Y=" << y_name << ", OUT@GRAD=" << dout_name << ", axis=" << axis << ")";
+
+  auto x    = ctx.GetVar(x_name);
+  auto y    = ctx.GetVar(y_name);
+  auto dout = ctx.GetVar(dout_name);
+  auto outs = ctx.Builder()->ElementwiseAddGrad(dout, x, y, axis);
+  CHECK_EQ(outs.size(), 2) << "elementwise_add_grad should return 2 variables";
+
+  if (has_dx) {
+    auto dx = outs.front();
+    ctx.AddVar(dx_name, dx);
+    ctx.AddVarModelToProgram(dx_name, dx->id, true);
+  }
+  if (has_dy) {
+    auto dy = outs.back();
+    ctx.AddVar(dy_name, dy);
+    ctx.AddVarModelToProgram(dy_name, dy->id, true);
+  }
+}
+
+void SumOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_GE(op_desc.Input("X").size(), 1UL);
+  auto x_names = op_desc.Input("X");
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  std::vector<Variable> xs;
+  for (const auto& name : x_names) {
+    xs.emplace_back(ctx.GetVar(name));
+  }
+
+  VLOG(4) << out_name << " = " << cinn::utils::Join(x_names, " + ");
+
+  auto out = ctx.Builder()->Sum(xs);
+
+  ctx.AddVar(out_name, out, true);
+  ctx.AddVarModelToProgram(out_name, out->id, true);
+}
+
+void CastOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  CHECK(op_desc.HasAttr("out_dtype")) << "The cast op should has [out_dtype] attribute!";
+  auto dtype = utils::GetPaddleDtype(op_desc, "out_dtype", paddle::cpp::VarDescAPI::Type::FP32);
+  CHECK(!dtype.empty()) << "The op \"cast\"'s attribute \"out_dtype\" should not be unknown type! Please check.";
+
+  VLOG(4) << out_name << " = cast(X:" << x_name << ", out_dtype=" << dtype << ")";
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->Cast(x, dtype);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void PowOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  auto x      = ctx.GetVar(x_name);
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  absl::optional<Variable> y;
+  if (op_desc.HasInput("FactorTensor") && !op_desc.Input("FactorTensor").empty()) {
+    CHECK_EQ(op_desc.Input("FactorTensor").size(), 1UL);
+    auto y_name = op_desc.Input("FactorTensor").front();
+    y           = ctx.GetVar(y_name);
+
+  } else if (op_desc.HasAttr("factor")) {
+    auto factor = utils::GetAttrOrDefault<float>(op_desc, "factor");
+    y = ctx.Builder()->FillConstant(x->shape, factor, cinn::UniqName(x_name + "_factor"), common::Type2Str(x->type));
+  } else {
+    LOG(FATAL) << "Cannot found [FactorTensor] input or [factor] attribute in paddle.pow! Please check.";
+  }
+
+  VLOG(4) << out_name << " = pow(" << x_name << ", " << y.value()->id << ")";
+  CHECK_EQ(x->type, y.value()->type) << "The data type of pow's inputs should be equal, but here x:" << x->type
+                                     << " != y:" << y.value()->type;
+
+  auto out = ctx.Builder()->Pow(x, y.value());
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void FloorDivideOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto y_name = op_desc.Input("Y").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x = ctx.GetVar(x_name);
+  auto y = ctx.GetVar(y_name);
+
+  VLOG(4) << out_name << " = ⌊ " << x_name << " / " << y_name << " ⌋";
+  CHECK_EQ(x->type, y->type) << "Type of input x and y must be the same.";
+  CHECK(x->type.is_int()) << "Type of inputs must be int32 or int64.";
+
+  auto out = ctx.Builder()->FloorDivide(x, y);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_elementwise) {
+  using namespace cinn::frontend::paddle_mappers;
+  CINN_REGISTER_OP_MAPPER(add, AddOpMapper)
+  CINN_REGISTER_OP_MAPPER(elementwise_add, ElementwiseOpMapper<EltwiseType::kAdd>)
+  CINN_REGISTER_OP_MAPPER(elementwise_add_grad, ElementwiseAddGradOpMapper)
+  CINN_REGISTER_OP_MAPPER(elementwise_mul, ElementwiseOpMapper<EltwiseType::kMul>)
+  CINN_REGISTER_OP_MAPPER(elementwise_div, ElementwiseOpMapper<EltwiseType::kDiv>)
+  CINN_REGISTER_OP_MAPPER(elementwise_sub, ElementwiseOpMapper<EltwiseType::kSub>)
+  CINN_REGISTER_OP_MAPPER(elementwise_pow, ElementwiseOpMapper<EltwiseType::kPow>)
+  CINN_REGISTER_OP_MAPPER(elementwise_mod, ElementwiseOpMapper<EltwiseType::kMod>)
+  CINN_REGISTER_OP_MAPPER(elementwise_max, ElementwiseOpMapper<EltwiseType::kMax>)
+  CINN_REGISTER_OP_MAPPER(elementwise_min, ElementwiseOpMapper<EltwiseType::kMin>)
+  CINN_REGISTER_OP_MAPPER(sum, SumOpMapper)
+  CINN_REGISTER_OP_MAPPER(cast, CastOpMapper)
+  CINN_REGISTER_OP_MAPPER(pow, PowOpMapper)
+  CINN_REGISTER_OP_MAPPER(grad_add,
+                          ElementwiseOpMapper<EltwiseType::kAdd>)  // special elementwise_add for gradient accumulation
+  CINN_REGISTER_OP_MAPPER(elementwise_floordiv, FloorDivideOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/expand.cc b/paddle/cinn/frontend/op_mappers/paddle/expand.cc
new file mode 100644
index 0000000000000..e0dbb78c8c17c
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/expand.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void ExpandOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  CHECK(op_desc.HasAttr("expand_times"));
+  auto expand_times = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "expand_times");
+
+  auto x       = ctx.GetVar(x_name);
+  auto x_shape = x->shape;
+
+  VLOG(4) << "expand: x shape: " << cinn::utils::Join(x_shape, ", ");
+  VLOG(4) << "expand: attr expand_times: " << cinn::utils::Join(expand_times, ", ");
+
+  CHECK_EQ(expand_times.size(), x_shape.size()) << "The size of `expand_times' should == the rank[" << x_shape.size()
+                                                << "] of x's shape, but got " << expand_times.size();
+
+  std::vector<int> out_shape(x_shape.size());
+  for (size_t i = 0; i < x_shape.size(); ++i) {
+    out_shape[i] = x_shape[i] * expand_times[i];
+  }
+
+  VLOG(4) << "expand: out shape: " << cinn::utils::Join(out_shape, ", ");
+
+  auto out = ctx.Builder()->BroadcastTo(x, out_shape);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void ExpandV2OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  CHECK(op_desc.HasAttr("shape"));
+  auto shape = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "shape");
+
+  auto x       = ctx.GetVar(x_name);
+  auto x_shape = x->shape;
+
+  VLOG(4) << "expand_v2: x shape: " << cinn::utils::Join(x_shape, ", ");
+  VLOG(4) << "expand_v2: attr shape: " << cinn::utils::Join(shape, ", ");
+
+  CHECK_GE(shape.size(), x_shape.size()) << "The size of `shape' should >= the rank[" << x_shape.size()
+                                         << "] of x's shape, but got " << shape.size();
+
+  auto diff = shape.size() - x_shape.size();
+  x_shape.insert(x_shape.begin(), diff, 1);
+
+  std::vector<int> out_shape(x_shape.size());
+
+  for (size_t i = 0; i < x_shape.size(); ++i) {
+    CHECK_NE(shape[i], 0) << "The " << i << "th element in shape cannot be zero.";
+    if (i < diff) {
+      CHECK_GT(shape[i], 0) << "The " << i << "th element[" << shape[i]
+                            << "] for non-existing dimensions must be positive.";
+      out_shape[i] = shape[i];
+    } else if (shape[i] > 0) {
+      if (x_shape[i] != 1) {
+        CHECK_EQ(shape[i], x_shape[i]) << "The " << i << "th element[" << shape[i]
+                                       << "] of the non-singleton dimension does not match"
+                                          " the corresponding element["
+                                       << x_shape[i] << "] in x's shape.";
+        out_shape[i] = shape[i];
+      } else {
+        out_shape[i] = shape[i];
+      }
+    } else {
+      CHECK_EQ(shape[i], -1) << "When the element in shape is negative for expand_v2 op, only -1 is supported, but got "
+                             << shape[i];
+      out_shape[i] = x_shape[i];
+    }
+  }
+
+  VLOG(4) << "expand_v2: out shape: " << cinn::utils::Join(out_shape, ", ");
+
+  Variable out;
+  if (out_shape == x_shape) {
+    out = ctx.Builder()->Identity(x);
+  } else {
+    out = ctx.Builder()->BroadcastTo(x, out_shape);
+  }
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_expand) {
+  CINN_REGISTER_OP_MAPPER(expand, cinn::frontend::paddle_mappers::ExpandOpMapper)
+  CINN_REGISTER_OP_MAPPER(expand_v2, cinn::frontend::paddle_mappers::ExpandV2OpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/fetch_feed.cc b/paddle/cinn/frontend/op_mappers/paddle/fetch_feed.cc
new file mode 100644
index 0000000000000..d5786eebd1c3f
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/fetch_feed.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/macros.h"
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void FetchOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto output_name = op_desc.Input("X").front();
+  ctx.AddFetchVarName(output_name);
+  VLOG(4) << "detect model output: [" << output_name << "]";
+}
+
+void FeedOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto feed_name = op_desc.Output("Out").front();
+  VLOG(4) << "Model get feed [" << feed_name << "]";
+
+  // For input parameters
+  if (ctx.Scope().FindVar(cinn::utils::TransValidVarName(feed_name))) {
+    auto param      = ctx.GetVar(feed_name);
+    const auto& var = ctx.Builder()->CreateInput(param);
+    VLOG(4) << "Create param [" << feed_name << "]"
+            << " to " << var.id() << " with shape=[" << cinn::utils::Join(var.shape(), ",")
+            << "], dtype=" << var.type();
+    return;
+  }
+
+  // For input variables
+  const auto& feed_info = ctx.GetFeedInfo(feed_name);
+  auto cinn_id          = cinn::utils::TransValidVarName(feed_name);
+  auto input            = ctx.Builder()->CreateInput(feed_info.type, feed_info.shape, cinn_id);
+  ctx.AddVar(feed_name, input);
+  ctx.AddVarModelToProgram(feed_name, input.id().data());
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_fetch_feed) {
+  CINN_REGISTER_OP_MAPPER(fetch, cinn::frontend::paddle_mappers::FetchOpMapper)
+  CINN_REGISTER_OP_MAPPER(feed, cinn::frontend::paddle_mappers::FeedOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/flip.cc b/paddle/cinn/frontend/op_mappers/paddle/flip.cc
new file mode 100644
index 0000000000000..105a89519d8a7
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/flip.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void FlipOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto axes = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "axis", std::vector<int>{});
+  VLOG(4) << "out_name = flip(" << x_name << ", axis=[" << cinn::utils::Join(axes, ", ") << "])";
+
+  auto x           = ctx.GetVar(x_name);
+  const auto& ndim = x->shape.size();
+  for (auto& axis : axes) {
+    if (axis < 0) {
+      axis += ndim;
+    }
+  }
+  auto out = ctx.Builder()->Flip(x, axes);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_flip) {
+  CINN_REGISTER_OP_MAPPER(flip, cinn::frontend::paddle_mappers::FlipOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/gather.cc b/paddle/cinn/frontend/op_mappers/paddle/gather.cc
new file mode 100644
index 0000000000000..8bd273682d18e
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/gather.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <numeric>
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void GatherOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Index").size(), 1UL);
+  auto index_name = op_desc.Input("Index").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x     = ctx.GetVar(x_name);
+  auto index = ctx.GetVar(index_name);
+
+  auto axis = utils::GetAttrOrDefault<int>(op_desc, "axis", 0);
+
+  VLOG(4) << "Gather X:" << x_name << "[" << cinn::utils::Join(x->shape, ",") << "] with index:" << index_name << "["
+          << cinn::utils::Join(index->shape, ",") << "] at axis=" << axis;
+
+  if (index->shape.size() > 1) {
+    // reshape index if the rank of index is greater than 1
+    bool is_rank_1 = false;
+    for (auto dim : index->shape) {
+      if (dim != 1) {
+        CHECK(!is_rank_1) << "The \"index\" of \"Gather\" only support rank 1 tensor, but here index.shape=["
+                          << cinn::utils::Join(index->shape, ",") << "]";
+        is_rank_1 = true;
+      }
+    }
+    auto num = std::accumulate(index->shape.begin(), index->shape.end(), 1, std::multiplies<int>());
+    index    = ctx.Builder()->Reshape(index, {num});
+  }
+
+  // now paddle science only need reduce sum
+  auto out = ctx.Builder()->Gather(x, index, axis);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_gather) {
+  CINN_REGISTER_OP_MAPPER(gather, cinn::frontend::paddle_mappers::GatherOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/gather_nd.cc b/paddle/cinn/frontend/op_mappers/paddle/gather_nd.cc
new file mode 100644
index 0000000000000..f97f5835b3c12
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/gather_nd.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void GatherNdOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Index").size(), 1UL);
+  auto index_name = op_desc.Input("Index").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x     = ctx.GetVar(x_name);
+  auto index = ctx.GetVar(index_name);
+
+  VLOG(4) << "GatherND X:" << x_name << "[" << cinn::utils::Join(x->shape, ",") << "] with index:" << index_name << "["
+          << cinn::utils::Join(index->shape, ",") << "]";
+
+  auto out = ctx.Builder()->GatherNd(x, index);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_gather_nd) {
+  CINN_REGISTER_OP_MAPPER(gather_nd, cinn::frontend::paddle_mappers::GatherNdOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/gaussian_random.cc b/paddle/cinn/frontend/op_mappers/paddle/gaussian_random.cc
new file mode 100644
index 0000000000000..1f10b9bfd859f
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/gaussian_random.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void GaussianRandomOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto shape_origin = utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "shape");
+  auto shape        = utils::ToShapeType(shape_origin);
+
+  auto mean = utils::GetAttrOrDefault<float>(op_desc, "mean", 0.0f);
+  auto std  = utils::GetAttrOrDefault<float>(op_desc, "std", 1.0f);
+  auto seed = utils::GetAttrOrDefault<int>(op_desc, "seed", 0);
+
+  auto dtype = utils::GetPaddleDtype(op_desc, "dtype", paddle::cpp::VarDescAPI::Type::FP32);
+  CHECK(!dtype.empty()) << "The op \"gaussian_random\"'s attribute \"dtype\" should not be unknown type! Please check.";
+
+  VLOG(4) << out_name << "[" << cinn::utils::Join(shape, ", ") << "] = uniform_random(mean=" << mean << ", std=" << std
+          << ", seed=" << seed << ", dtype=" << dtype << ", shape=[" << cinn::utils::Join(shape, ", ") << "])";
+
+  auto out = ctx.Builder()->GaussianRandom(shape, mean, std, seed, dtype);
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_gaussian_random) {
+  CINN_REGISTER_OP_MAPPER(gaussian_random, cinn::frontend::paddle_mappers::GaussianRandomOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc b/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
new file mode 100644
index 0000000000000..a91317a43e7e0
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/types/optional.h>
+
+#include <string>
+
+#include "cinn/common/context.h"
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void LayerNormOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  auto get_input = [&op_desc](const std::string& name) {
+    CHECK_EQ(op_desc.Input(name).size(), 1UL);
+    return op_desc.Input(name).front();
+  };
+  auto get_output = [&op_desc](const std::string& name) {
+    CHECK_EQ(op_desc.Output(name).size(), 1UL);
+    return op_desc.Output(name).front();
+  };
+
+  // get input names
+  auto x_name = get_input("X");
+  absl::optional<std::string> scale_name;
+  if (op_desc.HasInput("Scale")) {
+    scale_name = get_input("Scale");
+  }
+  absl::optional<std::string> bias_name;
+  if (op_desc.HasInput("Bias")) {
+    bias_name = get_input("Bias");
+  }
+  // get attribute values
+  auto epsilon         = utils::GetAttrOrDefault<float>(op_desc, "epsilon", 1e-5f);
+  auto begin_norm_axis = utils::GetAttrOrDefault<int>(op_desc, "begin_norm_axis", 1);
+  // get input variable
+  auto x = ctx.GetVar(x_name);
+  absl::optional<Variable> scale;
+  if (scale_name) {
+    scale = ctx.GetVar(*scale_name);
+  }
+  absl::optional<Variable> bias;
+  if (bias_name) {
+    bias = ctx.GetVar(*bias_name);
+  }
+
+  VLOG(4) << "layer_norm X=" << x_name << "[" << x << "], Scale=" << scale_name.value() << "[" << scale.value()
+          << "], Bias=" << bias_name.value() << "[" << bias.value() << "], epsilon=" << epsilon
+          << ", begin_norm_axis=" << begin_norm_axis;
+
+  const auto& x_shape = x->shape;
+  auto x_ndim         = x_shape.size();
+  CHECK_LT(begin_norm_axis, x_ndim) << "`begin_norm_axis` must be less than the dimensions of X, but received "
+                                    << begin_norm_axis;
+  VLOG(4) << "-- [layer_norm] begin_norm_axis = " << begin_norm_axis;
+  int left = 1;
+  for (int i = 0; i < begin_norm_axis; i++) {
+    left *= x_shape[i];
+  }
+  int right = 1;
+  for (int i = begin_norm_axis; i < x_ndim; i++) {
+    right *= x_shape[i];
+  }
+  VLOG(4) << "-- [layer_norm] left = " << left << ", right = " << right;
+
+  // compute mean
+  auto* builder = ctx.Builder();
+
+  const auto& x_type = x->type;
+  if (x_type.is_float16() || x_type.is_bfloat16()) {
+    x = builder->Cast(x, "float32");
+  }
+
+  std::vector<int> shape{left, right};
+  auto x_reshape = builder->Reshape(x, shape);
+  auto x_reduce  = builder->ReduceSum(x_reshape, {1});
+  auto ele_num   = builder->FillConstant(
+      {left}, static_cast<float>(right), common::UniqName("layer_norm_ele_num"), common::Type2Str(x->type));
+  auto x_mean = builder->Divide(x_reduce, ele_num);
+
+  // use `E[|x|^2] - |E[x]|^2` instead of `E[|x - E[x]|^2])` to compute variance
+  auto x2        = builder->Multiply(x_reshape, builder->Identity(x_reshape));
+  auto x2_reduce = builder->ReduceSum(x2, {1});
+  auto x2_mean   = builder->Divide(x2_reduce, ele_num);
+  auto x_mean2   = builder->Multiply(x_mean, builder->Identity(x_mean));
+  auto zero      = builder->FillConstant({left}, 0.f, common::UniqName("layer_norm_zero"), common::Type2Str(x->type));
+  auto x_var     = builder->Max(builder->Subtract(x2_mean, x_mean2), zero);
+
+  // compute x norm
+  auto x_mean_broadcast = builder->BroadcastTo(x_mean, shape, {0});
+  auto y_sub            = builder->Subtract(x_reshape, x_mean_broadcast);
+  auto epsilon_var =
+      builder->FillConstant({left}, epsilon, common::UniqName("layer_norm_epsilon"), common::Type2Str(x->type));
+  auto x_var_eps  = builder->Add(x_var, epsilon_var);
+  auto x_var_sqrt = builder->Sqrt(x_var_eps);
+  auto y_out      = builder->Divide(y_sub, builder->BroadcastTo(x_var_sqrt, shape, {0}));
+
+  // multiply scale
+  if (scale) {
+    if (scale.value()->type.is_float16() || scale.value()->type.is_bfloat16()) {
+      scale = ctx.Builder()->Cast(scale.value(), "float32");
+    }
+    auto scale_broadcast = builder->BroadcastTo(*scale, shape, {1});
+    y_out                = builder->Multiply(y_out, scale_broadcast);
+  }
+
+  // add bias
+  if (bias) {
+    if (bias.value()->type.is_float16() || bias.value()->type.is_bfloat16()) {
+      bias = ctx.Builder()->Cast(bias.value(), "float32");
+    }
+    auto bias_broadcast = builder->BroadcastTo(*bias, shape, {1});
+    y_out               = builder->Add(y_out, bias_broadcast);
+  }
+
+  // reshape to the original shape
+  y_out = builder->Reshape(y_out, x_shape);
+
+  if (x_type.is_float16()) {
+    y_out = builder->Cast(y_out, "float16");
+  } else if (x_type.is_bfloat16()) {
+    y_out = builder->Cast(y_out, "bfloat16");
+  }
+
+  // get output names
+  auto y_name        = get_output("Y");
+  auto mean_name     = get_output("Mean");
+  auto variance_name = get_output("Variance");
+  // re-mapper outputs
+  ctx.AddVar(y_name, y_out);
+  ctx.AddVarModelToProgram(y_name, y_out->id);
+  ctx.AddVar(mean_name, x_mean);
+  ctx.AddVarModelToProgram(mean_name, x_mean->id);
+  ctx.AddVar(variance_name, x_var);
+  ctx.AddVarModelToProgram(variance_name, x_var->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_layer_norm) {
+  CINN_REGISTER_OP_MAPPER(layer_norm, cinn::frontend::paddle_mappers::LayerNormOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/log.cc b/paddle/cinn/frontend/op_mappers/paddle/log.cc
new file mode 100644
index 0000000000000..44b7df9b3040b
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/log.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void LogOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->Log(x);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void Log2OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->Log2(x);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void Log10OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->Log10(x);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void Log1pOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto x = ctx.GetVar(x_name);
+
+  auto one =
+      ctx.Builder()->FillConstant(x->shape, 1.0f, cinn::UniqName(x->id + "_1p"), cinn::common::Type2Str(x->type));
+  auto y   = ctx.Builder()->Add(x, one);
+  auto out = ctx.Builder()->Log(y);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_log) {
+  CINN_REGISTER_OP_MAPPER(log, cinn::frontend::paddle_mappers::LogOpMapper)
+  CINN_REGISTER_OP_MAPPER(log2, cinn::frontend::paddle_mappers::Log2OpMapper)
+  CINN_REGISTER_OP_MAPPER(log10, cinn::frontend::paddle_mappers::Log10OpMapper)
+  CINN_REGISTER_OP_MAPPER(log1p, cinn::frontend::paddle_mappers::Log1pOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/lookup_table.cc b/paddle/cinn/frontend/op_mappers/paddle/lookup_table.cc
new file mode 100644
index 0000000000000..82e3903ea804e
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/lookup_table.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void LookupTableOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("W").size(), 1UL);
+  auto w_name = op_desc.Input("W").front();
+  CHECK_EQ(op_desc.Input("Ids").size(), 1UL);
+  auto ids_name = op_desc.Input("Ids").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  auto w        = ctx.GetVar(w_name);
+  auto ids      = ctx.GetVar(ids_name);
+  CHECK(op_desc.HasAttr("padding_idx"));
+  auto padding_idx = utils::GetAttrOrDefault<int64_t>(op_desc, "padding_idx", -1);
+  auto out         = ctx.Builder()->LookupTable(w, ids, padding_idx);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void LookupTableV2OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("W").size(), 1UL);
+  auto w_name = op_desc.Input("W").front();
+  CHECK_EQ(op_desc.Input("Ids").size(), 1UL);
+  auto ids_name = op_desc.Input("Ids").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  auto w        = ctx.GetVar(w_name);
+  auto ids      = ctx.GetVar(ids_name);
+  ids           = ctx.Builder()->ExpandDims(ids, {-1});
+  CHECK(op_desc.HasAttr("padding_idx"));
+  auto padding_idx = utils::GetAttrOrDefault<int64_t>(op_desc, "padding_idx", -1);
+  auto out         = ctx.Builder()->LookupTable(w, ids, padding_idx);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_lookup_table) {
+  CINN_REGISTER_OP_MAPPER(lookup_table, cinn::frontend::paddle_mappers::LookupTableOpMapper)
+  CINN_REGISTER_OP_MAPPER(lookup_table_v2, cinn::frontend::paddle_mappers::LookupTableV2OpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/matmul.cc b/paddle/cinn/frontend/op_mappers/paddle/matmul.cc
new file mode 100644
index 0000000000000..7db1c86fea325
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/matmul.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void MatMulOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto y_name = op_desc.Input("Y").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto trans_x = utils::GetAttrOrDefault<bool>(op_desc, "trans_x", false);
+  trans_x      = utils::GetAttrOrDefault<bool>(op_desc, "transpose_X", trans_x);
+
+  auto trans_y = utils::GetAttrOrDefault<bool>(op_desc, "trans_y", false);
+  trans_y      = utils::GetAttrOrDefault<bool>(op_desc, "transpose_Y", trans_y);
+
+  auto alpha = utils::GetAttrOrDefault<float>(op_desc, "alpha", 1.0f);
+
+  VLOG(4) << out_name << "=matmul{" << x_name << ", " << y_name << ", trans_x=" << trans_x << ", trans_y=" << trans_y
+          << ", alpha=" << alpha << "}";
+
+  auto x   = ctx.GetVar(x_name);
+  auto y   = ctx.GetVar(y_name);
+  auto out = ctx.Builder()->Matmul(x, y, trans_x, trans_y, alpha);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_matmul) {
+  CINN_REGISTER_OP_MAPPER(matmul, cinn::frontend::paddle_mappers::MatMulOpMapper)
+  CINN_REGISTER_OP_MAPPER(matmul_v2, cinn::frontend::paddle_mappers::MatMulOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/mul.cc b/paddle/cinn/frontend/op_mappers/paddle/mul.cc
new file mode 100644
index 0000000000000..a510654038f25
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/mul.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/types/optional.h"
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void MulOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto y_name = op_desc.Input("Y").front();
+
+  auto x = ctx.GetVar(x_name);
+  auto y = ctx.GetVar(y_name);
+
+  // Step1: flatten multi-dimension matrix input to two-dimension matrix
+  auto x_num_col_dims = utils::GetAttrOrDefault<int>(op_desc, "x_num_col_dims", 1);
+  auto y_num_col_dims = utils::GetAttrOrDefault<int>(op_desc, "y_num_col_dims", 1);
+
+  auto out = ctx.Builder()->Mul(x, y, x_num_col_dims, y_num_col_dims);
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_mul) {
+  CINN_REGISTER_OP_MAPPER(mul, cinn::frontend::paddle_mappers::MulOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/norm.cc b/paddle/cinn/frontend/op_mappers/paddle/norm.cc
new file mode 100644
index 0000000000000..cc3c6556d79cf
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/norm.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+struct NormHelper {
+  NormHelper(NetBuilder* net_builder, int32_t axis) {
+    builder          = net_builder;
+    reduce_dim       = {axis};
+    num_instructions = builder->size();
+  }
+
+  ~NormHelper() { VLOG(4) << "norm is decomposed to " << builder->size() - num_instructions << " instructions."; }
+
+  // square_sum = reduce_sum(x * x)
+  Variable SquareSum(Variable x) {
+    auto x_square     = builder->Multiply(x, builder->Identity(x));
+    auto x_square_sum = Reduce(x_square);
+
+    return x_square_sum;
+  }
+
+  // std_square_sum = sqrt(square_sum + epsilon)
+  Variable StdSquareSum(Variable square_sum, float epsilon) {
+    auto epsilon_1d = builder->FillConstant(
+        square_sum->shape, epsilon, common::UniqName("norm_epsilon"), common::Type2Str(square_sum->type));
+    auto std_square_sum = builder->Sqrt(builder->Add(square_sum, epsilon_1d));
+    return std_square_sum;
+  }
+
+  Variable Reduce(Variable x) { return builder->ReduceSum(x, reduce_dim, true); }
+
+  NetBuilder* builder{nullptr};
+  std::vector<int> reduce_dim;
+  int num_instructions{0};
+};
+
+void NormOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  std::string norm_name;
+  if (op_desc.HasOutput("Norm") && !op_desc.Output("Norm").empty()) {
+    CHECK_EQ(op_desc.Output("Norm").size(), 1UL);
+    norm_name = op_desc.Output("Norm").front();
+  }
+
+  CHECK(op_desc.HasAttr("axis"));
+  auto axis    = utils::GetAttrOrDefault<int>(op_desc, "axis", -1);
+  auto epsilon = utils::GetAttrOrDefault<float>(op_desc, "epsilon", 1.0e-10f);
+  auto is_test = utils::GetAttrOrDefault<bool>(op_desc, "is_test", norm_name.empty());
+
+  auto x = ctx.GetVar(x_name);
+
+  VLOG(4) << "Out=" << out_name << ", Norm=" << norm_name << " = norm(X:" << x_name << "=" << x << ", axis=" << axis
+          << ", epsilon=" << epsilon << ", is_test=" << std::ios::boolalpha << is_test;
+
+  if (axis < 0) {
+    axis += x->shape.size();
+  }
+  CHECK_GE(axis, 0);
+  CHECK_LT(axis, x->shape.size());
+
+  NormHelper helper(ctx.Builder(), axis);
+
+  auto in_type = x->type;
+  if (in_type.is_float16() || in_type.is_bfloat16()) {
+    x = ctx.Builder()->Cast(x, "float32");
+  }
+  auto square_sum     = helper.SquareSum(x);
+  auto std_square_sum = helper.StdSquareSum(square_sum, epsilon);
+  auto normalized     = ctx.Builder()->Divide(x, std_square_sum);
+  auto y              = ctx.Builder()->Cast(normalized, common::Type2Str(in_type));
+
+  ctx.AddVar(out_name, y);
+  ctx.AddVarModelToProgram(out_name, y->id);
+
+  if (!norm_name.empty()) {
+    auto norm_grad = ctx.Builder()->Cast(std_square_sum, common::Type2Str(in_type));
+    ctx.AddVar(norm_name, norm_grad);
+    ctx.AddVarModelToProgram(norm_name, norm_grad->id);
+  }
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_norm) {
+  CINN_REGISTER_OP_MAPPER(norm, cinn::frontend::paddle_mappers::NormOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/one_hot.cc b/paddle/cinn/frontend/op_mappers/paddle/one_hot.cc
new file mode 100644
index 0000000000000..29efa3beb114c
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/one_hot.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void OneHotOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto depth = utils::GetAttrOrDefault<int>(op_desc, "depth", 1);
+  auto axis  = utils::GetAttrOrDefault<int>(op_desc, "axis", -1);
+
+  auto on_value  = ctx.Builder()->FillConstant({1}, 1, cinn::UniqName(x_name + "_on_value"), "int32");
+  auto off_value = ctx.Builder()->FillConstant({1}, 0, cinn::UniqName(x_name + "_off_value"), "int32");
+
+  auto dtype = utils::GetPaddleDtype(op_desc, "dtype", paddle::cpp::VarDescAPI::Type::FP32);
+  CHECK(!dtype.empty()) << "The op \"ont_hot\"'s attribute \"dtype\" should not be unknown type! Please check.";
+
+  auto x   = ctx.GetVar(x_name);
+  x        = ctx.Builder()->Slice(x, {static_cast<int>(x->shape.size()) - 1}, {0}, {1}, {}, {1}, {});
+  x        = ctx.Builder()->Squeeze(x, {-1});
+  auto out = ctx.Builder()->OneHot(x, on_value, off_value, depth, axis, dtype);
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void OneHotV2OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto depth = utils::GetAttrOrDefault<int>(op_desc, "depth", 1);
+  auto axis  = utils::GetAttrOrDefault<int>(op_desc, "axis", -1);
+
+  auto on_value  = ctx.Builder()->FillConstant({1}, 1, cinn::UniqName(x_name + "_on_value"), "int32");
+  auto off_value = ctx.Builder()->FillConstant({1}, 0, cinn::UniqName(x_name + "_off_value"), "int32");
+
+  auto dtype = utils::GetPaddleDtype(op_desc, "dtype", paddle::cpp::VarDescAPI::Type::FP32);
+  CHECK(!dtype.empty()) << "The op \"one_hot_v2\"'s attribute \"dtype\" should not be unknown type! Please check.";
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->OneHot(x, on_value, off_value, depth, axis, dtype);
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_one_hot) {
+  CINN_REGISTER_OP_MAPPER(one_hot, cinn::frontend::paddle_mappers::OneHotOpMapper)
+  CINN_REGISTER_OP_MAPPER(one_hot_v2, cinn::frontend::paddle_mappers::OneHotV2OpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/pool2d.cc b/paddle/cinn/frontend/op_mappers/paddle/pool2d.cc
new file mode 100644
index 0000000000000..adc5a38c0fa51
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/pool2d.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void Pool2dOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  CHECK(op_desc.HasAttr("pooling_type"));
+  auto pooling_type = utils::GetAttrOrDefault<std::string>(op_desc, "pooling_type");
+  CHECK(op_desc.HasAttr("ksize"));
+  auto ksize = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "ksize");
+
+  auto strides      = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "strides", {1, 1});
+  auto padding_size = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "paddings", {0, 0});
+
+  auto ceil_mode         = utils::GetAttrOrDefault<bool>(op_desc, "ceil_mode", false);
+  auto exclusive         = utils::GetAttrOrDefault<bool>(op_desc, "exclusive", true);
+  auto global_pooling    = utils::GetAttrOrDefault<bool>(op_desc, "global_pooling", false);
+  auto data_format       = utils::GetAttrOrDefault<std::string>(op_desc, "data_format", "NCHW");
+  auto adaptive          = utils::GetAttrOrDefault<bool>(op_desc, "adaptive", false);
+  auto padding_algorithm = utils::GetAttrOrDefault<std::string>(op_desc, "padding_algorithm", "EXPLICIT");
+  auto x                 = ctx.GetVar(x_name);
+  auto out               = ctx.Builder()->Pool2d(x,
+                                   pooling_type,
+                                   ksize,
+                                   strides,
+                                   padding_size,
+                                   ceil_mode,
+                                   exclusive,
+                                   global_pooling,
+                                   data_format,
+                                   adaptive,
+                                   padding_algorithm);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void Pool2dGradOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Out").size(), 1UL);
+  auto y_name = op_desc.Input("Out").front();
+  CHECK_EQ(op_desc.Input(paddle::GradVarName("Out")).size(), 1UL);
+  auto dy_name = op_desc.Input(paddle::GradVarName("Out")).front();
+
+  CHECK_EQ(op_desc.Output(paddle::GradVarName("X")).size(), 1UL);
+  auto dx_name = op_desc.Output(paddle::GradVarName("X")).front();
+
+  CHECK(op_desc.HasAttr("pooling_type"));
+  auto pooling_type = utils::GetAttrOrDefault<std::string>(op_desc, "pooling_type");
+  CHECK(op_desc.HasAttr("ksize"));
+  auto ksize = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "ksize");
+
+  auto strides      = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "strides", {1, 1});
+  auto padding_size = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "paddings", {0, 0});
+
+  auto ceil_mode         = utils::GetAttrOrDefault<bool>(op_desc, "ceil_mode", false);
+  auto exclusive         = utils::GetAttrOrDefault<bool>(op_desc, "exclusive", true);
+  auto global_pooling    = utils::GetAttrOrDefault<bool>(op_desc, "global_pooling", false);
+  auto data_format       = utils::GetAttrOrDefault<std::string>(op_desc, "data_format", "NCHW");
+  auto adaptive          = utils::GetAttrOrDefault<bool>(op_desc, "adaptive", false);
+  auto padding_algorithm = utils::GetAttrOrDefault<std::string>(op_desc, "padding_algorithm", "EXPLICIT");
+
+  auto x  = ctx.GetVar(x_name);
+  auto y  = ctx.GetVar(y_name);
+  auto dy = ctx.GetVar(dy_name);
+
+  auto out = ctx.Builder()->Pool2dGrad(x,
+                                       y,
+                                       dy,
+                                       pooling_type,
+                                       ksize,
+                                       strides,
+                                       padding_size,
+                                       ceil_mode,
+                                       exclusive,
+                                       global_pooling,
+                                       data_format,
+                                       adaptive,
+                                       padding_algorithm);
+
+  ctx.AddVar(dx_name, out);
+  ctx.AddVarModelToProgram(dx_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_pool2d) {
+  CINN_REGISTER_OP_MAPPER(pool2d, cinn::frontend::paddle_mappers::Pool2dOpMapper)
+  CINN_REGISTER_OP_MAPPER(pool2d_grad, cinn::frontend::paddle_mappers::Pool2dGradOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/randint.cc b/paddle/cinn/frontend/op_mappers/paddle/randint.cc
new file mode 100644
index 0000000000000..8e32a7b4e3037
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/randint.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void RandIntOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  CHECK(op_desc.HasAttr("shape")) << "Cannot find attribute \"shape\" in paddle op \"randint\"! Please check.";
+  auto shape_origin = utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "shape");
+  auto shape        = utils::ToShapeType(shape_origin);
+
+  CHECK(op_desc.HasAttr("low")) << "Cannot find attribute \"low\" in paddle op \"randint\"! Please check.";
+  auto min = utils::GetAttrOrDefault<int>(op_desc, "low", 0);
+
+  CHECK(op_desc.HasAttr("high")) << "Cannot find attribute \"high\" in paddle op \"randint\"! Please check.";
+  auto max = utils::GetAttrOrDefault<int>(op_desc, "high", 0);
+  CHECK_GT(max, min) << "max(" << max << ") should greater than min(" << min << ")! Please check.";
+
+  auto seed = utils::GetAttrOrDefault<int>(op_desc, "seed", 0);
+
+  auto dtype = utils::GetPaddleDtype(op_desc, "dtype", paddle::cpp::VarDescAPI::Type::INT64);
+  CHECK(dtype == "int32" || dtype == "int64") << "the indices dtype must be int32 or int64, but got dtype = " << dtype;
+
+  auto out = ctx.Builder()->RandInt(shape, min, max, seed, dtype);
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_randint) {
+  CINN_REGISTER_OP_MAPPER(randint, cinn::frontend::paddle_mappers::RandIntOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/reduce.cc b/paddle/cinn/frontend/op_mappers/paddle/reduce.cc
new file mode 100644
index 0000000000000..74d7bfaa7e32f
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/reduce.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void ReduceOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx, const std::string& reduce_type) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto axis       = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int>>(op_desc, "dim"));
+  auto keepdim    = utils::GetAttrOrDefault<bool>(op_desc, "keep_dim", false);
+  auto reduce_all = utils::GetAttrOrDefault<bool>(op_desc, "reduce_all", false);
+
+  auto x = ctx.GetVar(x_name);
+
+  VLOG(4) << "Reudce " << reduce_type << " x:" << x_name << " from shape (" << cinn::utils::Join(x->shape, ",")
+          << "), with dim=[" << cinn::utils::Join(axis, ",") << "], keepdim=" << keepdim
+          << ", reduce_all=" << reduce_all;
+
+  if (reduce_all) {
+    axis.clear();
+    for (int i = 0; i < x->shape.size(); ++i) {
+      axis.emplace_back(i);
+    }
+  }
+
+  // now paddle science only need reduce sum
+  absl::optional<Variable> out;
+  if (reduce_type == "Sum") {
+    out = ctx.Builder()->ReduceSum(x, axis, keepdim);
+  } else if (reduce_type == "Prod") {
+    out = ctx.Builder()->ReduceProd(x, axis, keepdim);
+  } else if (reduce_type == "Max") {
+    out = ctx.Builder()->ReduceMax(x, axis, keepdim);
+  } else if (reduce_type == "Min") {
+    out = ctx.Builder()->ReduceMin(x, axis, keepdim);
+  } else if (reduce_type == "All") {
+    out = ctx.Builder()->ReduceAll(x, axis, keepdim);
+  } else if (reduce_type == "Any") {
+    out = ctx.Builder()->ReduceAny(x, axis, keepdim);
+  } else if (reduce_type == "Mean") {
+    int num = 1;
+    if (axis.empty()) {
+      num = std::accumulate(x->shape.begin(), x->shape.end(), 1, std::multiplies<int>());
+    } else {
+      for (int i = 0; i < axis.size(); ++i) {
+        num *= x->shape[axis[i]];
+      }
+    }
+
+    const auto& sum  = ctx.Builder()->ReduceSum(x, axis, keepdim);
+    const auto& size = ctx.Builder()->FillConstant(
+        sum->shape, num, cinn::common::UniqName(x->id + "_mean"), cinn::common::Type2Str(sum->type));
+    out = ctx.Builder()->Divide(sum, size);
+  }
+
+  CHECK(out) << "Not support Reduce " << reduce_type << "! Please check.";
+
+  auto dtype = utils::GetPaddleDtype(op_desc, "out_dtype", static_cast<paddle::cpp::VarDescAPI::Type>(-1));
+  if (!dtype.empty() && common::Type2Str(out.value()->type) != dtype) {
+    out = ctx.Builder()->Cast(out.value(), dtype);
+  }
+
+  ctx.AddVar(out_name, out.value());
+  ctx.AddVarModelToProgram(out_name, out.value()->id);
+}
+
+#define EXPAND_REDUCE_OPMAPPER(ReduceType)                                                            \
+  void Reduce##ReduceType##OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) { \
+    ReduceOpMapper(op_desc, ctx, #ReduceType);                                                        \
+  }
+
+EXPAND_REDUCE_OPMAPPER(Sum)
+EXPAND_REDUCE_OPMAPPER(Prod)
+EXPAND_REDUCE_OPMAPPER(Max)
+EXPAND_REDUCE_OPMAPPER(Min)
+EXPAND_REDUCE_OPMAPPER(All)
+EXPAND_REDUCE_OPMAPPER(Any)
+EXPAND_REDUCE_OPMAPPER(Mean)
+#undef EXPAND_REDUCE_OPMAPPER
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_reduce) {
+#define EXPAND_REDUCE_OP_MAPPER_REGISTER(op_name, ReduceType) \
+  CINN_REGISTER_OP_MAPPER(op_name, cinn::frontend::paddle_mappers::Reduce##ReduceType##OpMapper)
+
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_sum, Sum)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_prod, Prod)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_max, Max)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_min, Min)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_all, All)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_any, Any)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_mean, Mean)
+#undef EXPAND_REDUCE_OP_MAPPER_REGISTER
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/relu.cc b/paddle/cinn/frontend/op_mappers/paddle/relu.cc
new file mode 100644
index 0000000000000..d8997ff7e63ee
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/relu.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void ReluOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  auto x        = ctx.GetVar(x_name);
+  auto out      = ctx.Builder()->Relu(x);
+
+  ctx.AddVar(out_name, out, true);
+  ctx.AddVarModelToProgram(out_name, out->id, true);
+}
+
+void Relu6OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto threshold = utils::GetAttrOrDefault<float>(op_desc, "threshold", 6.0f);
+  auto x         = ctx.GetVar(x_name);
+  auto out       = ctx.Builder()->Relu6(x, threshold);
+
+  ctx.AddVar(out_name, out, true);
+  ctx.AddVarModelToProgram(out_name, out->id, true);
+}
+
+void ReluGradOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input(paddle::GradVarName("Out")).size(), 1UL);
+  auto dout_name = op_desc.Input(paddle::GradVarName("Out")).front();
+  CHECK_EQ(op_desc.Input("Out").size(), 1UL);
+  auto out_name = op_desc.Input("Out").front();
+  CHECK_EQ(op_desc.Output(paddle::GradVarName("X")).size(), 1UL);
+  auto dx_name = op_desc.Output(paddle::GradVarName("X")).front();
+
+  auto dout = ctx.GetVar(dout_name);
+  auto out  = ctx.GetVar(out_name);
+  auto dx   = ctx.Builder()->ReluGrad(dout, out);
+
+  ctx.AddVar(dx_name, dx, true);
+  ctx.AddVarModelToProgram(dx_name, dx->id, true);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_relu) {
+  CINN_REGISTER_OP_MAPPER(relu, cinn::frontend::paddle_mappers::ReluOpMapper)
+  CINN_REGISTER_OP_MAPPER(relu_grad, cinn::frontend::paddle_mappers::ReluGradOpMapper)
+  CINN_REGISTER_OP_MAPPER(relu6, cinn::frontend::paddle_mappers::Relu6OpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/reshape.cc b/paddle/cinn/frontend/op_mappers/paddle/reshape.cc
new file mode 100644
index 0000000000000..e767fbee1ec86
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/reshape.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void ReshapeOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  auto x      = ctx.GetVar(x_name);
+
+  auto shape = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "shape");
+
+  VLOG(4) << "x shape: " << cinn::utils::Join(x->shape, ",");
+  VLOG(4) << "reshape to : " << cinn::utils::Join(shape, ",");
+
+  auto out = ctx.Builder()->Reshape(x, shape);
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void ReshapeGradOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  auto get_input_var = [&op_desc, &ctx](const std::string& op_name) {
+    CHECK_EQ(op_desc.Input(op_name).size(), 1UL);
+    auto var_name = op_desc.Input(op_name).front();
+    return ctx.GetVar(var_name);
+  };
+
+  auto get_output_name = [&op_desc](const std::string& op_name) {
+    CHECK_EQ(op_desc.Output(op_name).size(), 1UL);
+    return op_desc.Output(op_name).front();
+  };
+
+  auto dout = get_input_var(paddle::GradVarName("Out"));
+  VLOG(4) << "dout shape: " << cinn::utils::Join(dout->shape, ",");
+
+  auto x = get_input_var("X");
+  VLOG(4) << "x shape: " << cinn::utils::Join(x->shape, ",");
+
+  auto out = ctx.Builder()->Reshape(dout, x->shape);
+
+  auto out_name = get_output_name(paddle::GradVarName("X"));
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void Reshape2OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  auto x      = ctx.GetVar(x_name);
+
+  auto shape = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "shape");
+
+  VLOG(4) << "x shape: " << cinn::utils::Join(x->shape, ",");
+  VLOG(4) << "reshape to : " << cinn::utils::Join(shape, ",");
+
+  auto out = ctx.Builder()->Reshape(x, shape);
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+
+  if (op_desc.HasOutput("XShape")) {
+    // Reshape2 adds an intermediate output(XShape) based on
+    // Reshape, the XShape is used to carry the shape and lod of X which
+    // will be used in Reshape_grad, in this way, the framework can reuse
+    // the memory of X immediately the Reshape2_op is finished.
+    // Considering compatibility issues, we could not fix Reshape2_op
+    CHECK_EQ(op_desc.Output("XShape").size(), 1UL);
+    auto xshape_name = op_desc.Output("XShape").front();
+
+    auto xshape = ctx.Builder()->Identity(x);
+
+    ctx.AddVar(xshape_name, xshape);
+    ctx.AddVarModelToProgram(xshape_name, xshape->id);
+  }
+}
+
+void Reshape2GradOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  auto get_input_var = [&op_desc, &ctx](const std::string& op_name) {
+    CHECK_EQ(op_desc.Input(op_name).size(), 1UL);
+    auto var_name = op_desc.Input(op_name).front();
+    return ctx.GetVar(var_name);
+  };
+
+  auto get_output_name = [&op_desc](const std::string& op_name) {
+    CHECK_EQ(op_desc.Output(op_name).size(), 1UL);
+    return op_desc.Output(op_name).front();
+  };
+
+  auto dout = get_input_var(paddle::GradVarName("Out"));
+  VLOG(4) << "dout shape: " << cinn::utils::Join(dout->shape, ",");
+
+  auto xshape = get_input_var("XShape");
+  VLOG(4) << "x shape: " << cinn::utils::Join(xshape->shape, ",");
+
+  auto out = ctx.Builder()->Reshape(dout, xshape->shape);
+
+  auto out_name = get_output_name(paddle::GradVarName("X"));
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_reshape) {
+  CINN_REGISTER_OP_MAPPER(reshape, cinn::frontend::paddle_mappers::ReshapeOpMapper)
+  CINN_REGISTER_OP_MAPPER(reshape2, cinn::frontend::paddle_mappers::Reshape2OpMapper)
+
+  CINN_REGISTER_OP_MAPPER(reshape_grad, cinn::frontend::paddle_mappers::ReshapeGradOpMapper)
+  CINN_REGISTER_OP_MAPPER(reshape2_grad, cinn::frontend::paddle_mappers::Reshape2GradOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/reverse.cc b/paddle/cinn/frontend/op_mappers/paddle/reverse.cc
new file mode 100644
index 0000000000000..e515f970bbefe
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/reverse.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void ReverseOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto axes = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "axis", std::vector<int>{});
+  VLOG(4) << "out_name = reverse(" << x_name << ", axis=[" << cinn::utils::Join(axes, ", ") << "])";
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->Reverse(x, axes);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_reverse) {
+  CINN_REGISTER_OP_MAPPER(reverse, cinn::frontend::paddle_mappers::ReverseOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/roll.cc b/paddle/cinn/frontend/op_mappers/paddle/roll.cc
new file mode 100644
index 0000000000000..85cb63504f2a5
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/roll.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void RollOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  // input
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  // output
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  // attr shifts and axis
+  CHECK(op_desc.HasAttr("shifts"));
+  CHECK(op_desc.HasAttr("axis"));
+  std::vector<int> shifts = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "shifts", {1}));
+  std::vector<int> axis   = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "axis", {}));
+
+  auto x                        = ctx.GetVar(x_name);
+  auto vec_x_dims               = std::vector<int>(x->shape);
+  std::vector<int> output_shape = vec_x_dims;
+
+  // check axis and shifts and when axis is None, we should flatten x.
+  bool axis_None = false;
+  if (axis.size() == 0) {
+    CHECK_EQ(shifts.size(), 1) << "shifts.size() should be equal to 1 when axis is None";
+    axis.push_back(0);
+    axis_None       = true;
+    int reshape_num = 1;
+    for (int i = 0; i < vec_x_dims.size(); ++i) {
+      reshape_num *= vec_x_dims[i];
+    }
+    vec_x_dims = std::vector<int>{reshape_num};
+    x          = ctx.Builder()->Reshape(x, vec_x_dims);
+  } else {
+    CHECK_EQ(shifts.size(), axis.size()) << "shifts.size() should be equal to axis.size()";
+  }
+
+  // preprocessing the shifts and axis
+  int shifts_size = shifts.size();
+  std::unordered_map<int, int> axis_to_shifts;
+  for (int i = 0; i < shifts_size; ++i) {
+    int vec_x_dims_size = vec_x_dims.size();
+    CHECK_GE(axis[i], -vec_x_dims_size) << "axis value should be >= " << -vec_x_dims_size;
+    CHECK_LT(axis[i], vec_x_dims_size) << "axis value should be < " << vec_x_dims_size;
+    if (axis[i] < 0) {
+      axis[i] += vec_x_dims_size;
+    }
+    // optimize for the same axis
+    if (axis_to_shifts.count(axis[i]) > 0) {
+      axis_to_shifts[axis[i]] += shifts[i];
+    } else {
+      axis_to_shifts[axis[i]] = shifts[i];
+    }
+    int size = vec_x_dims[axis[i]];
+    if (size > 0) {
+      axis_to_shifts[axis[i]] = (axis_to_shifts[axis[i]] % size + size) % size;
+    }
+  }
+
+  auto output = ctx.Builder()->Identity(x);
+  // use Split + Concat for each shift
+  for (const auto& pair : axis_to_shifts) {
+    if (pair.second > 0) {
+      int length        = vec_x_dims[pair.first];
+      auto front_slice  = ctx.Builder()->Slice(output, {pair.first}, {0}, {length - pair.second});
+      auto behind_slice = ctx.Builder()->Slice(output, {pair.first}, {length - pair.second}, {length});
+      auto split_output = std::vector<Variable>{behind_slice, front_slice};
+      output            = ctx.Builder()->Concat(split_output, pair.first);
+    }
+  }
+
+  if (axis_None) {
+    output = ctx.Builder()->Reshape(output, output_shape);
+  }
+
+  ctx.AddVar(out_name, output);
+  ctx.AddVarModelToProgram(out_name, output->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_roll) {
+  CINN_REGISTER_OP_MAPPER(roll, cinn::frontend::paddle_mappers::RollOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/scale.cc b/paddle/cinn/frontend/op_mappers/paddle/scale.cc
new file mode 100644
index 0000000000000..8b4b6e2a4fc69
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/scale.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <variant>
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void ScaleOpMapper(const paddle::cpp::OpDesc& op_desc, const cinn::frontend::OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto bias             = utils::GetAttrOrDefault<float>(op_desc, "bias", 0.0f);
+  auto bias_after_scale = utils::GetAttrOrDefault<bool>(op_desc, "bias_after_scale", true);
+
+  auto x = ctx.GetVar(x_name);
+
+  absl::optional<Variable> out;
+  if (op_desc.HasInput("ScaleTensor") && !op_desc.Input("ScaleTensor").empty()) {
+    CHECK_EQ(op_desc.Input("ScaleTensor").size(), 1);
+    auto scale_name   = op_desc.Input("ScaleTensor").front();
+    auto scale_tensor = ctx.GetVar(scale_name);
+
+    VLOG(4) << out_name << " = scale(" << x_name << "=" << x << ", scale=" << scale_name << "[" << scale_tensor
+            << "], bias=" << bias << ", bias_after_scale=" << bias_after_scale;
+
+    CHECK(scale_tensor->shape == cinn::utils::ShapeType{1}) << "The shape of [ScaleTensor] should be [1], but here ["
+                                                            << cinn::utils::Join(scale_tensor->shape, ", ") << "]";
+    scale_tensor = ctx.Builder()->Cast(scale_tensor, common::Type2Str(x->type));
+    scale_tensor = ctx.Builder()->BroadcastTo(scale_tensor, x->shape);
+
+    if (bias != 0.0f) {
+      auto bias_tensor = ctx.Builder()->FillConstant(x->shape, bias, x->id + "_bias", common::Type2Str(x->type));
+      if (bias_after_scale) {
+        out = ctx.Builder()->Add(bias_tensor, ctx.Builder()->Multiply(x, scale_tensor));
+      } else {
+        out = ctx.Builder()->Multiply(scale_tensor, ctx.Builder()->Add(x, bias_tensor));
+      }
+    } else {
+      out = ctx.Builder()->Multiply(scale_tensor, x);
+    }
+  } else {
+    auto scale = utils::GetAttrOrDefault<float>(op_desc, "scale", 1.0f);
+
+    VLOG(4) << out_name << " = scale(" << x_name << "=" << x << ", scale=" << scale << ", bias=" << bias
+            << ", bias_after_scale=" << bias_after_scale;
+
+    out = ctx.Builder()->Scale(x, scale, bias, bias_after_scale);
+  }
+
+  ctx.AddVar(out_name, out.value());
+  ctx.AddVarModelToProgram(out_name, out.value()->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_scale) {
+  CINN_REGISTER_OP_MAPPER(scale, cinn::frontend::paddle_mappers::ScaleOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/scatter.cc b/paddle/cinn/frontend/op_mappers/paddle/scatter.cc
new file mode 100644
index 0000000000000..507f8a86cf7c2
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/scatter.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/type.h"
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void ScatterOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Ids").size(), 1UL);
+  auto ids_name = op_desc.Input("Ids").front();
+  CHECK_EQ(op_desc.Input("Updates").size(), 1UL);
+  auto updates_name = op_desc.Input("Updates").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  bool overwrite = utils::GetAttrOrDefault<bool>(op_desc, "overwrite", true);
+  VLOG(4) << "out_name = scatter(X=" << x_name << ", Ids=" << ids_name << ", Updates=" << updates_name
+          << ", overwrite=" << overwrite << ")";
+
+  const auto& input   = ctx.GetVar(x_name);
+  auto indices        = ctx.GetVar(ids_name);
+  const auto& updates = ctx.GetVar(updates_name);
+  CHECK(input->type == updates->type) << "checks whether the type of the input and the updates are the same.";
+  CHECK(indices->type == common::Int(32) || indices->type == common::Int(64))
+      << "checks whether the data type of the indices is either int32 or int64";
+  if (indices->type == common::Int(64)) {
+    indices = ctx.Builder()->Cast(indices, common::Type2Str(common::Int(32)));
+  }
+  CHECK_LE(indices->shape.size(), 2) << "Ids should be 0, 1 or 2 in scatter_op";
+  if (indices->shape.size() == 0) {
+    indices = ctx.Builder()->Reshape(indices, {1});
+  }
+  if (indices->shape.size() == 2) {
+    indices = ctx.Builder()->Reshape(indices, {indices->shape[0] * indices->shape[1]});
+  }
+
+  Variable out;
+  if (overwrite) {
+    out = ctx.Builder()->ScatterAssign(input, updates, indices);
+  } else {
+    const auto& zeros = ctx.Builder()->FillConstant(
+        updates->shape, 0, common::UniqName("scatter_zeros"), common::Type2Str(updates->type));
+    out = ctx.Builder()->ScatterAssign(input, zeros, indices);
+    out = ctx.Builder()->ScatterAdd(out, updates, indices);
+  }
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_scatter) {
+  CINN_REGISTER_OP_MAPPER(scatter, cinn::frontend::paddle_mappers::ScatterOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/slice.cc b/paddle/cinn/frontend/op_mappers/paddle/slice.cc
new file mode 100644
index 0000000000000..8ebeb53144dfc
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/slice.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void SliceOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+  auto x_name = op_desc.Input("Input").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  CHECK(op_desc.HasAttr("starts"));
+  auto starts = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "starts");
+  CHECK(op_desc.HasAttr("ends"));
+  auto ends = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "ends");
+  CHECK(op_desc.HasAttr("axes"));
+  auto axes = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "axes");
+
+  auto infer_flags   = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "infer_flags");
+  auto strides       = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "strides");
+  auto decrease_axis = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "decrease_axis");
+  auto x             = ctx.GetVar(x_name);
+  auto out           = ctx.Builder()->Slice(x, axes, starts, ends, infer_flags, strides, decrease_axis);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_slice) {
+  CINN_REGISTER_OP_MAPPER(slice, cinn::frontend::paddle_mappers::SliceOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/softmax.cc b/paddle/cinn/frontend/op_mappers/paddle/softmax.cc
new file mode 100644
index 0000000000000..732b101927999
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/softmax.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void SoftmaxOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto axis        = utils::GetAttrOrDefault<int>(op_desc, "axis", -1);
+  auto data_format = utils::GetAttrOrDefault<std::string>(op_desc, "data_format", "AnyLayout");
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->Softmax(x, {axis}, data_format);
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_softmax) {
+  CINN_REGISTER_OP_MAPPER(softmax, cinn::frontend::paddle_mappers::SoftmaxOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/squeeze.cc b/paddle/cinn/frontend/op_mappers/paddle/squeeze.cc
new file mode 100644
index 0000000000000..7c64abc709ee3
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/squeeze.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void Squeeze2OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  auto x      = ctx.GetVar(x_name);
+
+  auto axes = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "axes");
+
+  VLOG(4) << "x shape: " << cinn::utils::Join(x->shape, ",");
+  VLOG(4) << "squeeze axes: " << cinn::utils::Join(axes, ",");
+
+  auto out = ctx.Builder()->Squeeze(x, axes);
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+
+  if (op_desc.HasOutput("XShape")) {
+    // squeeze2 adds an intermediate output(XShape) based on squeeze,
+    // the XShape is used to carry the shape and lod of X which will be used in
+    // squeeze_grad, in this way, the framework can reuse the memory of X
+    // immediately the squeeze2_op is finished.
+    // Considering compatibility issues, we could not fix squeeze2_op
+    CHECK_EQ(op_desc.Output("XShape").size(), 1UL);
+    auto xshape_name = op_desc.Output("XShape").front();
+
+    auto xshape = ctx.Builder()->Identity(x);
+
+    ctx.AddVar(xshape_name, xshape);
+    ctx.AddVarModelToProgram(xshape_name, xshape->id);
+  }
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_squeeze) {
+  CINN_REGISTER_OP_MAPPER(squeeze2, cinn::frontend::paddle_mappers::Squeeze2OpMapper)
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/strided_slice.cc b/paddle/cinn/frontend/op_mappers/paddle/strided_slice.cc
new file mode 100644
index 0000000000000..9d637d4e37f8c
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/strided_slice.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void StridedSliceOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+  auto x_name = op_desc.Input("Input").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  CHECK(op_desc.HasAttr("starts"));
+  auto starts = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int>>(op_desc, "starts"));
+  CHECK(op_desc.HasAttr("ends"));
+  auto ends = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int>>(op_desc, "ends"));
+  CHECK(op_desc.HasAttr("axes"));
+  auto axes = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int>>(op_desc, "axes"));
+  CHECK(op_desc.HasAttr("strides"));
+  auto strides = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int>>(op_desc, "strides"));
+  CHECK(op_desc.HasAttr("infer_flags"));
+  auto infer_flags   = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int>>(op_desc, "infer_flags"));
+  auto decrease_axis = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int>>(op_desc, "decrease_axis"));
+
+  auto x   = ctx.GetVar(x_name);
+  auto out = ctx.Builder()->Slice(x, axes, starts, ends, infer_flags, strides, decrease_axis);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_strided_slice) {
+  CINN_REGISTER_OP_MAPPER(strided_slice, cinn::frontend::paddle_mappers::StridedSliceOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/take_along_axis.cc b/paddle/cinn/frontend/op_mappers/paddle/take_along_axis.cc
new file mode 100644
index 0000000000000..b771c2e96f04b
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/take_along_axis.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void TakeAlongAxis2OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+  auto x_name = op_desc.Input("Input").front();
+  auto x      = ctx.GetVar(x_name);
+  CHECK_EQ(op_desc.Input("Index").size(), 1UL);
+  auto index_name = op_desc.Input("Index").front();
+  auto index      = ctx.GetVar(index_name);
+
+  auto axis = utils::GetAttrOrDefault<int>(op_desc, "Axis");
+
+  VLOG(4) << "x shape: " << cinn::utils::Join(x->shape, ",");
+  VLOG(4) << "index shape: " << cinn::utils::Join(index->shape, ",");
+  VLOG(4) << "take_along_axis axis: " << axis;
+
+  auto out = ctx.Builder()->Gather(x, index, axis);
+
+  CHECK_EQ(op_desc.Output("Result").size(), 1UL);
+  auto out_name = op_desc.Output("Result").front();
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_take_along_axis) {
+  CINN_REGISTER_OP_MAPPER(take_along_axis, cinn::frontend::paddle_mappers::TakeAlongAxis2OpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/tile.cc b/paddle/cinn/frontend/op_mappers/paddle/tile.cc
new file mode 100644
index 0000000000000..03038a0186dd9
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/tile.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void TileOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  // input
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  // output
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  // attr repeat_times
+  std::vector<int> repeat_times = op_desc.GetAttr<std::vector<int>>("repeat_times");
+
+  for (auto i : repeat_times) {
+    CHECK_GT(i, 0) << "repeat_times's element must be greater than 0";
+  }
+
+  auto x = ctx.GetVar(x_name);
+
+  // promotion
+  auto vec_x_dims = std::vector<int>(x->shape);
+  if (repeat_times.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times.size();
+    repeat_times.insert(repeat_times.begin(), diff, 1);
+  } else {
+    int diff = repeat_times.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+
+  CHECK_EQ(vec_x_dims.size(), repeat_times.size())
+      << "vec_x_dims's size must be equal to repeat_times's size after promotion";
+
+  // output's shape
+  std::vector<int> output_shape = vec_x_dims;
+
+  // calucate output's shape
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    output_shape[i] *= repeat_times[i];
+  }
+
+  VLOG(4) << "output_shape: " << cinn::utils::Join(output_shape, ",");
+
+  // NOTE(wuweilong): Paddle's tile is implemented by Eigen's broadcast directly, but CINN's tile can not be implemented
+  // by BroadcastTo directly, because it is different from Eigen's broadcast. The semantics of Eigen's broadcast is same
+  // as tile, but CINN can not use Eigen's broadcast. So we need to Combine Reshape and BroadcastTo to implement tile.
+
+  // make a copy of vec_x_dims
+  std::vector<int> vec_x_dims_copy = vec_x_dims;
+  // recontruct vec_x_dims_copy by inserting 1 before every element
+  for (size_t i = 0; i < vec_x_dims_copy.size(); ++i) {
+    vec_x_dims_copy.insert(vec_x_dims_copy.begin() + i, 1);
+    i++;
+  }
+
+  x = ctx.Builder()->Reshape(x, vec_x_dims_copy);
+
+  // recontruct vec_x_dims_copy for BroadaCast
+  for (size_t i = 0; i < vec_x_dims_copy.size(); ++i) {
+    if (i % 2 == 0) {
+      vec_x_dims_copy[i] = output_shape[i / 2] / vec_x_dims_copy[i + 1];
+    }
+  }
+
+  auto tmp    = ctx.Builder()->BroadcastTo(x, vec_x_dims_copy);
+  auto output = ctx.Builder()->Reshape(tmp, output_shape);
+
+  ctx.AddVar(out_name, output);
+  ctx.AddVarModelToProgram(out_name, output->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_tile) {
+  CINN_REGISTER_OP_MAPPER(tile, cinn::frontend::paddle_mappers::TileOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/top_k.cc b/paddle/cinn/frontend/op_mappers/paddle/top_k.cc
new file mode 100644
index 0000000000000..786ec5612b000
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/top_k.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void TopKOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  CHECK_EQ(op_desc.Output("Indices").size(), 1UL);
+  auto indices_name = op_desc.Output("Indices").front();
+  auto x            = ctx.GetVar(x_name);
+
+  CHECK(op_desc.HasAttr("k"));
+  auto k    = utils::GetAttrOrDefault<int>(op_desc, "k");
+  auto outs = ctx.Builder()->TopK(x, k, -1, true);
+
+  ctx.AddVar(out_name, outs[0]);
+  ctx.AddVarModelToProgram(out_name, outs[0]->id);
+  ctx.AddVar(indices_name, outs[1]);
+  ctx.AddVarModelToProgram(indices_name, outs[1]->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_top_k) {
+  CINN_REGISTER_OP_MAPPER(top_k, cinn::frontend::paddle_mappers::TopKOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/transpose.cc b/paddle/cinn/frontend/op_mappers/paddle/transpose.cc
new file mode 100644
index 0000000000000..0daddd0ec62a4
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/transpose.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void TransposeOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  auto x      = ctx.GetVar(x_name);
+
+  auto axis = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "axis");
+
+  VLOG(4) << "x shape: " << cinn::utils::Join(x->shape, ",");
+  VLOG(4) << "transpose perm : " << cinn::utils::Join(axis, ",");
+
+  auto out = ctx.Builder()->Transpose(x, axis);
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void Transpose2OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  auto x      = ctx.GetVar(x_name);
+
+  auto axis = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "axis");
+
+  VLOG(4) << "x shape: " << cinn::utils::Join(x->shape, ",");
+  VLOG(4) << "transpose2 perm : " << cinn::utils::Join(axis, ",");
+
+  auto out = ctx.Builder()->Transpose(x, axis);
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+
+  if (op_desc.HasOutput("XShape")) {
+    // transpose2 adds an intermediate output(XShape) based on
+    // transpose, the XShape is used to carry the shape and lod of X which
+    // will be used in transpose_grad, in this way, the framework can reuse
+    // the memory of X immediately the transpose2_op is finished.
+    // Considering compatibility issues, we could not fix transpose2_op
+    CHECK_EQ(op_desc.Output("XShape").size(), 1UL);
+    auto xshape_name = op_desc.Output("XShape").front();
+
+    auto xshape = ctx.Builder()->Identity(x);
+
+    ctx.AddVar(xshape_name, xshape);
+    ctx.AddVarModelToProgram(xshape_name, xshape->id);
+  }
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_transpose) {
+  CINN_REGISTER_OP_MAPPER(transpose, cinn::frontend::paddle_mappers::TransposeOpMapper)
+  CINN_REGISTER_OP_MAPPER(transpose2, cinn::frontend::paddle_mappers::Transpose2OpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/triangular_solve.cc b/paddle/cinn/frontend/op_mappers/paddle/triangular_solve.cc
new file mode 100644
index 0000000000000..efbff5985f88a
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/triangular_solve.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void TriangularSolveOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto y_name = op_desc.Input("Y").front();
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  constexpr bool left_side = true;
+  auto upper               = utils::GetAttrOrDefault<bool>(op_desc, "upper", true);
+  auto transpose           = utils::GetAttrOrDefault<bool>(op_desc, "transpose", false);
+  auto unitriangular       = utils::GetAttrOrDefault<bool>(op_desc, "unitriangular", false);
+  VLOG(4) << "out_name = triangular_solve(" << x_name << ", left_side=" << left_side << ", upper=" << upper
+          << ", transpose=" << transpose << ", unitriangular=" << unitriangular << ")";
+
+  auto x   = ctx.GetVar(x_name);
+  auto y   = ctx.GetVar(y_name);
+  auto out = ctx.Builder()->TriangularSolve(x, y, left_side, upper, transpose, unitriangular);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_triangular_solve) {
+  CINN_REGISTER_OP_MAPPER(triangular_solve, cinn::frontend::paddle_mappers::TriangularSolveOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/unary.cc b/paddle/cinn/frontend/op_mappers/paddle/unary.cc
new file mode 100644
index 0000000000000..d442950fd468f
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/unary.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+#define UNARY_OPMAPPER_FUNCTION(OP_NAME)                                                                          \
+  void OP_NAME##OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {                        \
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);                                                                     \
+    auto x_name = op_desc.Input("X").front();                                                                     \
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);                                                                  \
+    auto out_name = op_desc.Output("Out").front();                                                                \
+    auto x        = ctx.GetVar(x_name);                                                                           \
+    VLOG(4) << #OP_NAME << " X:" << x_name << "[" << cinn::utils::Join(x->shape, ",") << "] to Out:" << out_name; \
+    auto out = ctx.Builder()->OP_NAME(x);                                                                         \
+    ctx.AddVar(out_name, out);                                                                                    \
+    ctx.AddVarModelToProgram(out_name, out->id);                                                                  \
+  }
+
+UNARY_OPMAPPER_FUNCTION(LogicalNot)
+UNARY_OPMAPPER_FUNCTION(BitwiseNot)
+UNARY_OPMAPPER_FUNCTION(Sqrt)
+UNARY_OPMAPPER_FUNCTION(Gelu)
+UNARY_OPMAPPER_FUNCTION(Sigmoid)
+UNARY_OPMAPPER_FUNCTION(Exp)
+UNARY_OPMAPPER_FUNCTION(Erf)
+UNARY_OPMAPPER_FUNCTION(Rsqrt)
+UNARY_OPMAPPER_FUNCTION(Floor)
+UNARY_OPMAPPER_FUNCTION(Ceil)
+UNARY_OPMAPPER_FUNCTION(Round)
+UNARY_OPMAPPER_FUNCTION(Trunc)
+UNARY_OPMAPPER_FUNCTION(Sin)
+UNARY_OPMAPPER_FUNCTION(Cos)
+UNARY_OPMAPPER_FUNCTION(Tan)
+UNARY_OPMAPPER_FUNCTION(Sinh)
+UNARY_OPMAPPER_FUNCTION(Cosh)
+UNARY_OPMAPPER_FUNCTION(Tanh)
+UNARY_OPMAPPER_FUNCTION(Asin)
+UNARY_OPMAPPER_FUNCTION(Acos)
+UNARY_OPMAPPER_FUNCTION(Atan)
+UNARY_OPMAPPER_FUNCTION(Asinh)
+UNARY_OPMAPPER_FUNCTION(Acosh)
+UNARY_OPMAPPER_FUNCTION(Atanh)
+UNARY_OPMAPPER_FUNCTION(Sign)
+UNARY_OPMAPPER_FUNCTION(Abs)
+UNARY_OPMAPPER_FUNCTION(Reciprocal)
+UNARY_OPMAPPER_FUNCTION(IsNan)
+UNARY_OPMAPPER_FUNCTION(IsFinite)
+UNARY_OPMAPPER_FUNCTION(IsInf)
+
+#undef UNARY_OPMAPPER_FUNCTION
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_unary) {
+#define UNARY_OPMAPPER_REGISTER(PD_OP, CINN_OP) \
+  CINN_REGISTER_OP_MAPPER(PD_OP, cinn::frontend::paddle_mappers::CINN_OP##OpMapper)
+
+  UNARY_OPMAPPER_REGISTER(logical_not, LogicalNot)
+  UNARY_OPMAPPER_REGISTER(bitwise_not, BitwiseNot)
+  UNARY_OPMAPPER_REGISTER(sqrt, Sqrt)
+  UNARY_OPMAPPER_REGISTER(gelu, Gelu)
+  UNARY_OPMAPPER_REGISTER(sigmoid, Sigmoid)
+  UNARY_OPMAPPER_REGISTER(exp, Exp)
+  UNARY_OPMAPPER_REGISTER(erf, Erf)
+  UNARY_OPMAPPER_REGISTER(rsqrt, Rsqrt)
+  UNARY_OPMAPPER_REGISTER(floor, Floor)
+  UNARY_OPMAPPER_REGISTER(ceil, Ceil)
+  UNARY_OPMAPPER_REGISTER(round, Round)
+  UNARY_OPMAPPER_REGISTER(trunc, Trunc)
+  UNARY_OPMAPPER_REGISTER(sin, Sin)
+  UNARY_OPMAPPER_REGISTER(cos, Cos)
+  UNARY_OPMAPPER_REGISTER(tan, Tan)
+  UNARY_OPMAPPER_REGISTER(sinh, Sinh)
+  UNARY_OPMAPPER_REGISTER(cosh, Cosh)
+  UNARY_OPMAPPER_REGISTER(tanh, Tanh)
+  UNARY_OPMAPPER_REGISTER(asin, Asin)
+  UNARY_OPMAPPER_REGISTER(acos, Acos)
+  UNARY_OPMAPPER_REGISTER(atan, Atan)
+  UNARY_OPMAPPER_REGISTER(asinh, Asinh)
+  UNARY_OPMAPPER_REGISTER(acosh, Acosh)
+  UNARY_OPMAPPER_REGISTER(atanh, Atanh)
+  UNARY_OPMAPPER_REGISTER(sign, Sign)
+  UNARY_OPMAPPER_REGISTER(abs, Abs)
+  UNARY_OPMAPPER_REGISTER(reciprocal, Reciprocal)
+  UNARY_OPMAPPER_REGISTER(isinf_v2, IsInf)
+  UNARY_OPMAPPER_REGISTER(isnan_v2, IsNan)
+  UNARY_OPMAPPER_REGISTER(isfinite_v2, IsFinite)
+
+#undef UNARY_OPMAPPER_REGISTER
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/uniform_random.cc b/paddle/cinn/frontend/op_mappers/paddle/uniform_random.cc
new file mode 100644
index 0000000000000..989504c981249
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/uniform_random.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void UniformRandomOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto shape_origin = utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "shape");
+  auto shape        = utils::ToShapeType(shape_origin);
+
+  auto min  = utils::GetAttrOrDefault<float>(op_desc, "min", -1.0f);
+  auto max  = utils::GetAttrOrDefault<float>(op_desc, "max", 1.0f);
+  auto seed = utils::GetAttrOrDefault<int>(op_desc, "seed", 0);
+
+  auto diag_num  = utils::GetAttrOrDefault<int>(op_desc, "diag_num", 0);
+  auto diag_step = utils::GetAttrOrDefault<int>(op_desc, "diag_step", 0);
+  auto diag_val  = utils::GetAttrOrDefault<float>(op_desc, "diag_val", 1.0f);
+
+  auto dtype = utils::GetPaddleDtype(op_desc, "dtype", paddle::cpp::VarDescAPI::Type::FP32);
+  CHECK(!dtype.empty()) << "The op \"uniform_random\"'s attribute \"dtype\" should not be unknown type! Please check.";
+
+  VLOG(4) << out_name << "[" << cinn::utils::Join(shape, ", ") << "] = uniform_random(min=" << min << ", max=" << max
+          << ", seed=" << seed << ", dtype=" << dtype << ", shape=[" << cinn::utils::Join(shape, ", ") << "])";
+
+  auto out = ctx.Builder()->UniformRandom(shape, min, max, seed, dtype, diag_num, diag_step, diag_val);
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_uniform_random) {
+  CINN_REGISTER_OP_MAPPER(uniform_random, cinn::frontend::paddle_mappers::UniformRandomOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/unsqueeze.cc b/paddle/cinn/frontend/op_mappers/paddle/unsqueeze.cc
new file mode 100644
index 0000000000000..cb3ab0aebec60
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/unsqueeze.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void UnSqueeze2OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  auto x      = ctx.GetVar(x_name);
+
+  auto axes = utils::GetAttrOrDefault<std::vector<int>>(op_desc, "axes");
+
+  VLOG(4) << "x shape: " << cinn::utils::Join(x->shape, ",");
+  VLOG(4) << "unsqueeze axes: " << cinn::utils::Join(axes, ",");
+
+  const auto& out = ctx.Builder()->ExpandDims(x, axes);
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+
+  if (op_desc.HasOutput("XShape")) {
+    // squeeze2 adds an intermediate output(XShape) based on squeeze,
+    // the XShape is used to carry the shape and lod of X which will be used in
+    // squeeze_grad, in this way, the framework can reuse the memory of X
+    // immediately the squeeze2_op is finished.
+    // Considering compatibility issues, we could not fix squeeze2_op
+    CHECK_EQ(op_desc.Output("XShape").size(), 1UL);
+    auto xshape_name = op_desc.Output("XShape").front();
+
+    auto xshape = ctx.Builder()->Identity(x);
+
+    ctx.AddVar(xshape_name, xshape);
+    ctx.AddVarModelToProgram(xshape_name, xshape->id);
+  }
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_unsqueeze) {
+  CINN_REGISTER_OP_MAPPER(unsqueeze2, cinn::frontend::paddle_mappers::UnSqueeze2OpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/paddle/where.cc b/paddle/cinn/frontend/op_mappers/paddle/where.cc
new file mode 100644
index 0000000000000..57783f09b631e
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/paddle/where.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle_mappers {
+
+void WhereOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("Condition").size(), 1UL);
+  auto c_name = op_desc.Input("Condition").front();
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto y_name = op_desc.Input("Y").front();
+
+  CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+  auto out_name = op_desc.Output("Out").front();
+
+  auto c = ctx.GetVar(c_name);
+  auto x = ctx.GetVar(x_name);
+  auto y = ctx.GetVar(y_name);
+
+  auto out = ctx.Builder()->Select(c, x, y);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace paddle_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(paddle_where) {
+  CINN_REGISTER_OP_MAPPER(where, cinn::frontend::paddle_mappers::WhereOpMapper)
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/science/CMakeLists.txt b/paddle/cinn/frontend/op_mappers/science/CMakeLists.txt
new file mode 100644
index 0000000000000..4a029e1774967
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/science/CMakeLists.txt
@@ -0,0 +1,3 @@
+core_gather_headers()
+file(GLOB paddlescience_op_mapper_srcs LIST_DIRECTORIES false RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cc)
+gather_srcs(cinnapi_src SRCS ${paddlescience_op_mapper_srcs})
diff --git a/paddle/cinn/frontend/op_mappers/science/broadcast.cc b/paddle/cinn/frontend/op_mappers/science/broadcast.cc
new file mode 100644
index 0000000000000..79ee6061f7ef7
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/science/broadcast.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace science_mappers {
+
+void FillConstantOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Output("Y").size(), 1UL);
+  auto y_name = op_desc.Output("Y").front();
+
+  auto shape = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "shape"));
+  auto value = utils::GetAttrOrDefault<float>(op_desc, "value", 0.0f);
+
+  auto dtype_id = utils::GetAttrOrDefault<int>(op_desc, "dtype", static_cast<int>(paddle::cpp::VarDescAPI::Type::FP32));
+  auto dtype_pd = static_cast<paddle::cpp::VarDescAPI::Type>(dtype_id);
+  auto dtype_cinn = utils::CppVarType2CommonType(dtype_pd);
+  auto dtype      = common::Type2Str(dtype_cinn);
+
+  VLOG(4) << "fill constant (" << value << ") with shape (" << cinn::utils::Join(shape, ",") << ") and dtype [" << dtype
+          << "]";
+
+  const auto& cinn_name = cinn::utils::TransValidVarName(y_name);
+  CheckVarNameValid(cinn_name);
+
+  auto out = ctx.Builder()->FillConstant(shape, value, cinn_name, dtype);
+
+  ctx.AddVar(y_name, out);
+  ctx.AddVarModelToProgram(y_name, out->id);
+}
+
+void BroadcastOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Y").size(), 1UL);
+  auto y_name = op_desc.Output("Y").front();
+
+  CHECK(op_desc.HasAttr("shape")) << "The broadcast_p operator should has 'shape' attribute, but " << x_name
+                                  << "'s broadcast hasn't.";
+
+  auto y_shape = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "shape"));
+  auto x       = ctx.GetVar(x_name);
+
+  VLOG(4) << "Broadcast " << x_name << " from shape (" << cinn::utils::Join(x->shape, ",") << ") to shape ("
+          << cinn::utils::Join(y_shape, ",") << ").";
+
+  auto out = ctx.Builder()->BroadcastTo(x, y_shape);
+
+  ctx.AddVar(y_name, out);
+  ctx.AddVarModelToProgram(y_name, out->id);
+}
+
+}  // namespace science_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(science_broadcast) {
+  CINN_REGISTER_OP_MAPPER(fill_constant_p, cinn::frontend::science_mappers::FillConstantOpMapper)
+  CINN_REGISTER_OP_MAPPER(broadcast_p, cinn::frontend::science_mappers::BroadcastOpMapper)
+
+  return true;
+}
\ No newline at end of file
diff --git a/paddle/cinn/frontend/op_mappers/science/compare.cc b/paddle/cinn/frontend/op_mappers/science/compare.cc
new file mode 100644
index 0000000000000..eda23588d5607
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/science/compare.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace science_mappers {
+
+#define COMPARE_OPMAPPER_FUNCTION(OP_NAME)                                                 \
+  void OP_NAME##OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) { \
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);                                              \
+    auto x_name = op_desc.Input("X").front();                                              \
+    CHECK_EQ(op_desc.Input("Y").size(), 1UL);                                              \
+    auto y_name = op_desc.Input("Y").front();                                              \
+    CHECK_EQ(op_desc.Output("Z").size(), 1UL);                                             \
+    auto out_name = op_desc.Output("Z").front();                                           \
+    auto x        = ctx.GetVar(x_name);                                                    \
+    auto y        = ctx.GetVar(y_name);                                                    \
+    auto out      = ctx.Builder()->OP_NAME(x, y);                                          \
+    ctx.AddVar(out_name, out);                                                             \
+    ctx.AddVarModelToProgram(out_name, out->id);                                           \
+  }
+
+COMPARE_OPMAPPER_FUNCTION(GreaterThan)
+COMPARE_OPMAPPER_FUNCTION(GreaterEqual)
+COMPARE_OPMAPPER_FUNCTION(LessThan)
+COMPARE_OPMAPPER_FUNCTION(LessEqual)
+COMPARE_OPMAPPER_FUNCTION(Equal)
+COMPARE_OPMAPPER_FUNCTION(NotEqual)
+
+#undef COMPARE_OPMAPPER_FUNCTION
+
+}  // namespace science_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(science_compare) {
+#define COMPARE_OPMAPPER_REGISTER(PD_OP, CINN_OP) \
+  CINN_REGISTER_OP_MAPPER(PD_OP, cinn::frontend::science_mappers::CINN_OP##OpMapper)
+
+  COMPARE_OPMAPPER_REGISTER(gt_p, GreaterThan)
+  COMPARE_OPMAPPER_REGISTER(ge_p, GreaterEqual)
+  COMPARE_OPMAPPER_REGISTER(lt_p, LessThan)
+  COMPARE_OPMAPPER_REGISTER(le_p, LessEqual)
+  COMPARE_OPMAPPER_REGISTER(eq_p, Equal)
+  COMPARE_OPMAPPER_REGISTER(ne_p, NotEqual)
+
+#undef COMPARE_OPMAPPER_REGISTER
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/science/math.cc b/paddle/cinn/frontend/op_mappers/science/math.cc
new file mode 100644
index 0000000000000..2a59776c66b34
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/science/math.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace science_mappers {
+
+#define BINARY_OPMAPPER(op_name)                                                           \
+  void op_name##OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) { \
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);                                              \
+    auto x_name = op_desc.Input("X").front();                                              \
+    CHECK_EQ(op_desc.Input("Y").size(), 1UL);                                              \
+    auto y_name = op_desc.Input("Y").front();                                              \
+    CHECK_EQ(op_desc.Output("Z").size(), 1UL);                                             \
+    auto out_name = op_desc.Output("Z").front();                                           \
+    VLOG(3) << out_name << " = " << #op_name << "(" << x_name << ", " << y_name << ")";    \
+    auto x   = ctx.GetVar(x_name);                                                         \
+    auto y   = ctx.GetVar(y_name);                                                         \
+    auto out = ctx.Builder()->op_name(x, y);                                               \
+    ctx.AddVar(out_name, out);                                                             \
+    ctx.AddVarModelToProgram(out_name, out->id);                                           \
+  }
+
+BINARY_OPMAPPER(Add)
+BINARY_OPMAPPER(Subtract)
+BINARY_OPMAPPER(Divide)
+BINARY_OPMAPPER(Multiply)
+BINARY_OPMAPPER(Matmul)
+BINARY_OPMAPPER(Pow)
+BINARY_OPMAPPER(Max)
+BINARY_OPMAPPER(Min)
+
+#undef BINARY_OPMAPPER
+
+#define UNARY_OPMAPPER(op_name)                                                            \
+  void op_name##OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) { \
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);                                              \
+    auto x_name = op_desc.Input("X").front();                                              \
+    CHECK_EQ(op_desc.Output("Y").size(), 1UL);                                             \
+    auto out_name = op_desc.Output("Y").front();                                           \
+    VLOG(3) << out_name << " = " << #op_name << "(" << x_name << ")";                      \
+    auto x   = ctx.GetVar(x_name);                                                         \
+    auto out = ctx.Builder()->op_name(x);                                                  \
+    ctx.AddVar(out_name, out);                                                             \
+    ctx.AddVarModelToProgram(out_name, out->id);                                           \
+  }
+
+UNARY_OPMAPPER(Sqrt)
+UNARY_OPMAPPER(Rsqrt)
+UNARY_OPMAPPER(Tanh)
+UNARY_OPMAPPER(Sin)
+UNARY_OPMAPPER(Cos)
+UNARY_OPMAPPER(Exp)
+UNARY_OPMAPPER(Erf)
+UNARY_OPMAPPER(Log)
+UNARY_OPMAPPER(Identity)
+UNARY_OPMAPPER(Abs)
+
+#undef UNARY_OPMAPPER
+
+}  // namespace science_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(science_math) {
+#define EXPAND_OP_MAPPER_REGISTER(psci_op, cinn_op) \
+  CINN_REGISTER_OP_MAPPER(psci_op, cinn::frontend::science_mappers::cinn_op##OpMapper)
+
+  EXPAND_OP_MAPPER_REGISTER(add_p, Add)
+  EXPAND_OP_MAPPER_REGISTER(sub_p, Subtract)
+  EXPAND_OP_MAPPER_REGISTER(div_p, Divide)
+  EXPAND_OP_MAPPER_REGISTER(mul_p, Multiply)
+  EXPAND_OP_MAPPER_REGISTER(matmul_p, Matmul)
+  EXPAND_OP_MAPPER_REGISTER(pow_p, Pow)
+  EXPAND_OP_MAPPER_REGISTER(max_p, Max)
+  EXPAND_OP_MAPPER_REGISTER(min_p, Min)
+
+  EXPAND_OP_MAPPER_REGISTER(sqrt_p, Sqrt)
+  EXPAND_OP_MAPPER_REGISTER(rsqrt_p, Rsqrt)
+  EXPAND_OP_MAPPER_REGISTER(tanh_p, Tanh)
+  EXPAND_OP_MAPPER_REGISTER(sin_p, Sin)
+  EXPAND_OP_MAPPER_REGISTER(cos_p, Cos)
+  EXPAND_OP_MAPPER_REGISTER(exp_p, Exp)
+  EXPAND_OP_MAPPER_REGISTER(erf_p, Erf)
+  EXPAND_OP_MAPPER_REGISTER(log_p, Log)
+  EXPAND_OP_MAPPER_REGISTER(clone_p, Identity)
+  EXPAND_OP_MAPPER_REGISTER(share_data_p, Identity)
+  EXPAND_OP_MAPPER_REGISTER(abs_p, Abs)
+
+#undef EXPAND_OP_MAPPER_REGISTER
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/science/transform.cc b/paddle/cinn/frontend/op_mappers/science/transform.cc
new file mode 100644
index 0000000000000..3064e53aab84c
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/science/transform.cc
@@ -0,0 +1,404 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/types/optional.h>
+
+#include <functional>
+#include <numeric>
+
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/op_mappers/common_utils.h"
+#include "cinn/frontend/var_type_utils.h"
+
+namespace cinn {
+namespace frontend {
+namespace science_mappers {
+
+using cinn::utils::ShapeType;
+
+void ConcatOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_GE(op_desc.Input("XS").size(), 1UL);
+  auto x_names = op_desc.Input("XS");
+  CHECK_EQ(op_desc.Output("Y").size(), 1UL);
+  auto out_name = op_desc.Output("Y").front();
+
+  Variable out;
+  if (x_names.size() == 1) {
+    // if concat only has one input, using Identity to copy the input and return
+    auto x = ctx.GetVar(x_names.front());
+    out    = ctx.Builder()->Identity(x);
+  } else {
+    std::vector<Variable> xs;
+    for (const auto& name : x_names) {
+      xs.emplace_back(ctx.GetVar(name));
+    }
+
+    auto axis = utils::ToDimType(utils::GetAttrOrDefault<int64_t>(op_desc, "axis", 0));
+
+    out = ctx.Builder()->Concat(xs, axis);
+  }
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void SplitOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_GE(op_desc.Output("YS").size(), 1UL);
+  auto out_name = op_desc.Output("YS");
+
+  CHECK(op_desc.HasAttr("num_or_sections"));
+  auto num_or_sections = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "num_or_sections"));
+
+  CHECK(!num_or_sections.empty()) << "The Split op cannot found [num_or_sections] attrbute!  ! Please check.";
+
+  auto axis = utils::ToDimType(utils::GetAttrOrDefault<int64_t>(op_desc, "axis", 0));
+
+  auto x = ctx.GetVar(x_name);
+
+  auto x_shape = x->shape;
+  if (num_or_sections.size() == 1U) {
+    CHECK_EQ(x_shape[axis] % num_or_sections[0], 0)
+        << "If the attribute 'num_or_sections' is a number, it should be divisible by the "
+           "axis's dimension of inputs A ! Please check.";
+  } else {
+    cinn::utils::DimType sec_sum = 0;
+    bool has_neg                 = false;
+    for (auto sec : num_or_sections) {
+      if (sec > 0) {
+        sec_sum += sec;
+      } else if (sec == -1 && !has_neg) {
+        has_neg = true;
+      } else if (sec == 0) {
+        LOG(FATAL) << "The attribute 'num_or_sections' of split should not has 0 ! Please check.";
+      } else {
+        LOG(FATAL) << "The attribute 'num_or_sections' of split can only have at most one '-1' ! Please check.";
+      }
+    }
+    CHECK(!has_neg && sec_sum == x_shape[axis])
+        << "The sum of attr sections should be equal with the axis's dimension value of "
+           "inputs A in Split ! Please check.";
+  }
+
+  VLOG(4) << "Split " << x_name << " with shape (" << cinn::utils::Join(x->shape, ",") << ") "
+          << " to section (" << cinn::utils::Join(num_or_sections, ",") << ") at dimension " << axis;
+
+  auto out = ctx.Builder()->Split(x, num_or_sections, axis);
+
+  CHECK_EQ(out.size(), out_name.size()) << "The Split op should has " << out_name.size() << " output, but only "
+                                        << out.size();
+
+  for (int i = 0; i < out.size(); ++i) {
+    ctx.AddVar(out_name[i], out[i]);
+    ctx.AddVarModelToProgram(out_name[i], out[i]->id);
+  }
+}
+
+void ReshapeOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+
+  auto shape = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "shape"));
+
+  auto x = ctx.GetVar(x_name);
+
+  VLOG(4) << "Reshape " << x_name << "from shape (" << cinn::utils::Join(x->shape, ",") << ") to ("
+          << cinn::utils::Join(shape, ",") << ").";
+
+  auto out = ctx.Builder()->Reshape(x, shape);
+
+  CHECK_EQ(op_desc.Output("Y").size(), 1UL);
+  auto out_name = op_desc.Output("Y").front();
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void TransposeOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Y").size(), 1UL);
+  auto out_name = op_desc.Output("Y").front();
+
+  auto x = ctx.GetVar(x_name);
+
+  CHECK(x->shape.size() == 2) << "Now transpose_p only support 2-dim matrix.";
+  VLOG(4) << "Transpose " << x_name << " with shape (" << cinn::utils::Join(x->shape, ",") << ").";
+
+  auto out = ctx.Builder()->Transpose(x, {1, 0});
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void SliceSelectOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Y").size(), 1UL);
+  auto out_name = op_desc.Output("Y").front();
+
+  CHECK(op_desc.HasAttr("starts"));
+  auto starts = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "starts"));
+  CHECK(op_desc.HasAttr("ends"));
+  auto ends = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "ends"));
+  CHECK(op_desc.HasAttr("axis"));
+  auto axes = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "axis"));
+  CHECK(op_desc.HasAttr("strides"));
+  auto strides = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "strides"));
+
+  auto x = ctx.GetVar(x_name);
+
+  VLOG(4) << "SliceSelect " << x_name << " from shape (" << cinn::utils::Join(x->shape, ",") << ") with starts ["
+          << cinn::utils::Join(starts, ",") << "], ends [" << cinn::utils::Join(ends, ",") << "], axis ["
+          << cinn::utils::Join(axes, ",") << "], strides [" << cinn::utils::Join(strides, ",") << "].";
+
+  auto out = ctx.Builder()->Slice(x, axes, starts, ends, ShapeType{}, strides);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void SliceAssignOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto y_name = op_desc.Input("Y").front();
+  CHECK_EQ(op_desc.Output("Z").size(), 1UL);
+  auto out_name = op_desc.Output("Z").front();
+
+  CHECK(op_desc.HasAttr("starts"));
+  auto starts = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "starts"));
+  CHECK(op_desc.HasAttr("ends"));
+  auto ends = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "ends"));
+  CHECK(op_desc.HasAttr("axis"));
+  auto axes = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "axis"));
+  CHECK(op_desc.HasAttr("strides"));
+  auto strides = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "strides"));
+
+  auto x      = ctx.GetVar(x_name);
+  auto assign = ctx.GetVar(y_name);
+
+  VLOG(4) << "SliceAssign " << x_name << " from shape (" << cinn::utils::Join(x->shape, ",") << ") with starts ["
+          << cinn::utils::Join(starts, ",") << "], ends [" << cinn::utils::Join(ends, ",") << "], axis ["
+          << cinn::utils::Join(axes, ",") << "], strides [" << cinn::utils::Join(strides, ",") << "].";
+
+  absl::optional<Variable> out;
+  if (x->shape == assign->shape) {
+    out = ctx.Builder()->Identity(assign);
+  } else {
+    out = ctx.Builder()->SliceAssign(x, assign, axes, starts, ends, strides);
+  }
+
+  ctx.AddVar(out_name, out.value());
+  ctx.AddVarModelToProgram(out_name, out.value()->id);
+}
+
+void ReduceOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx, const std::string& reduce_type) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Y").size(), 1UL);
+  auto out_name = op_desc.Output("Y").front();
+
+  auto axis    = utils::ToShapeType(utils::GetAttrOrDefault<std::vector<int64_t>>(op_desc, "axis"));
+  auto keepdim = utils::GetAttrOrDefault<bool>(op_desc, "keepdim", false);
+
+  auto x = ctx.GetVar(x_name);
+
+  VLOG(4) << "Reudce " << reduce_type << " x:" << x_name << " from shape (" << cinn::utils::Join(x->shape, ",")
+          << "), with axis [" << cinn::utils::Join(axis, ",") << "], keepdim " << keepdim;
+
+  // now paddle science only need reduce sum
+  absl::optional<Variable> out;
+  if (reduce_type == "Sum") {
+    out = ctx.Builder()->ReduceSum(x, axis, keepdim);
+  } else if (reduce_type == "Prod") {
+    out = ctx.Builder()->ReduceProd(x, axis, keepdim);
+  } else if (reduce_type == "Max") {
+    out = ctx.Builder()->ReduceMax(x, axis, keepdim);
+  } else if (reduce_type == "Min") {
+    out = ctx.Builder()->ReduceMin(x, axis, keepdim);
+  } else if (reduce_type == "All") {
+    out = ctx.Builder()->ReduceAll(x, axis, keepdim);
+  } else if (reduce_type == "Any") {
+    out = ctx.Builder()->ReduceAny(x, axis, keepdim);
+  }
+
+  CHECK(out) << "Not support Reduce " << reduce_type << "! Please check.";
+
+  ctx.AddVar(out_name, out.value());
+  ctx.AddVarModelToProgram(out_name, out.value()->id);
+}
+
+#define EXPAND_REDUCE_OPMAPPER(ReduceType)                                                            \
+  void Reduce##ReduceType##OpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) { \
+    ReduceOpMapper(op_desc, ctx, #ReduceType);                                                        \
+  }
+
+EXPAND_REDUCE_OPMAPPER(Sum)
+EXPAND_REDUCE_OPMAPPER(Prod)
+EXPAND_REDUCE_OPMAPPER(Max)
+EXPAND_REDUCE_OPMAPPER(Min)
+EXPAND_REDUCE_OPMAPPER(All)
+EXPAND_REDUCE_OPMAPPER(Any)
+#undef EXPAND_REDUCE_OPMAPPER
+
+void GatherOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("IndexTensor").size(), 1UL);
+  auto index_name = op_desc.Input("IndexTensor").front();
+  CHECK_EQ(op_desc.Output("Y").size(), 1UL);
+  auto out_name = op_desc.Output("Y").front();
+
+  auto axis = utils::ToDimType(utils::GetAttrOrDefault<int64_t>(op_desc, "axis", 0));
+
+  auto x     = ctx.GetVar(x_name);
+  auto index = ctx.GetVar(index_name);
+
+  VLOG(4) << "Gather " << index_name << " (" << cinn::utils::Join(index->shape, ",") << ") from " << x_name
+          << " shape (" << cinn::utils::Join(x->shape, ",") << ") "
+          << "at dimension " << axis;
+
+  auto out = ctx.Builder()->Gather(x, index, axis);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void IndexAssignOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto updates_name = op_desc.Input("Y").front();
+  CHECK_EQ(op_desc.Input("IndexTensor").size(), 1UL);
+  auto index_name = op_desc.Input("IndexTensor").front();
+  CHECK_EQ(op_desc.Output("Z").size(), 1UL);
+  auto out_name = op_desc.Output("Z").front();
+
+  auto axis = utils::ToDimType(utils::GetAttrOrDefault<int64_t>(op_desc, "axis", 0));
+
+  auto x       = ctx.GetVar(x_name);
+  auto updates = ctx.GetVar(updates_name);
+  auto index   = ctx.GetVar(index_name);
+
+  auto out = ctx.Builder()->ScatterAssign(x, updates, index, axis);
+
+  VLOG(4) << "IndexAssign " << updates_name << " (" << cinn::utils::Join(updates->shape, ",") << ") to " << x_name
+          << " shape (" << cinn::utils::Join(x->shape, ",") << ") "
+          << "at dimension " << axis;
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void ScatterAddOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto updates_name = op_desc.Input("Y").front();
+  CHECK_EQ(op_desc.Input("IndexTensor").size(), 1UL);
+  auto index_name = op_desc.Input("IndexTensor").front();
+  CHECK_EQ(op_desc.Output("Z").size(), 1UL);
+  auto out_name = op_desc.Output("Z").front();
+
+  auto axis = utils::ToDimType(utils::GetAttrOrDefault<int64_t>(op_desc, "axis", 0));
+
+  auto x       = ctx.GetVar(x_name);
+  auto updates = ctx.GetVar(updates_name);
+  auto index   = ctx.GetVar(index_name);
+
+  auto out = ctx.Builder()->ScatterAdd(x, updates, index, axis);
+
+  VLOG(4) << "ScatterAdd " << updates_name << " (" << cinn::utils::Join(updates->shape, ",") << ") to " << x_name
+          << " shape (" << cinn::utils::Join(x->shape, ",") << ") "
+          << "at dimension " << axis;
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void SelectOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("Condition").size(), 1UL);
+  auto cond_name = op_desc.Input("Condition").front();
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+  auto y_name = op_desc.Input("Y").front();
+  CHECK_EQ(op_desc.Output("Z").size(), 1UL);
+  auto out_name = op_desc.Output("Z").front();
+
+  VLOG(4) << cond_name << " ? " << x_name << " : " << y_name;
+
+  auto cond = ctx.GetVar(cond_name);
+  auto x    = ctx.GetVar(x_name);
+  auto y    = ctx.GetVar(y_name);
+  auto out  = ctx.Builder()->Select(cond, x, y);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+void CastOpMapper(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  CHECK_EQ(op_desc.Input("X").size(), 1UL);
+  auto x_name = op_desc.Input("X").front();
+  CHECK_EQ(op_desc.Output("Y").size(), 1UL);
+  auto out_name = op_desc.Output("Y").front();
+
+  auto x = ctx.GetVar(x_name);
+
+  auto dtype_id = utils::GetAttrOrDefault<int>(op_desc, "dtype", static_cast<int>(paddle::cpp::VarDescAPI::Type::FP32));
+  auto dtype_pd = static_cast<paddle::cpp::VarDescAPI::Type>(dtype_id);
+  auto dtype_cinn = utils::CppVarType2CommonType(dtype_pd);
+  auto dtype      = common::Type2Str(dtype_cinn);
+
+  VLOG(4) << out_name << " = cast(" << x_name << ", dtype=" << dtype << ")";
+
+  auto out = ctx.Builder()->Cast(x, dtype);
+
+  ctx.AddVar(out_name, out);
+  ctx.AddVarModelToProgram(out_name, out->id);
+}
+
+}  // namespace science_mappers
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(science_transform) {
+  CINN_REGISTER_OP_MAPPER(concat_p, cinn::frontend::science_mappers::ConcatOpMapper)
+  CINN_REGISTER_OP_MAPPER(split_p, cinn::frontend::science_mappers::SplitOpMapper)
+  CINN_REGISTER_OP_MAPPER(reshape_p, cinn::frontend::science_mappers::ReshapeOpMapper)
+  CINN_REGISTER_OP_MAPPER(transpose_p, cinn::frontend::science_mappers::TransposeOpMapper)
+  CINN_REGISTER_OP_MAPPER(slice_select_p, cinn::frontend::science_mappers::SliceSelectOpMapper)
+  CINN_REGISTER_OP_MAPPER(slice_assign_p, cinn::frontend::science_mappers::SliceAssignOpMapper)
+  CINN_REGISTER_OP_MAPPER(index_select_p, cinn::frontend::science_mappers::GatherOpMapper)
+  CINN_REGISTER_OP_MAPPER(gather_p, cinn::frontend::science_mappers::GatherOpMapper)
+  CINN_REGISTER_OP_MAPPER(index_assign_p, cinn::frontend::science_mappers::IndexAssignOpMapper)
+  CINN_REGISTER_OP_MAPPER(scatter_add_p, cinn::frontend::science_mappers::ScatterAddOpMapper)
+  CINN_REGISTER_OP_MAPPER(reduce_p, cinn::frontend::science_mappers::ReduceSumOpMapper)
+  CINN_REGISTER_OP_MAPPER(select_p, cinn::frontend::science_mappers::SelectOpMapper)
+  CINN_REGISTER_OP_MAPPER(cast_p, cinn::frontend::science_mappers::CastOpMapper)
+
+#define EXPAND_REDUCE_OP_MAPPER_REGISTER(op_name, ReduceType) \
+  CINN_REGISTER_OP_MAPPER(op_name, cinn::frontend::science_mappers::Reduce##ReduceType##OpMapper)
+
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_sum_p, Sum)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_prod_p, Prod)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_max_p, Max)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_min_p, Min)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_all_p, All)
+  EXPAND_REDUCE_OP_MAPPER_REGISTER(reduce_any_p, Any)
+#undef EXPAND_REDUCE_OP_MAPPER_REGISTER
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/op_mappers/use_op_mappers.h b/paddle/cinn/frontend/op_mappers/use_op_mappers.h
new file mode 100644
index 0000000000000..dd545889c02af
--- /dev/null
+++ b/paddle/cinn/frontend/op_mappers/use_op_mappers.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/common/macros.h"
+
+CINN_USE_REGISTER(paddle_argsort)
+CINN_USE_REGISTER(paddle_fetch_feed)
+CINN_USE_REGISTER(paddle_mul)
+CINN_USE_REGISTER(paddle_slice)
+CINN_USE_REGISTER(paddle_relu)
+CINN_USE_REGISTER(paddle_softmax)
+CINN_USE_REGISTER(paddle_scale)
+CINN_USE_REGISTER(paddle_batchnorm)
+CINN_USE_REGISTER(paddle_dropout)
+CINN_USE_REGISTER(paddle_elementwise)
+CINN_USE_REGISTER(paddle_pool2d)
+CINN_USE_REGISTER(paddle_conv2d)
+CINN_USE_REGISTER(paddle_transpose)
+CINN_USE_REGISTER(paddle_reshape)
+CINN_USE_REGISTER(paddle_matmul)
+CINN_USE_REGISTER(paddle_compare)
+CINN_USE_REGISTER(paddle_log)
+CINN_USE_REGISTER(paddle_concat)
+CINN_USE_REGISTER(paddle_constant)
+CINN_USE_REGISTER(paddle_where)
+CINN_USE_REGISTER(paddle_layer_norm)
+CINN_USE_REGISTER(paddle_squeeze)
+CINN_USE_REGISTER(paddle_clip)
+CINN_USE_REGISTER(paddle_unsqueeze)
+CINN_USE_REGISTER(paddle_expand)
+CINN_USE_REGISTER(paddle_lookup_table)
+CINN_USE_REGISTER(paddle_take_along_axis)
+CINN_USE_REGISTER(paddle_unary)
+CINN_USE_REGISTER(paddle_binary)
+CINN_USE_REGISTER(paddle_gather)
+CINN_USE_REGISTER(paddle_gather_nd)
+CINN_USE_REGISTER(paddle_reduce)
+CINN_USE_REGISTER(paddle_atan)
+CINN_USE_REGISTER(paddle_gaussian_random)
+CINN_USE_REGISTER(paddle_uniform_random)
+CINN_USE_REGISTER(paddle_top_k)
+CINN_USE_REGISTER(paddle_one_hot)
+CINN_USE_REGISTER(paddle_cumsum)
+CINN_USE_REGISTER(paddle_norm)
+CINN_USE_REGISTER(paddle_tile)
+CINN_USE_REGISTER(paddle_strided_slice)
+CINN_USE_REGISTER(paddle_arg)
+CINN_USE_REGISTER(paddle_triangular_solve)
+CINN_USE_REGISTER(paddle_flip)
+CINN_USE_REGISTER(paddle_reverse)
+CINN_USE_REGISTER(paddle_randint)
+CINN_USE_REGISTER(paddle_roll)
+CINN_USE_REGISTER(paddle_cholesky)
+CINN_USE_REGISTER(paddle_scatter)
+
+CINN_USE_REGISTER(science_broadcast)
+CINN_USE_REGISTER(science_transform)
+CINN_USE_REGISTER(science_math)
+CINN_USE_REGISTER(science_compare)
diff --git a/paddle/cinn/frontend/optimize.cc b/paddle/cinn/frontend/optimize.cc
new file mode 100644
index 0000000000000..b93326b806c43
--- /dev/null
+++ b/paddle/cinn/frontend/optimize.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/optimize.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/decomposer/use_decomposer.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/visualize_helper.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(cinn_use_fill_constant_folding);
+DECLARE_bool(cinn_use_op_fusion);
+DECLARE_bool(cinn_use_common_subexpression_elimination);
+DECLARE_string(cinn_check_fusion_accuracy_pass);
+DECLARE_bool(cinn_use_custom_call);
+DECLARE_bool(use_reduce_split_pass);
+DECLARE_bool(cinn_use_dense_merge_pass);
+DECLARE_string(cinn_custom_call_deny_ops);
+
+namespace cinn {
+namespace frontend {
+
+OptimizeOptions DefaultTrainingOptimizeOptions() {
+  OptimizeOptions options;
+  options.program_passes.emplace_back("ExpandZeroDim");
+  options.program_passes.emplace_back("AutoCast");
+  options.program_passes.emplace_back("Decomposer");
+  options.program_passes.emplace_back("RemoveIdentity");
+
+  options.program_passes.emplace_back("CastCollapsing");
+  options.program_passes.emplace_back("TransposeCollapsing");
+  options.program_passes.emplace_back("RemoveIdentity");
+
+#ifdef CINN_WITH_CUDA
+  auto can_find_custom_call_deny_op = [](const std::string& op) {
+    return FLAGS_cinn_custom_call_deny_ops.find(op) != std::string::npos;
+  };
+  bool is_gemm_use_cublas = FLAGS_cinn_use_custom_call && !can_find_custom_call_deny_op("matmul") &&
+                            !can_find_custom_call_deny_op("cublas_gemm") &&
+                            !can_find_custom_call_deny_op("cublas_matmul");
+  if (is_gemm_use_cublas) {
+    options.program_passes.emplace_back("TransposeFoldingInput");
+    options.program_passes.emplace_back("GemmRewriter");
+    options.program_passes.emplace_back("TransposeFoldingOutput");
+    options.program_passes.emplace_back("GemmRewriter");
+  }
+#endif
+
+  options.program_passes.emplace_back("AutoBroadcast");
+  options.program_passes.emplace_back("FillConstantRewriter");
+  if (FLAGS_cinn_use_fill_constant_folding) {
+    options.program_passes.emplace_back("FillConstantFolding");
+  }
+  options.program_passes.emplace_back("RemoveIdentity");
+  options.program_passes.emplace_back("DeadCodeEliminate");
+
+  options.graph_passes = {"ConstantFolding"};
+  if (FLAGS_cinn_use_dense_merge_pass) {
+    options.graph_passes.push_back("DenseMergePass");
+  }
+
+  if (FLAGS_cinn_use_custom_call) {
+    options.graph_passes.emplace_back("TransToCustomCallPass");
+  }
+
+  if (FLAGS_cinn_use_common_subexpression_elimination) {
+    options.graph_passes.emplace_back("CommonSubexpressionEliminationPass");
+  }
+
+  // this pass should be applied before merge
+  if (FLAGS_use_reduce_split_pass) {
+    options.graph_passes.emplace_back("ReduceSplit");
+  }
+
+  if (FLAGS_cinn_use_op_fusion) {
+    options.graph_passes.emplace_back("OpFusionPass");
+    options.graph_passes.emplace_back("FusionMergePass");
+  } else {
+    options.graph_passes.emplace_back("BuildNonFusedGroupsPass");
+  }
+
+#ifdef CINN_WITH_CUDA
+  options.graph_passes.emplace_back("SingleGroupOptimizePass");
+#endif
+
+  // WARNING: the pass must be the last pass !!!
+  if (!cinn::runtime::CheckStringFlagFalse(FLAGS_cinn_check_fusion_accuracy_pass)) {
+    // Check the correct of fusion kernels, if the results not satisfied 'allclose(rtol=1e-05f, atol=1e-08f)', report
+    // error and exited.
+    options.graph_passes.emplace_back("CheckFusionAccuracyPass");
+    options.graph_passes.emplace_back("TransToCustomCallPass");
+  }
+  return options;
+}
+
+std::vector<std::string> DefaultOpFusionPasses() {
+  std::vector<std::string> passes;
+  if (FLAGS_cinn_use_op_fusion) {
+    passes = {"OpFusionPass", "FusionMergePass"};
+  }
+  return passes;
+}
+
+std::shared_ptr<hlir::framework::Graph> Optimize(frontend::Program* program,
+                                                 const std::unordered_set<std::string>& fetch_ids,
+                                                 common::Target target,
+                                                 const OptimizeOptions& options) {
+  cinn::hlir::framework::PassPrinter::GetInstance()->Begin(fetch_ids);
+  // Apply program passes
+  VLOG(3) << "Before frontend::ProgramPass::Apply";
+  frontend::ProgramPass::Apply(program, fetch_ids, target, options.program_passes);
+  // Apply graph passes
+  auto graph = std::make_shared<hlir::framework::Graph>(*program, fetch_ids, target);
+
+  VLOG(3) << "Before hlir::framework::ApplyPasses";
+  hlir::framework::ApplyPasses(graph.get(), options.graph_passes);
+  cinn::hlir::framework::PassPrinter::GetInstance()->End();
+  return graph;
+}
+
+std::shared_ptr<hlir::framework::Graph> Optimize(frontend::Program* program,
+                                                 const std::unordered_set<std::string>& fetch_ids,
+                                                 common::Target target,
+                                                 const std::vector<std::string>& passes) {
+  OptimizeOptions options;
+
+  bool enbale_fusion = false;
+  if (!passes.empty()) {
+    for (const auto& pass : passes) {
+      auto* p_pass = ProgramPassRegistry::Global()->Find(pass);
+      auto* g_pass = Registry<hlir::framework::PassFunctionRegister>::Global()->Find(pass);
+      if (p_pass) {
+        options.program_passes.emplace_back(pass);
+      } else if (g_pass) {
+        options.graph_passes.emplace_back(pass);
+        if (pass == "OpFusionPass" || pass == "FusionMergePass") {
+          enbale_fusion = true;
+        }
+      } else {
+        LOG(FATAL) << "Pass " << pass << " unsupported in CINN! Please check.\n";
+      }
+    }
+
+    if (!enbale_fusion) {
+      options.graph_passes.emplace_back("BuildNonFusedGroupsPass");
+    }
+  } else {
+    // if pass empty, default enable all pass
+    options = DefaultTrainingOptimizeOptions();
+  }
+
+  return Optimize(program, fetch_ids, target, options);
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/optimize.h b/paddle/cinn/frontend/optimize.h
new file mode 100755
index 0000000000000..9f41479055a28
--- /dev/null
+++ b/paddle/cinn/frontend/optimize.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+
+namespace cinn {
+namespace frontend {
+
+struct OptimizeOptions {
+  std::vector<std::string> program_passes;
+  std::vector<std::string> graph_passes;
+};
+
+OptimizeOptions DefaultTrainingOptimizeOptions();
+
+std::vector<std::string> DefaultOpFusionPasses();
+
+std::shared_ptr<hlir::framework::Graph> Optimize(frontend::Program* program,
+                                                 const std::unordered_set<std::string>& fetch_ids,
+                                                 common::Target target,
+                                                 const OptimizeOptions& options = DefaultTrainingOptimizeOptions());
+
+std::shared_ptr<hlir::framework::Graph> Optimize(frontend::Program* program,
+                                                 const std::unordered_set<std::string>& fetch_ids,
+                                                 common::Target target,
+                                                 const std::vector<std::string>& passes);
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/paddle/CMakeLists.txt b/paddle/cinn/frontend/paddle/CMakeLists.txt
new file mode 100644
index 0000000000000..782295f74de21
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/CMakeLists.txt
@@ -0,0 +1,26 @@
+proto_library(framework_proto SRCS framework.proto)
+
+add_subdirectory(cpp)
+add_subdirectory(pb)
+
+set(srcs
+  model_parser.cc
+  compatible_pb.cc
+  )
+
+cc_test(test_model_parser SRCS model_parser_test.cc DEPS cinncore
+  ARGS --model_dir=${THIRD_PARTY_PATH}/model/lite_naive_model)
+
+foreach(cpp ${srcs})
+  set(cinnapi_src "${cinnapi_src};cinn/frontend/paddle/${cpp}" CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
+
+foreach(header ${framework_proto_HDRS})
+  set(core_proto_includes "${core_proto_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/cinn/frontend/paddle/README.md b/paddle/cinn/frontend/paddle/README.md
new file mode 100644
index 0000000000000..ce91a4c16f41b
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/README.md
@@ -0,0 +1 @@
+Most of the code are port from Paddle-Lite v2.6.1, should be carefully update.
diff --git a/paddle/cinn/frontend/paddle/compatible_pb.cc b/paddle/cinn/frontend/paddle/compatible_pb.cc
new file mode 100644
index 0000000000000..0a4aaad8f8177
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/compatible_pb.cc
@@ -0,0 +1,266 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/compatible_pb.h"
+
+#include <string>
+#include <vector>
+
+#include "cinn/frontend/paddle/pb/block_desc.h"
+#include "cinn/frontend/paddle/pb/op_desc.h"
+#include "cinn/frontend/paddle/pb/program_desc.h"
+#include "cinn/frontend/paddle/pb/var_desc.h"
+
+namespace cinn::frontend::paddle {
+namespace framework_proto = ::cinn::frontend::paddle::proto;
+
+/// For VarDesc transfrom
+#define TRANS_VAR_ANY_WITH_CPP_IMPL(T)                                          \
+  template <>                                                                   \
+  void TransformVarDescCppToAny<T>(const cpp::VarDesc &cpp_desc, T *any_desc) { \
+    any_desc->SetName(cpp_desc.Name());                                         \
+    any_desc->SetType(cpp_desc.GetType());                                      \
+    any_desc->SetPersistable(cpp_desc.Persistable());                           \
+    if (cpp_desc.Name() != "feed" && cpp_desc.Name() != "fetch") {              \
+      any_desc->SetShape(cpp_desc.GetShape());                                  \
+      any_desc->SetDataType(cpp_desc.GetDataType());                            \
+    }                                                                           \
+  }
+
+template <>
+void TransformVarDescAnyToCpp<pb::VarDesc>(const pb::VarDesc &any_desc, cpp::VarDesc *cpp_desc) {
+  cpp_desc->SetName(any_desc.Name());
+  cpp_desc->SetType(any_desc.GetType());
+  cpp_desc->SetPersistable(any_desc.Persistable());
+  if (any_desc.Name() != "feed" && any_desc.Name() != "fetch") {
+    cpp_desc->SetDataType(any_desc.GetDataType());
+    cpp_desc->SetShape(any_desc.GetShape());
+  }
+}
+
+/// For OpDesc transform
+template <typename OpDescType>
+void OpInputsAnyToCpp(const OpDescType &any_desc, cpp::OpDesc *cpp_desc) {
+  for (const std::string &param : any_desc.InputArgumentNames()) {
+    cpp_desc->SetInput(param, any_desc.Input(param));
+  }
+}
+
+template <typename OpDescType>
+void OpInputsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) {
+  for (const std::string &param : cpp_desc.InputArgumentNames()) {
+    any_desc->SetInput(param, cpp_desc.Input(param));
+  }
+}
+
+template <typename OpDescType>
+void OpOutputsAnyToCpp(const OpDescType &any_desc, cpp::OpDesc *cpp_desc) {
+  for (const std::string &param : any_desc.OutputArgumentNames()) {
+    cpp_desc->SetOutput(param, any_desc.Output(param));
+  }
+}
+
+template <typename OpDescType>
+void OpOutputsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) {
+  for (const std::string &param : cpp_desc.OutputArgumentNames()) {
+    any_desc->SetOutput(param, cpp_desc.Output(param));
+  }
+}
+
+template <typename OpDescType>
+void OpAttrsAnyToCpp(const OpDescType &any_desc, cpp::OpDesc *cpp_desc) {
+  using AttrType = cpp::OpDescAPI::AttrType;
+  auto set_attr  = [&](const std::string &name, AttrType type) {
+    switch (type) {
+      case AttrType::INT:
+        cpp_desc->SetAttr<int32_t>(name, any_desc.template GetAttr<int32_t>(name));
+        break;
+      case AttrType::FLOAT:
+        cpp_desc->SetAttr<float>(name, any_desc.template GetAttr<float>(name));
+        break;
+      case AttrType::STRING:
+        cpp_desc->SetAttr<std::string>(name, any_desc.template GetAttr<std::string>(name));
+        break;
+      case AttrType::LONG:
+        cpp_desc->SetAttr<int64_t>(name, any_desc.template GetAttr<int64_t>(name));
+        break;
+      case AttrType::INTS:
+        cpp_desc->SetAttr<std::vector<int>>(name, any_desc.template GetAttr<std::vector<int>>(name));
+        break;
+      case AttrType::FLOATS:
+        cpp_desc->SetAttr<std::vector<float>>(name, any_desc.template GetAttr<std::vector<float>>(name));
+        break;
+      case AttrType::BOOLEAN:
+        cpp_desc->SetAttr<bool>(name, any_desc.template GetAttr<bool>(name));
+        break;
+      case AttrType::STRINGS:
+        cpp_desc->SetAttr<std::vector<std::string>>(name, any_desc.template GetAttr<std::vector<std::string>>(name));
+        break;
+      case AttrType::LONGS:
+        cpp_desc->SetAttr<std::vector<int64_t>>(name, any_desc.template GetAttr<std::vector<int64_t>>(name));
+        break;
+      case AttrType::BLOCK: {
+        auto i = any_desc.template GetAttr<int16_t>(name);
+        cpp_desc->SetAttr<int32_t>(name, i);
+        // naive_buffer::BlockDesc* sub_block = any_desc.template
+        // GetAttr<naive_buffer::BlockDesc*>(name);
+        // LOG(INFO) << sub_block->OpsSize();
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unsupported attr type found " << static_cast<int>(type);
+    }
+  };
+
+  for (const auto &attr_name : any_desc.AttrNames()) {
+    auto type = any_desc.GetAttrType(attr_name);
+    set_attr(attr_name, type);
+  }
+}
+
+template <typename OpDescType>
+void OpAttrsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) {
+  using AttrType = cpp::OpDescAPI::AttrType;
+  auto set_attr  = [&](const std::string &name, AttrType type) {
+    switch (type) {
+#define IMPL_ONE(type__, T)                                         \
+  case AttrType::type__:                                            \
+    any_desc->template SetAttr<T>(name, cpp_desc.GetAttr<T>(name)); \
+    break;
+      IMPL_ONE(INT, int32_t);
+      IMPL_ONE(FLOAT, float);
+      IMPL_ONE(STRING, std::string);
+      IMPL_ONE(STRINGS, std::vector<std::string>);
+      IMPL_ONE(FLOATS, std::vector<float>);
+      IMPL_ONE(INTS, std::vector<int>);
+      IMPL_ONE(BOOLEAN, bool);
+      IMPL_ONE(LONG, int64_t);
+      IMPL_ONE(LONGS, std::vector<int64_t>);
+      default:
+        LOG(FATAL) << "Unsupported attr type found: " << static_cast<int>(type);
+    }
+  };
+#undef IMPL_ONE
+  for (const auto &attr_name : cpp_desc.AttrNames()) {
+    auto type = cpp_desc.GetAttrType(attr_name);
+    set_attr(attr_name, type);
+  }
+}
+
+#define TRANS_OP_ANY_WITH_CPP_IMPL(T)                                         \
+  template <>                                                                 \
+  void TransformOpDescAnyToCpp<T>(const T &any_desc, cpp::OpDesc *cpp_desc) { \
+    cpp_desc->SetType(any_desc.Type());                                       \
+    OpInputsAnyToCpp<T>(any_desc, cpp_desc);                                  \
+    OpOutputsAnyToCpp<T>(any_desc, cpp_desc);                                 \
+    OpAttrsAnyToCpp<T>(any_desc, cpp_desc);                                   \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  void TransformOpDescCppToAny<T>(const cpp::OpDesc &cpp_desc, T *any_desc) { \
+    any_desc->SetType(cpp_desc.Type());                                       \
+    OpInputsCppToAny<T>(cpp_desc, any_desc);                                  \
+    OpOutputsCppToAny<T>(cpp_desc, any_desc);                                 \
+    OpAttrsCppToAny<T>(cpp_desc, any_desc);                                   \
+  }
+
+/// For BlockDesc transform
+#define TRANS_BLOCK_ANY_WITH_CPP_IMPL(T, NT)                                                \
+  template <>                                                                               \
+  void TransformBlockDescAnyToCpp<NT::T>(const NT::T &any_desc, cpp::BlockDesc *cpp_desc) { \
+    NT::T desc = any_desc;                                                                  \
+    cpp_desc->SetIdx(desc.Idx());                                                           \
+    cpp_desc->SetParentIdx(desc.ParentIdx());                                               \
+    cpp_desc->SetForwardBlockIdx(desc.ForwardBlockIdx());                                   \
+                                                                                            \
+    cpp_desc->ClearOps();                                                                   \
+    for (size_t i = 0; i < desc.OpsSize(); ++i) {                                           \
+      auto any_op_desc  = NT::OpDesc(desc.GetOp<framework_proto::OpDesc>(i));               \
+      auto *cpp_op_desc = cpp_desc->AddOp<cpp::OpDesc>();                                   \
+      TransformOpDescAnyToCpp(any_op_desc, cpp_op_desc);                                    \
+    }                                                                                       \
+                                                                                            \
+    cpp_desc->ClearVars();                                                                  \
+    for (size_t i = 0; i < desc.VarsSize(); ++i) {                                          \
+      auto any_var_desc  = NT::VarDesc(desc.GetVar<framework_proto::VarDesc>(i));           \
+      auto *cpp_var_desc = cpp_desc->AddVar<cpp::VarDesc>();                                \
+      TransformVarDescAnyToCpp(any_var_desc, cpp_var_desc);                                 \
+    }                                                                                       \
+  }                                                                                         \
+                                                                                            \
+  template <>                                                                               \
+  void TransformBlockDescCppToAny<NT::T>(const cpp::T &cpp_desc, NT::T *any_desc) {         \
+    auto desc = cpp_desc;                                                                   \
+    any_desc->SetIdx(desc.Idx());                                                           \
+    any_desc->SetParentIdx(desc.ParentIdx());                                               \
+    any_desc->SetForwardBlockIdx(desc.ForwardBlockIdx());                                   \
+                                                                                            \
+    any_desc->ClearOps();                                                                   \
+    for (size_t i = 0; i < desc.OpsSize(); ++i) {                                           \
+      auto *cpp_op_desc = desc.GetOp<cpp::OpDesc>(i);                                       \
+      auto any_op_desc  = NT::OpDesc(any_desc->AddOp<framework_proto::OpDesc>());           \
+      TransformOpDescCppToAny(*cpp_op_desc, &any_op_desc);                                  \
+    }                                                                                       \
+                                                                                            \
+    any_desc->ClearVars();                                                                  \
+    for (size_t i = 0; i < desc.VarsSize(); ++i) {                                          \
+      auto *cpp_var_desc = desc.GetVar<cpp::VarDesc>(i);                                    \
+      auto any_var_desc  = NT::VarDesc(any_desc->AddVar<framework_proto::VarDesc>());       \
+      TransformVarDescCppToAny(*cpp_var_desc, &any_var_desc);                               \
+    }                                                                                       \
+  }
+
+/// For ProgramDesc transform
+#define TRANS_PROGRAM_ANY_WITH_CPP_IMPL(T, NT)                                                  \
+  template <>                                                                                   \
+  void TransformProgramDescAnyToCpp<NT::T>(const NT::T &any_desc, cpp::ProgramDesc *cpp_desc) { \
+    NT::T desc = any_desc;                                                                      \
+    if (desc.HasVersion()) {                                                                    \
+      cpp_desc->SetVersion(desc.Version());                                                     \
+    }                                                                                           \
+                                                                                                \
+    cpp_desc->ClearBlocks();                                                                    \
+    for (size_t i = 0; i < desc.BlocksSize(); ++i) {                                            \
+      auto any_block_desc  = NT::BlockDesc(desc.GetBlock<framework_proto::BlockDesc>(i));       \
+      auto *cpp_block_desc = cpp_desc->AddBlock<cpp::BlockDesc>();                              \
+      TransformBlockDescAnyToCpp(any_block_desc, cpp_block_desc);                               \
+    }                                                                                           \
+  }                                                                                             \
+                                                                                                \
+  template <>                                                                                   \
+  void TransformProgramDescCppToAny<NT::T>(const cpp::T &cpp_desc, NT::T *any_desc) {           \
+    auto desc = cpp_desc;                                                                       \
+    if (desc.HasVersion()) {                                                                    \
+      any_desc->SetVersion(desc.Version());                                                     \
+    }                                                                                           \
+                                                                                                \
+    any_desc->ClearBlocks();                                                                    \
+    for (size_t i = 0; i < desc.BlocksSize(); ++i) {                                            \
+      auto *cpp_block_desc = desc.GetBlock<cpp::BlockDesc>(i);                                  \
+      auto any_block_desc  = NT::BlockDesc(any_desc->AddBlock<framework_proto::BlockDesc>());   \
+      TransformBlockDescCppToAny(*cpp_block_desc, &any_block_desc);                             \
+    }                                                                                           \
+  }
+
+TRANS_VAR_ANY_WITH_CPP_IMPL(pb::VarDesc);
+TRANS_OP_ANY_WITH_CPP_IMPL(pb::OpDesc);
+TRANS_BLOCK_ANY_WITH_CPP_IMPL(BlockDesc, pb)
+TRANS_PROGRAM_ANY_WITH_CPP_IMPL(ProgramDesc, pb)
+
+#undef TRANS_VAR_ANY_WITH_CPP_IMPL
+#undef TRANS_OP_ANY_WITH_CPP_IMPL
+#undef TRANS_BLOCK_ANY_WITH_CPP_IMPL
+#undef TRANS_PROGRAM_ANY_WITH_CPP_IMPL
+
+}  // namespace cinn::frontend::paddle
diff --git a/paddle/cinn/frontend/paddle/compatible_pb.h b/paddle/cinn/frontend/paddle/compatible_pb.h
new file mode 100644
index 0000000000000..77088c5d0b7fd
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/compatible_pb.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/frontend/paddle/cpp/block_desc.h"
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "cinn/frontend/paddle/cpp/op_desc.h"
+#include "cinn/frontend/paddle/cpp/program_desc.h"
+#include "cinn/frontend/paddle/cpp/var_desc.h"
+
+namespace cinn::frontend::paddle {
+
+/// Transform an VarDesc from VarDescType to cpp format.
+template <typename VarDescType>
+void TransformVarDescAnyToCpp(const VarDescType& any_desc, cpp::VarDesc* cpp_desc);
+
+/// Transform an VarDesc from cpp to VarDescType format.
+template <typename VarDescType>
+void TransformVarDescCppToAny(const cpp::VarDesc& cpp_desc, VarDescType* any_desc);
+
+/// Transform an OpDesc from OpDescType to cpp format.
+template <typename OpDescType>
+void TransformOpDescAnyToCpp(const OpDescType& any_desc, cpp::OpDesc* cpp_desc);
+
+/// Transform an OpDesc from cpp to OpDescType format.
+template <typename OpDescType>
+void TransformOpDescCppToAny(const cpp::OpDesc& cpp_desc, OpDescType* any_desc);
+
+/// Transform an BlockDesc from BlockDescType to cpp format.
+template <typename BlockDescType>
+void TransformBlockDescAnyToCpp(const BlockDescType& any_desc, cpp::BlockDesc* cpp_desc);
+
+/// Transform an BlockDesc from cpp to BlockDescType format.
+template <typename BlockDescType>
+void TransformBlockDescCppToAny(const cpp::BlockDesc& cpp_desc, BlockDescType* any_desc);
+
+/// Transform an ProgramDesc from ProgramDescType to cpp format.
+template <typename ProgramDescType>
+void TransformProgramDescAnyToCpp(const ProgramDescType& any_desc, cpp::ProgramDesc* cpp_desc);
+
+/// Transform an ProgramDesc from cpp to ProgramDescType format.
+template <typename ProgramDescType>
+void TransformProgramDescCppToAny(const cpp::ProgramDesc& cpp_desc, ProgramDescType* any_desc);
+
+}  // namespace cinn::frontend::paddle
diff --git a/paddle/cinn/frontend/paddle/cpp/CMakeLists.txt b/paddle/cinn/frontend/paddle/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000..bc13453f806aa
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/cpp/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(srcs
+  var_desc.cc
+  op_desc.cc
+  block_desc.cc
+  program_desc.cc
+  )
+
+foreach(cpp ${srcs})
+  set(cinnapi_src
+    "${cinnapi_src};cinn/frontend/paddle/cpp/${cpp}"
+    CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/cinn/frontend/paddle/cpp/block_desc.cc b/paddle/cinn/frontend/paddle/cpp/block_desc.cc
new file mode 100644
index 0000000000000..6e5ade3e153ec
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/cpp/block_desc.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/cpp/block_desc.h"
+
+namespace cinn::frontend::paddle::cpp {
+
+template <>
+VarDesc* BlockDesc::GetVar<VarDesc>(int32_t idx) {
+  CHECK_LT(idx, VarsSize()) << "idx >= vars.size()";
+  return &vars_[idx];
+}
+
+template <>
+const VarDesc& BlockDesc::GetConstVar<VarDesc>(int32_t idx) const {
+  CHECK_LT(idx, static_cast<int32_t>(VarsSize())) << "idx >= vars.size()";
+  return vars_[idx];
+}
+
+template <>
+VarDesc* BlockDesc::AddVar<VarDesc>() {
+  vars_.emplace_back();
+  return &vars_.back();
+}
+
+template <>
+OpDesc* BlockDesc::GetOp<OpDesc>(int32_t idx) {
+  CHECK_LT(idx, OpsSize()) << "idx >= ops.size()";
+  return &ops_[idx];
+}
+
+template <>
+const OpDesc& BlockDesc::GetConstOp<OpDesc>(int32_t idx) const {
+  CHECK_LT(idx, static_cast<int32_t>(OpsSize())) << "idx >= ops.size()";
+  return ops_[idx];
+}
+
+template <>
+OpDesc* BlockDesc::AddOp<OpDesc>() {
+  ops_.emplace_back();
+  return &ops_.back();
+}
+
+}  // namespace cinn::frontend::paddle::cpp
diff --git a/paddle/cinn/frontend/paddle/cpp/block_desc.h b/paddle/cinn/frontend/paddle/cpp/block_desc.h
new file mode 100644
index 0000000000000..59b5c443ed41d
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/cpp/block_desc.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "cinn/frontend/paddle/cpp/op_desc.h"
+#include "cinn/frontend/paddle/cpp/var_desc.h"
+
+namespace cinn::frontend::paddle::cpp {
+
+/*
+ * The cpp::BlockDesc is the internal representation for Op. All the internal
+ * imprementation should use it, not the pb::BlockDesc.
+ */
+class BlockDesc : public BlockDescAPI {
+ public:
+  BlockDesc() = default;
+
+  int32_t Idx() const override { return idx_; }
+
+  void SetIdx(int32_t idx) override { idx_ = idx; }
+
+  int32_t ParentIdx() const override { return parent_idx_; }
+
+  void SetParentIdx(int32_t idx) override { parent_idx_ = idx; }
+
+  size_t VarsSize() const override { return vars_.size(); }
+
+  void ClearVars() override { vars_.clear(); }
+
+  template <typename T>
+  T* GetVar(int32_t idx);
+
+  template <typename T>
+  const T& GetConstVar(int32_t idx) const;
+
+  template <typename T>
+  T* AddVar();
+
+  size_t OpsSize() const override { return ops_.size(); }
+
+  void ClearOps() override { ops_.clear(); }
+
+  template <typename T>
+  T* GetOp(int32_t idx);
+
+  template <typename T>
+  const T& GetConstOp(int32_t idx) const;
+
+  template <typename T>
+  T* AddOp();
+
+  int32_t ForwardBlockIdx() const override { return forward_block_idx_; }
+
+  void SetForwardBlockIdx(int32_t idx) override { forward_block_idx_ = idx; }
+
+ private:
+  int32_t idx_;
+  int32_t parent_idx_;
+  std::vector<OpDesc> ops_;
+  std::vector<VarDesc> vars_;
+  int32_t forward_block_idx_;
+};
+
+}  // namespace cinn::frontend::paddle::cpp
diff --git a/paddle/cinn/frontend/paddle/cpp/desc_api.h b/paddle/cinn/frontend/paddle/cpp/desc_api.h
new file mode 100644
index 0000000000000..ae30474cd3ec5
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/cpp/desc_api.h
@@ -0,0 +1,250 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/types/variant.h>
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace cinn::frontend::paddle::cpp {
+
+/*
+ * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc
+ * classes should implement this.
+ * ref to: https://github.com/PaddlePaddle/Paddle/blob/v2.4.1/paddle/fluid/framework/framework.proto#L118
+ */
+class VarDescAPI {
+ public:
+  enum class Type {
+    // Pod Types
+    BOOL  = 0,
+    INT16 = 1,
+    INT32 = 2,
+    INT64 = 3,
+    FP16  = 4,
+    FP32  = 5,
+    FP64  = 6,
+    // Tensor<size_t> is used in C++.
+    SIZE_T     = 19,
+    UINT8      = 20,
+    INT8       = 21,
+    BF16       = 22,
+    COMPLEX64  = 23,
+    COMPLEX128 = 24,
+
+    // Other types that may need additional descriptions
+    LOD_TENSOR       = 7,
+    SELECTED_ROWS    = 8,
+    FEED_MINIBATCH   = 9,
+    FETCH_LIST       = 10,
+    STEP_SCOPES      = 11,
+    LOD_RANK_TABLE   = 12,
+    LOD_TENSOR_ARRAY = 13,
+    PLACE_LIST       = 14,
+    READER           = 15,
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW   = 17,
+    TUPLE = 18,
+
+    STRING    = 25,
+    STRINGS   = 26,
+    VOCAB     = 27,
+    FEED_LIST = 28,
+    // The data type of phi::StringTensor
+    PSTRING = 29,
+    // the data type of phi::SparseCooTensor
+    SPARSE_COO = 30,
+    // the data type of phi::SparseCsrTensor
+    SPARSE_CSR = 31,
+  };
+
+  using VarDataType = Type;
+
+  virtual ~VarDescAPI() = default;
+
+  // Get var's name
+  virtual std::string Name() const = 0;
+  // Set var's name
+  virtual void SetName(std::string name) = 0;
+  // Get var's type
+  virtual Type GetType() const = 0;
+  // Set var's type
+  virtual void SetType(Type type) = 0;
+  // Tell whether var is persistable or not
+  virtual bool Persistable() const = 0;
+  // Set var to be persistable or not
+  virtual void SetPersistable(bool persistable) = 0;
+  // Get var's shape
+  virtual std::vector<int64_t> GetShape() const = 0;
+  // Set var's shape
+  virtual void SetShape(const std::vector<int64_t>& dims) = 0;
+};
+
+/*
+ * NOTE Some interfaces are weried, we remain them unchanged to keep compatible
+ * with framework::OpDesc in Fluid framework.
+ */
+class OpDescAPI {
+ public:
+  // The AttrType is used to make the proto::AttrType portable.
+  // ref to https://github.com/PaddlePaddle/Paddle/blob/v2.4.1/paddle/fluid/framework/framework.proto#L25
+  enum class AttrType {
+    INT      = 0,
+    FLOAT    = 1,
+    STRING   = 2,
+    INTS     = 3,
+    FLOATS   = 4,
+    STRINGS  = 5,
+    BOOLEAN  = 6,
+    BOOLEANS = 7,
+    BLOCK    = 8,
+    LONG     = 9,
+    BLOCKS   = 10,
+    LONGS    = 11,
+    FLOAT64S = 12,
+    VAR      = 13,
+    VARS     = 14,
+    FLOAT64  = 15,
+    SCALAR   = 16,
+    SCALARS  = 17
+  };
+
+  virtual ~OpDescAPI() = default;
+
+  /// Get operator's type.
+  virtual std::string Type() const = 0;
+  /// Set operator's type.
+  virtual void SetType(const std::string& type) = 0;
+  /// Get arguments given the parameter.
+  virtual std::vector<std::string> Input(const std::string& param) const = 0;
+  /// Get parameters.
+  virtual std::vector<std::string> InputArgumentNames() const = 0;
+  /// Get arguments given the parameter.
+  virtual std::vector<std::string> Output(const std::string& param) const = 0;
+  /// Get parameters.
+  virtual std::vector<std::string> OutputArgumentNames() const = 0;
+  /// Set a input given the parameter and arguments.
+  virtual void SetInput(const std::string& param, const std::vector<std::string>& args)  = 0;
+  virtual void SetOutput(const std::string& param, const std::vector<std::string>& args) = 0;
+  /// Tell whether this desc has an attribute.
+  virtual bool HasAttr(const std::string& name) const = 0;
+
+  /// Get the type of an attribute.
+  virtual AttrType GetAttrType(const std::string& name) const = 0;
+
+  virtual std::vector<std::string> AttrNames() const = 0;
+
+  /// Set an attribute.
+  template <typename T>
+  void SetAttr(const std::string& name, const T& v);
+
+  /// Get an attribute.
+  template <typename T>
+  T GetAttr(const std::string& name) const;
+
+  std::string Repr() const {
+    std::stringstream ss;
+    ss << Type();
+    ss << "(";
+    for (auto& arg : InputArgumentNames()) {
+      ss << arg << ":";
+      for (auto val : Input(arg)) {
+        ss << val << " ";
+      }
+    }
+    ss << ") -> (";
+    for (auto& arg : OutputArgumentNames()) {
+      ss << arg << ":";
+      for (auto val : Output(arg)) {
+        ss << val << " ";
+      }
+    }
+    ss << ")";
+    return ss.str();
+  }
+};
+
+class BlockDescAPI {
+ public:
+  virtual ~BlockDescAPI() = default;
+
+  virtual int32_t Idx() const = 0;
+
+  virtual void SetIdx(int32_t idx) = 0;
+
+  virtual int32_t ParentIdx() const = 0;
+
+  virtual void SetParentIdx(int32_t idx) = 0;
+
+  virtual size_t VarsSize() const = 0;
+
+  virtual void ClearVars() = 0;
+
+  // NOTE: This ugly method is used to compatible interfaces between cpp and
+  // pb/nb backends
+  // TODO(sangoly): refine this
+  template <typename T>
+  T* GetVar(int32_t idx);
+
+  template <typename T>
+  T* AddVar();
+
+  virtual size_t OpsSize() const = 0;
+
+  virtual void ClearOps() = 0;
+
+  // NOTE: This ugly method is used to compatible interfaces between cpp and
+  // pb/nb backends
+  // TODO(sangoly): refine this
+  template <typename T>
+  T* GetOp(int32_t idx);
+
+  template <typename T>
+  T* AddOp();
+
+  virtual int32_t ForwardBlockIdx() const = 0;
+
+  virtual void SetForwardBlockIdx(int32_t idx) = 0;
+};
+
+class ProgramDescAPI {
+ public:
+  virtual ~ProgramDescAPI() = default;
+
+  virtual size_t BlocksSize() const = 0;
+
+  virtual void ClearBlocks() = 0;
+
+  // NOTE: This ugly method is used to compatible interfaces between cpp and
+  // pb/nb backends
+  // TODO(sangoly): refine this
+  template <typename T>
+  T* GetBlock(int32_t idx);
+
+  template <typename T>
+  T* AddBlock();
+
+  virtual bool HasVersion() const = 0;
+
+  virtual int64_t Version() const = 0;
+
+  virtual void SetVersion(int64_t version) = 0;
+};
+
+}  // namespace cinn::frontend::paddle::cpp
diff --git a/paddle/cinn/frontend/paddle/cpp/op_desc.cc b/paddle/cinn/frontend/paddle/cpp/op_desc.cc
new file mode 100644
index 0000000000000..787b2f5f6f065
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/cpp/op_desc.cc
@@ -0,0 +1,152 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/cpp/op_desc.h"
+
+#include <cstdint>
+#include <utility>
+
+namespace cinn::frontend::paddle::cpp {
+
+inline std::string AttrTypeToString(paddle::cpp::OpDescAPI::AttrType attr_type) {
+  using AttrType = paddle::cpp::OpDescAPI::AttrType;
+  switch (attr_type) {
+#define EXPAND_SWITCH_CASE(ATTR_TYPE) \
+  case AttrType::ATTR_TYPE:           \
+    return #ATTR_TYPE;
+    EXPAND_SWITCH_CASE(INT)
+    EXPAND_SWITCH_CASE(FLOAT)
+    EXPAND_SWITCH_CASE(STRING)
+    EXPAND_SWITCH_CASE(INTS)
+    EXPAND_SWITCH_CASE(FLOATS)
+    EXPAND_SWITCH_CASE(STRINGS)
+    EXPAND_SWITCH_CASE(BOOLEAN)
+    EXPAND_SWITCH_CASE(LONG)
+    EXPAND_SWITCH_CASE(LONGS)
+    EXPAND_SWITCH_CASE(FLOAT64S)
+    EXPAND_SWITCH_CASE(FLOAT64)
+    EXPAND_SWITCH_CASE(SCALAR)
+    EXPAND_SWITCH_CASE(SCALARS)
+#undef EXPAND_SWITCH_CASE
+  };
+  return "Invlid AttrType";
+}
+
+#define SET_ATTR_IMPL(T, repr__)                                 \
+  template <>                                                    \
+  void OpDesc::SetAttr<T>(const std::string& name, const T& v) { \
+    attr_types_[name] = AttrType::repr__;                        \
+    attrs_[name]      = v;                                       \
+  }
+
+SET_ATTR_IMPL(int32_t, INT);
+SET_ATTR_IMPL(float, FLOAT);
+SET_ATTR_IMPL(double, FLOAT64);
+SET_ATTR_IMPL(std::string, STRING);
+SET_ATTR_IMPL(bool, BOOLEAN);
+SET_ATTR_IMPL(int64_t, LONG);
+SET_ATTR_IMPL(std::vector<int>, INTS);
+SET_ATTR_IMPL(std::vector<float>, FLOATS);
+SET_ATTR_IMPL(std::vector<double>, FLOAT64S);
+SET_ATTR_IMPL(std::vector<std::string>, STRINGS);
+SET_ATTR_IMPL(std::vector<bool>, BOOLEANS);
+SET_ATTR_IMPL(std::vector<int64_t>, LONGS);
+
+#undef SET_ATTR_IMPL
+
+std::pair<OpDesc::attrs_t::const_iterator, OpDesc::attr_types_t::const_iterator> FindAttr(const OpDesc& desc,
+                                                                                          const std::string& name) {
+  auto it = desc.attrs().find(name);
+  CHECK(it != desc.attrs().end()) << "No attributes called " << name << " found";
+  auto attr_it = desc.attr_types().find(name);
+  CHECK(attr_it != desc.attr_types().end());
+  return std::make_pair(it, attr_it);
+}
+
+#define GET_IMPL_ONE(T, repr__)                                                                  \
+  template <>                                                                                    \
+  T OpDesc::GetAttr<T>(const std::string& name) const {                                          \
+    auto pair = FindAttr(*this, name);                                                           \
+    CHECK(pair.second->second == AttrType::repr__)                                               \
+        << "The op \"" << Type() << "\"'s attrbute \"" << pair.second->first                     \
+        << "\"'s type doesn't match the target type! Try get \"" << #repr__ << "\", but real \"" \
+        << AttrTypeToString(pair.second->second) << "\". Please check.";                         \
+    return absl::any_cast<T>(pair.first->second);                                                \
+  }
+
+GET_IMPL_ONE(int32_t, INT);
+GET_IMPL_ONE(float, FLOAT);
+GET_IMPL_ONE(double, FLOAT64);
+GET_IMPL_ONE(std::string, STRING);
+GET_IMPL_ONE(bool, BOOLEAN);
+GET_IMPL_ONE(int64_t, LONG);
+GET_IMPL_ONE(std::vector<int>, INTS);
+GET_IMPL_ONE(std::vector<float>, FLOATS);
+GET_IMPL_ONE(std::vector<double>, FLOAT64S);
+GET_IMPL_ONE(std::vector<std::string>, STRINGS);
+GET_IMPL_ONE(std::vector<bool>, BOOLEANS);
+GET_IMPL_ONE(std::vector<int64_t>, LONGS);
+
+#undef GET_IMPL_ONE
+
+std::vector<std::string> OpDesc::OutputArgumentNames() const {
+  std::vector<std::string> res;
+  for (const auto& x : outputs_) res.push_back(x.first);
+  return res;
+}
+
+std::vector<std::string> OpDesc::input_vars() const {
+  std::vector<std::string> res;
+  for (const auto& arg : InputArgumentNames()) {
+    for (auto& vars : Input(arg)) {
+      res.emplace_back(vars.begin(), vars.end());
+    }
+  }
+  return res;
+}
+
+std::vector<std::string> OpDesc::output_vars() const {
+  std::vector<std::string> res;
+  for (const auto& arg : OutputArgumentNames()) {
+    for (auto& vars : Output(arg)) {
+      res.emplace_back(vars.begin(), vars.end());
+    }
+  }
+  return res;
+}
+
+std::vector<std::string> OpDesc::InputArgumentNames() const {
+  std::vector<std::string> res;
+  for (const auto& x : inputs_) res.push_back(x.first);
+  return res;
+}
+
+std::vector<std::string> OpDesc::Input(const std::string& param) const {
+  auto it = inputs_.find(param);
+  CHECK(it != inputs_.end());
+  return it->second;
+}
+
+std::vector<std::string> OpDesc::Output(const std::string& param) const {
+  auto it = outputs_.find(param);
+  CHECK(it != outputs_.end());
+  return it->second;
+}
+
+bool OpDesc::HasOutput(const std::string& param) const {
+  auto it = outputs_.find(param);
+  return it != outputs_.end();
+}
+
+}  // namespace cinn::frontend::paddle::cpp
diff --git a/paddle/cinn/frontend/paddle/cpp/op_desc.h b/paddle/cinn/frontend/paddle/cpp/op_desc.h
new file mode 100644
index 0000000000000..320392b9c0d3d
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/cpp/op_desc.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/types/any.h>
+#include <absl/types/variant.h>
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "cinn/frontend/paddle/framework.pb.h"
+
+namespace cinn {
+namespace frontend {
+namespace paddle {
+namespace cpp {
+
+/*
+ * The cpp::OpDesc is the internal representation for Op. All the internal
+ * imprementation should use it, not the pb::OpDesc.
+ */
+class OpDesc : public OpDescAPI {
+ public:
+  using attrs_t      = std::map<std::string, absl::any>;
+  using attr_types_t = std::map<std::string, AttrType>;
+
+ protected:
+  std::string type_;
+  std::map<std::string, std::vector<std::string>> inputs_;
+  std::map<std::string, std::vector<std::string>> outputs_;
+  std::map<std::string, absl::any> attrs_;
+  std::map<std::string, AttrType> attr_types_;
+
+ public:
+  OpDesc() = default;
+
+  std::string Type() const override { return type_; }
+  void SetType(const std::string& x) override { type_ = x; }
+
+  const std::map<std::string, std::vector<std::string>>& inputs() const { return inputs_; }
+  const std::map<std::string, std::vector<std::string>>& outputs() const { return outputs_; }
+  std::map<std::string, std::vector<std::string>>* mutable_inputs() { return &inputs_; }
+  std::map<std::string, std::vector<std::string>>* mutable_outputs() { return &outputs_; }
+
+  bool HasInput(const std::string& param) const {
+    auto it = inputs_.find(param);
+    return it != inputs_.end();
+  }
+
+  std::vector<std::string> Input(const std::string& param) const override;
+
+  std::vector<std::string> InputArgumentNames() const override;
+  std::vector<std::string> OutputArgumentNames() const override;
+
+  std::vector<std::string> input_vars() const;
+
+  std::vector<std::string> output_vars() const;
+
+  bool HasOutput(const std::string& param) const;
+
+  std::vector<std::string> Output(const std::string& param) const override;
+
+  void SetInput(const std::string& param, const std::vector<std::string>& args) override { inputs_[param] = args; }
+
+  void SetOutput(const std::string& param, const std::vector<std::string>& args) override { outputs_[param] = args; }
+
+  bool HasAttr(const std::string& name) const override { return attrs_.count(name); }
+
+  AttrType GetAttrType(const std::string& name) const override {
+    auto it = attr_types_.find(name);
+    CHECK(it != attr_types_.end());
+    return it->second;
+  }
+
+  std::vector<std::string> AttrNames() const override {
+    std::vector<std::string> res;
+    for (const auto& x : attrs_) {
+      res.push_back(x.first);
+    }
+    return res;
+  }
+
+  template <typename T>
+  void SetAttr(const std::string& name, const T& v);
+
+  template <typename T>
+  T GetAttr(const std::string& name) const;
+
+  const std::map<std::string, absl::any>& attrs() const { return attrs_; }
+  const std::map<std::string, AttrType>& attr_types() const { return attr_types_; }
+};
+
+}  // namespace cpp
+}  // namespace paddle
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/paddle/cpp/program_desc.cc b/paddle/cinn/frontend/paddle/cpp/program_desc.cc
new file mode 100644
index 0000000000000..2b78f07a63161
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/cpp/program_desc.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/cpp/program_desc.h"
+
+namespace cinn::frontend::paddle::cpp {
+
+template <>
+BlockDesc* ProgramDesc::GetBlock<BlockDesc>(int32_t idx) {
+  CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()";
+  return &blocks_[idx];
+}
+
+template <>
+const BlockDesc& ProgramDesc::GetConstBlock<BlockDesc>(int32_t idx) const {
+  CHECK_LT(idx, static_cast<int32_t>(BlocksSize())) << "idx >= blocks.size()";
+  return blocks_[idx];
+}
+
+template <>
+BlockDesc* ProgramDesc::AddBlock<BlockDesc>() {
+  blocks_.emplace_back();
+  return &blocks_.back();
+}
+
+}  // namespace cinn::frontend::paddle::cpp
diff --git a/paddle/cinn/frontend/paddle/cpp/program_desc.h b/paddle/cinn/frontend/paddle/cpp/program_desc.h
new file mode 100644
index 0000000000000..34a028a3028bf
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/cpp/program_desc.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "cinn/frontend/paddle/cpp/block_desc.h"
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+
+namespace cinn::frontend::paddle::cpp {
+
+/*
+ * The cpp::ProgramDesc is the internal representation for Op. All the internal
+ * imprementation should use it, not the pb::ProgramDesc.
+ */
+class ProgramDesc : public ProgramDescAPI {
+ public:
+  ProgramDesc() = default;
+
+  size_t BlocksSize() const override { return blocks_.size(); }
+
+  void ClearBlocks() override { blocks_.clear(); }
+
+  template <typename T>
+  T* GetBlock(int32_t idx);
+
+  template <typename T>
+  const T& GetConstBlock(int32_t idx) const;
+
+  template <typename T>
+  T* AddBlock();
+
+  // Just return default versoin
+  // TODO(sangoly): refine this
+  bool HasVersion() const override { return true; }
+
+  int64_t Version() const override { return version_; }
+
+  void SetVersion(int64_t version) override { version_ = version; }
+
+ private:
+  int64_t version_;
+  std::vector<cpp::BlockDesc> blocks_;
+};
+
+}  // namespace cinn::frontend::paddle::cpp
diff --git a/paddle/cinn/frontend/paddle/cpp/var_desc.cc b/paddle/cinn/frontend/paddle/cpp/var_desc.cc
new file mode 100644
index 0000000000000..b9f3c73535dd1
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/cpp/var_desc.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/cpp/var_desc.h"
+
+namespace cinn::frontend::paddle::cpp {}  // namespace cinn::frontend::paddle::cpp
diff --git a/paddle/cinn/frontend/paddle/cpp/var_desc.h b/paddle/cinn/frontend/paddle/cpp/var_desc.h
new file mode 100644
index 0000000000000..35026f9ddd480
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/cpp/var_desc.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+
+namespace cinn::frontend::paddle::cpp {
+
+/*
+ * The cpp::VarDesc is the internal representation for Op. All the internal
+ * imprementation should use it, not the pb::VarDesc.
+ */
+class VarDesc : public VarDescAPI {
+ public:
+  VarDesc() = default;
+
+  explicit VarDesc(std::string name) : name_(name) {}
+
+  std::string Name() const override { return name_; }
+
+  void SetName(std::string name) override { name_ = name; }
+
+  Type GetType() const override { return type_; }
+
+  void SetType(Type type) override { type_ = type; }
+
+  bool Persistable() const override { return persistable_; }
+
+  void SetPersistable(bool persistable) override { persistable_ = persistable; }
+
+  Type GetDataType() const { return data_type_; }
+
+  void SetDataType(Type data_type) { data_type_ = data_type; }
+
+  void SetShape(const std::vector<int64_t> &dims) { shape_ = dims; }
+
+  std::vector<int64_t> GetShape() const { return shape_; }
+
+ private:
+  std::string name_;
+  Type type_;
+  Type data_type_;
+  bool persistable_;
+  std::vector<int64_t> shape_;
+};
+
+}  // namespace cinn::frontend::paddle::cpp
diff --git a/paddle/cinn/frontend/paddle/framework.proto b/paddle/cinn/frontend/paddle/framework.proto
new file mode 100644
index 0000000000000..45c0985b01d43
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/framework.proto
@@ -0,0 +1,214 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package cinn.frontend.paddle.proto;
+
+// Any incompatible changes to ProgramDesc and its dependencies should
+// raise the version defined version.h.
+//
+// Serailization and Deserialization codes should be modified in a way
+// that supports old versions following the version and compatibility policy.
+message Version { optional int64 version = 1 [ default = 0 ]; }
+
+enum AttrType {
+  INT = 0;
+  FLOAT = 1;
+  STRING = 2;
+  INTS = 3;
+  FLOATS = 4;
+  STRINGS = 5;
+  BOOLEAN = 6;
+  BOOLEANS = 7;
+  BLOCK = 8;
+  LONG = 9;
+  BLOCKS = 10;
+  LONGS = 11;
+}
+
+// OpDesc describes an instance of a C++ framework::OperatorBase
+// derived class type.
+message OpDesc {
+
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    optional int32 i = 3;
+    optional float f = 4;
+    optional string s = 5;
+    repeated int32 ints = 6;
+    repeated float floats = 7;
+    repeated string strings = 8;
+    optional bool b = 10;
+    repeated bool bools = 11;
+    optional int32 block_idx = 12;
+    optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
+    repeated int64 longs = 15;
+  };
+
+  message Var {
+    required string parameter = 1;
+    repeated string arguments = 2;
+  };
+
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+
+// OpProto describes a C++ framework::OperatorBase derived class.
+message OpProto {
+
+  // VarProto describes the C++ type framework::Variable.
+  message Var {
+    required string name = 1;
+    required string comment = 2;
+
+    optional bool duplicable = 3 [ default = false ];
+    optional bool intermediate = 4 [ default = false ];
+    optional bool dispensable = 5 [ default = false ];
+  }
+
+  // AttrProto describes the C++ type Attribute.
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    required string comment = 3;
+    // If that attribute is generated, it means the Paddle third
+    // language binding has responsibility to fill that
+    // attribute. End-User should not set that attribute.
+    optional bool generated = 4 [ default = false ];
+  }
+
+  required string type = 1;
+  repeated Var inputs = 2;
+  repeated Var outputs = 3;
+  repeated Attr attrs = 4;
+  required string comment = 5;
+}
+
+message VarType {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+    // Tensor<size_t> is used in C++.
+    SIZE_T = 19;
+    UINT8 = 20;
+    INT8 = 21;
+    BF16 = 22;
+
+    // Other types that may need additional descriptions
+    LOD_TENSOR = 7;
+    SELECTED_ROWS = 8;
+    FEED_MINIBATCH = 9;
+    FETCH_LIST = 10;
+    STEP_SCOPES = 11;
+    LOD_RANK_TABLE = 12;
+    LOD_TENSOR_ARRAY = 13;
+    PLACE_LIST = 14;
+    READER = 15;
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW = 17;
+    TUPLE = 18;
+  }
+
+  required Type type = 1;
+
+  message TensorDesc {
+    // Should only be PODType. Is enforced in C++
+    required Type data_type = 1;
+    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  }
+  optional TensorDesc selected_rows = 2;
+
+  message LoDTensorDesc {
+    required TensorDesc tensor = 1;
+    optional int32 lod_level = 2 [ default = 0 ];
+  }
+  optional LoDTensorDesc lod_tensor = 3;
+
+  message LoDTensorArrayDesc {
+    required TensorDesc tensor = 1;
+    optional int32 lod_level = 2 [ default = 0 ];
+  }
+  optional LoDTensorArrayDesc tensor_array = 4;
+
+  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
+  optional ReaderDesc reader = 5;
+
+  message Tuple { repeated Type element_type = 1; }
+  optional Tuple tuple = 7;
+}
+
+message VarDesc {
+  required string name = 1;
+  required VarType type = 2;
+  optional bool persistable = 3 [ default = false ];
+  // True if the variable is an input data and
+  // have to check the feed data shape and dtype
+  optional bool need_check_feed = 4 [ default = false ];
+}
+
+message BlockDesc {
+  required int32 idx = 1;
+  required int32 parent_idx = 2;
+  repeated VarDesc vars = 3;
+  repeated OpDesc ops = 4;
+  optional int32 forward_block_idx = 5 [ default = -1 ];
+}
+
+// CompatibleInfo is used to determine if a feature is compatible and
+// provides the information.
+message CompatibleInfo {
+  enum Type {
+    COMPATIBLE = 0;
+    DEFINITELY_NOT = 1;
+    POSSIBLE = 2;
+    BUG_FIX = 3;
+    PRECISION_CHANGE = 4;
+  }
+  required string version = 1;
+  required Type type = 2;
+}
+
+// In some cases, Paddle Fluid may perform operator definition iterations,
+// and the operator uses OpCompatibleMap for compatibility testing.
+message OpCompatibleMap {
+  message OpCompatiblePair {
+    required string op_name = 1;
+    required CompatibleInfo compatible_info = 2;
+  }
+  repeated OpCompatiblePair pair = 1;
+  optional string default_required_version = 2;
+}
+
+// Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+// for more details.
+// TODO(panyx0718): A model can have multiple programs. Need a
+// way to distinguish them. Maybe ID or name?
+message ProgramDesc {
+  reserved 2; // For backward compatibility.
+  repeated BlockDesc blocks = 1;
+  optional Version version = 4;
+  optional OpCompatibleMap op_compatible_map = 3;
+}
diff --git a/paddle/cinn/frontend/paddle/model_parser.cc b/paddle/cinn/frontend/paddle/model_parser.cc
new file mode 100755
index 0000000000000..e337ff2475052
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/model_parser.cc
@@ -0,0 +1,272 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/model_parser.h"
+
+#include <fstream>
+#include <vector>
+
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/codegen_cuda_host.h"
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/cuda_util.h"
+#include "cinn/common/common.h"
+#include "cinn/frontend/paddle/compatible_pb.h"
+
+namespace cinn::frontend::paddle {
+
+int SizeOfType(framework_proto::VarType::Type type) {
+  using Type = framework_proto::VarType::Type;
+  switch (static_cast<int>(type)) {
+#define DO(desc, type)            \
+  case Type::VarType_Type_##desc: \
+    return sizeof(type);
+    DO(BOOL, bool);
+    DO(BF16, float);
+    DO(FP16, float);
+    DO(FP32, float);
+    DO(INT8, int8_t);
+    DO(INT16, int16_t);
+    DO(INT32, int);
+    DO(INT64, int64_t);
+#undef DO
+    default:
+      LOG(FATAL) << "unknown data type " << type;
+  }
+  return -1;
+}
+
+void TensorFromStream(std::istream &is, hlir::framework::_Tensor_ *tensor, const common::Target &target) {
+  using Type = framework_proto::VarType::Type;
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  CHECK_EQ(version, 0U) << "Only version 0 is supported";
+  // read tensor desc
+  framework_proto::VarType::TensorDesc desc;
+  {
+    // int32_t size
+    // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    CHECK(desc.ParseFromArray(buf.get(), size)) << "Cannot parse tensor desc";
+  }
+
+  // read tensor
+  std::vector<int32_t> dims_vec;
+  std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims_vec));
+  hlir::framework::Shape dims(dims_vec);
+  tensor->Resize(dims);
+  void *buf;
+  size_t size = tensor->shape().numel() * SizeOfType(desc.data_type());
+  // alllocate memory
+  if (target.arch == Target::Arch::X86) {
+    switch (static_cast<int>(desc.data_type())) {
+#define SET_TENSOR(desc, type, precision)     \
+  case Type::VarType_Type_##desc:             \
+    buf = tensor->mutable_data<type>(target); \
+    tensor->set_type(precision);              \
+    break
+
+      SET_TENSOR(FP32, float, Float(32));
+      SET_TENSOR(INT8, int8_t, Int(8));
+      SET_TENSOR(INT16, int16_t, Int(16));
+      SET_TENSOR(INT32, int32_t, Int(32));
+      SET_TENSOR(INT64, int64_t, Int(64));
+#undef SET_TENSOR
+      default:
+        LOG(FATAL) << "unknown type " << desc.data_type();
+    }
+    // tensor->set_persistable(true);
+    is.read(static_cast<char *>(buf), size);
+  } else if (target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+    if (desc.data_type() != Type::VarType_Type_FP32) LOG(FATAL) << "[CUDA] The type is not fp32!!";
+    auto *data = tensor->mutable_data<float>(target);
+    tensor->set_type(Float(32));
+    std::vector<float> temp(tensor->shape().numel());
+    // LOG(INFO) <<"[CUDA] The tensor's size is "<< tensor->shape().numel();
+    is.read(reinterpret_cast<char *>(temp.data()), size);
+    CUDA_CALL(cudaMemcpy(
+        reinterpret_cast<void *>(data), temp.data(), tensor->shape().numel() * sizeof(float), cudaMemcpyHostToDevice));
+#else
+    LOG(FATAL) << "To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void LoadLoDTensor(std::istream &is, hlir::framework::Variable *var, const common::Target &target) {
+  auto &tensor = absl::get<hlir::framework::Tensor>(*var);
+  uint32_t version{};
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  VLOG(3) << "model version " << version;
+
+  // Load LoD information
+  uint64_t lod_level{};
+  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::vector<uint64_t> tmp(size / sizeof(uint64_t));
+    is.read(reinterpret_cast<char *>(tmp.data()), static_cast<std::streamsize>(size));
+    // lod[i] = tmp;
+  }
+
+  TensorFromStream(is, tensor.operator->(), target);
+}
+
+void ReadBinaryFile(const std::string &filename, std::string *contents) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  CHECK(fin.is_open()) << "Cannot open file: " << filename;
+  fin.seekg(0, std::ios::end);
+  auto size = fin.tellg();
+  contents->clear();
+  contents->resize(size);
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(contents->at(0)), contents->size());
+  fin.close();
+}
+
+std::unique_ptr<framework_proto::ProgramDesc> LoadProgram(const std::string &path, bool program_from_memory) {
+  std::unique_ptr<framework_proto::ProgramDesc> main_program(new framework_proto::ProgramDesc);
+  if (!program_from_memory) {
+    std::string desc_str;
+    ReadBinaryFile(path, &desc_str);
+    main_program->ParseFromString(desc_str);
+  } else {
+    main_program->ParseFromString(path);
+  }
+  return main_program;
+}
+
+void LoadParams(const std::string &path) {}
+
+// Load directly to CPU, and latter transfer to other devices.
+void LoadParam(const std::string &path, hlir::framework::Variable *out, const common::Target &target) {
+  std::ifstream fin(path, std::ios::binary);
+  CHECK(fin.is_open()) << "failed to open file " << path;
+  LoadLoDTensor(fin, out, target);
+}
+
+bool IsPersistable(const cpp::VarDesc &var) {
+  if (var.Persistable() && var.GetType() != cpp::VarDescAPI::Type::FEED_MINIBATCH &&
+      var.GetType() != cpp::VarDescAPI::Type::FETCH_LIST && var.GetType() != cpp::VarDescAPI::Type::RAW) {
+    return true;
+  }
+  return false;
+}
+
+void LoadCombinedParamsPb(const std::string &path,
+                          hlir::framework::Scope *scope,
+                          const cpp::ProgramDesc &cpp_prog,
+                          bool params_from_memory,
+                          const common::Target &target) {
+  CHECK(scope);
+  auto prog             = cpp_prog;
+  auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
+
+  // Get vars
+  std::vector<std::string> paramlist;
+  for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) {
+    auto &var = *main_block_desc.GetVar<cpp::VarDesc>(i);
+    if (!IsPersistable(var)) continue;
+    paramlist.push_back(var.Name());
+  }
+  std::sort(paramlist.begin(), paramlist.end());
+
+  // Load vars
+  auto load_var_func = [&](std::istream &is) {
+    for (size_t i = 0; i < paramlist.size(); ++i) {
+      auto *var = scope->Var<hlir::framework::Tensor>(utils::TransValidVarName(paramlist[i]));
+      // Error checking
+      CHECK(static_cast<bool>(is)) << "There is a problem with loading model parameters";
+      LoadLoDTensor(is, var, target);
+    }
+    is.peek();
+    CHECK(is.eof()) << "You are not allowed to load partial data via"
+                    << " LoadCombinedParamsPb, use LoadParam instead.";
+  };
+
+  if (params_from_memory) {
+    std::stringstream fin(path, std::ios::in | std::ios::binary);
+    load_var_func(fin);
+  } else {
+    std::ifstream fin(path, std::ios::binary);
+    CHECK(fin.is_open());
+    load_var_func(fin);
+  }
+}
+
+void LoadModelPb(const std::string &model_dir,
+                 const std::string &model_file,
+                 const std::string &param_file,
+                 hlir::framework::Scope *scope,
+                 cpp::ProgramDesc *cpp_prog,
+                 bool combined,
+                 bool model_from_memory,
+                 const common::Target &target) {
+  CHECK(cpp_prog);
+  CHECK(scope);
+  cpp_prog->ClearBlocks();
+  VLOG(3) << "model_dir is: " << model_dir;
+  VLOG(3) << "model_file is: " << model_file;
+  VLOG(3) << "param_file is: " << param_file;
+  // Load model
+  VLOG(4) << "Start load model program...";
+  std::string prog_path       = model_dir + "/__model__";
+  std::string param_file_temp = param_file;
+  if (combined) {
+    // prog_path = model_file;
+    param_file_temp = model_dir + "/params";
+  }
+  framework_proto::ProgramDesc pb_proto_prog = *LoadProgram(prog_path, model_from_memory);
+  pb::ProgramDesc pb_prog(&pb_proto_prog);
+  // Transform to cpp::ProgramDesc
+  TransformProgramDescAnyToCpp(pb_prog, cpp_prog);
+
+  // Load Params
+  // NOTE: Only main block be used now.
+  VLOG(4) << "Start load model params...";
+  CHECK(!(!combined && model_from_memory)) << "If you want use the model_from_memory,"
+                                           << " you should load the combined model using cfg.set_model_buffer "
+                                              "interface.";
+  if (combined) {
+    LoadCombinedParamsPb(param_file_temp, scope, *cpp_prog, model_from_memory, target);
+  } else {
+    auto main_block = pb_proto_prog.blocks(0);
+    for (auto &var : main_block.vars()) {
+      if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) continue;
+
+      std::string file_path = model_dir + "/" + var.name();
+      VLOG(4) << "reading weight " << var.name();
+
+      std::ifstream file(file_path, std::ios::binary);
+      switch (var.type().type()) {
+        case framework_proto::VarType_Type_LOD_TENSOR:
+          LoadLoDTensor(file, scope->Var<hlir::framework::Tensor>(utils::TransValidVarName(var.name())), target);
+          break;
+        default:
+          LOG(FATAL) << "unknown weight type";
+      }
+    }
+  }
+
+  VLOG(4) << "Load protobuf model in [" << model_dir << "] successfully";
+}
+
+}  // namespace cinn::frontend::paddle
diff --git a/paddle/cinn/frontend/paddle/model_parser.h b/paddle/cinn/frontend/paddle/model_parser.h
new file mode 100644
index 0000000000000..1644940523638
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/model_parser.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/frontend/paddle/cpp/program_desc.h"
+#include "cinn/frontend/paddle/framework.pb.h"
+#include "cinn/frontend/paddle/pb/block_desc.h"
+#include "cinn/frontend/paddle/pb/op_desc.h"
+#include "cinn/frontend/paddle/pb/program_desc.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/hlir/framework/tensor.h"
+
+namespace cinn::frontend::paddle {
+namespace framework_proto = ::cinn::frontend::paddle::proto;
+
+// Read a model and files of parameters in pb format.
+void LoadModelPb(const std::string& model_dir,
+                 const std::string& model_file,
+                 const std::string& param_file,
+                 hlir::framework::Scope* scope,
+                 cpp::ProgramDesc* cpp_prog,
+                 bool combined                = true,
+                 bool model_from_memory       = false,
+                 const common::Target& target = common::DefaultHostTarget());
+
+// Read a __model__ file.
+std::unique_ptr<framework_proto::ProgramDesc> LoadProgram(const std::string& path, bool program_from_memory = false);
+
+void LoadLoDTensor(std::istream& is, hlir::framework::Variable* var, const common::Target& target);
+
+// Read a single file containing all the parameters.
+void LoadParams(const std::string& path);
+
+// Load a single parameter to an output tensor.
+void LoadParam(const std::string& path, hlir::framework::Variable* out, const common::Target& target);
+
+void LoadCombinedParamsPb(const std::string& path,
+                          hlir::framework::Scope* scope,
+                          const pb::ProgramDesc& prog,
+                          bool params_from_memory      = false,
+                          const common::Target& target = common::DefaultHostTarget());
+
+// LoDTensor to ostream
+void TensorToStream(std::ostream& os, const hlir::framework::_Tensor_& tensor);
+void TensorFromStream(std::istream& is,
+                      hlir::framework::_Tensor_* tensor,
+                      const common::Target& target = common::DefaultHostTarget());
+void ReadBinaryFile(const std::string& filename, std::string* contents);
+
+}  // namespace cinn::frontend::paddle
diff --git a/paddle/cinn/frontend/paddle/model_parser_test.cc b/paddle/cinn/frontend/paddle/model_parser_test.cc
new file mode 100644
index 0000000000000..5dd01ee08bcdf
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/model_parser_test.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/model_parser.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+DEFINE_string(model_dir, "<NOTEXIST>", "model directory path");
+
+namespace cinn::frontend::paddle {
+
+TEST(LoadModelPb, naive_model) {
+  hlir::framework::Scope scope;
+  cpp::ProgramDesc program_desc;
+  LoadModelPb(FLAGS_model_dir, "__model__", "", &scope, &program_desc, false);
+
+  ASSERT_EQ(program_desc.BlocksSize(), 1UL);
+
+  auto* block = program_desc.GetBlock<cpp::BlockDesc>(0);
+  ASSERT_EQ(block->OpsSize(), 4UL);
+  for (int i = 0; i < block->OpsSize(); i++) {
+    auto* op = block->GetOp<cpp::OpDesc>(i);
+    LOG(INFO) << op->Type();
+  }
+
+  // The Op list:
+  // feed
+  // mul
+  // scale
+  // fetch
+}
+
+}  // namespace cinn::frontend::paddle
diff --git a/paddle/cinn/frontend/paddle/pb/CMakeLists.txt b/paddle/cinn/frontend/paddle/pb/CMakeLists.txt
new file mode 100644
index 0000000000000..2dce2900e7129
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/pb/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(srcs
+  var_desc.cc
+  op_desc.cc
+  block_desc.cc
+  program_desc.cc
+  )
+
+foreach(cpp ${srcs})
+  set(cinnapi_src "${cinnapi_src};cinn/frontend/paddle/pb/${cpp}" CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/cinn/frontend/paddle/pb/block_desc.cc b/paddle/cinn/frontend/paddle/pb/block_desc.cc
new file mode 100644
index 0000000000000..93193cb67a53b
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/pb/block_desc.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/pb/block_desc.h"
+
+namespace cinn::frontend::paddle::pb {
+
+template <>
+framework_proto::VarDesc* BlockDesc::GetVar<framework_proto::VarDesc>(int32_t idx) {
+  CHECK_LT(idx, VarsSize()) << "idx >= vars.size()";
+  return desc_->mutable_vars(idx);
+}
+
+template <>
+framework_proto::VarDesc* BlockDesc::AddVar<framework_proto::VarDesc>() {
+  return desc_->add_vars();
+}
+
+template <>
+framework_proto::OpDesc* BlockDesc::GetOp<framework_proto::OpDesc>(int32_t idx) {
+  CHECK_LT(idx, OpsSize()) << "idx >= ops.size()";
+  return desc_->mutable_ops(idx);
+}
+
+template <>
+framework_proto::OpDesc* BlockDesc::AddOp<framework_proto::OpDesc>() {
+  return desc_->add_ops();
+}
+
+}  // namespace cinn::frontend::paddle::pb
diff --git a/paddle/cinn/frontend/paddle/pb/block_desc.h b/paddle/cinn/frontend/paddle/pb/block_desc.h
new file mode 100644
index 0000000000000..ffdc2c5bdf81f
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/pb/block_desc.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "cinn/frontend/paddle/framework.pb.h"
+
+namespace cinn::frontend::paddle::pb {
+
+namespace framework_proto = ::cinn::frontend::paddle::proto;
+
+class BlockDesc : public cpp::BlockDescAPI {
+ public:
+  BlockDesc() = delete;
+
+  explicit BlockDesc(framework_proto::BlockDesc* desc) : desc_(desc) { CHECK(desc_); }
+
+  framework_proto::BlockDesc* Proto() { return desc_; }
+
+  const framework_proto::BlockDesc& ReadonlyProto() const { return *desc_; }
+
+  int32_t Idx() const override { return desc_->idx(); }
+
+  void SetIdx(int32_t idx) override { desc_->set_idx(idx); }
+
+  int32_t ParentIdx() const override { return desc_->parent_idx(); }
+
+  void SetParentIdx(int32_t idx) override { desc_->set_parent_idx(idx); }
+
+  size_t VarsSize() const override { return desc_->vars_size(); }
+
+  void ClearVars() override { desc_->clear_vars(); }
+
+  template <typename T>
+  T* GetVar(int32_t idx);
+
+  template <typename T>
+  T* AddVar();
+
+  size_t OpsSize() const override { return desc_->ops_size(); }
+
+  void ClearOps() override { desc_->clear_ops(); }
+
+  template <typename T>
+  T* GetOp(int32_t idx);
+
+  template <typename T>
+  T* AddOp();
+
+  int32_t ForwardBlockIdx() const override { return desc_->forward_block_idx(); }
+
+  void SetForwardBlockIdx(int32_t idx) override { desc_->set_forward_block_idx(idx); }
+
+ private:
+  framework_proto::BlockDesc* desc_;  // not_own
+};
+
+}  // namespace cinn::frontend::paddle::pb
diff --git a/paddle/cinn/frontend/paddle/pb/op_desc.cc b/paddle/cinn/frontend/paddle/pb/op_desc.cc
new file mode 100644
index 0000000000000..0b67628487701
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/pb/op_desc.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/pb/op_desc.h"
+
+namespace cinn::frontend::paddle::pb {
+
+google::protobuf::internal::RepeatedPtrIterator<framework_proto::OpDesc_Attr> FindAttr(framework_proto::OpDesc *desc,
+                                                                                       const std::string &name) {
+  auto &xs = *desc->mutable_attrs();
+  auto it = std::find_if(xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) { return x.name() == name; });
+  if (it == xs.end()) {
+    auto *attr = xs.Add();
+    attr->set_name(name);
+    it = std::find_if(xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) { return x.name() == name; });
+  }
+  return it;
+}
+
+#define SET_IMPL_ONE(T, ty__, pb_f__)                            \
+  template <>                                                    \
+  void OpDesc::SetAttr<T>(const std::string &name, const T &v) { \
+    auto it = FindAttr(desc_, name);                             \
+    it->set_type(framework_proto::ty__);                         \
+    it->set_##pb_f__(v);                                         \
+  }
+SET_IMPL_ONE(int, INT, i);
+SET_IMPL_ONE(float, FLOAT, f);
+SET_IMPL_ONE(bool, BOOLEAN, b);
+SET_IMPL_ONE(int64_t, LONG, l);
+
+template <>
+void OpDesc::SetAttr<std::vector<int>>(const std::string &name, const std::vector<int> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::INTS);
+  it->clear_ints();
+  for (auto &i : v) {
+    it->add_ints(i);
+  }
+}
+
+template <>
+void OpDesc::SetAttr<std::string>(const std::string &name, const std::string &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::STRING);
+  it->set_s(v.c_str());
+}
+
+template <>
+void OpDesc::SetAttr<std::vector<float>>(const std::string &name, const std::vector<float> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::FLOATS);
+  it->clear_floats();
+  for (auto &i : v) {
+    it->add_floats(i);
+  }
+}
+
+template <>
+void OpDesc::SetAttr<std::vector<std::string>>(const std::string &name, const std::vector<std::string> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::STRINGS);
+  it->clear_strings();
+  for (auto &i : v) {
+    it->add_strings(i);
+  }
+}
+
+template <>
+void OpDesc::SetAttr<std::vector<int64_t>>(const std::string &name, const std::vector<int64_t> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::LONGS);
+  it->clear_longs();
+  for (auto &i : v) {
+    it->add_longs(i);
+  }
+}
+google::protobuf::internal::RepeatedPtrIterator<const framework_proto::OpDesc_Attr> GetFindAttr(
+    const framework_proto::OpDesc &desc, const std::string &name) {
+  auto &xs = desc.attrs();
+  auto it = std::find_if(xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) { return x.name() == name; });
+  return it;
+}
+
+#define GET_ATTR_IMPL(T, pb_f__)                        \
+  template <>                                           \
+  T OpDesc::GetAttr<T>(const std::string &name) const { \
+    auto it = GetFindAttr(*desc_, name);                \
+    return it->pb_f__();                                \
+  }
+
+#define GET_ATTRS_IMPL(T, pb_f__)                       \
+  template <>                                           \
+  T OpDesc::GetAttr<T>(const std::string &name) const { \
+    auto it = GetFindAttr(*desc_, name);                \
+    T res;                                              \
+    for (const auto &v : it->pb_f__()) {                \
+      res.push_back(v);                                 \
+    }                                                   \
+    return res;                                         \
+  }
+GET_ATTR_IMPL(int32_t, i);
+GET_ATTR_IMPL(int16_t, block_idx);
+GET_ATTR_IMPL(float, f);
+GET_ATTR_IMPL(bool, b);
+GET_ATTR_IMPL(int64_t, l);
+GET_ATTRS_IMPL(std::vector<int>, ints);
+GET_ATTRS_IMPL(std::vector<float>, floats);
+GET_ATTRS_IMPL(std::vector<std::string>, strings);
+GET_ATTR_IMPL(std::string, s);
+GET_ATTRS_IMPL(std::vector<int64_t>, longs);
+
+}  // namespace cinn::frontend::paddle::pb
diff --git a/paddle/cinn/frontend/paddle/pb/op_desc.h b/paddle/cinn/frontend/paddle/pb/op_desc.h
new file mode 100644
index 0000000000000..ab11611b7e686
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/pb/op_desc.h
@@ -0,0 +1,169 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/types/variant.h>
+
+#include "cinn/frontend/paddle/cpp/op_desc.h"
+#include "cinn/frontend/paddle/framework.pb.h"
+
+namespace cinn::frontend::paddle::pb {
+
+namespace framework_proto = ::cinn::frontend::paddle::proto;
+
+using Attribute       = absl::variant<int, float, bool, std::vector<std::string>, std::vector<int>>;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+/*
+ * The lite::OpDesc, an light-weight implementation of wrapper of proto::OpDesc.
+ * Unlike the original one in framework::OpDesc, we remove the local members
+ * except the desc_, to avoid the inconsistent state, which is normal in the
+ * original interface and results in bugs.
+ */
+class OpDesc : public cpp::OpDescAPI {
+ public:
+  OpDesc() = delete;
+
+  explicit OpDesc(framework_proto::OpDesc *desc) : desc_(desc) { CHECK(desc_); }
+
+  framework_proto::OpDesc *Proto() { return desc_; }
+  const framework_proto::OpDesc &ReadonlyProto() const { return *desc_; }
+
+  std::string Type() const override { return desc_->type(); }
+
+  void SetType(const std::string &type) override { desc_->set_type(type); }
+
+  // Get the arguments of parameter called `param`
+  std::vector<std::string> Input(const std::string &param) const override {
+    return GetArguments(desc_->inputs(), param);
+  }
+
+  std::vector<std::string> InputArgumentNames() const override { return GetArgumentNames(desc_->inputs()); }
+
+  void SetInput(const std::string &param, const std::vector<std::string> &args) override {
+    SetArgument(desc_->mutable_inputs(), param, args);
+  }
+
+  std::vector<std::string> Output(const std::string &param) const override {
+    return GetArguments(desc_->outputs(), param);
+  }
+
+  std::vector<std::string> OutputArgumentNames() const override { return GetArgumentNames(desc_->outputs()); }
+
+  void SetOutput(const std::string &param, const std::vector<std::string> &args) override {
+    SetArgument(desc_->mutable_outputs(), param, args);
+  }
+
+  bool HasAttr(const std::string &name) const override {
+    const auto &xs = desc_->attrs();
+    auto it =
+        std::find_if(xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) { return x.name() == name; });
+    return it != xs.end();
+  }
+
+  AttrType GetAttrType(const std::string &name) const override {
+    const auto &xs = desc_->attrs();
+    auto it =
+        std::find_if(xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) { return x.name() == name; });
+    CHECK(it != xs.end());
+#define DEF_ONE(type__)                   \
+  case framework_proto::AttrType::type__: \
+    return AttrType::type__;
+
+    switch (it->type()) {
+      DEF_ONE(INT);
+      DEF_ONE(FLOAT);
+      DEF_ONE(STRING);
+      DEF_ONE(INTS);
+      DEF_ONE(FLOATS);
+      DEF_ONE(STRINGS);
+      DEF_ONE(BOOLEAN);
+      DEF_ONE(BOOLEANS);
+      DEF_ONE(BLOCK);
+      DEF_ONE(LONG);
+      DEF_ONE(BLOCKS);
+      DEF_ONE(LONGS);
+      default:
+        LOG(FATAL) << "Unknown attribute type";
+        return static_cast<AttrType>(-1);
+    }
+#undef DEF_ONE
+  }
+
+  std::vector<std::string> AttrNames() const override {
+    std::vector<std::string> res;
+    const auto &xs = desc_->attrs();
+    std::transform(
+        xs.begin(), xs.end(), std::back_inserter(res), [](const framework_proto::OpDesc_Attr &x) { return x.name(); });
+    return res;
+  }
+
+  template <typename T>
+  void SetAttr(const std::string &name, const T &v);
+
+  template <typename T>
+  T GetAttr(const std::string &name) const;
+
+ private:
+  std::vector<std::string> GetArguments(const google::protobuf::RepeatedPtrField<framework_proto::OpDesc_Var> &xs,
+                                        const std::string &param) const {
+    std::vector<std::string> res;
+    auto it = std::find_if(
+        xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Var &it) { return it.parameter() == param; });
+    CHECK(it != xs.end());
+
+    const auto &ys = it->arguments();
+    std::transform(ys.begin(), ys.end(), std::back_inserter(res), [](const std::string &x) { return x; });
+    return res;
+  }
+
+  void SetArgument(google::protobuf::RepeatedPtrField<framework_proto::OpDesc_Var> *xs,
+                   const std::string &param,
+                   const std::vector<std::string> &args) {
+    auto it = std::find_if(
+        xs->begin(), xs->end(), [&](const framework_proto::OpDesc_Var &it) { return it.parameter() == param; });
+    if (it == xs->end()) {
+      auto *new_arg = xs->Add();
+      new_arg->set_parameter(param);
+      for (const auto &arg : args) {
+        *new_arg->mutable_arguments()->Add() = arg;
+      }
+    } else {
+      it->mutable_arguments()->Clear();
+      for (const auto &arg : args) {
+        *it->mutable_arguments()->Add() = arg;
+      }
+    }
+  }
+
+  std::vector<std::string> GetArgumentNames(
+      const google::protobuf::RepeatedPtrField<framework_proto::OpDesc_Var> &xs) const {
+    std::vector<std::string> res;
+    std::transform(xs.begin(), xs.end(), std::back_inserter(res), [](const framework_proto::OpDesc_Var &x) {
+      return x.parameter();
+    });
+    return res;
+  }
+
+ private:
+  framework_proto::OpDesc *desc_;
+};
+
+template <>
+void OpDesc::SetAttr<std::string>(const std::string &name, const std::string &v);
+
+template <>
+void OpDesc::SetAttr<std::vector<int>>(const std::string &name, const std::vector<int> &v);
+
+}  // namespace cinn::frontend::paddle::pb
diff --git a/paddle/cinn/frontend/paddle/pb/program_desc.cc b/paddle/cinn/frontend/paddle/pb/program_desc.cc
new file mode 100644
index 0000000000000..eb9a885f81f1c
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/pb/program_desc.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/pb/program_desc.h"
+
+#include <algorithm>
+#include <limits>
+
+namespace cinn::frontend::paddle::pb {
+
+template <>
+framework_proto::BlockDesc* ProgramDesc::GetBlock<framework_proto::BlockDesc>(int32_t idx) {
+  CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()";
+  return desc_->mutable_blocks(idx);
+}
+
+template <>
+framework_proto::BlockDesc* ProgramDesc::AddBlock<framework_proto::BlockDesc>() {
+  return desc_->add_blocks();
+}
+
+}  // namespace cinn::frontend::paddle::pb
diff --git a/paddle/cinn/frontend/paddle/pb/program_desc.h b/paddle/cinn/frontend/paddle/pb/program_desc.h
new file mode 100644
index 0000000000000..bdb55d2f372d5
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/pb/program_desc.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "cinn/frontend/paddle/framework.pb.h"
+
+namespace cinn::frontend::paddle::pb {
+namespace framework_proto = ::cinn::frontend::paddle::proto;
+
+class ProgramDesc : public cpp::ProgramDescAPI {
+ public:
+  ProgramDesc() = delete;
+
+  explicit ProgramDesc(framework_proto::ProgramDesc *desc) : desc_(desc) { CHECK(desc_); }
+
+  framework_proto::ProgramDesc *Proto() { return desc_; }
+
+  const framework_proto::ProgramDesc &ReadonlyProto() const { return *desc_; }
+
+  size_t BlocksSize() const override { return desc_->blocks_size(); }
+
+  void ClearBlocks() override { desc_->clear_blocks(); }
+
+  template <typename T>
+  T *GetBlock(int32_t idx);
+
+  template <typename T>
+  T *AddBlock();
+
+  bool HasVersion() const override { return desc_->has_version(); }
+
+  int64_t Version() const override { return desc_->version().version(); }
+
+  void SetVersion(int64_t version) override { desc_->mutable_version()->set_version(version); }
+
+ private:
+  framework_proto::ProgramDesc *desc_;  // not_own
+};
+
+}  // namespace cinn::frontend::paddle::pb
diff --git a/paddle/cinn/frontend/paddle/pb/var_desc.cc b/paddle/cinn/frontend/paddle/pb/var_desc.cc
new file mode 100644
index 0000000000000..1e85d9a2b0770
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/pb/var_desc.cc
@@ -0,0 +1,341 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle/pb/var_desc.h"
+
+#include <google/protobuf/map.h>
+
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "cinn/frontend/paddle/framework.pb.h"
+
+namespace cinn::frontend::paddle::pb {
+
+cpp::VarDescAPI::Type VarDesc::GetType() const {
+  auto type = desc_->type().type();
+
+#define GET_TYPE_CASE_ITEM(type__)       \
+  case framework_proto::VarType::type__: \
+    return cpp::VarDescAPI::Type::type__;
+
+  switch (type) {
+    GET_TYPE_CASE_ITEM(LOD_TENSOR);
+    GET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
+    GET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
+    GET_TYPE_CASE_ITEM(SELECTED_ROWS);
+    GET_TYPE_CASE_ITEM(FEED_MINIBATCH);
+    GET_TYPE_CASE_ITEM(FETCH_LIST);
+    GET_TYPE_CASE_ITEM(STEP_SCOPES);
+    GET_TYPE_CASE_ITEM(PLACE_LIST);
+    GET_TYPE_CASE_ITEM(READER);
+    default:
+      LOG(FATAL) << "Unknown var type";
+      return VarDescAPI::Type();
+  }
+#undef GET_TYPE_CASE_ITEM
+}
+
+void VarDesc::SetType(VarDescAPI::Type type) {
+#define SET_TYPE_CASE_ITEM(type__)                                     \
+  case VarDescAPI::Type::type__:                                       \
+    desc_->mutable_type()->set_type(framework_proto::VarType::type__); \
+    break;
+
+  switch (type) {
+    SET_TYPE_CASE_ITEM(LOD_TENSOR);
+    SET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
+    SET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
+    SET_TYPE_CASE_ITEM(SELECTED_ROWS);
+    SET_TYPE_CASE_ITEM(FEED_MINIBATCH);
+    SET_TYPE_CASE_ITEM(FETCH_LIST);
+    SET_TYPE_CASE_ITEM(STEP_SCOPES);
+    SET_TYPE_CASE_ITEM(PLACE_LIST);
+    SET_TYPE_CASE_ITEM(READER);
+    default:
+      LOG(FATAL) << "Unknown var type";
+  }
+#undef SET_TYPE_CASE_ITEM
+}
+
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
+  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
+}
+
+void VarDesc::SetTensorDescNum(size_t num) {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER: {
+      auto *lod_tensors_ptr = desc_->mutable_type()->mutable_reader()->mutable_lod_tensor();
+      lod_tensors_ptr->Clear();
+      for (size_t i = 0; i < num; ++i) {
+        lod_tensors_ptr->Add();
+      }
+      return;
+    } break;
+    default:
+      LOG(FATAL) << "Setting 'sub_tensor_number' is not supported by the type "
+                    "of var %s."
+                 << this->Name();
+  }
+}
+
+size_t VarDesc::GetTensorDescNum() const {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      return desc_->type().reader().lod_tensor_size();
+      break;
+    default:
+      LOG(FATAL) << "Getting 'sub_tensor_number' is not supported by the type "
+                    "of var %s."
+                 << this->Name();
+  }
+  return 0;
+}
+
+void VarDesc::SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims) {
+  if (multiple_dims.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
+            << ") doesn't match the existing tensor number(" << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_dims.size());
+  }
+  std::vector<framework_proto::VarType::TensorDesc *> tensors = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_dims.size(); ++i) {
+    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
+  }
+}
+
+std::vector<int64_t> VarDesc::GetShape() const { return RepeatedToVector(tensor_desc().dims()); }
+
+std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
+  std::vector<framework_proto::VarType::TensorDesc> descs = tensor_descs();
+  std::vector<std::vector<int64_t>> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(RepeatedToVector(tensor_desc.dims()));
+  }
+  return res;
+}
+
+void VarDesc::SetDataType(VarDescAPI::VarDataType data_type) {
+#define SET_DATA_TYPE_CASE_ITEM(type__)                                     \
+  case cpp::VarDescAPI::Type::type__:                                       \
+    mutable_tensor_desc()->set_data_type(framework_proto::VarType::type__); \
+    break;
+
+  switch (data_type) {
+    SET_DATA_TYPE_CASE_ITEM(BOOL);
+    SET_DATA_TYPE_CASE_ITEM(SIZE_T);
+    SET_DATA_TYPE_CASE_ITEM(UINT8);
+    SET_DATA_TYPE_CASE_ITEM(INT8);
+    SET_DATA_TYPE_CASE_ITEM(INT16);
+    SET_DATA_TYPE_CASE_ITEM(INT32);
+    SET_DATA_TYPE_CASE_ITEM(INT64);
+    SET_DATA_TYPE_CASE_ITEM(BF16);
+    SET_DATA_TYPE_CASE_ITEM(FP16);
+    SET_DATA_TYPE_CASE_ITEM(FP32);
+    SET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      LOG(FATAL) << "Unknown var type: " << static_cast<int>(data_type);
+  }
+#undef SET_DATA_TYPE_CASE_ITEM
+}
+
+void VarDesc::SetDataTypes(const std::vector<framework_proto::VarType::Type> &multiple_data_type) {
+  if (multiple_data_type.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given data types(" << multiple_data_type.size()
+            << ") doesn't match the existing tensor number(" << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_data_type.size());
+  }
+  std::vector<framework_proto::VarType::TensorDesc *> tensor_descs = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
+    tensor_descs[i]->set_data_type(multiple_data_type[i]);
+  }
+}
+
+// proto::VarType::Type VarDesc::GetDataType() const {
+//   return tensor_desc().data_type();
+// }
+cpp::VarDescAPI::VarDataType VarDesc::GetDataType() const {
+  CHECK(desc_->has_type()) << "The var's type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  if (desc_->type().type() != framework_proto::VarType::LOD_TENSOR) {
+    return VarDescAPI::Type();
+  }
+  auto type = tensor_desc().data_type();
+#define GET_DATA_TYPE_CASE_ITEM(type__)                       \
+  case framework_proto::VarType::Type::VarType_Type_##type__: \
+    return VarDescAPI::Type::type__
+
+  switch (type) {
+    GET_DATA_TYPE_CASE_ITEM(BOOL);
+    GET_DATA_TYPE_CASE_ITEM(SIZE_T);
+    GET_DATA_TYPE_CASE_ITEM(UINT8);
+    GET_DATA_TYPE_CASE_ITEM(INT8);
+    GET_DATA_TYPE_CASE_ITEM(INT16);
+    GET_DATA_TYPE_CASE_ITEM(INT32);
+    GET_DATA_TYPE_CASE_ITEM(INT64);
+    GET_DATA_TYPE_CASE_ITEM(BF16);
+    GET_DATA_TYPE_CASE_ITEM(FP16);
+    GET_DATA_TYPE_CASE_ITEM(FP32);
+    GET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      LOG(FATAL) << "Unknown var type: " << static_cast<int>(type);
+      return VarDescAPI::Type();
+  }
+#undef GET_DATA_TYPE_CASE_ITEM
+}
+
+std::vector<framework_proto::VarType::Type> VarDesc::GetDataTypes() const {
+  std::vector<framework_proto::VarType::TensorDesc> descs = tensor_descs();
+  std::vector<framework_proto::VarType::Type> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(tensor_desc.data_type());
+  }
+  return res;
+}
+
+void VarDesc::SetLoDLevel(int32_t lod_level) {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::LOD_TENSOR:
+      desc_->mutable_type()->mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      desc_->mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      LOG(FATAL) << "Setting 'lod_level' is not supported by the type of var %s." << this->Name();
+  }
+}
+
+void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
+  if (multiple_lod_level.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given lod_levels(" << multiple_lod_level.size()
+            << ") doesn't match the existing tensor number(" << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_lod_level.size());
+  }
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER: {
+      size_t i = 0;
+      for (auto &lod_tensor : *desc_->mutable_type()->mutable_reader()->mutable_lod_tensor()) {
+        lod_tensor.set_lod_level(multiple_lod_level[i++]);
+      }
+    } break;
+    default:
+      LOG(FATAL) << "Setting 'lod_levels' is not supported by the type of var %s." << this->Name();
+  }
+}
+
+int32_t VarDesc::GetLoDLevel() const {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::LOD_TENSOR:
+      return desc_->type().lod_tensor().lod_level();
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      return desc_->type().tensor_array().lod_level();
+    default:
+      LOG(FATAL) << "Getting 'lod_level' is not supported by the type of var %s." << this->Name();
+  }
+  return 0;
+}
+
+std::vector<int32_t> VarDesc::GetLoDLevels() const {
+  std::vector<int32_t> res;
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      res.reserve(desc_->type().reader().lod_tensor_size());
+      for (auto &lod_tensor : desc_->type().reader().lod_tensor()) {
+        res.push_back(lod_tensor.lod_level());
+      }
+      return res;
+      break;
+    default:
+      LOG(FATAL) << "Getting 'lod_levels' is not supported by the type of var %s." << this->Name();
+  }
+  return std::vector<int32_t>();
+}
+
+const framework_proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
+  CHECK(desc_->has_type()) << "The var's type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::SELECTED_ROWS:
+      return desc_->type().selected_rows();
+    case framework_proto::VarType::LOD_TENSOR:
+      return desc_->type().lod_tensor().tensor();
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      return desc_->type().tensor_array().tensor();
+    default:
+      LOG(FATAL) << "Getting 'tensor_desc' is not supported by the type of var %s." << this->Name();
+  }
+  return framework_proto::VarDesc().type().lod_tensor().tensor();
+}
+
+std::vector<framework_proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
+  CHECK(desc_->has_type()) << "The var type hasn't been set.";
+  std::vector<framework_proto::VarType::TensorDesc> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      for (const auto &lod_tensor : desc_->type().reader().lod_tensor()) {
+        res.push_back(lod_tensor.tensor());
+      }
+      return res;
+    default:
+      LOG(FATAL) << "Getting 'tensor_descs' is not supported by the type of var "
+                    "%s."
+                 << this->Name();
+  }
+  return std::vector<framework_proto::VarType::TensorDesc>();
+}
+
+framework_proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
+  CHECK(desc_->has_type()) << "The var type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::SELECTED_ROWS:
+      return desc_->mutable_type()->mutable_selected_rows();
+    case framework_proto::VarType::LOD_TENSOR:
+      return desc_->mutable_type()->mutable_lod_tensor()->mutable_tensor();
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      return desc_->mutable_type()->mutable_tensor_array()->mutable_tensor();
+    default:
+      LOG(FATAL) << "Getting 'mutable_tensor_desc' is not supported by the "
+                    "type of var "
+                    "%s."
+                 << this->Name();
+  }
+  return nullptr;
+}
+
+std::vector<framework_proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
+  CHECK(desc_->has_type()) << "The var type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  std::vector<framework_proto::VarType::TensorDesc *> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      for (auto &lod_tensor : *desc_->mutable_type()->mutable_reader()->mutable_lod_tensor()) {
+        res.push_back(lod_tensor.mutable_tensor());
+      }
+      return res;
+    default:
+      LOG(FATAL) << "Getting 'tensor_descs' is not supported by the type of var "
+                    "%s."
+                 << this->Name();
+  }
+  return std::vector<framework_proto::VarType::TensorDesc *>();
+}
+
+}  // namespace cinn::frontend::paddle::pb
diff --git a/paddle/cinn/frontend/paddle/pb/var_desc.h b/paddle/cinn/frontend/paddle/pb/var_desc.h
new file mode 100644
index 0000000000000..5f680648b352e
--- /dev/null
+++ b/paddle/cinn/frontend/paddle/pb/var_desc.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <google/protobuf/map.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "cinn/frontend/paddle/framework.pb.h"
+
+namespace cinn::frontend::paddle::pb {
+namespace framework_proto = ::cinn::frontend::paddle::proto;
+
+// convert between std::vector and protobuf repeated.
+template <typename T>
+inline std::vector<T> RepeatedToVector(const google::protobuf::RepeatedField<T> &repeated_field) {
+  std::vector<T> ret;
+  ret.reserve(repeated_field.size());
+  std::copy(repeated_field.begin(), repeated_field.end(), std::back_inserter(ret));
+  return ret;
+}
+
+template <typename T, typename RepeatedField>
+inline void VectorToRepeated(const std::vector<T> &vec, RepeatedField *repeated_field) {
+  repeated_field->Clear();
+  repeated_field->Reserve(vec.size());
+  for (const auto &elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+// Specialize vector<bool>.
+template <typename RepeatedField>
+inline void VectorToRepeated(const std::vector<bool> &vec, RepeatedField *repeated_field) {
+  repeated_field->Clear();
+  repeated_field->Reserve(vec.size());
+  for (auto elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+class VarDesc : public cpp::VarDescAPI {
+ public:
+  VarDesc() = delete;
+
+  explicit VarDesc(framework_proto::VarDesc *desc) : desc_(desc) { CHECK(desc_); }
+
+  ::cinn::frontend::paddle::proto::VarDesc *Proto() { return desc_; }
+  const framework_proto::VarDesc &ReadonlyProto() const { return *desc_; }
+
+  std::string Name() const override { return desc_->name(); }
+
+  void SetName(std::string name) override { desc_->set_name(name); }
+
+  void SetTensorDescNum(size_t num);
+
+  size_t GetTensorDescNum() const;
+
+  void SetShape(const std::vector<int64_t> &dims);
+
+  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
+
+  std::vector<int64_t> GetShape() const;
+
+  std::vector<std::vector<int64_t>> GetShapes() const;
+
+  void SetDataType(VarDescAPI::VarDataType data_type);
+
+  void SetDataTypes(const std::vector<framework_proto::VarType::Type> &multiple_data_type);
+
+  VarDescAPI::VarDataType GetDataType() const;
+
+  std::vector<framework_proto::VarType::Type> GetDataTypes() const;
+
+  void SetLoDLevel(int32_t lod_level);
+
+  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
+
+  int32_t GetLoDLevel() const;
+
+  std::vector<int32_t> GetLoDLevels() const;
+
+  VarDescAPI::Type GetType() const override;
+
+  void SetType(VarDescAPI::Type type) override;
+
+  bool Persistable() const override { return desc_->persistable(); }
+
+  void SetPersistable(bool persistable) override { desc_->set_persistable(persistable); }
+
+ private:
+  const framework_proto::VarType::TensorDesc &tensor_desc() const;
+  std::vector<framework_proto::VarType::TensorDesc> tensor_descs() const;
+  framework_proto::VarType::TensorDesc *mutable_tensor_desc();
+  std::vector<framework_proto::VarType::TensorDesc *> mutable_tensor_descs();
+
+  framework_proto::VarDesc *desc_;
+};
+
+}  // namespace cinn::frontend::paddle::pb
diff --git a/paddle/cinn/frontend/paddle_model_convertor.cc b/paddle/cinn/frontend/paddle_model_convertor.cc
new file mode 100644
index 0000000000000..6f71001b8163f
--- /dev/null
+++ b/paddle/cinn/frontend/paddle_model_convertor.cc
@@ -0,0 +1,204 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle_model_convertor.h"
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <unordered_set>
+#include <utility>
+
+#include "cinn/frontend/op_mappers/use_op_mappers.h"
+#include "cinn/frontend/paddle/cpp/op_desc.h"
+#include "cinn/frontend/paddle/cpp/program_desc.h"
+#include "cinn/frontend/paddle/model_parser.h"
+#include "cinn/frontend/var_type_utils.h"
+#include "cinn/hlir/op/use_ops.h"
+
+namespace cinn {
+namespace frontend {
+
+using cinn::utils::Attribute;
+
+PaddleModelConvertor::PaddleModelConvertor() : PaddleModelConvertor(common::DefaultTarget(), nullptr, nullptr) {}
+
+PaddleModelConvertor::PaddleModelConvertor(const common::Target& target,
+                                           std::shared_ptr<NetBuilder> builder,
+                                           std::shared_ptr<hlir::framework::Scope> scope)
+    : target_(target), builder_(builder), scope_(scope) {
+  if (!builder_) {
+    // do not need scope
+    builder_ = std::make_shared<NetBuilder>(cinn::UniqName("PaddleModelConvertor"));
+  }
+  if (!scope_) {
+    // do not need scope
+    scope_ = hlir::framework::Scope::Create();
+  }
+  ctx_ = std::make_unique<OpMapperContext>(
+      *scope_, target_, builder_.get(), &var_map_, &var_model_to_program_map_, &fetch_var_names_);
+}
+
+void PaddleModelConvertor::PrepareRun(const paddle::cpp::BlockDesc& block_desc, OpMapperContext* ctx) {
+  std::unordered_map<std::string, const paddle::cpp::VarDesc*> var_desc_map;
+  // preserve var desc info lik shape and dtype
+  for (int i = 0; i < block_desc.VarsSize(); i++) {
+    const auto& var_desc          = block_desc.GetConstVar<paddle::cpp::VarDesc>(i);
+    var_desc_map[var_desc.Name()] = &var_desc;
+  }
+
+  for (int i = 0; i < block_desc.OpsSize(); i++) {
+    const auto& op_desc = block_desc.GetConstOp<paddle::cpp::OpDesc>(i);
+
+    if (op_desc.Type() == "feed") {
+      for (const auto& var_name : op_desc.output_vars()) {
+        CHECK(var_desc_map.count(var_name)) << "Feed var [" << var_name << "] Not found in block";
+        ctx->AddFeedInfo(var_name, utils::GetFeedInfoFromDesc(*var_desc_map[var_name]));
+      }
+    }
+  }
+}
+
+void PaddleModelConvertor::RunOp(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx) {
+  const auto& op_type = op_desc.Type();
+  auto kernel         = OpMapperRegistry::Global()->Find(op_type);
+  CHECK(kernel) << "Op [" << op_type << "] Not supported in OpMapper";
+  VLOG(4) << "Running Op " << op_type;
+  kernel->Run(op_desc, ctx);
+}
+
+std::unordered_map<std::string, Variable> PaddleModelConvertor::GetFetchList(
+    const std::unordered_set<std::string>& fetch_name_list) const {
+  // the return map's key is paddle variable name, the value is the cinn fetch variable
+  const std::unordered_set<std::string>* var_name_list = &fetch_name_list;
+  if (fetch_name_list.empty()) {
+    // if paddle var list is empty, fetch the program's fetch var instead
+    CHECK(!fetch_var_names_.empty()) << "Should not fetch empty variable in CINN.";
+    var_name_list = &fetch_var_names_;
+  }
+
+  std::unordered_map<std::string, Variable> fetch_list;
+  fetch_list.reserve(var_name_list->size());
+  for (const auto& pd_name : *var_name_list) {
+    CHECK(var_model_to_program_map_.count(pd_name))
+        << "Cannot find cinn variable [" << pd_name << "] in var_model_to_program_map_";
+    auto norm_pd_name = pd_name;
+    // remove inplace output's suffix
+    auto pos = pd_name.find(paddle::InplaceOutSuffix);
+    if (pos != std::string::npos) {
+      norm_pd_name.replace(pos, sizeof(paddle::InplaceOutSuffix), "");
+    }
+    fetch_list[pd_name] = var_map_.at(norm_pd_name);
+  }
+  return fetch_list;
+}
+
+Program PaddleModelConvertor::LoadModel(const std::string& model_dir,
+                                        bool is_combined,
+                                        const std::unordered_map<std::string, std::vector<int64_t>>& feed) {
+  paddle::cpp::ProgramDesc program_desc;
+  paddle::LoadModelPb(model_dir, "__model__", "", scope_.get(), &program_desc, is_combined, false, target_);
+  CHECK_EQ(program_desc.BlocksSize(), 1) << "CINN can only support the model with a single block";
+  auto* block_desc = program_desc.GetBlock<paddle::cpp::BlockDesc>(0);
+
+  // Set feeds shape
+  for (int i = 0; i < block_desc->VarsSize(); i++) {
+    auto* var_desc      = block_desc->GetVar<paddle::cpp::VarDesc>(i);
+    const auto var_name = var_desc->Name();
+    if (feed.count(var_name)) {
+      const auto& var_shape = feed.at(var_name);
+      VLOG(4) << "Update var " << var_name << "'s shape to: " << cinn::utils::Join(var_shape, ", ");
+      var_desc->SetShape(var_shape);
+    }
+  }
+
+  OpMapperContext ctx(*scope_, target_, builder_.get(), &var_map_, &var_model_to_program_map_, &fetch_var_names_);
+
+  PrepareRun(*block_desc, &ctx);
+  for (int i = 0; i < block_desc->OpsSize(); i++) {
+    auto* op_desc = block_desc->GetOp<paddle::cpp::OpDesc>(i);
+    RunOp(*op_desc, ctx);
+  }
+  return builder_->Build();
+}
+
+void SetOpDescAttr(const std::string& attr_name, const Attribute& attr_value, paddle::cpp::OpDesc* op_desc) {
+  class Visitor {
+   public:
+    Visitor(paddle::cpp::OpDesc* op_desc, const std::string& attr_name) : op_desc_(op_desc), attr_name_(attr_name) {}
+
+#define VISITOR_EXPAND(TYPE) \
+  void operator()(const TYPE& v) { op_desc_->SetAttr(attr_name_, v); }
+
+    VISITOR_EXPAND(bool)
+    VISITOR_EXPAND(float)
+    VISITOR_EXPAND(int)
+    VISITOR_EXPAND(std::string)
+    VISITOR_EXPAND(std::vector<bool>)
+    VISITOR_EXPAND(std::vector<int>)
+    VISITOR_EXPAND(std::vector<float>)
+    VISITOR_EXPAND(std::vector<std::string>)
+    VISITOR_EXPAND(int64_t)
+    VISITOR_EXPAND(double)
+    VISITOR_EXPAND(std::vector<int64_t>)
+    VISITOR_EXPAND(std::vector<double>)
+#undef VISITOR_EXPAND
+
+   private:
+    paddle::cpp::OpDesc* op_desc_;
+    const std::string& attr_name_;
+  };
+  absl::visit(Visitor{op_desc, attr_name}, attr_value);
+}
+
+void PaddleModelConvertor::RunOp(const std::string& op_type,
+                                 const std::map<std::string, std::vector<std::string>>& inputs,
+                                 const std::map<std::string, std::vector<std::string>>& outputs,
+                                 const std::map<std::string, Attribute>& attrs,
+                                 const OpMapperContext& ctx) {
+  paddle::cpp::OpDesc op_desc;
+  op_desc.SetType(op_type);
+  for (const auto& in_pair : inputs) {
+    op_desc.SetInput(in_pair.first, in_pair.second);
+  }
+  for (const auto& out_pair : outputs) {
+    op_desc.SetOutput(out_pair.first, out_pair.second);
+  }
+  for (const auto& attr_pair : attrs) {
+    SetOpDescAttr(attr_pair.first, attr_pair.second, &op_desc);
+  }
+
+  RunOp(op_desc, ctx);
+}
+
+void PaddleModelConvertor::RunOp(const std::string& op_type,
+                                 const std::map<std::string, std::vector<std::string>>& inputs,
+                                 const std::map<std::string, std::vector<std::string>>& outputs,
+                                 const std::map<std::string, Attribute>& attrs) {
+  RunOp(op_type, inputs, outputs, attrs, *ctx_);
+}
+
+Program PaddleModelConvertor::operator()() { return builder_->Build(); }
+
+void PaddleModelConvertor::CreateInput(const std::string& dtype,
+                                       const cinn::utils::ShapeType& shape,
+                                       const std::string& name) {
+  OpMapperContext::FeedInfo feed_info = {shape, common::Str2Type(dtype)};
+
+  ctx_->AddFeedInfo(name, feed_info);
+  RunOp("feed", {}, {{"Out", {name}}}, {});
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/paddle_model_convertor.h b/paddle/cinn/frontend/paddle_model_convertor.h
new file mode 100644
index 0000000000000..5c82cea643169
--- /dev/null
+++ b/paddle/cinn/frontend/paddle_model_convertor.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/paddle/cpp/block_desc.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace frontend {
+
+// Transform paddle model to CINN fronted::Program object.
+// The paddle model is readed from __model__ file in model_dir, the PaddleModelConvertor
+// will run each op's kernel registered in OpMapper, each kernel will add instruction in
+// NetBuilder, after running all op of model, it will invoke its Build function and
+// finally return the complete fronted::Program object.
+// Note that if anyone op not registered, the program will failed and aborted.
+class PaddleModelConvertor {
+ public:
+  PaddleModelConvertor();
+
+  PaddleModelConvertor(const common::Target& target,
+                       std::shared_ptr<NetBuilder> builder           = nullptr,
+                       std::shared_ptr<hlir::framework::Scope> scope = nullptr);
+
+  // prepare feed variable before run CINN op
+  void PrepareRun(const paddle::cpp::BlockDesc& block_desc, OpMapperContext* ctx);
+
+  // RunOp accept OpDesc and global run context then run it's kernel registered in OpMapper.
+  static void RunOp(const paddle::cpp::OpDesc& op_desc, const OpMapperContext& ctx);
+
+  static void RunOp(const std::string& op_type,
+                    const std::map<std::string, std::vector<std::string>>& inputs,
+                    const std::map<std::string, std::vector<std::string>>& outputs,
+                    const std::map<std::string, cinn::utils::Attribute>& attrs,
+                    const OpMapperContext& ctx);
+
+  void RunOp(const std::string& op_type,
+             const std::map<std::string, std::vector<std::string>>& inputs,
+             const std::map<std::string, std::vector<std::string>>& outputs,
+             const std::map<std::string, cinn::utils::Attribute>& attrs);
+
+  void CreateInput(const std::string& dtype, const cinn::utils::ShapeType& shape, const std::string& name);
+
+  Program operator()();
+
+  // operator() accept the modle's directory, and return the fronted::Program object.
+  Program LoadModel(const std::string& model_dir,
+                    bool is_combined                                                  = false,
+                    const std::unordered_map<std::string, std::vector<int64_t>>& feed = {});
+
+  // return the internal variable map
+  const std::unordered_map<std::string, Variable>& var_map() const { return var_map_; }
+
+  // return the map from the variable name in paddle model to cinn program.
+  const std::unordered_map<std::string, std::string>& var_model_to_program_map() const {
+    return var_model_to_program_map_;
+  }
+
+  // return the map the paddle variable name to cinn variable object
+  std::unordered_map<std::string, Variable> GetFetchList(
+      const std::unordered_set<std::string>& fetch_name_list = {}) const;
+
+ private:
+  std::unordered_map<std::string, Variable> var_map_;
+  // map from var in Paddle model to var name in program.
+  std::unordered_map<std::string, std::string> var_model_to_program_map_;
+  // fetch var names used in Paddle
+  std::unordered_set<std::string> fetch_var_names_;
+
+  std::unique_ptr<OpMapperContext> ctx_;
+  std::shared_ptr<NetBuilder> builder_;
+  const common::Target& target_;
+  std::shared_ptr<hlir::framework::Scope> scope_;
+};
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/paddle_model_convertor_test.cc b/paddle/cinn/frontend/paddle_model_convertor_test.cc
new file mode 100644
index 0000000000000..c9240b461f4de
--- /dev/null
+++ b/paddle/cinn/frontend/paddle_model_convertor_test.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle_model_convertor.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+#include "cinn/runtime/use_extern_funcs.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace cinn {
+namespace frontend {
+
+template <typename T>
+void RandomInput(const Target& target,
+                 hlir::framework::Tensor tensor,
+                 T low  = static_cast<T>(0),
+                 T high = static_cast<T>(1)) {
+  std::vector<T> vec;
+  InitRandomVector<T>(&vec, tensor->shape().numel(), low, high);
+  CopyFromVector<T>(vec, tensor, target);
+}
+
+template <>
+void RandomInput<bool>(const Target& target, hlir::framework::Tensor tensor, bool low, bool high) {
+  std::vector<int> vec_int;
+  InitRandomVector<int>(&vec_int, tensor->shape().numel(), 0, 1);
+
+  std::vector<bool> vec(vec_int.size());
+  for (int i = 0; i < vec_int.size(); ++i) {
+    vec[i] = static_cast<bool>(vec_int[i]);
+  }
+  CopyFromVector<bool>(vec, tensor, target);
+}
+
+void RunProgram(const Target& target, Program* prog) {
+  const auto& inputs = prog->GetInputs();
+  std::vector<std::string> input_names;
+  for (const auto& var : inputs) {
+    input_names.emplace_back(var->id);
+  }
+
+  LOG(INFO) << "The Program's inputs are [" << cinn::utils::Join(input_names, ", ") << "]";
+
+  auto passes = DefaultTrainingOptimizeOptions();
+
+  frontend::ProgramPass::Apply(prog, {}, target, passes.program_passes);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(*prog, target);
+  hlir::framework::ApplyPasses(graph.get(), passes.graph_passes);
+
+  auto scope = BuildScope(target, graph);
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    scope->Var<hlir::framework::Tensor>(input_names[i]);
+    auto tensor = scope->GetTensor(input_names[i]);
+
+    if (inputs[i]->type.is_float(32)) {
+      RandomInput<float>(target, tensor);
+    } else if (inputs[i]->type.is_int(32)) {
+      RandomInput<int>(target, tensor);
+    } else if (inputs[i]->type.is_bool()) {
+      RandomInput<bool>(target, tensor, 0, inputs[i]->shape[0]);
+    } else {
+      LOG(FATAL) << "Only support float/int/bool! Please check.";
+    }
+  }
+
+  runtime_program->Execute();
+}
+
+TEST(PaddleModelConvertor, basic) {
+  auto target = common::DefaultTarget();
+
+  PaddleModelConvertor model_transform(target);
+  model_transform.LoadModel(FLAGS_model_dir);
+  auto program = model_transform();
+
+  const auto& var_map                  = model_transform.var_map();
+  const auto& var_model_to_program_map = model_transform.var_model_to_program_map();
+
+  ASSERT_FALSE(var_map.empty());
+  ASSERT_FALSE(var_model_to_program_map.empty());
+  ASSERT_FALSE(model_transform.GetFetchList().empty());
+  ASSERT_GT(program.size(), 0);
+
+  RunProgram(target, &program);
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/paddle_model_to_program.cc b/paddle/cinn/frontend/paddle_model_to_program.cc
new file mode 100644
index 0000000000000..490c9097f85f4
--- /dev/null
+++ b/paddle/cinn/frontend/paddle_model_to_program.cc
@@ -0,0 +1,736 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/paddle_model_to_program.h"
+
+#include <algorithm>
+
+#include "cinn/frontend/paddle/framework.pb.h"
+#include "cinn/frontend/paddle/model_parser.h"
+#include "cinn/frontend/paddle/pb/program_desc.h"
+#include "cinn/hlir/framework/node.h"
+
+namespace cinn {
+namespace frontend {
+using utils::Join;
+using utils::TransValidVarName;
+
+void MoveData(float* data, int i, int M, int N) {
+  float temp = data[i];
+  int cur    = i;  // current data index
+  int pre    = (cur % M) * N + cur / M;
+  while (pre != i) {
+    data[cur] = data[pre];
+    cur       = pre;
+    pre       = (cur % M) * N + cur / M;
+  }
+  data[cur] = temp;
+}
+
+void TransposeData(float* data, int M, int N) {
+  for (int i = 0; i < M * N; i++) {
+    int next = (i % N) * M + i / N;
+    while (next > i)  // next < 1 implies duplicate
+      next = (next % N) * M + next / N;
+    if (next == i)  // process current ring
+      MoveData(data, i, M, N);
+  }
+}
+
+void ReverseHWData(float* data, std::vector<int> shape) {
+  CHECK_EQ(shape.size(), 4UL);
+  for (int i = 0; i < shape[0] * shape[1]; i++) {
+    int num = shape[2] * shape[3];
+    std::reverse(data + (i * num), data + (i * num + num));
+  }
+}
+
+void PaddleModelToProgram::AddOpMapper_feed() {
+  op_mappers_["feed"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    auto outs = op_desc.Output("Out");
+    CHECK_EQ(outs.size(), 1UL);
+    VLOG(2) << "Model get feed [" << outs[0] << "]";
+    CHECK(input_shape_map_.count(outs[0]));
+    auto input_shape = input_shape_map_[outs[0]];
+    auto input       = net_builder_->CreateInput(Float(32), input_shape, outs[0]);
+    AddVar(outs[0], input);
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_fetch() {
+  op_mappers_["fetch"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto output_names = op_desc.Input("X");
+    for (auto& output_name : output_names) {
+      VLOG(2) << "fetch model output: [" << output_name << "]";
+      fetch_names_.insert(utils::TransValidVarName(output_name));
+    }
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_scale() {
+  op_mappers_["scale"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    auto x      = GetVar(utils::TransValidVarName(x_name));
+    float scale{};
+    float bias{};
+    if (op_desc.HasAttr("scale")) {  // the old model format
+      scale = op_desc.GetAttr<float>("scale");
+    } else {  // the newly refactored format
+      // load scale tensor
+      CHECK_EQ(op_desc.Input("ScaleTensor").size(), 1UL);
+      auto* scale_tensor_var = scope_->FindVar(op_desc.Input("ScaleTensor").front());
+      CHECK(scale_tensor_var) << "No scale tensor found in the scope";
+      auto& scale_tensor = absl::get<hlir::framework::Tensor>(*scale_tensor_var);
+      scale              = scale_tensor->mutable_data<float>(common::DefaultHostTarget())[0];
+    }
+    if (op_desc.HasAttr("bias")) {  // the old model format
+      bias = op_desc.GetAttr<float>("bias");
+    } else {
+      LOG(FATAL) << "Didn't find [bias] attr in Scale operator!!";
+    }
+    absl::flat_hash_map<std::string, hlir::framework::NodeAttr::attr_t> attrs;
+    auto out = net_builder_->Scale(x, scale, bias);
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    AddVar(utils::TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_mul() {
+  op_mappers_["mul"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+    auto y_name = op_desc.Input("Y").front();
+    auto x      = GetVar(utils::TransValidVarName(x_name));
+    TransposeVar(TransValidVarName(y_name));
+    auto y             = GetVar(utils::TransValidVarName(y_name));
+    int x_num_col_dims = op_desc.GetAttr<int>("x_num_col_dims");
+    int y_num_col_dims = op_desc.GetAttr<int>("y_num_col_dims");
+
+    VLOG(4) << "Mul x_num_col_dims: " << x_num_col_dims;
+    VLOG(4) << "Mul y_num_col_dims: " << y_num_col_dims;
+    VLOG(4) << "x shape: " << utils::Join(x->shape, ",");
+    VLOG(4) << "y shape: " << utils::Join(y->shape, ",");
+
+    const auto& out = net_builder_->Mul(x, y, x_num_col_dims, y_num_col_dims, true);
+
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    AddVar(utils::TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_matmul() {
+  op_mappers_["matmul"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+    auto y_name  = op_desc.Input("Y").front();
+    auto x       = GetVar(utils::TransValidVarName(x_name));
+    auto y       = GetVar(utils::TransValidVarName(y_name));
+    bool trans_a = op_desc.GetAttr<bool>("transpose_X");
+    bool trans_b = op_desc.GetAttr<bool>("transpose_Y");
+    float alpha  = op_desc.GetAttr<float>("alpha");
+    VLOG(4) << "x shape: " << utils::Join(x->shape, ",");
+    VLOG(4) << "y shape: " << utils::Join(y->shape, ",");
+    auto out = net_builder_->Matmul(x, y, trans_a, trans_b, alpha);
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    AddVar(utils::TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_reshape2() {
+  op_mappers_["reshape2"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name            = op_desc.Input("X").front();
+    auto x                 = GetVar(utils::TransValidVarName(x_name));
+    std::vector<int> shape = op_desc.GetAttr<std::vector<int>>("shape");
+    VLOG(4) << "x shape: " << utils::Join(x->shape, ",");
+    auto out = net_builder_->Reshape(x, shape);
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    AddVar(utils::TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_concat() {
+  op_mappers_["concat"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    int input_size = op_desc.Input("X").size();
+    CHECK_GE(input_size, 2UL);
+    std::vector<Variable> input_vars;
+    for (int i = 0; i < input_size; i++) {
+      auto name = op_desc.Input("X")[i];
+      input_vars.push_back(GetVar(utils::TransValidVarName(name)));
+    }
+    int axis = op_desc.GetAttr<int>("axis");
+    VLOG(4) << "axis in op concat is : " << axis;
+    auto out = net_builder_->Concat(input_vars, axis);
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    AddVar(utils::TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_assign() {
+  op_mappers_["assign"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    auto x        = GetVar(TransValidVarName(x_name));
+    auto out      = net_builder_->Identity(x);
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_fill_constant() {
+  op_mappers_["fill_constant"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+
+    CHECK(op_desc.HasAttr("shape"));
+    auto shape = op_desc.GetAttr<std::vector<int64_t>>("shape");
+    std::vector<int> shapes;
+    for (size_t i = 0; i < shape.size(); i++) {
+      CHECK_LE(shape[i], std::numeric_limits<int32_t>::max());
+      shapes.push_back(static_cast<int>(shape[i]));
+    }
+    CHECK(op_desc.HasAttr("dtype"));
+    auto dtype = op_desc.GetAttr<int>("dtype");
+    CHECK(op_desc.HasAttr("value"));
+    auto value = op_desc.GetAttr<float>("value");
+    CHECK(op_desc.HasAttr("str_value"));
+    auto str_value = op_desc.GetAttr<std::string>("str_value");
+    CHECK(op_desc.HasAttr("force_cpu"));
+    auto force_cpu = op_desc.GetAttr<bool>("force_cpu");
+
+    Variable out;
+    switch (dtype) {
+#define DO(desc, type)                                                           \
+  case ::cinn::frontend::paddle::proto::VarType::Type::VarType_Type_##desc:      \
+    out = net_builder_->FillConstant<type>(shapes, value, str_value, force_cpu); \
+    break;
+      DO(BOOL, bool);
+      DO(FP32, float);
+      DO(INT32, int);
+#undef DO
+      default:
+        LOG(FATAL) << "unknown data type " << dtype;
+    }
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_transpose2() {
+  op_mappers_["transpose2"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    auto x        = GetVar(TransValidVarName(x_name));
+    CHECK(op_desc.HasAttr("axis"));
+    auto axis = op_desc.GetAttr<std::vector<int>>("axis");
+
+    auto out = net_builder_->Transpose(x, axis);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_exp() {
+  op_mappers_["exp"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    auto x        = GetVar(TransValidVarName(x_name));
+
+    auto out = net_builder_->Exp(x);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_relu() {
+  op_mappers_["relu"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    auto x        = GetVar(TransValidVarName(x_name));
+    auto out      = net_builder_->Relu(x);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_softmax() {
+  op_mappers_["softmax"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+
+    int axis = 0;
+    if (op_desc.HasAttr("axis")) {
+      axis = op_desc.GetAttr<int>("axis");
+    } else {
+      axis = static_cast<int>(-1);
+    }
+    auto x   = GetVar(TransValidVarName(x_name));
+    auto out = net_builder_->Softmax(x, {axis});
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_elementwise_add() {
+  op_mappers_["elementwise_add"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+    auto y_name = op_desc.Input("Y").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    int axis      = op_desc.GetAttr<int>("axis");
+
+    auto x   = GetVar(TransValidVarName(x_name));
+    auto y   = GetVar(TransValidVarName(y_name));
+    auto out = net_builder_->Add(x, y, axis);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_elementwise_mul() {
+  op_mappers_["elementwise_mul"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+    auto y_name = op_desc.Input("Y").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    int axis      = op_desc.GetAttr<int>("axis");
+
+    auto x   = GetVar(TransValidVarName(x_name));
+    auto y   = GetVar(TransValidVarName(y_name));
+    auto out = net_builder_->Multiply(x, y, axis);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_elementwise_div() {
+  op_mappers_["elementwise_div"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+    auto y_name = op_desc.Input("Y").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    CHECK(op_desc.HasAttr("axis"));
+    int axis = op_desc.GetAttr<int>("axis");
+
+    auto x   = GetVar(TransValidVarName(x_name));
+    auto y   = GetVar(TransValidVarName(y_name));
+    auto out = net_builder_->Divide(x, y, axis);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_elementwise_sub() {
+  op_mappers_["elementwise_sub"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Input("Y").size(), 1UL);
+    auto y_name = op_desc.Input("Y").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    CHECK(op_desc.HasAttr("axis"));
+    int axis = op_desc.GetAttr<int>("axis");
+
+    auto x   = GetVar(TransValidVarName(x_name));
+    auto y   = GetVar(TransValidVarName(y_name));
+    auto out = net_builder_->Subtract(x, y, axis);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_relu6() {
+  op_mappers_["relu6"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+
+    absl::flat_hash_map<std::string, hlir::framework::NodeAttr::attr_t> attrs;
+    CHECK(op_desc.HasAttr("threshold"));
+    CHECK_EQ(op_desc.GetAttr<float>("threshold"), 6.0f) << "Threshold of Relu6 is not 6! To be implemented.";
+    attrs["threshold"] = op_desc.GetAttr<float>("threshold");
+
+    auto x   = GetVar(TransValidVarName(x_name));
+    auto out = net_builder_->Relu6(x);
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+void PaddleModelToProgram::AddOpMapper_depthwise_conv2d() {
+  op_mappers_["depthwise_conv2d"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+    auto x_name = op_desc.Input("Input").front();
+    CHECK_EQ(op_desc.Input("Filter").size(), 1UL);
+    auto y_name = op_desc.Input("Filter").front();
+    CHECK_EQ(op_desc.Output("Output").size(), 1UL);
+    auto out_name = op_desc.Output("Output").front();
+
+    CHECK(op_desc.HasAttr("paddings"));
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    CHECK(op_desc.HasAttr("strides"));
+    auto strides = op_desc.GetAttr<std::vector<int>>("strides");
+    CHECK(op_desc.HasAttr("dilations"));
+    auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+    CHECK(op_desc.HasAttr("groups"));
+    auto groups = op_desc.GetAttr<int>("groups");
+    CHECK(op_desc.HasAttr("data_format"));
+    std::string data_format = op_desc.GetAttr<std::string>("data_format");
+    if (data_format == "AnyLayout") {
+      data_format = "NCHW";
+    }
+    auto x = GetVar(TransValidVarName(x_name));
+    auto y = GetVar(TransValidVarName(y_name));
+    Variable out;
+    if (target_.arch == Target::Arch::X86) {
+      out = net_builder_->Conv2d(x, y, strides, paddings, dilations, groups, data_format);
+    } else {
+      out = net_builder_->DepthwiseConv2d(x, y, strides, paddings, dilations, groups, data_format);
+    }
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_conv2d() {
+  op_mappers_["conv2d"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+    auto x_name = op_desc.Input("Input").front();
+    CHECK_EQ(op_desc.Input("Filter").size(), 1UL);
+    auto y_name = op_desc.Input("Filter").front();
+    CHECK_EQ(op_desc.Output("Output").size(), 1UL);
+    auto out_name = op_desc.Output("Output").front();
+
+    CHECK(op_desc.HasAttr("paddings"));
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    CHECK(op_desc.HasAttr("strides"));
+    auto strides = op_desc.GetAttr<std::vector<int>>("strides");
+    CHECK(op_desc.HasAttr("dilations"));
+    auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+    CHECK(op_desc.HasAttr("groups"));
+    auto groups = op_desc.GetAttr<int>("groups");
+    CHECK(op_desc.HasAttr("data_format"));
+    std::string data_format = op_desc.GetAttr<std::string>("data_format");
+    if (data_format == "AnyLayout") {
+      data_format = "NCHW";
+    }
+    auto x   = GetVar(TransValidVarName(x_name));
+    auto y   = GetVar(TransValidVarName(y_name));
+    auto out = net_builder_->Conv2d(x, y, strides, paddings, dilations, groups, data_format);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_pool2d() {
+  op_mappers_["pool2d"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+
+    CHECK(op_desc.HasAttr("pooling_type"));
+    auto pool_type = op_desc.GetAttr<std::string>("pooling_type");
+    CHECK(op_desc.HasAttr("ksize"));
+    auto ksize = op_desc.GetAttr<std::vector<int>>("ksize");
+    CHECK(op_desc.HasAttr("strides"));
+    auto strides = op_desc.GetAttr<std::vector<int>>("strides");
+    CHECK(op_desc.HasAttr("paddings"));
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    CHECK(op_desc.HasAttr("ceil_mode"));
+    auto ceil_mode = op_desc.GetAttr<bool>("ceil_mode");
+    CHECK(op_desc.HasAttr("exclusive"));
+    auto exclusive = op_desc.GetAttr<bool>("exclusive");
+    CHECK(op_desc.HasAttr("data_format"));
+    auto data_format = op_desc.GetAttr<std::string>("data_format");
+    CHECK(op_desc.HasAttr("global_pooling"));
+    auto global_pooling = op_desc.GetAttr<bool>("global_pooling");
+    CHECK(op_desc.HasAttr("adaptive"));
+    auto adaptive = op_desc.GetAttr<bool>("adaptive");
+
+    auto x   = GetVar(TransValidVarName(x_name));
+    auto out = net_builder_->Pool2d(
+        x, pool_type, ksize, strides, paddings, ceil_mode, exclusive, global_pooling, data_format, adaptive);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_batchnorm() {
+  op_mappers_["batch_norm"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Input("Scale").size(), 1UL);
+    auto scale_name = op_desc.Input("Scale").front();
+    CHECK_EQ(op_desc.Input("Bias").size(), 1UL);
+    auto bias_name = op_desc.Input("Bias").front();
+    CHECK_EQ(op_desc.Input("Mean").size(), 1UL);
+    auto mean_name = op_desc.Input("Mean").front();
+    CHECK_EQ(op_desc.Input("Variance").size(), 1UL);
+    auto variance_name = op_desc.Input("Variance").front();
+    CHECK(!op_desc.Output("Y").empty());
+    auto out_name = op_desc.Output("Y").front();
+
+    auto x        = GetVar(TransValidVarName(x_name));
+    auto scale    = GetVar(TransValidVarName(scale_name));
+    auto bias     = GetVar(TransValidVarName(bias_name));
+    auto mean     = GetVar(TransValidVarName(mean_name));
+    auto variance = GetVar(TransValidVarName(variance_name));
+    CHECK(op_desc.HasAttr("epsilon"));
+    auto epsilon = op_desc.GetAttr<float>("epsilon");
+    CHECK(op_desc.HasAttr("momentum"));
+    auto momentum = op_desc.GetAttr<float>("momentum");
+    // CHECK(op_desc.HasAttr("data_format"));
+    // auto data_format = op_desc.GetAttr<std::string>("data_format");
+    std::string data_format = "NCHW";
+
+    auto out = net_builder_->BatchNorm(x, scale, bias, mean, variance, epsilon, momentum, data_format, true);
+
+    AddVar(TransValidVarName(out_name), out[0]);
+    var_model_to_program_map_[out_name] = out[0]->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_sigmoid() {
+  op_mappers_["sigmoid"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+    auto x        = GetVar(TransValidVarName(x_name));
+    auto out      = net_builder_->Sigmoid(x);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_slice() {
+  op_mappers_["slice"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("Input").size(), 1UL);
+    auto x_name = op_desc.Input("Input").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+
+    absl::flat_hash_map<std::string, hlir::framework::NodeAttr::attr_t> attrs;
+    CHECK(op_desc.HasAttr("starts"));
+    auto starts = op_desc.GetAttr<std::vector<int>>("starts");
+    CHECK(op_desc.HasAttr("ends"));
+    auto end = op_desc.GetAttr<std::vector<int>>("ends");
+    CHECK(op_desc.HasAttr("axes"));
+    auto axes = op_desc.GetAttr<std::vector<int>>("axes");
+    auto x    = GetVar(TransValidVarName(x_name));
+    auto out  = net_builder_->Slice(x, axes, starts, end);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOpMapper_dropout_infer() {
+  op_mappers_["dropout"] = [&](const paddle::cpp::OpDesc& op_desc) {
+    CHECK_EQ(op_desc.Input("X").size(), 1UL);
+    auto x_name = op_desc.Input("X").front();
+    CHECK_EQ(op_desc.Output("Out").size(), 1UL);
+    auto out_name = op_desc.Output("Out").front();
+
+    absl::flat_hash_map<std::string, hlir::framework::NodeAttr::attr_t> attrs;
+    CHECK(op_desc.HasAttr("dropout_prob"));
+    auto dropout_prob = op_desc.GetAttr<float>("dropout_prob");
+    CHECK(op_desc.HasAttr("dropout_implementation"));
+    auto dropout_implementation = op_desc.GetAttr<std::string>("dropout_implementation");
+    auto x                      = GetVar(TransValidVarName(x_name));
+    auto out                    = net_builder_->DropoutInfer(x, dropout_prob, dropout_implementation);
+
+    AddVar(TransValidVarName(out_name), out);
+    var_model_to_program_map_[out_name] = out->id;
+  };
+}
+
+void PaddleModelToProgram::AddOp(const paddle::cpp::OpDesc& op_desc) {
+  const auto& op_type = op_desc.Type();
+  auto it             = op_mappers_.find(op_type);
+  if (it != op_mappers_.end()) {
+    it->second(op_desc);
+    return;
+  }
+  // feed op's output is a input of the model
+  LOG(FATAL) << "Not supported op [" << op_desc.Type() << "] found";
+}
+
+void PaddleModelToProgram::TransposeVar(const std::string& name) {
+  CheckVarNameValid(name);
+  auto* var = scope_->FindVar(name);
+  if (var) {
+    auto& tensor = absl::get<hlir::framework::Tensor>(*var);
+    if (target_.arch == Target::Arch::X86) {
+      float* data = tensor->mutable_data<float>(target_);
+      CHECK(tensor->shape().size() == 2) << "The y data's shape size of op [mul] is not equal to 2! Please check.";
+      TransposeData(data, tensor->shape().data()[0], tensor->shape().data()[1]);
+    } else if (target_.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+      // To use cublas mul api, there is no need to transpose data.
+#ifndef CINN_WITH_CUDNN
+      std::vector<float> data(tensor->shape().numel());
+      CUDA_CALL(cudaMemcpy(data.data(),
+                           reinterpret_cast<void*>(tensor->mutable_data<float>(target_)),
+                           tensor->shape().numel() * sizeof(float),
+                           cudaMemcpyDeviceToHost));
+      CHECK(tensor->shape().size() == 2) << "The y data's shape size of op [mul] is not equal to 2! Please check.";
+      TransposeData(data.data(), tensor->shape().data()[0], tensor->shape().data()[1]);
+      CUDA_CALL(cudaMemcpy(reinterpret_cast<void*>(tensor->mutable_data<float>(target_)),
+                           data.data(),
+                           tensor->shape().numel() * sizeof(float),
+                           cudaMemcpyHostToDevice));
+#endif
+#else
+      LOG(FATAL) << "To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+    } else {
+      CINN_NOT_IMPLEMENTED
+    }
+
+    Variable var;
+    var.set_id(name);
+    std::vector<int> reverse_shape = tensor->shape().data();
+    std::reverse(reverse_shape.begin(), reverse_shape.end());
+    tensor->shape().SetData(reverse_shape);
+    var->shape = tensor->shape().data();
+    // TODO(Superjomn) Make this determined by model.
+    var->type = Float(32);
+    AddVar(name, var, true);
+  } else {
+    LOG(FATAL) << "No var called [" << name << "] exists";
+  }
+}
+
+void PaddleModelToProgram::ReverseHWVar(const std::string& name) {
+  CheckVarNameValid(name);
+  auto* var = scope_->FindVar(name);
+  if (var) {
+    auto& tensor = absl::get<hlir::framework::Tensor>(*var);
+    if (target_.arch == Target::Arch::X86) {
+      float* data = tensor->mutable_data<float>(target_);
+      CHECK(tensor->shape().size() == 4) << "The y data's shape size of op [conv2d] is not equal to 4! Please check.";
+      ReverseHWData(data, tensor->shape().data());
+    } else if (target_.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+      std::vector<float> data(tensor->shape().numel());
+      CUDA_CALL(cudaMemcpy(data.data(),
+                           reinterpret_cast<void*>(tensor->mutable_data<float>(target_)),
+                           tensor->shape().numel() * sizeof(float),
+                           cudaMemcpyDeviceToHost));
+      CHECK(tensor->shape().size() == 4) << "The y data's shape size of op [conv2d] is not equal to 4! Please check.";
+      ReverseHWData(data.data(), tensor->shape().data());
+      CUDA_CALL(cudaMemcpy(reinterpret_cast<void*>(tensor->mutable_data<float>(target_)),
+                           data.data(),
+                           tensor->shape().numel() * sizeof(float),
+                           cudaMemcpyHostToDevice));
+#else
+      LOG(FATAL) << "To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+    } else {
+      CINN_NOT_IMPLEMENTED
+    }
+  } else {
+    LOG(FATAL) << "No var called [" << name << "] exists";
+  }
+}
+
+Variable PaddleModelToProgram::GetVar(const std::string& name) {
+  CheckVarNameValid(name);
+
+  auto it = var_map_.find(name);
+  if (it != var_map_.end()) return it->second;
+
+  auto* var = scope_->FindVar(name);
+  if (var) {
+    auto& tensor = absl::get<hlir::framework::Tensor>(*var);
+    Variable var;
+    var.set_id(name);
+    var->shape = tensor->shape().data();
+    // TODO(Superjomn) Make this determined by model.
+    var->type = Float(32);
+    var.set_const(true);
+    AddVar(name, var);
+    return var;
+  }
+
+  LOG(FATAL) << "No var called [" << name << "] exists";
+  return Variable();
+}
+
+std::unique_ptr<Program> PaddleModelToProgram::operator()(const std::string& model_dir, bool is_combined) {
+  paddle::cpp::ProgramDesc program_desc;
+  paddle::LoadModelPb(model_dir, "__model__", "", scope_, &program_desc, is_combined, false, target_);
+  CHECK_EQ(program_desc.BlocksSize(), 1) << "CINN can only support the model with a single block";
+  auto* block_desc = program_desc.GetBlock<paddle::cpp::BlockDesc>(0);
+
+  for (int i = 0; i < block_desc->OpsSize(); i++) {
+    auto* op_desc = block_desc->GetOp<paddle::cpp::OpDesc>(i);
+    AddOp(*op_desc);
+  }
+  return std::unique_ptr<Program>(new Program(net_builder_->Build()));
+}
+
+void PaddleModelToProgram::AddVar(const std::string& name, const Variable& var, bool replace) {
+  CheckVarNameValid(name);
+  if (replace == false) {
+    CHECK(!var_map_.count(name)) << "Duplicate variable [" << name << "] found";
+  }
+  var_map_[name] = var;
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/paddle_model_to_program.h b/paddle/cinn/frontend/paddle_model_to_program.h
new file mode 100644
index 0000000000000..1df5d0fb661e6
--- /dev/null
+++ b/paddle/cinn/frontend/paddle_model_to_program.h
@@ -0,0 +1,141 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/container/flat_hash_set.h>
+#include <absl/types/variant.h>
+#include <glog/logging.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/object.h"
+#include "cinn/common/type.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/paddle/cpp/program_desc.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/scope.h"
+
+namespace cinn {
+namespace frontend {
+
+class PaddleModelToProgram {
+ public:
+  explicit PaddleModelToProgram(hlir::framework::Scope* scope,
+                                std::unordered_map<std::string, std::vector<int>> input_shape_map,
+                                const common::Target& target)
+      : scope_(scope),
+        input_shape_map_(input_shape_map),
+        target_(target),
+        net_builder_(new NetBuilder("paddle_model_netbuilder")) {
+    CHECK(scope_);
+
+    AddOpMapper_feed();
+    AddOpMapper_fetch();
+    AddOpMapper_mul();
+    AddOpMapper_scale();
+    AddOpMapper_relu();
+    AddOpMapper_elementwise_add();
+    AddOpMapper_elementwise_mul();
+    AddOpMapper_elementwise_div();
+    AddOpMapper_elementwise_sub();
+    AddOpMapper_conv2d();
+    AddOpMapper_batchnorm();
+    AddOpMapper_pool2d();
+    AddOpMapper_softmax();
+    AddOpMapper_relu6();
+    AddOpMapper_depthwise_conv2d();
+    AddOpMapper_sigmoid();
+    AddOpMapper_slice();
+    AddOpMapper_dropout_infer();
+    AddOpMapper_matmul();
+    AddOpMapper_reshape2();
+    AddOpMapper_concat();
+    AddOpMapper_assign();
+    AddOpMapper_fill_constant();
+    AddOpMapper_transpose2();
+    AddOpMapper_exp();
+  }
+
+  std::unique_ptr<Program> operator()(const std::string& model_dir, bool is_combined);
+
+  // Add an Instruction to a program given a Paddle-format \p op_desc.
+  void AddOp(const paddle::cpp::OpDesc& op_desc);
+
+  // @{
+  void AddOpMapper_feed();
+  void AddOpMapper_fetch();
+  void AddOpMapper_scale();
+  void AddOpMapper_mul();
+  void AddOpMapper_relu();
+  void AddOpMapper_elementwise_add();
+  void AddOpMapper_elementwise_mul();
+  void AddOpMapper_elementwise_div();
+  void AddOpMapper_elementwise_sub();
+  void AddOpMapper_conv2d();
+  void AddOpMapper_batchnorm();
+  void AddOpMapper_pool2d();
+  void AddOpMapper_softmax();
+  void AddOpMapper_relu6();
+  void AddOpMapper_depthwise_conv2d();
+  void AddOpMapper_sigmoid();
+  void AddOpMapper_slice();
+  void AddOpMapper_dropout_infer();
+  void AddOpMapper_matmul();
+  void AddOpMapper_reshape2();
+  void AddOpMapper_concat();
+  void AddOpMapper_assign();
+  void AddOpMapper_fill_constant();
+  void AddOpMapper_transpose2();
+  void AddOpMapper_exp();
+  // @}
+
+  const absl::flat_hash_map<std::string, Variable>& var_map() const { return var_map_; }
+  const absl::flat_hash_map<std::string, std::string>& var_model_to_program_map() { return var_model_to_program_map_; }
+  const absl::flat_hash_set<std::string>& fetch_names() { return fetch_names_; }
+
+ protected:
+  void AddVar(const std::string& name, const Variable& var, bool replace = false);
+
+  Variable GetVar(const std::string& name);
+
+  void TransposeVar(const std::string& name);
+
+  void ReverseHWVar(const std::string& name);
+
+ private:
+  // op mapper
+  absl::flat_hash_map<std::string, std::function<void(const paddle::cpp::OpDesc&)>> op_mappers_;
+  std::unordered_map<std::string, std::vector<int>> input_shape_map_;
+  // net builder
+  std::unique_ptr<NetBuilder> net_builder_;
+
+  absl::flat_hash_map<std::string, Variable> var_map_;
+  absl::flat_hash_set<std::string> fetch_names_;
+  // map from var in Paddle model to var name in program.
+  absl::flat_hash_map<std::string, std::string> var_model_to_program_map_;
+  hlir::framework::Scope* scope_{};
+  common::Target target_;
+};
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/pass/CMakeLists.txt b/paddle/cinn/frontend/pass/CMakeLists.txt
new file mode 100755
index 0000000000000..67cd9b5786510
--- /dev/null
+++ b/paddle/cinn/frontend/pass/CMakeLists.txt
@@ -0,0 +1,36 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    decomposer.cc
+    dead_code_eliminate.cc
+    remove_identity.cc
+    transpose_collapsing.cc
+    transpose_folding_input.cc
+    transpose_folding_output.cc
+    gemm_rewriter.cc
+    fill_constant_rewriter.cc
+    fill_constant_folding.cc
+    cast_collapsing.cc
+    auto_cast.cc
+    expand_zero_dim_pass.cc
+    auto_broadcast.cc
+    )
+
+if (WITH_CUDA)
+cc_test(test_decomposer_pass SRCS decomposer_test.cc DEPS cinncore)
+cc_test(test_dead_code_eliminate_pass SRCS dead_code_eliminate_test.cc DEPS cinncore)
+cc_test(test_remove_identity_pass SRCS remove_identity_test.cc DEPS cinncore)
+cc_test(test_fill_constant_rewriter_pass SRCS fill_constant_rewriter_test.cc DEPS cinncore)
+cc_test(test_fill_constant_folding_pass SRCS fill_constant_folding_test.cc DEPS cinncore)
+cc_test(test_program_topoerror SRCS program_topoerror_test.cc DEPS cinncore)
+endif()
+if (WITH_CUDNN)
+cc_test(test_gemm_rewriter_pass SRCS gemm_rewriter_test.cc DEPS cinncore)
+cc_test(test_transpose_folding_input_pass SRCS transpose_folding_input_test.cc DEPS cinncore)
+cc_test(test_transpose_folding_output_pass SRCS transpose_folding_output_test.cc DEPS cinncore)
+cc_test(test_transpose_scale_folding SRCS transpose_scale_folding_test.cc DEPS cinncore)
+endif()
+cc_test(test_transpose_collapsing SRCS transpose_collapsing_test.cc DEPS cinncore)
+cc_test(test_cast_collapsing SRCS cast_collapsing_test.cc DEPS cinncore)
+cc_test(test_auto_cast SRCS auto_cast_test.cc DEPS cinncore)
+cc_test(test_expand_zero_dim_pass SRCS expand_zero_dim_pass_test.cc DEPS cinncore)
diff --git a/paddle/cinn/frontend/pass/auto_broadcast.cc b/paddle/cinn/frontend/pass/auto_broadcast.cc
new file mode 100644
index 0000000000000..8be83f2fb568f
--- /dev/null
+++ b/paddle/cinn/frontend/pass/auto_broadcast.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/utils/string.h"
+#include "cinn/utils/type_defs.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace frontend {
+namespace pass {
+
+class AutoBroadcastPass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+
+ private:
+  std::vector<int> GetBroadcastAxes(const cinn::utils::ShapeType& input_shape,
+                                    const cinn::utils::ShapeType& output_shape,
+                                    int axis) {
+    std::vector<int> broadcast_axes;
+    if (input_shape.size() == output_shape.size()) {
+      for (int idx = 0; idx < input_shape.size(); ++idx) {
+        broadcast_axes.push_back(idx);
+      }
+    } else {
+      if (axis == -1) {
+        axis = output_shape.size() - input_shape.size();
+      }
+      CHECK_LE(axis + input_shape.size(), output_shape.size())
+          << "Cannot Broadcast from shape=[" << cinn::utils::Join(input_shape, ", ") << "] to shape=["
+          << cinn::utils::Join(output_shape, ", ") << "] with axis=" << axis;
+      for (int idx = 0; idx < input_shape.size(); ++idx) {
+        broadcast_axes.push_back(axis++);
+      }
+    }
+    return broadcast_axes;
+  }
+
+  void InsertBroadcastTo(NetBuilder* builder, Instruction* broadcast_op) {
+    const auto& instr   = *broadcast_op;
+    const auto& op_name = instr->op_type;
+
+    const auto& op_pattern_dict_ =
+        &cinn::hlir::framework::Operator::GetAttrs<cinn::hlir::framework::OpPatternKind>("OpPattern");
+    const auto* op = cinn::hlir::framework::Operator::Get(op_name);
+    if (!op_pattern_dict_->Find(op) || (*op_pattern_dict_)[op] != cinn::hlir::framework::kBroadcast) {
+      // no set OpPattern or not broadcast kind operator, skip
+      builder->AppendInstruction(instr);
+      return;
+    }
+    if (instr->inputs.size() <= 1) {
+      // skip broadcast_to and other op
+      builder->AppendInstruction(instr);
+      return;
+    }
+
+    const auto& outputs = instr.GetOutputs();
+    CHECK_EQ(outputs.size(), 1) << "The broadcast operator should has and only has one output";
+    const auto& output = outputs.front();
+
+    int axis = -1;
+    if (instr->attrs.count("axis")) {
+      axis = instr.GetAttrs<int>("axis");
+    }
+
+    bool need_insert = false;
+    std::vector<Variable> new_inputs;
+    for (auto input : instr->inputs) {
+      if (input->shape == output->shape) {
+        // if shape same, no need broadcast
+        new_inputs.emplace_back(input);
+      } else {
+        // else insert broadcast_to
+        need_insert = true;
+
+        auto new_var = builder->BroadcastTo(input, output->shape, GetBroadcastAxes(input->shape, output->shape, axis));
+        new_inputs.emplace_back(new_var);
+      }
+    }
+
+    if (need_insert) {
+      VLOG(4) << "Before Insert broadcast_to: " << *broadcast_op;
+      // update origin broadcast op's input and attribute
+      broadcast_op->SetInputs(std::move(new_inputs));
+      (*broadcast_op)->attrs["axis"] = -1;
+      VLOG(4) << "After Insert broadcast_to: " << *broadcast_op;
+    }
+    // append new broadcast
+    builder->AppendInstruction(*broadcast_op);
+  }
+
+ protected:
+  void ApplyImpl(Program* program,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) override {
+    NetBuilder builder("auto_broadcast_builder");
+    for (auto& var : program->GetInputs()) {
+      builder.CreateInput(var);
+    }
+    for (int i = 0; i < program->size(); ++i) {
+      auto& instr = (*program)[i];
+
+      InsertBroadcastTo(&builder, &instr);
+    }
+    *program = builder.Build();
+  }
+
+  void Clear() override {}
+};
+
+}  // namespace pass
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(AutoBroadcast) {
+  CINN_REGISTER_PROGRAM_PASS(AutoBroadcast, cinn::frontend::pass::AutoBroadcastPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/auto_cast.cc b/paddle/cinn/frontend/pass/auto_cast.cc
new file mode 100644
index 0000000000000..aa5440bd67595
--- /dev/null
+++ b/paddle/cinn/frontend/pass/auto_cast.cc
@@ -0,0 +1,250 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace frontend {
+namespace pass {
+
+namespace {
+using CastImplFunc = std::function<void(NetBuilder* builder, const Instruction&)>;
+
+bool IsInputHasFP16OrBF16(const std::vector<Variable>& inputs) {
+  return std::find_if(inputs.begin(), inputs.end(), [](const Variable& var) {
+           return var->type.is_float16() || var->type.is_bfloat16();
+         }) != inputs.end();
+}
+
+Instruction CreateNewCastInstruction(const Variable& input, const Variable& output) {
+  Instruction new_cast_instr("cast", {input});
+  new_cast_instr->outputs       = {output};
+  new_cast_instr->attrs         = {{"dtype", common::Type2Str(output->type)}};
+  new_cast_instr->attrs_ordered = {{"dtype", common::Type2Str(output->type)}};
+  return new_cast_instr;
+}
+
+Instruction CreateNewIdentityInstruction(const Variable& input, const Variable& output) {
+  Instruction new_identity_instr("identity", {input});
+  new_identity_instr->outputs = {output};
+  return new_identity_instr;
+}
+
+void CommonCastImpl(NetBuilder* builder, const Instruction& instr) {
+  if (!IsInputHasFP16OrBF16(instr->inputs)) {
+    // DO NOT NEED CAST
+    builder->AppendInstruction(instr);
+    return;
+  }
+
+  // Cast all fp16/bf16 inputs to fp32
+  std::vector<Variable> casted_inputs;
+  for (const auto& var : instr->inputs) {
+    auto casted_var = var;
+    if (var->type.is_float16() || var->type.is_bfloat16()) {
+      casted_var = builder->Cast(var, "float32");
+    }
+    casted_inputs.emplace_back(casted_var);
+  }
+  // Run fp32 op
+  const auto& outputs = builder->CustomInstr(instr->op_type, casted_inputs, instr->attrs);
+  // Cast all fp32 outputs to fp16/bf16
+  for (int i = 0; i < outputs.size(); ++i) {
+    if (outputs[i]->type.is_float(32)) {
+      builder->AppendInstruction(CreateNewCastInstruction(outputs[i], instr->outputs[i]));
+    }
+  }
+}
+
+static std::unordered_map<std::string, CastImplFunc> need_cast_list = {
+    // math function
+    {"sin", CommonCastImpl},
+    {"cos", CommonCastImpl},
+    {"exp", CommonCastImpl},
+    {"log", CommonCastImpl},
+    {"log2", CommonCastImpl},
+    {"log10", CommonCastImpl},
+    {"sqrt", CommonCastImpl},
+    {"rsqrt", CommonCastImpl},
+    {"cbrt", CommonCastImpl},
+    {"erf", CommonCastImpl},
+    {"sinh", CommonCastImpl},
+    {"cosh", CommonCastImpl},
+    {"tanh", CommonCastImpl},
+    {"asin", CommonCastImpl},
+    {"acos", CommonCastImpl},
+    {"atan", CommonCastImpl},
+    {"asinh", CommonCastImpl},
+    {"acosh", CommonCastImpl},
+    {"atanh", CommonCastImpl},
+    {"remainder", CommonCastImpl},
+    {"pow", CommonCastImpl},
+    // reduce
+    {"reduce_sum", CommonCastImpl},
+    {"reduce_prod", CommonCastImpl},
+    // composite function
+    {"sigmoid", CommonCastImpl},
+    {"sum", CommonCastImpl},
+    {"softmax", CommonCastImpl},
+    {"gelu", CommonCastImpl},
+    {"batch_norm",
+     [](NetBuilder* builder, const Instruction& instr) {
+       if (!IsInputHasFP16OrBF16(instr->inputs)) {
+         // DO NOT NEED CAST
+         builder->AppendInstruction(instr);
+         return;
+       }
+
+       // Except input [X], BatchNormTrain's Input should all be fp32
+       CHECK_EQ(instr->inputs.size(), 5UL)
+           << "The number of the given inputs is not equal to the required for op " << instr->op_type;
+       CHECK(instr->inputs[1]->type.is_float(32))
+           << instr->op_type << "'s input [scale] should be float32, but here " << instr->inputs[1]->type;
+       CHECK(instr->inputs[2]->type.is_float(32))
+           << instr->op_type << "'s input [bias] should be float32, but here " << instr->inputs[1]->type;
+       CHECK(instr->inputs[3]->type.is_float(32))
+           << instr->op_type << "'s input [moving_mean] should be float32, but here " << instr->inputs[1]->type;
+       CHECK(instr->inputs[4]->type.is_float(32))
+           << instr->op_type << "'s input [moving_variance] should be float32, but here " << instr->inputs[1]->type;
+
+       // Cast input [X] from fp16/bf16 to fp32
+       const auto& x        = instr->inputs[0];
+       const auto& x_casted = builder->Cast(x, "float32");
+
+       auto casted_inputs = instr->inputs;
+       casted_inputs[0]   = x_casted;
+       // Run fp32 function
+       const auto& outputs = builder->CustomInstr(instr->op_type, casted_inputs, instr->attrs);
+       // Cast output [Y] to fp16/bf16, no other output
+       builder->AppendInstruction(CreateNewCastInstruction(outputs[0], instr->outputs[0]));
+     }},
+    {"batch_norm_train",
+     [](NetBuilder* builder, const Instruction& instr) {
+       if (!IsInputHasFP16OrBF16(instr->inputs)) {
+         // DO NOT NEED CAST
+         builder->AppendInstruction(instr);
+         return;
+       }
+
+       // Except input [X], BatchNormTrain's Input should all be fp32
+       CHECK_EQ(instr->inputs.size(), 5UL)
+           << "The number of the given inputs is not equal to the required for op " << instr->op_type;
+       CHECK(instr->inputs[1]->type.is_float(32))
+           << instr->op_type << "'s input [scale] should be float32, but here " << instr->inputs[1]->type;
+       CHECK(instr->inputs[2]->type.is_float(32))
+           << instr->op_type << "'s input [bias] should be float32, but here " << instr->inputs[1]->type;
+       CHECK(instr->inputs[3]->type.is_float(32))
+           << instr->op_type << "'s input [moving_mean] should be float32, but here " << instr->inputs[1]->type;
+       CHECK(instr->inputs[4]->type.is_float(32))
+           << instr->op_type << "'s input [moving_variance] should be float32, but here " << instr->inputs[1]->type;
+
+       // Cast input [X] from fp16/bf16 to fp32
+       const auto& x        = instr->inputs[0];
+       const auto& x_casted = builder->Cast(x, "float32");
+
+       auto casted_inputs = instr->inputs;
+       casted_inputs[0]   = x_casted;
+       // Run fp32 function
+       const auto& outputs = builder->CustomInstr(instr->op_type, casted_inputs, instr->attrs);
+       // Cast output [Y] to fp16/bf16
+       builder->AppendInstruction(CreateNewCastInstruction(outputs[0], instr->outputs[0]));
+       // Identity other output
+       for (int i = 1; i < outputs.size(); ++i) {
+         builder->AppendInstruction(CreateNewIdentityInstruction(outputs[i], instr->outputs[i]));
+       }
+     }},
+    {"batch_norm_grad", [](NetBuilder* builder, const Instruction& instr) {
+       if (!IsInputHasFP16OrBF16(instr->inputs)) {
+         // DO NOT NEED CAST
+         builder->AppendInstruction(instr);
+         return;
+       }
+
+       // Except input [X], BatchNormTrain's Input should all be fp32
+       CHECK_EQ(instr->inputs.size(), 5UL)
+           << "The number of the given inputs is not equal to the required for op " << instr->op_type;
+       CHECK_EQ(instr->inputs[0]->type, instr->inputs[1]->type)
+           << instr->op_type << "'s input [Y@GRAD] and input [X] 's type should be the same";
+       CHECK(instr->inputs[2]->type.is_float(32))
+           << instr->op_type << "'s input [scale] should be float32, but here " << instr->inputs[1]->type;
+       CHECK(instr->inputs[3]->type.is_float(32))
+           << instr->op_type << "'s input [save_mean] should be float32, but here " << instr->inputs[1]->type;
+       CHECK(instr->inputs[4]->type.is_float(32))
+           << instr->op_type << "'s input [save_variance] should be float32, but here " << instr->inputs[1]->type;
+
+       // Cast input [Y@GRAD] from fp16/bf16 to fp32
+       const auto& y_grad        = instr->inputs[0];
+       const auto& y_grad_casted = builder->Cast(y_grad, "float32");
+
+       // Cast input [X] from fp16/bf16 to fp32
+       const auto& x        = instr->inputs[1];
+       const auto& x_casted = builder->Cast(x, "float32");
+
+       auto casted_inputs = instr->inputs;
+       casted_inputs[0]   = y_grad_casted;
+       casted_inputs[1]   = x_casted;
+       // Run fp32 function
+       const auto& outputs = builder->CustomInstr(instr->op_type, casted_inputs, instr->attrs);
+       // Cast output [X@GRAD] to fp16/bf16
+       builder->AppendInstruction(CreateNewCastInstruction(outputs[0], instr->outputs[0]));
+       // Identity other output
+       for (int i = 1; i < outputs.size(); ++i) {
+         builder->AppendInstruction(CreateNewIdentityInstruction(outputs[i], instr->outputs[i]));
+       }
+     }}};
+}  // namespace
+
+class AutoCastPass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+
+ protected:
+  void ApplyImpl(Program* program,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) override {
+    NetBuilder builder("auto_cast_builder");
+    for (auto& var : program->GetInputs()) {
+      builder.CreateInput(var);
+    }
+    for (int i = 0; i < program->size(); ++i) {
+      auto& instr = (*program)[i];
+
+      if (need_cast_list.count(instr->op_type)) {
+        need_cast_list.at(instr->op_type)(&builder, instr);
+      } else {
+        builder.AppendInstruction(instr);
+      }
+    }
+    *program = builder.Build();
+  }
+
+  void Clear() override {}
+};
+
+}  // namespace pass
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(AutoCast) {
+  CINN_REGISTER_PROGRAM_PASS(AutoCast, cinn::frontend::pass::AutoCastPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/auto_cast_test.cc b/paddle/cinn/frontend/pass/auto_cast_test.cc
new file mode 100644
index 0000000000000..ab9b07fb113dc
--- /dev/null
+++ b/paddle/cinn/frontend/pass/auto_cast_test.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <cfloat>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/pass/pass_test_helper.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+namespace cinn::frontend {
+
+TEST(AutoCast, Exp) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(common::Float16(), {4, 5, 3}, "X");
+  auto out     = builder.Exp(x);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{}, {"AutoCast", "Decomposer"}};
+  CompareProgramPassResult(&program, target, {out->id}, -2, passes);
+}
+
+TEST(AutoCast, Exp_bf16) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(common::BFloat16(), {4, 5, 3}, "X");
+  auto out     = builder.Exp(x);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{}, {"AutoCast", "Decomposer"}};
+  CompareProgramPassResult(&program, target, {out->id}, -2, passes);
+}
+
+TEST(AutoCast, BatchNorm) {
+  NetBuilder builder("net_builder");
+  auto x        = builder.CreateInput(common::Float16(), {128, 64, 112, 112}, "X");
+  auto scale    = builder.FillConstant({64}, 1.0f, "scale", "float32");
+  auto bias     = builder.FillConstant({64}, 0.0f, "bias", "float32");
+  auto mean     = builder.FillConstant({64}, 0.0f, "mean", "float32");
+  auto variance = builder.FillConstant({64}, 1.0f, "variance", "float32");
+  auto out      = builder.BatchNorm(x, scale, bias, mean, variance, 1e-5f, 0.9f, "NCHW", false);
+  auto program  = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{}, {"AutoCast", "Decomposer"}};
+  CompareProgramPassResult(&program, target, {out[0]->id}, -2, passes);
+}
+
+TEST(AutoCast, BatchNorm_bf16) {
+  NetBuilder builder("net_builder");
+  auto x        = builder.CreateInput(common::BFloat16(), {128, 64, 112, 112}, "X");
+  auto scale    = builder.FillConstant({64}, 1.0f, "scale", "float32");
+  auto bias     = builder.FillConstant({64}, 0.0f, "bias", "float32");
+  auto mean     = builder.FillConstant({64}, 0.0f, "mean", "float32");
+  auto variance = builder.FillConstant({64}, 1.0f, "variance", "float32");
+  auto out      = builder.BatchNorm(x, scale, bias, mean, variance, 1e-5f, 0.9f, "NCHW", false);
+  auto program  = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{}, {"AutoCast", "Decomposer"}};
+  CompareProgramPassResult(&program, target, {out[0]->id}, -2, passes);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/cast_collapsing.cc b/paddle/cinn/frontend/pass/cast_collapsing.cc
new file mode 100644
index 0000000000000..50e7001e09b32
--- /dev/null
+++ b/paddle/cinn/frontend/pass/cast_collapsing.cc
@@ -0,0 +1,347 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/container/flat_hash_map.h>
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn::frontend::pass {
+
+class CastKey {
+ public:
+  CastKey(const std::string& input_id, const std::string& cast_type) { SetKey(input_id, cast_type); }
+
+  void SetKey(const std::string& input_id, const std::string& cast_type) {
+    input_id_  = input_id;
+    cast_type_ = cast_type;
+  }
+
+  bool operator==(const CastKey& other) const { return cast_type_ == other.cast_type_ && input_id_ == other.input_id_; }
+  bool operator!=(const CastKey& other) const { return !this->operator==(other); }
+
+  struct Hash {
+    size_t operator()(const CastKey& key) const { return std::hash<std::string>()(key.input_id_ + key.cast_type_); }
+  };
+
+ private:
+  std::string input_id_;
+  std::string cast_type_;
+};
+
+// Pass `CastCollapsing` folds multi cast into one.
+class CastCollapsingPass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+  using OutputToOpMap = std::unordered_map<std::string, Instruction*>;
+  using InputToOpMap  = std::unordered_map<std::string, std::unordered_set<Instruction*>>;
+
+ protected:
+  void Clear() override {}
+
+  void ApplyImpl(Program* program,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) const override {
+    // `out2instr` is used to represent the mapping of Output to Instruction.
+    OutputToOpMap out2instr;
+    // `in2instr` is used to represent the mapping of Input to Instruction.
+    InputToOpMap in2instr;
+    // all cast op in program
+    std::unordered_set<Instruction*> all_cast;
+
+    for (size_t i = 0; i < program->size(); ++i) {
+      auto& instr = (*program)[i];
+      for (const auto& out : instr->outputs) {
+        out2instr[out->id] = &instr;
+      }
+      for (const auto& in : instr->inputs) {
+        in2instr[in->id].insert(&instr);
+      }
+      if ("cast" == instr->op_type) {
+        all_cast.insert(&instr);
+      }
+    }
+
+    // the useless cast op need to remove from program
+    std::unordered_set<Instruction*> remove_instrs;
+    FoldingCastVertical(all_cast, fetch_ids, in2instr, out2instr, &remove_instrs);
+
+    for (auto instr : remove_instrs) {
+      if (all_cast.count(instr)) {
+        all_cast.erase(instr);
+      }
+    }
+    // TODO(thisjiang): reopen after CINN support recompute for performance
+    // due to recompute unsupported, if the op output to two group, it will also create a new group,
+    // so that the horizontal fuse will not improve performance.
+    // FoldingCastHorizontal(all_cast, fetch_ids, in2instr, out2instr, &remove_instrs);
+
+    NetBuilder builder("cast_collapsing_builder");
+    for (auto& var : program->GetInputs()) {
+      builder.CreateInput(var);
+    }
+    for (int i = 0; i < program->size(); i++) {
+      if (remove_instrs.end() == remove_instrs.find(&(*program)[i])) {
+        builder.AppendInstruction((*program)[i]);
+      }
+    }
+    *program = builder.Build();
+  }
+
+ private:
+  void FoldingCastVertical(const std::unordered_set<Instruction*>& all_cast,
+                           const std::unordered_set<std::string>& fetch_ids,
+                           const InputToOpMap& in2instr,
+                           const OutputToOpMap& out2instr,
+                           std::unordered_set<Instruction*>* remove_instrs) const {
+    if (all_cast.size() == 1) {
+      return;
+    }
+    // the cast op should not remove
+    std::unordered_set<Instruction*> visited_instrs;
+    for (auto cast : all_cast) {
+      if (("cast" != (*cast)->op_type) || visited_instrs.count(cast)) {
+        // the cast op had been fused, skip
+        continue;
+      }
+
+      // Fuse cast from front to back, the fuse path is unique
+      auto first_cast = FindFirstCast(cast, out2instr);
+      TryFuseCast(first_cast, fetch_ids, in2instr, remove_instrs, &visited_instrs);
+    }
+  }
+
+  Instruction* FindFirstCast(Instruction* cast, const OutputToOpMap& out2instr) const {
+    auto first_cast = cast;
+
+    auto input_name = (*first_cast)->inputs.front()->id;
+    // Q: Why check whether cast's input in out2instr ?
+    // A: The input may be the input of the graph other than another op's output.
+    //    Obviously, the cast op is the first cast in the situation.
+    while (out2instr.count(input_name)) {
+      auto instr = out2instr.at(input_name);
+      if ("cast" != (*instr)->op_type) {
+        // if input of cast is not output of another cast, it is the first cast.
+        break;
+      }
+
+      input_name = (*instr)->inputs.front()->id;
+      first_cast = instr;
+    }
+    return first_cast;
+  }
+
+  void TryFuseCast(Instruction* cast,
+                   const std::unordered_set<std::string>& fetch_ids,
+                   const InputToOpMap& in2instr,
+                   std::unordered_set<Instruction*>* remove_instrs,
+                   std::unordered_set<Instruction*>* visited_instrs) const {
+    visited_instrs->insert(cast);
+
+    const auto& input       = (*cast)->inputs.front();
+    const auto& input_name  = input->id;
+    const auto& input_dtype = input->type;
+
+    const auto& output       = (*cast)->outputs.front();
+    const auto& output_name  = output->id;
+    const auto& output_dtype = output->type;
+
+    const auto& dtype = cast->GetAttrs<std::string>("dtype");
+
+    const auto& cast_info = output_name + "=cast(" + input_name + ", dtype=" + dtype + ")";
+
+    bool can_remove = !fetch_ids.count(output_name);
+
+    if (CheckCastBorder(cast, in2instr)) {
+      if (can_remove) {
+        VLOG(4) << "The op " << cast_info << " is a output op of graph, cannot fuse, remove.";
+        // this cast not used by any other op, remove
+        remove_instrs->insert(cast);
+      } else {
+        if (input_dtype == output_dtype) {
+          VLOG(4) << "The cast op " << cast_info << " is fetched but useless, replace with identity.";
+          // cannot remove, however, the transpose is useless, we can replace the cast with identity for more
+          // fusion opportunity
+          ReplaceWithIdentity(cast);
+        }
+        // else the transpose is fetched and helpful, ignore
+      }
+      return;
+    }
+
+    // CheckCastBorder ensure `output_name` existed in `in2instr`
+    const auto& out_instrs = in2instr.at(output_name);
+    if (input_dtype == output_dtype) {
+      if (!can_remove) {
+        VLOG(4) << "The cast op " << cast_info << " is useless but fetched, replace with identity.";
+        // cannot remove, but we can replace the cast with indentiy for more fusion opportunity
+        ReplaceWithIdentity(cast);
+      } else {
+        VLOG(4) << "The cast op " << cast_info << " is useless, remove.";
+        for (auto instr : out_instrs) {
+          // replace the input to cast's input
+          ReplaceInputVariable(instr, output_name, input);
+        }
+        remove_instrs->insert(cast);
+
+        for (auto instr : out_instrs) {
+          if ("cast" == (*instr)->op_type) {
+            // if the next instruction is cast op, continue fuse
+            TryFuseCast(instr, fetch_ids, in2instr, remove_instrs, visited_instrs);
+          }
+        }
+      }
+      return;
+    }
+
+    if (!CheckOutputContainCast(cast, in2instr)) {
+      VLOG(4) << "The cast op " << cast_info << " doesn't has output link to cast, skip.";
+      return;
+    }
+
+    std::unordered_set<Instruction*> next_fused_instrs;
+
+    for (auto instr : out_instrs) {
+      if ("cast" != (*instr)->op_type) {
+        // the cast was used by other non-cast op, cannot remove, skip
+        can_remove = false;
+        VLOG(4) << "Fuse cast of " << cast_info << " was used by " << (*instr)->op_type << ", cannot remove.";
+        continue;
+      }
+
+      const auto& next_dtype = instr->GetAttrs<std::string>("dtype");
+
+      VLOG(4) << "Fuse cast of " << cast_info << " and cast of " << (*instr)->outputs.front()->id << "=cast("
+              << (*instr)->inputs.front()->id << ", dtype=" << next_dtype << ")"
+              << " into cast of " << (*instr)->outputs.front()->id << "=cast(" << input_name << ", dtype=" << next_dtype
+              << ")";
+
+      auto fused_cast = FuseCastImpl(cast, instr, next_dtype);
+
+      next_fused_instrs.insert(fused_cast);
+    }
+
+    if (can_remove) {
+      VLOG(4) << "Remove cast of " << cast_info;
+      remove_instrs->insert(cast);
+    }
+
+    for (auto instr : next_fused_instrs) {
+      TryFuseCast(instr, fetch_ids, in2instr, remove_instrs, visited_instrs);
+    }
+  }
+
+  // check whether the op is the border op of graph, in other words, its output var was not
+  // used by any op in graph.
+  bool CheckCastBorder(Instruction* cast, const InputToOpMap& in2instr) const {
+    const auto& output_name = (*cast)->outputs.front()->id;
+    return !in2instr.count(output_name) || in2instr.at(output_name).empty();
+  }
+
+  // check whether the op's output ops has cast, if not, no cast need folding
+  bool CheckOutputContainCast(Instruction* cast, const InputToOpMap& in2instr) const {
+    const auto& output_name = (*cast)->outputs.front()->id;
+    for (auto instr : in2instr.at(output_name)) {
+      if ("cast" == (*instr)->op_type) {
+        return true;
+      }
+    }
+    // the first cast's output is not anyone cast's input
+    return false;
+  }
+
+  // replace the op's input variable whose name is `old_input_name` to `new_input`, note we need keep the input list
+  // order
+  void ReplaceInputVariable(Instruction* op, const std::string& old_input_name, const Variable& new_input) const {
+    auto find_input = [&](const std::string& input_name) {
+      return std::find_if(
+          (*op)->inputs.begin(), (*op)->inputs.end(), [&](const Variable& v) { return input_name == v->id; });
+    };
+
+    // Why Loop : To avoid the op's inputs are the same variable !
+    for (auto it = find_input(old_input_name); it != (*op)->inputs.end(); it = find_input(old_input_name)) {
+      // erase previous fill_constant output var and replace to new fill_constant output var
+      auto next_it = (*op)->inputs.erase(it);
+      // keep the input place same, it's very important
+      (*op)->inputs.insert(next_it, new_input);
+    }
+  }
+
+  Instruction* ReplaceWithIdentity(Instruction* op) const {
+    (*op)->op_type = "identity";
+    (*op)->attrs.clear();
+    (*op)->attrs_ordered.clear();
+    return op;
+  }
+
+  // fuse the two cast dtype into the second cast, replace its input and dtype
+  Instruction* FuseCastImpl(Instruction* cast1, Instruction* cast2, const std::string& fused_dtype) const {
+    (*cast2)->inputs.front() = (*cast1)->inputs.front();
+    cast2->SetAttr("dtype", fused_dtype);
+    return cast2;
+  }
+
+  // if the casts have the same input and dtype, they can folding into one, the redundance should remove
+  void FoldingCastHorizontal(const std::unordered_set<Instruction*>& all_cast,
+                             const std::unordered_set<std::string>& fetch_ids,
+                             const InputToOpMap& in2instr,
+                             const OutputToOpMap& out2instr,
+                             std::unordered_set<Instruction*>* remove_instrs) const {
+    std::unordered_map<CastKey, Variable*, CastKey::Hash> first_cast_map;
+    for (auto cast : all_cast) {
+      if (("cast" != (*cast)->op_type) || remove_instrs->count(cast)) {
+        continue;
+      }
+
+      const auto& input_id  = (*cast)->inputs.front()->id;
+      const auto& output_id = (*cast)->outputs.front()->id;
+      const auto& dtype     = cast->GetAttrs<std::string>("dtype");
+
+      CastKey key(input_id, dtype);
+      if (!first_cast_map.count(key)) {
+        VLOG(4) << "The cast, whose output [" << output_id << "], cannot remove because it is the first cast ! ";
+        first_cast_map.emplace(key, &(*cast)->outputs.front());
+        continue;
+      }
+
+      if (fetch_ids.find(output_id) != fetch_ids.end()) {
+        // the cast's output variable was fetched, skip
+        VLOG(4) << "Cannot remove cast, because the output [" << output_id << "] was fetched by other op ! ";
+        continue;
+      }
+
+      VLOG(4) << "Try remove cast, whose output [" << output_id << "]. ";
+      remove_instrs->insert(cast);
+
+      const auto& output_ops = in2instr.at(output_id);
+      for (auto op : output_ops) {
+        ReplaceInputVariable(op, output_id, *first_cast_map.at(key));
+      }
+    }
+  }
+};
+
+}  // namespace cinn::frontend::pass
+
+CINN_REGISTER_HELPER(CastCollapsing) {
+  CINN_REGISTER_PROGRAM_PASS(CastCollapsing, ::cinn::frontend::pass::CastCollapsingPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/cast_collapsing_test.cc b/paddle/cinn/frontend/pass/cast_collapsing_test.cc
new file mode 100644
index 0000000000000..1dca1c6ed0875
--- /dev/null
+++ b/paddle/cinn/frontend/pass/cast_collapsing_test.cc
@@ -0,0 +1,200 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <cfloat>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/pass/pass_test_helper.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/data_util.h"
+
+namespace cinn::frontend {
+
+TEST(CastCollapsing, FuseTwoCast) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_t     = builder.Cast(x, "float16");
+  auto out     = builder.Cast(x_t, "float32");
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id()}, std::back_inserter(input_ids), [](absl::string_view id) {
+    return std::string(id);
+  });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer"}, {"CastCollapsing"}};
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(CastCollapsing, FuseThreeCast) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_1t    = builder.Cast(x, "int32");
+  auto x_2t    = builder.Cast(x_1t, "int64");
+  auto out     = builder.Cast(x_2t, "float32");
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id()}, std::back_inserter(input_ids), [](absl::string_view id) {
+    return std::string(id);
+  });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer"}, {"CastCollapsing"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(CastCollapsing, ReplaceUselessCastWithIndentity) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto out     = builder.Cast(x, "float32");
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id()}, std::back_inserter(input_ids), [](absl::string_view id) {
+    return std::string(id);
+  });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer"}, {"CastCollapsing"}};
+  CompareResult(&program, target, input_ids, {out->id}, 0, passes, 123, true);
+}
+
+TEST(CastCollapsing, FuseCastToUseless) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_1t    = builder.Cast(x, "int32");
+  auto x_2t    = builder.Cast(x_1t, "int64");
+  auto x_3t    = builder.Cast(x_2t, "float32");
+  auto out     = builder.Add(x_3t, x_3t);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id()}, std::back_inserter(input_ids), [](absl::string_view id) {
+    return std::string(id);
+  });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer"}, {"CastCollapsing"}};
+  CompareResult(&program, target, input_ids, {out->id}, 3, passes, 123, true);
+}
+
+TEST(TransposeCollapsing, FuseTransposeWithMultiOutput) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_1t    = builder.Cast(x, "int32");
+  auto x_2t    = builder.Cast(x_1t, "float32");
+  auto x_3t    = builder.Cast(x_2t, "int32");
+  auto out1    = builder.Transpose(x_1t, {0, 2, 1});
+  auto out2    = builder.Transpose(x_2t, {0, 2, 1});
+  auto out3    = builder.Transpose(x_3t, {0, 2, 1});
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id()}, std::back_inserter(input_ids), [](absl::string_view id) {
+    return std::string(id);
+  });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer"}, {"CastCollapsing"}};
+  CompareResult(&program, target, input_ids, {out1->id, out2->id, out3->id}, 1, passes, 123, true);
+}
+
+TEST(TransposeCollapsing, FuseTwoSecTranspose) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_1t    = builder.Cast(x, "int32");
+  auto x_2t    = builder.Cast(x_1t, "float32");
+  auto out1    = builder.Reshape(x_2t, {5, 3, 4});
+  auto x_3t    = builder.Cast(out1, "int32");
+  auto x_4t    = builder.Cast(x_3t, "float32");
+  auto out2    = builder.Transpose(x_2t, {0, 2, 1});
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id()}, std::back_inserter(input_ids), [](absl::string_view id) {
+    return std::string(id);
+  });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer"}, {"CastCollapsing"}};
+  CompareResult(&program, target, input_ids, {out1->id, out2->id}, 4, passes, 123, true);
+}
+
+TEST(TransposeCollapsing, FuseTwoHorizontalTranspose) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y_t1    = builder.Cast(x, "int32");
+  auto y_t2    = builder.Cast(x, "int32");
+  auto out     = builder.Add(y_t1, y_t2);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id()}, std::back_inserter(input_ids), [](absl::string_view id) {
+    return std::string(id);
+  });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer"}, {"CastCollapsing"}};
+  CompareResult(&program, target, input_ids, {out->id}, 0, passes, 123, true);
+}
+
+TEST(TransposeCollapsing, FuseVerAndHorTranspose) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y_t1    = builder.Cast(x, "int32");
+  auto y_t2    = builder.Cast(y_t1, "float32");
+  auto y_t3    = builder.Cast(x, "float32");
+  auto out     = builder.Add(y_t2, y_t3);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id()}, std::back_inserter(input_ids), [](absl::string_view id) {
+    return std::string(id);
+  });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer"}, {"CastCollapsing"}};
+  CompareResult(&program, target, input_ids, {out->id}, 3, passes, 123, true);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/dead_code_eliminate.cc b/paddle/cinn/frontend/pass/dead_code_eliminate.cc
new file mode 100644
index 0000000000000..1094710ba137d
--- /dev/null
+++ b/paddle/cinn/frontend/pass/dead_code_eliminate.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <unordered_set>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+
+namespace cinn {
+namespace frontend {
+namespace pass {
+
+// Program maybe has some unused instructions. `DeadCodeEliminate` will remove
+// these instructions. The way to find unused instructions is to traverse all
+// instructions to determine whether its output is used by other instructions in the
+// same subgraph or in the `fetch_ids`.
+class DeadCodeEliminatePass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+
+ protected:
+  void Clear() override {}
+
+  void ApplyImpl(Program* program,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) override {
+    if (!CheckFetchIds(*program, fetch_ids)) {
+      return;
+    }
+
+    std::unordered_set<std::string> inputs;
+    std::unordered_set<int> remove_idxs;
+    for (int i = program->size() - 1; i >= 0; --i) {
+      const auto& instr = (*program)[i];
+      bool can_remove   = true;
+      for (const auto& out : instr->outputs) {
+        if (inputs.count(out->id) || fetch_ids.count(out->id)) {
+          can_remove = false;
+          break;
+        }
+      }
+      if (can_remove) {
+        VLOG(3) << "Remove the " << i << "-th instruction: " << instr;
+        remove_idxs.insert(i);
+      } else {
+        for (const auto& in : instr->inputs) {
+          inputs.insert(in->id);
+        }
+      }
+    }
+
+    VLOG(3) << "Total remove " << remove_idxs.size() << " instructions.";
+    if (remove_idxs.size() == 0) {
+      return;
+    }
+
+    NetBuilder builder("dead_code_eliminate_builder");
+    for (auto& var : program->GetInputs()) {
+      builder.CreateInput(var);
+    }
+    for (int i = 0; i < program->size(); i++) {
+      if (!remove_idxs.count(i)) {
+        builder.AppendInstruction((*program)[i]);
+      }
+    }
+    *program = builder.Build();
+  }
+
+ private:
+  bool CheckFetchIds(const Program& program, const std::unordered_set<std::string>& fetch_ids) {
+    if (fetch_ids.empty()) {
+      // If fetch_ids is not specified, all output vars are considered as fetch vars.
+      return false;
+    }
+
+    std::unordered_set<std::string> outputs;
+    for (int i = 0; i < program.size(); i++) {
+      const auto& instr = program[i];
+      for (auto& var : instr->outputs) {
+        outputs.insert(var->id);
+      }
+    }
+
+    bool res = true;
+    for (auto& id : fetch_ids) {
+      if (!outputs.count(id)) {
+        LOG(WARNING) << id << " in fetch_ids is not output of any instruction in program.";
+        res = false;
+      }
+    }
+
+    return res;
+  }
+};
+
+}  // namespace pass
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(DeadCodeEliminate) {
+  CINN_REGISTER_PROGRAM_PASS(DeadCodeEliminate, cinn::frontend::pass::DeadCodeEliminatePass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc b/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
new file mode 100644
index 0000000000000..da731a720292d
--- /dev/null
+++ b/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/pass/pass_test_helper.h"
+#include "cinn/frontend/pass/test_helper.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/runtime/flags.h"
+
+namespace cinn::frontend {
+
+TEST(DeadCodeEliminate, remove_single) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  //              <x>
+  //           /  | |   \
+  //     identity | |  identity
+  //             /   \
+  //    reduce_sum  reduce_sum
+  //          |         |
+  // <reduce_sum_1> <reduce_sum_2>
+  NetBuilder builder("net_builder");
+  auto x            = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto identity_1   = builder.Identity(x);
+  auto identity_2   = builder.Identity(x);
+  auto reduce_sum_1 = builder.ReduceSum(x, {0, 1});
+  auto reduce_sum_2 = builder.ReduceSum(x, {0, 1});
+  auto program      = builder.Build();
+
+  PassTest tester;
+  std::vector<std::string> input_names  = {x.id().data()};
+  std::vector<std::string> output_names = {identity_1->id, reduce_sum_2->id};
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer"}, {"DeadCodeEliminate"}};
+  CompareResult(&program, target, input_names, output_names, 2, passes, 123, true);
+}
+
+TEST(DeadCodeEliminate, remove_multiple) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  //              <x>
+  //           /   |   \
+  //     identity  |  reduce_sum
+  //          \   /     |
+  //           mul    <reduce_sum_1>
+  //            |
+  //         <mul_1>
+  NetBuilder builder("net_builder");
+  auto x            = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto identity_1   = builder.Transpose(x, {1, 0});
+  auto reduce_sum_1 = builder.ReduceSum(x, {0, 1});
+  auto mul_1        = builder.Matmul(x, identity_1);
+  auto program      = builder.Build();
+
+  PassTest tester;
+  std::vector<std::string> input_names  = {x.id().data()};
+  std::vector<std::string> output_names = {reduce_sum_1->id};
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer"}, {"DeadCodeEliminate"}};
+  CompareResult(&program, target, input_names, output_names, 2, passes, 123, true);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/decomposer.cc b/paddle/cinn/frontend/pass/decomposer.cc
new file mode 100755
index 0000000000000..ace930b7ec18c
--- /dev/null
+++ b/paddle/cinn/frontend/pass/decomposer.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unordered_set>
+
+#include "cinn/frontend/decomposer_registry.h"
+#include "cinn/frontend/program_pass.h"
+
+namespace cinn {
+namespace frontend {
+namespace pass {
+
+class DecomposerPass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+
+ protected:
+  void Clear() override {}
+
+  void ApplyImpl(Program* prog,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) const override {
+    // step 1: set the inputs of the origin program to the new program
+    NetBuilder builder("decomposer_builder");
+    for (auto& var : prog->GetInputs()) {
+      builder.CreateInput(var);
+    }
+
+    // step 2: use primitive instructions to build the new program
+    absl::flat_hash_map<std::string, Variable> var_map;
+    DecomposerContext context(&builder, &var_map);
+    for (size_t i = 0; i < prog->size(); i++) {
+      auto instr      = (*prog)[i];
+      auto decomposer = InstrDecomposerRegistry::Global()->Find(instr->op_type, target);
+      if (decomposer) {
+        VLOG(3) << "Run decomposer of op " << instr->op_type;
+        decomposer->Run(instr, context);
+      } else {
+        VLOG(3) << "Don't run decomposer of op " << instr->op_type;
+        builder.AppendInstruction(instr);
+      }
+    }
+    VLOG(3) << "Before builder.Build()";
+    *prog = builder.Build();
+    VLOG(3) << "After builder.Build()";
+    // step 3: set the origin output to the output of decomposed operator.
+    for (size_t i = 0; i < prog->size(); i++) {
+      auto& outputs = (*prog)[i]->outputs;
+      for (size_t j = 0; j < outputs.size(); j++) {
+        auto it = var_map.find(outputs[j]->id);
+        if (it != var_map.end()) {
+          outputs[j] = it->second;
+        }
+      }
+      auto& inputs = (*prog)[i]->inputs;
+      for (size_t j = 0; j < inputs.size(); j++) {
+        auto it = var_map.find(inputs[j]->id);
+        if (it != var_map.end()) {
+          inputs[j] = it->second;
+        }
+      }
+    }
+  }
+};
+
+}  // namespace pass
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(Decomposer) {
+  CINN_REGISTER_PROGRAM_PASS(Decomposer, ::cinn::frontend::pass::DecomposerPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/decomposer_test.cc b/paddle/cinn/frontend/pass/decomposer_test.cc
new file mode 100644
index 0000000000000..0712baa81bb01
--- /dev/null
+++ b/paddle/cinn/frontend/pass/decomposer_test.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "cinn/frontend/decomposer/use_decomposer.h"
+#include "cinn/frontend/decomposer_registry.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+namespace cinn::frontend {
+
+Program CreateAddProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {M, N});
+  auto b       = builder.CreateInput(Float(32), {M, N});
+  auto c       = builder.Relu(a);
+  auto d       = builder.Add(b, c);
+  auto program = builder.Build();
+
+  return program;
+}
+
+TEST(DecomposePassRegistry, basic) {
+  ASSERT_NE(cinn::frontend::ProgramPassRegistry::Global()->Find("Decomposer"), nullptr);
+  ASSERT_EQ(cinn::frontend::ProgramPassRegistry::Global()->Find("Test"), nullptr);
+}
+
+TEST(DecomposePass, basic) {
+  auto prog = CreateAddProgram();
+  for (int i = 0; i < prog.size(); i++) {
+    LOG(INFO) << "instruction: " << prog[i];
+  }
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+
+  ProgramPass::Apply(&prog, {}, target, {"Decomposer"});
+  for (int i = 0; i < prog.size(); i++) {
+    LOG(INFO) << "new instruction: " << prog[i];
+  }
+
+  auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
+  hlir::framework::ApplyPasses(graph.get(), DefaultOpFusionPasses());
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+
+  auto A = scope->GetTensor("A");
+  auto B = scope->GetTensor("B");
+  SetRandData<float>(A, target);
+  SetRandData<float>(B, target);
+
+  runtime_program->Execute();
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc b/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
new file mode 100644
index 0000000000000..176b873359c58
--- /dev/null
+++ b/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace frontend {
+namespace pass {
+
+class ExpandZeroDimPass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+
+ protected:
+  void ApplyImpl(Program* program,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) override {
+    NetBuilder builder("expand_zero_dim_builder");
+    for (auto var : program->GetInputs()) {
+      if (var->shape.empty()) {
+        var->shape.push_back(1);
+      }
+      builder.CreateInput(var);
+    }
+    for (int i = 0; i < program->size(); ++i) {
+      auto& instr = (*program)[i];
+      for (auto& input : instr->inputs) {
+        if (input->shape.empty()) {
+          VLOG(4) << "Change input 0D-Tensor " << input->id << " to 1D-Tensor";
+          input->shape.push_back(1);
+        }
+      }
+      for (auto& output : instr->outputs) {
+        if (output->shape.empty()) {
+          VLOG(4) << "Change output 0D-Tensor " << output->id << " to 1D-Tensor";
+          output->shape.push_back(1);
+        }
+      }
+      builder.AppendInstruction(instr);
+    }
+    *program = builder.Build();
+  }
+
+  void Clear() override {}
+};
+
+}  // namespace pass
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(ExpandZeroDim) {
+  CINN_REGISTER_PROGRAM_PASS(ExpandZeroDim, cinn::frontend::pass::ExpandZeroDimPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc b/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
new file mode 100644
index 0000000000000..ea18d07826f09
--- /dev/null
+++ b/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
@@ -0,0 +1,157 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <cfloat>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/decomposer/test_helper.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+namespace cinn {
+namespace frontend {
+
+int GetSize(std::vector<int>& shape) { return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()); }
+
+std::unordered_map<std::string, std::vector<float>> GetInputRandom(const std::vector<Variable>&& inputs) {
+  std::unordered_map<std::string, std::vector<float>> input_data;
+  for (auto input : inputs) {
+    input_data[input->id] = std::vector<float>(GetSize(input->shape));
+    InitRandomVector<float>(&input_data[input->id], input_data[input->id].size(), 0.0f, 1.0f, 1e-3);
+  }
+
+  return input_data;
+}
+
+std::unordered_map<std::string, hlir::framework::Tensor> RunWithProgram(
+    const Program& program,
+    const Target& target,
+    const std::unordered_map<std::string, std::vector<float>>& input_data,
+    const std::unordered_set<std::string>& fetch_ids) {
+  auto graph = std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
+  auto scope = hlir::framework::BuildScope(target, graph);
+
+  hlir::framework::ApplyPasses(graph.get(), {"InferShape"});
+  hlir::framework::ApplyPasses(graph.get(), DefaultOpFusionPasses());
+  VLOG(1) << "graph:\n" << graph->Visualize();
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  for (auto& data : input_data) {
+    scope->Var<hlir::framework::Tensor>(data.first);
+    auto tensor = scope->GetTensor(data.first);
+    CopyFromVector(data.second, tensor, target);
+  }
+  runtime_program->Execute();
+
+  std::unordered_map<std::string, hlir::framework::Tensor> outputs;
+  for (auto id : fetch_ids) {
+    auto tensor = scope->GetTensor(id);
+    outputs[id] = tensor;
+  }
+  return outputs;
+}
+
+TEST(ExpandZeroDimPass, expand_zero_dim_1) {
+  NetBuilder builder("expand_zero_dim_1");
+  auto x       = builder.CreateInput(Float(32), {}, "x");
+  auto y       = builder.CreateInput(Float(32), {}, "y");
+  auto out     = builder.Add(x, y);
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program Before ExpandZeroDimPass:\n" << program;
+  /*
+    Program {
+      Var(var_1: shape=[], dtype=float32)
+      Var(y: shape=[], dtype=float32)
+      Var(x: shape=[], dtype=float32)
+
+      var_1 = elementwise_add(x, y, axis=-1)
+    }
+  */
+  ProgramPass::Apply(&program, {}, target, {"ExpandZeroDim"});
+  size_t pass_size = program.size();
+  VLOG(1) << "Program after ExpandZeroDimPass:\n" << program;
+  /*
+    Program {
+      Var(var_1: shape=[1], dtype=float32)
+      Var(y: shape=[1], dtype=float32)
+      Var(x: shape=[1], dtype=float32)
+
+      var_1 = elementwise_add(x, y, axis=-1)
+    }
+  */
+  auto input_data = GetInputRandom({x, y});
+  auto fetch_ids  = {out->id};
+  auto outputs    = RunWithProgram(program, target, input_data, fetch_ids);
+  for (auto iter : outputs) {
+    // output var_1: shape=[1]
+    ASSERT_EQ(iter.second->shape().data().size(), 1);
+  }
+}
+
+TEST(ExpandZeroDimPass, expand_zero_dim_2) {
+  NetBuilder builder("expand_zero_dim_1");
+  auto x       = builder.CreateInput(Float(32), {3, 5}, "x");
+  auto y       = builder.CreateInput(Float(32), {}, "y");
+  auto out     = builder.Add(x, y);
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program Before ExpandZeroDimPass:\n" << program;
+  /*
+    Program {
+      Var(var_1: shape=[3, 5], dtype=float32)
+      Var(y: shape=[], dtype=float32)
+      Var(x: shape=[3, 5], dtype=float32)
+
+      var_1 = elementwise_add(x, y, axis=-1)
+    }
+  */
+  ProgramPass::Apply(&program, {}, target, {"ExpandZeroDim"});
+  size_t pass_size = program.size();
+  VLOG(1) << "Program after ExpandZeroDimPass:\n" << program;
+  /*
+    Program {
+      Var(var_1: shape=[3, 5], dtype=float32)
+      Var(y: shape=[1], dtype=float32)
+      Var(x: shape=[3, 5], dtype=float32)
+
+      var_1 = elementwise_add(x, y, axis=-1)
+    }
+  */
+  auto input_data = GetInputRandom({x, y});
+  auto fetch_ids  = {out->id};
+  auto outputs    = RunWithProgram(program, target, input_data, fetch_ids);
+  for (auto iter : outputs) {
+    // output var_1: shape=[3, 5]
+    ASSERT_EQ(iter.second->shape().data().size(), 2);
+  }
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/pass/fill_constant_folding.cc b/paddle/cinn/frontend/pass/fill_constant_folding.cc
new file mode 100644
index 0000000000000..d7d1d250af1d3
--- /dev/null
+++ b/paddle/cinn/frontend/pass/fill_constant_folding.cc
@@ -0,0 +1,191 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/container/flat_hash_map.h>
+
+#include <sstream>
+#include <string>
+#include <unordered_set>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/utils/string.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn::frontend::pass {
+
+using cinn::utils::Attribute;
+using cinn::utils::DimType;
+using cinn::utils::ShapeType;
+
+class FillConstantKey {
+ public:
+  FillConstantKey(const ShapeType& shape, Attribute value, const std::string& dtype, bool force_cpu) {
+    SetKey(shape, value, dtype, force_cpu);
+  }
+
+  void SetKey(const ShapeType& shape, Attribute value, const std::string& dtype, bool force_cpu) {
+    shape_     = shape;
+    value_     = value;
+    force_cpu_ = force_cpu;
+    dtype_     = dtype;
+  }
+
+  bool operator==(const FillConstantKey& other) const {
+    return shape_ == other.shape_ && value_ == other.value_ && force_cpu_ == other.force_cpu_ && dtype_ == other.dtype_;
+  }
+  bool operator!=(const FillConstantKey& other) const { return !this->operator==(other); }
+
+  struct Hash {
+    size_t operator()(const FillConstantKey& key) const {
+      std::ostringstream hash_str;
+
+      std::for_each(key.shape_.begin(), key.shape_.end(), [&](const DimType& dim) { hash_str << dim; });
+
+      hash_str << utils::Attribute2String(key.value_);
+      hash_str << key.force_cpu_;
+      hash_str << key.dtype_;
+
+      return std::hash<std::string>()(hash_str.str());
+    }
+  };
+
+ private:
+  ShapeType shape_;
+  Attribute value_;
+  bool force_cpu_;
+  std::string dtype_;
+};
+
+// Pass `FillConstantFolding` folds several same fill_constant into one.
+// If output of fill_constant in `fetch_ids`, keep the operator.
+class FillConstantFoldingPass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+  using InputToOpMap = std::unordered_map<std::string, std::unordered_set<Instruction*>>;
+
+ protected:
+  void Clear() override {}
+
+  void ApplyImpl(Program* program,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) const override {
+    auto in2instr = GetInputToOpMap(program);
+
+    // `fill_constant_map` is used to represent the first fill_constant and its output variable
+    std::unordered_map<FillConstantKey, Variable*, FillConstantKey::Hash> fill_constant_map;
+    // `remove_instrs` is used to represent Instructions of which type is fill_constant to be deleted.
+    std::unordered_set<Instruction*> remove_instrs;
+
+    for (int i = 0; i < program->size(); ++i) {
+      auto& instr = (*program)[i];
+
+      if ("fill_constant" != instr->op_type) {
+        // not fill_constant op, skip
+        continue;
+      }
+
+      CHECK_EQ(instr->outputs.size(), 1UL)
+          << "The fill_constant op should has one, and only one output ! Please check.";
+
+      const auto& shape = instr.GetAttrs<ShapeType>("shape");
+      auto value        = instr->attrs.at("value");
+      const auto& dtype = instr.GetAttrs<std::string>("dtype");
+      auto force_cpu    = instr.GetAttrs<bool>("force_cpu");
+
+      FillConstantKey key(shape, value, dtype, force_cpu);
+      if (!fill_constant_map.count(key)) {
+        VLOG(4) << "The fill_constant, whose output is Var [" << instr->outputs[0]->id
+                << "], cannot remove because it is the first fill_costant ! ";
+        // retain the first fill constant op node
+        fill_constant_map.emplace(key, &instr->outputs[0]);
+        continue;
+      }
+
+      if (fetch_ids.find(instr->outputs[0]->id) != fetch_ids.end()) {
+        // the fill constant's output variable was fetched, skip
+        VLOG(4) << "Cannot remove fill_constant, because Var [" << instr->outputs[0]->id
+                << "] was fetched by other op ! ";
+        continue;
+      }
+
+      VLOG(4) << "Try remove fill_constant, whose output is Var [" << instr->outputs[0]->id << "]. ";
+      remove_instrs.insert(&instr);
+
+      auto constant_name = instr->outputs[0]->id;
+      ReLinkFillConstant(in2instr, constant_name, fill_constant_map.at(key));
+    }
+
+    NetBuilder builder("fill_constant_folding_builder");
+    for (auto& var : program->GetInputs()) {
+      builder.CreateInput(var);
+    }
+    for (int i = 0; i < program->size(); i++) {
+      if (remove_instrs.end() != remove_instrs.find(&(*program)[i])) continue;
+      builder.AppendInstruction((*program)[i]);
+    }
+    *program = builder.Build();
+  }
+
+ private:
+  static InputToOpMap GetInputToOpMap(Program* program) {
+    // `in2instr` is used to represent the mapping of Input to Instruction.
+    InputToOpMap in2instr;
+
+    for (int i = 0; i < program->size(); ++i) {
+      auto& instr = (*program)[i];
+
+      for (const auto& in : instr->inputs) {
+        in2instr[in->id].insert(&instr);
+      }
+    }
+    return in2instr;
+  }
+
+  static void ReLinkFillConstant(const InputToOpMap& in2instr, const std::string& input_var_name, Variable* out_var) {
+    if (!in2instr.count(input_var_name)) {
+      LOG(WARNING) << "Var [" << input_var_name << "] not used by other op ! ";
+      return;
+    }
+
+    VLOG(4) << "Try replace the input Var [" << input_var_name << "] to [" << (*out_var)->id
+            << "], because the fill_constant will be folding.";
+
+    const auto& output_ops = in2instr.at(input_var_name);
+    for (auto op : output_ops) {
+      auto find_input = [&](const std::string& input_name) {
+        return std::find_if(
+            (*op)->inputs.begin(), (*op)->inputs.end(), [&](const Variable& var) { return var->id == input_name; });
+      };
+
+      // Why Loop : To avoid the op's inputs are the same variable !
+      for (auto it = find_input(input_var_name); it != (*op)->inputs.end(); it = find_input(input_var_name)) {
+        // erase previous fill_constant output var and replace to new fill_constant output var
+        auto next_it = (*op)->inputs.erase(it);
+        // keep the input place same, it's very important
+        (*op)->inputs.insert(next_it, *out_var);
+      }
+    }
+  }
+};
+
+}  // namespace cinn::frontend::pass
+
+CINN_REGISTER_HELPER(FillConstantFolding) {
+  CINN_REGISTER_PROGRAM_PASS(FillConstantFolding, ::cinn::frontend::pass::FillConstantFoldingPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/fill_constant_folding_test.cc b/paddle/cinn/frontend/pass/fill_constant_folding_test.cc
new file mode 100644
index 0000000000000..3e97b5225db1a
--- /dev/null
+++ b/paddle/cinn/frontend/pass/fill_constant_folding_test.cc
@@ -0,0 +1,210 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <cfloat>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+namespace cinn::frontend {
+
+std::vector<float> RunWithProgram(const Program& program, const Target& target, Variable out) {
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  auto scope = hlir::framework::BuildScope(target, graph);
+
+  hlir::framework::ApplyPasses(graph.get(), {"InferShape"});
+  hlir::framework::ApplyPasses(graph.get(), DefaultOpFusionPasses());
+  VLOG(1) << "graph:\n" << graph->Visualize();
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  runtime_program->Execute();
+
+  return GetTensorData<float>(scope->GetTensor(out->id), target);
+}
+
+TEST(TransposeFolding, FoldTwoFillConstant) {
+  NetBuilder builder("net_builder");
+  auto x           = builder.FillConstant<float>({32, 32}, 1.0f, "x");
+  auto y           = builder.FillConstant<float>({32, 32}, 1.0f, "y");
+  auto transpose_x = builder.Transpose(x, {1, 0});
+  auto transpose_y = builder.Transpose(y, {1, 0});
+  auto out         = builder.Add(transpose_x, transpose_y);
+  auto program     = builder.Build();
+  auto target      = common::DefaultTarget();
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program Before FillConstantFolding:\n" << program;
+  // Program {
+  //   x = fill_constant(value=1, dtype=float32, force_cpu=false, shape=[32,32])
+  //   y = fill_constant(dtype=float32, shape=[32,32], value=1, force_cpu=false)
+  //   var_1 = transpose(x, axis=[1,0])
+  //   var_2 = transpose(y, axis=[1,0])
+  //   var_3 = elementwise_add(var_1, var_2)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, out);
+
+  ProgramPass::Apply(&program, {}, target, {"FillConstantFolding"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after FillConstantFolding:\n" << program;
+  // y was removed
+  // Program {
+  //   x = fill_constant(value=1, dtype=float32, force_cpu=false, shape=[32,32])
+  //   var_1 = transpose(x, axis=[1,0])
+  //   var_2 = transpose(x, axis=[1,0])
+  //   var_3 = elementwise_add(var_1, var_2)
+  // }
+
+  auto folded_out = RunWithProgram(program, target, out);
+
+  ASSERT_EQ(origin_size, folded_size + 1);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_FLOAT_EQ(origin_out[i], folded_out[i]);
+  }
+}
+
+TEST(TransposeFolding, FoldTwoFillConstantWithSameOuput) {
+  NetBuilder builder("net_builder");
+  auto x           = builder.FillConstant<float>({32, 32}, 1.0f, "x");
+  auto y           = builder.FillConstant<float>({32, 32}, 1.0f, "y");
+  auto transpose_x = builder.Transpose(x, {1, 0});
+  auto out         = builder.Add(y, y);
+  auto program     = builder.Build();
+  auto target      = common::DefaultTarget();
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program Before FillConstantFolding:\n" << program;
+  // Program {
+  //   x = fill_constant(dtype=float32, shape=[32,32], value=1, force_cpu=false)
+  //   y = fill_constant(shape=[32,32], dtype=float32, value=1, force_cpu=false)
+  //   var_6 = transpose(x, axis=[1,0])
+  //   var_7 = elementwise_add(y, y)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, out);
+
+  ProgramPass::Apply(&program, {}, target, {"FillConstantFolding"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after FillConstantFolding:\n" << program;
+  // Program {
+  //   x = fill_constant(dtype=float32, shape=[32,32], value=1, force_cpu=false)
+  //   var_6 = transpose(x, axis=[1,0])
+  //   var_7 = elementwise_add(x, x)
+  // }
+
+  auto folded_out = RunWithProgram(program, target, out);
+
+  ASSERT_EQ(origin_size, folded_size + 1);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_FLOAT_EQ(origin_out[i], folded_out[i]);
+  }
+}
+
+TEST(TransposeFolding, FoldThreeFillConstant) {
+  NetBuilder builder("net_builder");
+  auto x             = builder.FillConstant<float>({32, 32}, 1.0f, "x");
+  auto y             = builder.FillConstant<float>({32, 32}, 1.0f, "y");
+  auto z             = builder.FillConstant<float>({32, 32}, 1.0f, "z");
+  auto transpose_x   = builder.Transpose(x, {1, 0});
+  auto out           = builder.Add(y, z);
+  auto program       = builder.Build();
+  auto target        = common::DefaultTarget();
+  size_t origin_size = program.size();
+  VLOG(1) << "Program Before FillConstantFolding:\n" << program;
+  // Program {
+  //   x = fill_constant(dtype=float32, shape=[32,32], value=1, force_cpu=false)
+  //   y = fill_constant(dtype=float32, shape=[32,32], value=1, force_cpu=false)
+  //   z = fill_constant(force_cpu=false, shape=[32,32], dtype=float32, value=1)
+  //   var_10 = transpose(x, axis=[1,0])
+  //   var_11 = elementwise_add(y, z)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, out);
+
+  ProgramPass::Apply(&program, {}, target, {"FillConstantFolding"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after FillConstantFolding:\n" << program;
+  // Program {
+  //   x = fill_constant(dtype=float32, shape=[32,32], value=1, force_cpu=false)
+  //   var_10 = transpose(x, axis=[1,0])
+  //   var_11 = elementwise_add(x, x)
+  // }
+
+  auto folded_out = RunWithProgram(program, target, out);
+
+  ASSERT_EQ(origin_size, folded_size + 2);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_FLOAT_EQ(origin_out[i], folded_out[i]);
+  }
+}
+
+TEST(TransposeFolding, FoldThreeFillConstantWithOneDiff) {
+  NetBuilder builder("net_builder");
+  auto x           = builder.FillConstant<float>({32, 32}, 1.0f, "x");
+  auto y           = builder.FillConstant<float>({32, 32}, 1.0f, "y");
+  auto z           = builder.FillConstant<float>({32, 32}, 0.0f, "z");
+  auto transpose_x = builder.Transpose(x, {1, 0});
+  auto out         = builder.Add(y, z);
+  auto program     = builder.Build();
+  auto target      = common::DefaultTarget();
+  auto graph       = std::make_shared<hlir::framework::Graph>(program, target);
+  auto scope       = hlir::framework::BuildScope(target, graph);
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program Before FillConstantFolding:\n" << program;
+  // Program {
+  //   x = fill_constant(dtype=float32, shape=[32,32], value=1, force_cpu=false)
+  //   y = fill_constant(force_cpu=false, shape=[32,32], dtype=float32, value=1)
+  //   z = fill_constant(force_cpu=false, shape=[32,32], value=0, dtype=float32)
+  //   var_15 = transpose(x, axis=[1,0])
+  //   var_16 = elementwise_add(y, z)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, out);
+
+  ProgramPass::Apply(&program, {}, target, {"FillConstantFolding"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after FillConstantFolding:\n" << program;
+  // Program {
+  //   x = fill_constant(dtype=float32, shape=[32,32], value=1, force_cpu=false)
+  //   z = fill_constant(force_cpu=false, shape=[32,32], value=0, dtype=float32)
+  //   var_15 = transpose(x, axis=[1,0])
+  //   var_16 = elementwise_add(z, x)
+  // }
+
+  auto folded_out = RunWithProgram(program, target, out);
+
+  ASSERT_EQ(origin_size, folded_size + 1);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_FLOAT_EQ(origin_out[i], folded_out[i]);
+  }
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/fill_constant_rewriter.cc b/paddle/cinn/frontend/pass/fill_constant_rewriter.cc
new file mode 100644
index 0000000000000..c70284b0c453f
--- /dev/null
+++ b/paddle/cinn/frontend/pass/fill_constant_rewriter.cc
@@ -0,0 +1,226 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace frontend {
+namespace pass {
+
+#define FILL_CONSTANT_VALUE_REWRITE(OLD_VALUE, FUNC, NEW_VALUE) \
+  if (absl::holds_alternative<float>(OLD_VALUE))                \
+    NEW_VALUE = FUNC(absl::get<float>(OLD_VALUE));              \
+  else if (absl::holds_alternative<double>(OLD_VALUE))          \
+    NEW_VALUE = FUNC(absl::get<double>(OLD_VALUE));             \
+  else if (absl::holds_alternative<int>(OLD_VALUE))             \
+    NEW_VALUE = FUNC(absl::get<int>(OLD_VALUE));                \
+  else if (absl::holds_alternative<int64_t>(OLD_VALUE))         \
+    NEW_VALUE = FUNC(absl::get<int64_t>(OLD_VALUE));            \
+  else                                                          \
+    LOG(FATAL) << "fill_constant Only support float32/float64/int32/int64";
+
+#define MATH_FUNC_REWRITER(op_name)                                            \
+  {                                                                            \
+#op_name, [](const Instruction& fill_constant, Instruction* instr) -> void { \
+       (*instr)->op_type = "fill_constant"; \
+       (*instr)->inputs.clear(); \
+       (*instr)->attrs = fill_constant->attrs; \
+       const auto& old_attr = fill_constant->attrs.at("value"); \
+       auto& new_attr = (*instr)->attrs.at("value"); \
+       FILL_CONSTANT_VALUE_REWRITE(old_attr, std::op_name, new_attr) \
+     } \
+  }
+
+static std::unordered_map<std::string, std::function<void(const Instruction&, Instruction*)>> rewriter_ops = {
+    {"reshape",
+     [](const Instruction& fill_constant, Instruction* instr) -> void {
+       (*instr)->op_type = "fill_constant";
+       (*instr)->inputs.clear();
+       // the outputs keep same
+
+       CHECK((*instr)->attrs.count("shape")) << "The reshape op should has attribute [shape]!";
+       auto new_shape           = (*instr)->attrs.at("shape");
+       (*instr)->attrs          = fill_constant->attrs;
+       (*instr)->attrs["shape"] = new_shape;
+     }},
+    {"scale",
+     [](const Instruction& fill_constant, Instruction* instr) -> void {
+       (*instr)->op_type = "fill_constant";
+       (*instr)->inputs.clear();
+       // the outputs keep same
+
+       auto scale = (*instr)->attrs.count("scale") ? instr->GetAttrs<float>("scale") : 1.0f;
+       auto bias  = (*instr)->attrs.count("bias") ? instr->GetAttrs<float>("bias") : 0.0f;
+       auto bias_after_scale =
+           (*instr)->attrs.count("bias_after_scale") ? instr->GetAttrs<bool>("bias_after_scale") : true;
+
+       (*instr)->attrs = fill_constant->attrs;
+
+       const auto& old_attr = fill_constant->attrs.at("value");
+       auto& new_attr       = (*instr)->attrs.at("value");
+       if (bias_after_scale) {
+         auto scale_func = [&](const auto& value) -> decltype(auto) {
+           return value * static_cast<decltype(value)>(scale) + static_cast<decltype(value)>(bias);
+         };
+         FILL_CONSTANT_VALUE_REWRITE(old_attr, scale_func, new_attr)
+       } else {
+         auto scale_func = [&](const auto& value) -> decltype(auto) {
+           return (value + static_cast<decltype(value)>(bias)) * static_cast<decltype(value)>(scale);
+         };
+         FILL_CONSTANT_VALUE_REWRITE(old_attr, scale_func, new_attr)
+       }
+     }},
+    {"cast",
+     [](const Instruction& fill_constant, Instruction* instr) -> void {
+       (*instr)->op_type = "fill_constant";
+       (*instr)->inputs.clear();
+       // the outputs keep same
+
+       CHECK((*instr)->attrs.count("dtype")) << "The cast op should has attribute [dtype]!";
+       auto cast_dtype = instr->GetAttrs<std::string>("dtype");
+
+       (*instr)->attrs          = fill_constant->attrs;
+       (*instr)->attrs["dtype"] = cast_dtype;
+     }},
+    {"broadcast_to",
+     [](const Instruction& fill_constant, Instruction* instr) -> void {
+       (*instr)->op_type = "fill_constant";
+       (*instr)->inputs.clear();
+       // the outputs keep same
+
+       CHECK((*instr)->attrs.count("out_shape")) << "The cast op should has attribute [out_shape]!";
+       auto out_shape = instr->GetAttrs<std::vector<int>>("out_shape");
+
+       (*instr)->attrs          = fill_constant->attrs;
+       (*instr)->attrs["shape"] = out_shape;
+     }},
+    {"slice",
+     [](const Instruction& fill_constant, Instruction* instr) -> void {
+       (*instr)->op_type = "fill_constant";
+       (*instr)->inputs.clear();
+       // the outputs keep same
+
+       (*instr)->attrs          = fill_constant->attrs;
+       (*instr)->attrs["shape"] = (*instr)->outputs[0]->shape;
+     }},
+    MATH_FUNC_REWRITER(abs),
+    MATH_FUNC_REWRITER(log),
+    MATH_FUNC_REWRITER(log2),
+    MATH_FUNC_REWRITER(log10),
+    MATH_FUNC_REWRITER(tanh)};
+
+#undef FILL_CONSTANT_VALUE_REWRITE
+#undef MATH_FUNC_REWRITER
+
+class FillConstantRewriterPass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+
+ protected:
+  void Clear() override {}
+
+  void ApplyImpl(Program* program,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) override {
+    auto input2instr = GetInput2Instr(program);
+
+    std::unordered_set<const Instruction*> remove_instr;
+    for (int i = 0; i < program->size(); ++i) {
+      const auto& instr = (*program)[i];
+
+      if (instr->op_type == "fill_constant") {
+        RewriteFillConstant(instr, input2instr, fetch_ids, &remove_instr);
+      }
+    }
+    VLOG(3) << "FillConstantRewriterPass Remove " << remove_instr.size() << " instruction";
+
+    NetBuilder builder("reshape_rewritter_builder");
+    for (auto& var : program->GetInputs()) {
+      builder.CreateInput(var);
+    }
+
+    for (int i = 0; i < program->size(); ++i) {
+      const auto& instr = (*program)[i];
+
+      if (!remove_instr.count(&instr)) {
+        builder.AppendInstruction(instr);
+      }
+    }
+    *program = builder.Build();
+  }
+
+ private:
+  using Input2Instr = std::unordered_map<std::string, std::unordered_set<Instruction*>>;
+
+  Input2Instr GetInput2Instr(Program* program) {
+    Input2Instr input2instr;
+
+    for (int i = 0; i < program->size(); ++i) {
+      auto& instr = (*program)[i];
+      for (const auto& var : instr->inputs) {
+        input2instr[var->id].insert(&instr);
+      }
+    }
+
+    return input2instr;
+  }
+
+  void RewriteFillConstant(const Instruction& fill_constant,
+                           const Input2Instr& input2instr,
+                           const std::unordered_set<std::string>& fetch_ids,
+                           std::unordered_set<const Instruction*>* remove_instr) {
+    CHECK_EQ(fill_constant->op_type, std::string("fill_constant"));
+    CHECK_EQ(fill_constant->outputs.size(), 1UL) << "The fill_constant op should just has one output! Please check.";
+    const auto& out = fill_constant->outputs[0];
+
+    if (!input2instr.count(out->id)) {
+      // the fill constant's output is empty, skip
+      return;
+    }
+
+    bool can_remove = true;
+    for (auto* instr : input2instr.at(out->id)) {
+      if (rewriter_ops.count((*instr)->op_type)) {
+        VLOG(3) << "Try folding " << (*instr) << " into " << fill_constant;
+        rewriter_ops.at((*instr)->op_type)(fill_constant, instr);
+        RewriteFillConstant(*instr, input2instr, fetch_ids, remove_instr);
+      } else {
+        can_remove = false;
+      }
+    }
+
+    if (can_remove && !fetch_ids.count(out->id)) {
+      remove_instr->insert(&fill_constant);
+    }
+  }
+};
+
+}  // namespace pass
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(FillConstantRewriter) {
+  CINN_REGISTER_PROGRAM_PASS(FillConstantRewriter, cinn::frontend::pass::FillConstantRewriterPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/fill_constant_rewriter_test.cc b/paddle/cinn/frontend/pass/fill_constant_rewriter_test.cc
new file mode 100644
index 0000000000000..cd662d3cbb715
--- /dev/null
+++ b/paddle/cinn/frontend/pass/fill_constant_rewriter_test.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/pass/test_helper.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/hlir/op/use_ops.h"
+
+namespace cinn::frontend {
+
+TEST(FillConstantRewriter, remove_reshape_single) {
+  //              <x>
+  //           /       \
+  //     identity    reshape
+  //          |         |
+  //    reduce_sum  reduce_sum
+  //          |         |
+  // <reduce_sum_1> <reduce_sum_2>
+  NetBuilder builder("net_builder");
+  auto x            = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto identity_1   = builder.Identity(x);
+  auto reshape_1    = builder.Reshape(x, {32, 16});
+  auto reduce_sum_1 = builder.ReduceSum(identity_1, {0});
+  auto reduce_sum_2 = builder.ReduceSum(reshape_1, {1});
+
+  PassTest tester;
+  std::vector<std::string> input_names    = {x.id().data()};
+  std::vector<std::string> output_names   = {reduce_sum_1->id, reduce_sum_2->id};
+  std::vector<std::string> program_passes = {"FillConstantRewriter", "RemoveIdentity"};
+  int num_removed_ops                     = tester.RunAndCheck(builder, program_passes, input_names, output_names);
+  ASSERT_EQ(num_removed_ops, 2);
+}
+
+TEST(FillConstantRewriter, remove_reshape_with_fill_constant) {
+  //  fill_constant({16, 32})   <x>
+  //          |                  |
+  //     reshape({32, 16}     reshape
+  //           \                /
+  //             elementwise_add
+  //                   |
+  //                <add_1>
+  NetBuilder builder("net_builder");
+  auto x          = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto constant_1 = builder.FillConstant<float>({16, 32}, static_cast<float>(1.0), "constant_1");
+  auto reshape_1  = builder.Reshape(constant_1, {32, 16});
+  auto reshape_2  = builder.Reshape(x, {32, 16});
+  auto add_1      = builder.Add(reshape_1, reshape_2);
+
+  PassTest tester;
+  std::vector<std::string> input_names    = {x.id().data()};
+  std::vector<std::string> output_names   = {add_1->id};
+  std::vector<std::string> program_passes = {"FillConstantRewriter", "RemoveIdentity"};
+  int num_removed_ops                     = tester.RunAndCheck(builder, program_passes, input_names, output_names);
+  ASSERT_EQ(num_removed_ops, 2);
+}
+
+TEST(FillConstantRewriter, remove_scale_single) {
+  //              <x>
+  //           /       \
+  //     identity    scale
+  //          |         |
+  //    reduce_sum  reduce_sum
+  //          |         |
+  // <reduce_sum_1> <reduce_sum_2>
+  NetBuilder builder("net_builder");
+  auto x            = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto identity_1   = builder.Identity(x);
+  auto scale_1      = builder.Scale(x, 1.0f);
+  auto reduce_sum_1 = builder.ReduceSum(identity_1, {0});
+  auto reduce_sum_2 = builder.ReduceSum(scale_1, {1});
+
+  PassTest tester;
+  std::vector<std::string> input_names    = {x.id().data()};
+  std::vector<std::string> output_names   = {reduce_sum_1->id, reduce_sum_2->id};
+  std::vector<std::string> program_passes = {"FillConstantRewriter", "RemoveIdentity"};
+  int num_removed_ops                     = tester.RunAndCheck(builder, program_passes, input_names, output_names);
+  ASSERT_EQ(num_removed_ops, 2);
+}
+
+TEST(FillConstantRewriter, remove_scale_with_fill_constant) {
+  //  fill_constant({16, 32})   <x>
+  //          |                  |
+  //        scale              scale
+  //           \                /
+  //             elementwise_add
+  //                   |
+  //                <add_1>
+  NetBuilder builder("net_builder");
+  auto x          = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto constant_1 = builder.FillConstant<float>({32, 16}, 128.0f, "constant_1");
+  auto scale_1    = builder.Scale(constant_1, -1.0f);
+  auto scale_2    = builder.Scale(x, 1.0f);
+  auto add_1      = builder.Add(scale_1, scale_2);
+
+  PassTest tester;
+  std::vector<std::string> input_names    = {x.id().data()};
+  std::vector<std::string> output_names   = {add_1->id};
+  std::vector<std::string> program_passes = {"FillConstantRewriter", "RemoveIdentity"};
+  int num_removed_ops                     = tester.RunAndCheck(builder, program_passes, input_names, output_names);
+  ASSERT_EQ(num_removed_ops, 2);
+}
+
+TEST(FillConstantRewriter, remove_multi_scale_with_fill_constant) {
+  //  fill_constant({16, 32})  x
+  //          |                |
+  //        scale             scale
+  //          |                |
+  //        scale              |
+  //          |                |
+  //        scale              |
+  //          \                /
+  //           elementwise_add
+  //                   |
+  //                <add_1>
+  NetBuilder builder("net_builder");
+  auto x          = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto constant_1 = builder.FillConstant<float>({32, 16}, 128.0f, "constant_1");
+  auto scale_1    = builder.Scale(constant_1, -1.0f);
+  auto scale_2    = builder.Scale(scale_1, 2.0f, 10.0f);
+  auto scale_3    = builder.Scale(scale_2, 3.0f, 1.0f, false);
+
+  auto x_1   = builder.Scale(x, 1.0f);
+  auto add_1 = builder.Add(scale_3, x_1);
+
+  PassTest tester;
+  std::vector<std::string> input_names    = {x.id().data()};
+  std::vector<std::string> output_names   = {add_1->id};
+  std::vector<std::string> program_passes = {"FillConstantRewriter", "RemoveIdentity"};
+  int num_removed_ops                     = tester.RunAndCheck(builder, program_passes, input_names, output_names);
+  ASSERT_EQ(num_removed_ops, 4);
+}
+
+TEST(FillConstantRewriter, two_fill_constant) {
+  //  fill_constant({16, 32})  fill_constant({16, 32})
+  NetBuilder builder("net_builder");
+  auto constant_1 = builder.FillConstant<float>({32, 16}, 128.0f, "constant_1");
+  auto constant_2 = builder.FillConstant<float>({32, 16}, -128.0f, "constant_2");
+
+  PassTest tester;
+  std::vector<std::string> input_names    = {};
+  std::vector<std::string> output_names   = {constant_1->id, constant_2->id};
+  std::vector<std::string> program_passes = {"FillConstantRewriter", "RemoveIdentity"};
+  int num_removed_ops                     = tester.RunAndCheck(builder, program_passes, input_names, output_names);
+  ASSERT_EQ(num_removed_ops, 0);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/gemm_rewriter.cc b/paddle/cinn/frontend/pass/gemm_rewriter.cc
new file mode 100644
index 0000000000000..201a751c01eee
--- /dev/null
+++ b/paddle/cinn/frontend/pass/gemm_rewriter.cc
@@ -0,0 +1,216 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ios>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace frontend {
+namespace pass {
+
+class GemmRewriterPass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+
+ protected:
+  void Clear() override {
+    removed_instrs_.clear();
+    origin2new_.clear();
+    output2instr_.clear();
+    var_used_count_.clear();
+  }
+
+  void ApplyImpl(Program* prog,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) override {
+    if (target.arch != Target::Arch::NVGPU || !prog->size()) {
+      return;
+    }
+
+    CollectInfo(*prog);
+
+    NetBuilder builder("gemm_rewriter_builder");
+    for (auto& var : prog->GetInputs()) {
+      builder.CreateInput(var);
+    }
+    for (int i = prog->size() - 1; i >= 0; i--) {
+      auto& instr = prog->operator[](i);
+      /*if (instr->op_type == "elementwise_add") {
+        auto fused = DoGemmFusion(&builder, instr, fetch_ids);
+        if (fused) {
+          // the elementwise_add is fused in gemm, just skip it
+          continue;
+        }
+      }*/
+      if (!removed_instrs_.count(instr.get())) {
+        builder.AppendInstruction(instr);
+      }
+    }
+    *prog = builder.Build(true);
+
+    // Use the cublas call instead of the single matmul
+    RewriteSingleMatmul(prog);
+
+    // relink old outputs to new outputs
+    for (size_t i = 0; i < prog->size(); i++) {
+      auto& inputs = (*prog)[i]->inputs;
+      for (size_t j = 0; j < inputs.size(); j++) {
+        if (origin2new_.count(inputs[j].get())) {
+          inputs[j] = origin2new_.at(inputs[j].get());
+        }
+      }
+    }
+    ClearResources();
+  }
+
+ private:
+  void CollectInfo(const Program& prog) {
+    for (size_t i = 0; i < prog.size(); i++) {
+      auto& instr = prog[i];
+      for (auto& var : instr->outputs) {
+        output2instr_.emplace(var.get(), instr);
+      }
+      for (auto& var : instr->inputs) {
+        var_used_count_[var.get()]++;
+      }
+    }
+  }
+
+  // Fuse the pattern of `matmul + add`
+  bool DoGemmFusion(NetBuilder* builder, const Instruction& instr, const std::unordered_set<std::string>& fetch_ids) {
+    CHECK_EQ(instr->inputs.size(), 2) << "elementwise should have only two inputs";
+    std::vector<Variable> inputs;
+    bool trans_a   = false;
+    bool trans_b   = false;
+    bool trans_out = false;
+    float alpha    = 1.f;
+    std::unordered_set<std::string> dot_instrs{"matmul", "cublas_matmul"};
+    for (auto& var : instr->inputs) {
+      auto it = output2instr_.find(var.get());
+      if (it != output2instr_.end() && dot_instrs.count(it->second->op_type)) {
+        // If the output var of matmul is consumed by more than one instruction or
+        // a fetch var, just skip to fuse it.
+        CHECK_GT(var_used_count_.count(var.get()), 0)
+            << "The input(" << var->id << ")"
+            << "should be included in var_used_count_. Please check the CollectInfo method.";
+        if ((var_used_count_.at(var.get()) > 1) || fetch_ids.count(var->id)) {
+          continue;
+        }
+
+        auto& matmul_instr = it->second;
+        // check inputs of cublas_gemm
+        auto& bias          = instr->inputs[0].get() == var.get() ? instr->inputs[1] : instr->inputs[0];
+        auto& matmul_inputs = matmul_instr->inputs;
+        int lhs_dim_size    = matmul_inputs[0]->shape.size();
+        int rhs_dim_size    = matmul_inputs[1]->shape.size();
+        int bias_dim_size   = bias->shape.size();
+        // only support the condition below:
+        // 1) tow-dim matrix multiply, such as m * k, k * n
+        // 2) three-dim tensor multiply, such as b * m * k, b * k * n
+        if (!((lhs_dim_size == 2 || lhs_dim_size == 3) && lhs_dim_size == rhs_dim_size &&
+              rhs_dim_size == bias_dim_size)) {
+          continue;
+        }
+        // set inputs of cublas_gemm
+        inputs = matmul_inputs;
+        inputs.emplace_back(bias);
+        // set attrs of cublas_gemm
+        auto& attrs = matmul_instr->attrs;
+        if (attrs.count("trans_a")) {
+          trans_a = absl::get<bool>(attrs.at("trans_a"));
+        }
+        if (attrs.count("trans_b")) {
+          trans_b = absl::get<bool>(attrs.at("trans_b"));
+        }
+        if (attrs.count("trans_out")) {
+          trans_out = absl::get<bool>(attrs.at("trans_out"));
+        }
+        if (attrs.count("alpha")) {
+          alpha = absl::get<float>(attrs.at("alpha"));
+        }
+
+        // After the fusion, matmul and elementwise_add should be removed.
+        removed_instrs_.emplace(matmul_instr.get());
+        removed_instrs_.emplace(instr.get());
+        break;
+      }
+    }
+
+    if (inputs.size() == 3) {
+      VLOG(4) << "-- The trans_a of GEMM: " << std::boolalpha << trans_a;
+      VLOG(4) << "-- The trans_b of GEMM: " << std::boolalpha << trans_b;
+      VLOG(4) << "-- The trans_out of GEMM: " << std::boolalpha << trans_out;
+      const auto& new_outs = builder->CustomInstr(
+          "cublas_gemm",
+          inputs,
+          {{"trans_a", trans_a}, {"trans_b", trans_b}, {"trans_out", trans_out}, {"alpha", alpha}});
+      auto new_out = new_outs[0];
+      auto old_out = instr.GetOutput(0);
+      new_out.set_id(old_out->id);
+      origin2new_.emplace(old_out.get(), new_out);
+      return true;
+    }
+
+    CHECK_EQ(inputs.size(), 0) << "The gemm should only have three inputs.";
+    return false;
+  }
+
+  // Rewrite the left single matmul, use cublas call instead
+  void RewriteSingleMatmul(Program* prog) {
+    for (int i = 0; i < prog->size(); i++) {
+      auto& instr = (*prog)[i];
+      if (instr->op_type == "matmul") {
+        auto& matmul_inputs = instr->inputs;
+        int lhs_dim_size    = matmul_inputs[0]->shape.size();
+        int rhs_dim_size    = matmul_inputs[1]->shape.size();
+        // only support the condition below:
+        // 1) tow-dim matrix multiply, such as m * k, k * n
+        // 2) three-dim tensor multiply, such as b * m * k, b * k * n
+        if (lhs_dim_size <= 4 && rhs_dim_size <= 4) {
+          instr->op_type = "cublas_matmul";
+        }
+      }
+    }
+  }
+
+  void ClearResources() {
+    removed_instrs_.clear();
+    origin2new_.clear();
+    output2instr_.clear();
+    var_used_count_.clear();
+  }
+
+ private:
+  std::unordered_set<_Instruction_*> removed_instrs_;
+  std::unordered_map<_Variable_*, Variable> origin2new_;
+  std::unordered_map<_Variable_*, Instruction> output2instr_;
+  std::unordered_map<_Variable_*, int> var_used_count_;
+};
+
+}  // namespace pass
+}  // namespace frontend
+}  // namespace cinn
+
+namespace fp = ::cinn::frontend::pass;
+CINN_REGISTER_HELPER(GemmRewriter) {
+  CINN_REGISTER_PROGRAM_PASS(GemmRewriter, fp::GemmRewriterPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/gemm_rewriter_test.cc b/paddle/cinn/frontend/pass/gemm_rewriter_test.cc
new file mode 100755
index 0000000000000..d04844cbb3230
--- /dev/null
+++ b/paddle/cinn/frontend/pass/gemm_rewriter_test.cc
@@ -0,0 +1,280 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/pass/pass_test_helper.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/runtime/flags.h"
+
+namespace cinn::frontend {
+
+TEST(GemmRwriter, BatchedTransLeft) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 6, 8}, "A");
+  auto b       = builder.Transpose(a, {0, 2, 1});
+  auto c       = builder.CreateInput(Float(32), {3, 6, 7}, "C");
+  auto d       = builder.Matmul(b, c);
+  auto e       = builder.CreateInput(Float(32), {3, 8, 7}, "E");
+  auto out     = builder.Add(d, e);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), e.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer", "RemoveIdentity"},
+                                                                       {"TransposeFoldingInput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(GemmRwriter, BatchedTransRight) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 8, 6}, "A");
+  auto b       = builder.CreateInput(Float(32), {3, 7, 6}, "B");
+  auto c       = builder.Transpose(b, {0, 2, 1});
+  auto e       = builder.Matmul(a, c);
+  auto f       = builder.CreateInput(Float(32), {3, 8, 7}, "F");
+  auto out     = builder.Add(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{{"Decomposer", "RemoveIdentity"},
+                                                                       {"TransposeFoldingInput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(GemmRwriter, BatchedTransTwo) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 6, 8}, "A");
+  auto b       = builder.Transpose(a, {0, 2, 1});
+  auto c       = builder.CreateInput(Float(32), {3, 7, 6}, "C");
+  auto d       = builder.Transpose(c, {0, 2, 1});
+  auto e       = builder.Matmul(b, d);
+  auto f       = builder.CreateInput(Float(32), {3, 8, 7}, "F");
+  auto out     = builder.Add(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+                               std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(GemmRwriter, BatchedNoTrans) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 8, 6}, "A");
+  auto b       = builder.CreateInput(Float(32), {3, 6, 7}, "B");
+  auto e       = builder.Matmul(a, b);
+  auto f       = builder.CreateInput(Float(32), {3, 8, 7}, "F");
+  auto out     = builder.Add(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+                               std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 0, passes, 123, true);
+}
+
+TEST(GemmRwriter, TransLeft) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {6, 8}, "A");
+  auto b       = builder.Transpose(a, {1, 0});
+  auto c       = builder.CreateInput(Float(32), {6, 7}, "C");
+  auto d       = builder.Matmul(b, c);
+  auto e       = builder.CreateInput(Float(32), {8, 7}, "E");
+  auto out     = builder.Add(d, e);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), e.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+                               std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(GemmRwriter, TransRight) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {8, 6}, "A");
+  auto b       = builder.CreateInput(Float(32), {7, 6}, "B");
+  auto c       = builder.Transpose(b, {1, 0});
+  auto e       = builder.Matmul(a, c);
+  auto f       = builder.CreateInput(Float(32), {8, 7}, "F");
+  auto out     = builder.Add(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+                               std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(GemmRwriter, TransTwo) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {6, 8}, "A");
+  auto b       = builder.Transpose(a, {1, 0});
+  auto c       = builder.CreateInput(Float(32), {7, 6}, "C");
+  auto d       = builder.Transpose(c, {1, 0});
+  auto e       = builder.Matmul(b, d);
+  auto f       = builder.CreateInput(Float(32), {8, 7}, "F");
+  auto out     = builder.Add(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+                               std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(GemmRwriter, NoTrans) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {8, 6}, "A");
+  auto b       = builder.CreateInput(Float(32), {6, 7}, "B");
+  auto e       = builder.Matmul(a, b);
+  auto f       = builder.CreateInput(Float(32), {8, 7}, "F");
+  auto out     = builder.Add(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+                               std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 0, passes, 123, true);
+}
+
+TEST(GemmRwriter, BatchedComplex) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.FillConstant<float>({2, 20}, 2.0f, "A");
+  auto b       = builder.FillConstant<float>({16, 2, 20}, 2.0f, "B");
+  auto c       = builder.Transpose(b, {0, 2, 1});
+  auto d       = builder.CreateInput(Float(32), {121, 20}, "D");
+  auto e       = builder.BroadcastTo(d, {16, 121, 20}, {1, 2});
+  auto f       = builder.Matmul(e, c);
+  auto x       = builder.FillConstant<float>({16, 2, 20}, 1.0f, "X");
+  auto y       = builder.Transpose(x, {0, 2, 1});
+  auto z       = builder.CreateInput(Float(32), {16, 20, 121}, "Z");
+  auto l       = builder.Transpose(z, {0, 2, 1});
+  auto m       = builder.Matmul(l, y);
+  auto n       = builder.Matmul(d, a, false, true);
+  auto o       = builder.BroadcastTo(n, {16, n->shape[0], n->shape[1]}, {1, 2});
+  auto p       = builder.Subtract(f, o);
+  auto q       = builder.Add(f, m);
+  auto out     = builder.Add(p, q);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{d.id(), z.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+                               std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 4, passes, 123, false);
+}
+
+TEST(GemmRwriter, Complex) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.FillConstant<float>({2, 20}, 2.0f, "A");
+  auto b       = builder.Transpose(a, {1, 0});
+  auto c       = builder.CreateInput(Float(32), {121, 20}, "C");
+  auto d       = builder.Matmul(c, b);
+  auto x       = builder.FillConstant<float>({2, 20}, 1.0f, "X");
+  auto y       = builder.Transpose(x, {1, 0});
+  auto z       = builder.CreateInput(Float(32), {20, 121}, "Z");
+  auto l       = builder.Transpose(z, {1, 0});
+  auto m       = builder.Matmul(l, y);
+  auto n       = builder.Matmul(c, a, false, true);
+  auto p       = builder.Subtract(d, n);
+  auto q       = builder.Add(d, m);
+  auto out     = builder.Add(p, q);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{c.id(), z.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+                               std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 3, passes, 123, false);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/pass_test_helper.h b/paddle/cinn/frontend/pass/pass_test_helper.h
new file mode 100644
index 0000000000000..7e88f67c8982d
--- /dev/null
+++ b/paddle/cinn/frontend/pass/pass_test_helper.h
@@ -0,0 +1,212 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+DECLARE_bool(cinn_use_op_fusion);
+
+namespace cinn {
+namespace frontend {
+
+inline void PrintMatrix(const std::vector<float>& mat, int bs, int m, int n) {
+  if (!VLOG_IS_ON(5)) {
+    return;
+  }
+  const auto min_max = std::minmax_element(mat.begin(), mat.end());
+  int min            = static_cast<int>(*min_max.first);
+  int max            = static_cast<int>(*min_max.second);
+  auto ele_width     = std::max(std::to_string(min).length(), std::to_string(max).length());
+  std::cout << "\n" << std::string((ele_width + 2) * n - 1, '-') << "\n";
+  for (int b = 0; b < bs; b++) {
+    for (int i = 0; i < m; i++) {
+      for (int j = 0; j < n; j++) {
+        std::cout << std::setw(ele_width) << mat[b * m * n + i * n + j] << ", ";
+      }
+      std::cout << "\n";
+    }
+    if (b != bs - 1) {
+      std::cout << std::string((ele_width + 2) * n - 1, '*') << "\n";
+    }
+  }
+  std::cout << std::string((ele_width + 2) * n - 1, '-') << "\n\n";
+}
+
+inline void RunGraph(std::shared_ptr<hlir::framework::Graph> graph,
+                     const common::Target& target,
+                     const std::shared_ptr<hlir::framework::Scope>& scope,
+                     const std::vector<std::string>& output_ids,
+                     const std::vector<std::string>& graph_passes) {
+  hlir::framework::ApplyPasses(graph.get(), graph_passes);
+  VLOG(3) << "Graph Viz:\n" << graph->Visualize();
+  BuildScope(target, graph, scope);
+  hlir::framework::GraphCompiler::CompileOptions options;
+  options.attached_code              = "";
+  options.with_instantiate_variables = true;
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program =
+      gc.Build(options, std::unordered_set<std::string>(output_ids.begin(), output_ids.end())).runtime_program;
+  runtime_program->Execute();
+}
+
+inline std::vector<float> RunProgram(const Program& program,
+                                     const common::Target& target,
+                                     const std::vector<std::string>& input_ids,
+                                     const std::vector<std::string>& output_ids,
+                                     const std::vector<std::string>& graph_passes,
+                                     int seed          = -1,
+                                     bool print_tensor = false) {
+  std::unordered_set<std::string> outputs_set{output_ids.begin(), output_ids.end()};
+  auto graph = std::make_shared<hlir::framework::Graph>(program, outputs_set, target);
+  auto scope = hlir::framework::BuildScope(target, graph);
+  for (auto& input_id : input_ids) {
+    scope->Var<hlir::framework::Tensor>(input_id);
+    auto input_tensor = scope->GetTensor(input_id);
+    SetRandData<int>(input_tensor, target, seed);
+    if (print_tensor) {
+      auto tensor_data = GetTensorData<float>(input_tensor, target);
+      if (input_tensor->shape().data().size() == 2) {
+        PrintMatrix(tensor_data, 1, input_tensor->shape().data()[0], input_tensor->shape().data()[1]);
+      } else if (input_tensor->shape().data().size() == 3) {
+        PrintMatrix(tensor_data,
+                    input_tensor->shape().data()[0],
+                    input_tensor->shape().data()[1],
+                    input_tensor->shape().data()[2]);
+      }
+    }
+  }
+
+  RunGraph(graph, target, scope, output_ids, graph_passes);
+
+  auto output_tensor = scope->GetTensor(output_ids.front());
+  auto output_data   = GetTensorData<float>(output_tensor, target);
+  if (print_tensor) {
+    if (output_tensor->shape().data().size() == 2) {
+      PrintMatrix(output_data, 1, output_tensor->shape().data()[0], output_tensor->shape().data()[1]);
+    } else if (output_tensor->shape().data().size() == 3) {
+      PrintMatrix(output_data,
+                  output_tensor->shape().data()[0],
+                  output_tensor->shape().data()[1],
+                  output_tensor->shape().data()[2]);
+    }
+  }
+  return output_data;
+}
+
+struct OptimizeConfig {
+  struct PassGroup;
+  OptimizeConfig(const PassGroup& program_passes) : program_passes{program_passes} {
+    if (FLAGS_cinn_use_op_fusion) {
+      graph_passes = {{"OpFusionPass", "FusionMergePass"}, {"OpFusionPass", "FusionMergePass"}};
+    }
+  }
+  OptimizeConfig(const PassGroup& program_passes, const PassGroup& graph_passes)
+      : program_passes{program_passes}, graph_passes{graph_passes} {}
+
+  OptimizeConfig(const std::pair<std::vector<std::string>, std::vector<std::string>>& program_passes) {
+    this->program_passes.ctrl = program_passes.first;
+    this->program_passes.exp  = program_passes.second;
+
+    if (FLAGS_cinn_use_op_fusion) {
+      graph_passes = {{"TransToCustomCallPass", "OpFusionPass", "FusionMergePass"},
+                      {"TransToCustomCallPass", "OpFusionPass", "FusionMergePass"}};
+    }
+  }
+
+  struct PassGroup {
+    // control group
+    std::vector<std::string> ctrl;
+    // experimental group
+    std::vector<std::string> exp;
+  };
+  PassGroup program_passes;
+  PassGroup graph_passes;
+};
+
+inline void CompareResult(Program* program,
+                          const common::Target& target,
+                          const std::vector<std::string>& input_ids,
+                          const std::vector<std::string>& output_ids,
+                          size_t size_diff,
+                          const OptimizeConfig& passes,
+                          int seed          = -1,
+                          bool print_tensor = false) {
+  std::unordered_set<std::string> fetch_ids(output_ids.begin(), output_ids.end());
+  // apply common passes
+  ProgramPass::Apply(program, fetch_ids, target, passes.program_passes.ctrl);
+  // get original program size
+  auto origin_size = program->size();
+  // get original output
+  auto origin_out = RunProgram(*program, target, input_ids, output_ids, passes.graph_passes.ctrl, seed, print_tensor);
+
+  // apply fused passes
+  ProgramPass::Apply(program, fetch_ids, target, passes.program_passes.exp);
+
+  // get fused program size
+  auto fused_size = program->size();
+  ASSERT_EQ(size_diff, origin_size - fused_size);
+  // get fused output
+  auto fused_out = RunProgram(*program, target, input_ids, output_ids, passes.graph_passes.exp, seed, print_tensor);
+
+  ASSERT_EQ(origin_out.size(), fused_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_FLOAT_EQ(origin_out[i], fused_out[i]) << " i is " << i;
+  }
+}
+
+inline bool CompareProgramPassResult(Program* program,
+                                     const common::Target& target,
+                                     const std::unordered_set<std::string>& fetch_ids,
+                                     const size_t size_diff,
+                                     const OptimizeConfig& passes) {
+  // apply common passes
+  ProgramPass::Apply(program, fetch_ids, target, passes.program_passes.ctrl);
+  // get original program size
+  auto origin_size = program->size();
+
+  // apply fused passes
+  ProgramPass::Apply(program, fetch_ids, target, passes.program_passes.exp);
+
+  // get fused program size
+  auto fused_size = program->size();
+  return size_diff == (origin_size - fused_size);
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/pass/program_topoerror_test.cc b/paddle/cinn/frontend/pass/program_topoerror_test.cc
new file mode 100644
index 0000000000000..012dc50e4ef91
--- /dev/null
+++ b/paddle/cinn/frontend/pass/program_topoerror_test.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <cfloat>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/pass/pass_test_helper.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/data_util.h"
+
+namespace cinn::frontend {
+
+void RunWithProgram(const Program& program,
+                    const Target& target,
+                    const std::shared_ptr<hlir::framework::Scope>& scope) {
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPasses(graph.get(), {"InferShape"});
+  hlir::framework::ApplyPasses(graph.get(), DefaultOpFusionPasses());
+  VLOG(1) << "graph:\n" << graph->Visualize();
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  runtime_program->Execute();
+}
+
+TEST(TransposeFoldingInput, TransposeWithMultiMamtul) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {2, 2}, "X");
+  auto y           = builder.CreateInput(Float(32), {2, 2}, "Y");
+  auto transpose_y = builder.Transpose(y, {1, 0});
+  auto dot1        = builder.Matmul(x, transpose_y);
+  auto dot2        = builder.Matmul(transpose_y, x);
+  auto out         = builder.Add(dot1, dot2);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer"}, {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/remove_identity.cc b/paddle/cinn/frontend/pass/remove_identity.cc
new file mode 100644
index 0000000000000..c580b94e2e289
--- /dev/null
+++ b/paddle/cinn/frontend/pass/remove_identity.cc
@@ -0,0 +1,276 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace frontend {
+namespace pass {
+
+#define SHAPE_SAME_REMOVE(op_name)                 \
+  {                                                \
+#op_name, [](const Instruction& instr) -> bool { \
+    const auto& input_shape = instr->inputs[0]->shape; \
+    const auto& output_shape = instr->outputs[0]->shape; \
+    return input_shape == output_shape; \
+  } \
+  }
+
+static std::unordered_map<std::string, std::function<bool(const Instruction&)>> identity_ops = {
+    {"identity", [](const Instruction& instr) -> bool { return true; }},
+    {"scale",
+     [](const Instruction& instr) -> bool {
+       bool bias_zero = !instr->attrs.count("bias") || instr.GetAttrs<float>("bias") == 0.0f;
+       bool scale_one = !instr->attrs.count("scale") || instr.GetAttrs<float>("scale") == 1.0f;
+       return bias_zero && scale_one;
+     }},
+    {"cast",
+     [](const Instruction& instr) -> bool {
+       const auto& input_dtype  = instr->inputs[0]->type;
+       const auto& output_dtype = instr->outputs[0]->type;
+       return input_dtype == output_dtype;
+     }},
+    {"transpose",
+     [](const Instruction& instr) -> bool {
+       const auto& input_shape = instr->inputs[0]->shape;
+       const auto& axis        = instr.GetAttrs<std::vector<int>>("axis");
+
+       bool can_remove = (input_shape.size() == axis.size());
+       if (can_remove) {
+         for (int i = 0; i < axis.size(); ++i) {
+           if (axis[i] != i) {
+             can_remove = false;
+             break;
+           }
+         }
+       }
+
+       return can_remove;
+     }},
+    {"concat", [](const Instruction& instr) -> bool { return (instr->inputs.size() == 1); }},
+    {"split", [](const Instruction& instr) -> bool { return (instr->outputs.size() == 1); }},
+    SHAPE_SAME_REMOVE(broadcast_to),
+    SHAPE_SAME_REMOVE(reduce_sum),
+    SHAPE_SAME_REMOVE(reduce_prod),
+    SHAPE_SAME_REMOVE(reduce_max),
+    SHAPE_SAME_REMOVE(reduce_min),
+    SHAPE_SAME_REMOVE(reduce_all),
+    SHAPE_SAME_REMOVE(reduce_any),
+    SHAPE_SAME_REMOVE(slice),
+    SHAPE_SAME_REMOVE(reshape)};
+
+#undef SHAPE_SAME_REMOVE
+
+namespace {
+bool check_reduce_to_reshape(const Instruction& instr) {
+  const auto& input_shape = instr->inputs[0]->shape;
+
+  auto dims = instr->attrs.count("dim") ? instr.GetAttrs<std::vector<int>>("dim") : std::vector<int>();
+
+  if (dims.empty()) {
+    for (int i = 0; i < input_shape.size(); ++i) {
+      dims.emplace_back(i);
+    }
+  }
+
+  for (auto aixs : dims) {
+    if (input_shape[aixs] != 1) {
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace
+
+static std::unordered_map<std::string, std::function<bool(const Instruction&)>> reshape_ops = {
+    {"reduce_sum", check_reduce_to_reshape},
+    {"reduce_prod", check_reduce_to_reshape},
+    {"reduce_max", check_reduce_to_reshape},
+    {"reduce_min", check_reduce_to_reshape},
+    {"reduce_all", check_reduce_to_reshape},
+    {"reduce_any", check_reduce_to_reshape}};
+
+// RemoveIdentityPass will remove the identity instructions in following patterns:
+//
+// 1. When varB is not in fetch_ids, the identity and varB will be removed.
+//    When varB is in fetch_ids and varA is not in fetch_ids, the identity and varA will be removed.
+//        instrA                      instrA
+//          | varA                      |
+//      identity           =>           | varA/varB
+//          | varB                      |
+//        instrB                      instrB
+//
+// 2. Multiply outputs are also supported.
+//        instrA                      instrA
+//          | varA                      |
+//      identity           =>           | varA/varB
+//          | varB                      |
+//         / \                         / \
+//   instrB   instrC             instrB   instrC
+class RemoveIdentityPass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+
+ protected:
+  void ApplyImpl(Program* program,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) override {
+    CollectInfo(*program, fetch_ids);
+
+    VLOG(3) << "Total remove " << remove_idxs_.size() << " instructions.";
+
+    NetBuilder builder("remove_identity_builder");
+    for (auto& var : program->GetInputs()) {
+      builder.CreateInput(var);
+    }
+    for (int i = 0; i < program->size(); ++i) {
+      if (remove_idxs_.count(i)) {
+        continue;
+      }
+
+      auto& instr = (*program)[i];
+      if (replace_identity_idxs_.count(i)) {
+        VLOG(4) << "Replace op " << instr->outputs[0]->id << "[" << cinn::utils::Join(instr->outputs[0]->shape, ", ")
+                << "]=" << instr->op_type << "{" << instr->inputs[0]->id << "["
+                << cinn::utils::Join(instr->inputs[0]->shape, ", ") << "]} to identity";
+
+        instr->op_type = "identity";
+        instr->attrs.clear();
+      } else if (reshape_ops.count(instr->op_type) && reshape_ops.at(instr->op_type)(instr)) {
+        VLOG(4) << "Replace op " << instr->outputs[0]->id << "[" << cinn::utils::Join(instr->outputs[0]->shape, ", ")
+                << "]=" << instr->op_type << "{" << instr->inputs[0]->id << "["
+                << cinn::utils::Join(instr->inputs[0]->shape, ", ") << "]} to reshape";
+
+        instr->op_type = "reshape";
+        instr->attrs.clear();
+        instr->attrs["shape"] = instr->outputs[0]->shape;
+      }
+
+      auto& inputs = instr->inputs;
+      for (size_t j = 0; j < inputs.size(); ++j) {
+        if (origin2new_.count(inputs[j].get())) {
+          inputs[j] = origin2new_.at(inputs[j].get());
+        }
+      }
+      auto& outputs = instr->outputs;
+      for (size_t j = 0; j < outputs.size(); ++j) {
+        if (origin2new_.count(outputs[j].get())) {
+          outputs[j] = origin2new_.at(outputs[j].get());
+        }
+      }
+      builder.AppendInstruction(instr);
+    }
+    *program = builder.Build();
+  }
+
+  void Clear() override {
+    remove_idxs_.clear();
+    origin2new_.clear();
+    replace_identity_idxs_.clear();
+  }
+
+ private:
+  void CollectInfo(const Program& program, const std::unordered_set<std::string>& fetch_ids) {
+    remove_idxs_.clear();
+    origin2new_.clear();
+
+    std::unordered_set<std::string> feed_ids;
+    for (auto& var : program.GetInputs()) {
+      feed_ids.insert(var->id);
+    }
+    for (int i = 0; i < program.size(); ++i) {
+      const auto& instr = program[i];
+      if (!identity_ops.count(instr->op_type)) {
+        continue;
+      }
+
+      if (!identity_ops.at(instr->op_type)(instr)) {
+        continue;
+      }
+      CHECK_EQ(instr->inputs.size(), 1) << instr->op_type << " should have only 1 input. But here " << instr;
+      CHECK_EQ(instr->outputs.size(), 1) << instr->op_type << " should have only 1 output. But here " << instr;
+
+      auto& input_var  = instr->inputs[0];
+      auto& output_var = instr->outputs[0];
+
+      bool can_input_var_removed  = !feed_ids.count(input_var->id) && !fetch_ids.count(input_var->id);
+      bool can_output_var_removed = !fetch_ids.count(output_var->id);
+      if (can_input_var_removed || can_output_var_removed) {
+        bool updated = false;
+        if (can_output_var_removed) {
+          updated = UpdateOrigin2New(output_var, input_var);
+        }
+        if (!updated && can_input_var_removed) {
+          updated = UpdateOrigin2New(input_var, output_var);
+        }
+        if (updated) {
+          VLOG(3) << "Remove the " << i << "-th instruction: " << instr;
+          remove_idxs_.insert(i);
+        }
+      } else {
+        replace_identity_idxs_.insert(i);
+      }
+    }
+
+    for (auto& v : origin2new_) {
+      const auto& reserved_var = v.second;
+      auto iter                = origin2new_.find(reserved_var.get());
+      if (iter != origin2new_.end()) {
+        VLOG(4) << "Update " << v.first->id << " -> " << reserved_var->id << " to " << v.first->id << " -> "
+                << iter->second->id;
+        origin2new_[v.first] = iter->second;
+      }
+    }
+
+    VLOG(4) << "origin2new_ {";
+    for (auto& iter : origin2new_) {
+      VLOG(4) << "  " << iter.first->id << " -> " << iter.second->id;
+    }
+    VLOG(4) << "}";
+  }
+
+  bool UpdateOrigin2New(const Variable& origin, const Variable& new_var) {
+    if (!origin2new_.count(origin.get())) {
+      if (origin2new_.count(new_var.get())) {
+        VLOG(4) << "Add " << origin->id << " -> " << origin2new_[new_var.get()]->id;
+        origin2new_.emplace(origin.get(), origin2new_[new_var.get()]);
+      } else {
+        VLOG(4) << "Add " << origin->id << " -> " << new_var->id;
+        origin2new_.emplace(origin.get(), new_var);
+      }
+      return true;
+    }
+    return false;
+  }
+
+  std::unordered_set<int> remove_idxs_;
+  std::unordered_map<_Variable_*, Variable> origin2new_;
+  std::unordered_set<int> replace_identity_idxs_;
+};
+
+}  // namespace pass
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(RemoveIdentity) {
+  CINN_REGISTER_PROGRAM_PASS(RemoveIdentity, cinn::frontend::pass::RemoveIdentityPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/remove_identity_test.cc b/paddle/cinn/frontend/pass/remove_identity_test.cc
new file mode 100644
index 0000000000000..87833f405257e
--- /dev/null
+++ b/paddle/cinn/frontend/pass/remove_identity_test.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/pass/test_helper.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/hlir/op/use_ops.h"
+
+namespace cinn::frontend {
+
+TEST(RemoveIdentity, remove_single) {
+  //              <x>
+  //           /       \
+  //     identity   identity
+  //          |         |
+  //    reduce_sum  reduce_sum
+  //          |         |
+  // <reduce_sum_1> <reduce_sum_2>
+  NetBuilder builder("net_builder");
+  auto x            = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto identity_1   = builder.Identity(x);
+  auto identity_2   = builder.Identity(x);
+  auto reduce_sum_1 = builder.ReduceSum(identity_1, {0});
+  auto reduce_sum_2 = builder.ReduceSum(identity_2, {1});
+
+  PassTest tester;
+  std::vector<std::string> input_names    = {x.id().data()};
+  std::vector<std::string> output_names   = {reduce_sum_1->id};
+  std::vector<std::string> program_passes = {"RemoveIdentity", "DeadCodeEliminate"};
+  int num_removed_ops                     = tester.RunAndCheck(builder, program_passes, input_names, output_names);
+  ASSERT_EQ(num_removed_ops, 3);
+}
+
+TEST(RemoveIdentity, remove_branch) {
+  //              <x>
+  //               |
+  //            identity
+  //           /        \
+  //    reduce_sum  reduce_sum
+  //          |          |
+  // <reduce_sum_1> <reduce_sum_2>
+  NetBuilder builder("net_builder");
+  auto x            = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto identity_1   = builder.Identity(x);
+  auto reduce_sum_1 = builder.ReduceSum(identity_1, {0});
+  auto reduce_sum_2 = builder.ReduceSum(identity_1, {1});
+
+  PassTest tester;
+  std::vector<std::string> input_names    = {x.id().data()};
+  std::vector<std::string> output_names   = {reduce_sum_1->id, reduce_sum_2->id};
+  std::vector<std::string> program_passes = {"RemoveIdentity"};
+  int num_removed_ops                     = tester.RunAndCheck(builder, program_passes, input_names, output_names);
+  ASSERT_EQ(num_removed_ops, 1);
+}
+
+TEST(RemoveIdentity, remove_multiple) {
+  //         <x>  <y>
+  //          |    |
+  //     identity  |
+  //          |    |
+  //     identity  |
+  //          |    |
+  //     identity  |
+  //           \  /
+  //           mul
+  //            |
+  //         <mul_1>
+  NetBuilder builder("net_builder");
+  auto x          = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto y          = builder.CreateInput(Float(32), {32, 16}, "y");
+  auto identity_1 = builder.Identity(x);
+  auto identity_2 = builder.Identity(identity_1);
+  auto identity_3 = builder.Identity(identity_2);
+  auto mul_1      = builder.Add(identity_3, y);
+
+  PassTest tester;
+  std::vector<std::string> input_names    = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names   = {mul_1->id};
+  std::vector<std::string> program_passes = {"RemoveIdentity"};
+  int num_removed_ops                     = tester.RunAndCheck(builder, program_passes, input_names, output_names);
+  ASSERT_EQ(num_removed_ops, 3);
+}
+
+TEST(RemoveIdentity, cannot_remove_fetch) {
+  //         <x>  <y>
+  //          |    |
+  //        relu   |
+  //          |    |
+  //     identity  |
+  //          |    |
+  //     identity  |
+  //           \  /
+  //           mul
+  //            |
+  //         <mul_1>
+  NetBuilder builder("net_builder");
+  auto x          = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto y          = builder.CreateInput(Float(32), {32, 16}, "y");
+  auto relu_1     = builder.Relu(x);
+  auto identity_1 = builder.Identity(relu_1);
+  auto identity_2 = builder.Identity(identity_1);
+  auto mul_1      = builder.Add(identity_2, y);
+
+  PassTest tester;
+  std::vector<std::string> input_names    = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names   = {identity_2->id, mul_1->id};
+  std::vector<std::string> program_passes = {"RemoveIdentity"};
+  int num_removed_ops                     = tester.RunAndCheck(builder, program_passes, input_names, output_names);
+  ASSERT_EQ(num_removed_ops, 1);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/test_helper.h b/paddle/cinn/frontend/pass/test_helper.h
new file mode 100644
index 0000000000000..d68d876dfee1e
--- /dev/null
+++ b/paddle/cinn/frontend/pass/test_helper.h
@@ -0,0 +1,185 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/pass/use_pass.h"
+
+namespace cinn::frontend {
+
+template <typename T>
+std::vector<T> GeneratedRandomVector(size_t numel) {
+  std::vector<T> data(numel);
+
+  std::random_device seed;
+  std::default_random_engine engine(seed());
+  std::uniform_real_distribution<float> dist(0.f, 10.f);
+  for (size_t i = 0; i < numel; i++) {
+    data[i] = static_cast<T>(dist(engine));  // All random data
+  }
+  return data;
+}
+
+template <typename T>
+void CopyFromVector(const std::vector<T>& src, hlir::framework::Tensor tensor, Target target) {
+  size_t numel = tensor->shape().numel();
+  auto* dst    = tensor->mutable_data<T>(target);
+
+#ifdef CINN_WITH_CUDA
+  cudaMemcpy(dst, src.data(), numel * sizeof(T), cudaMemcpyHostToDevice);
+#else
+  std::copy(src.begin(), src.end(), dst);
+#endif
+}
+
+template <typename T>
+std::vector<T> CopyToVector(const hlir::framework::Tensor tensor) {
+  size_t numel = tensor->shape().numel();
+  auto* src    = tensor->data<T>();
+
+  std::vector<T> dst(numel);
+#ifdef CINN_WITH_CUDA
+  cudaMemcpy(dst.data(), src, numel * sizeof(T), cudaMemcpyDeviceToHost);
+#else
+  for (size_t i = 0; i < numel; ++i) {
+    dst[i] = src[i];
+  }
+#endif
+  return dst;
+}
+
+class PassTest {
+ public:
+  PassTest() { target_ = common::DefaultTarget(); }
+
+  int RunAndCheck(NetBuilder& builder,
+                  const std::vector<std::string>& program_passes,
+                  const std::vector<std::string>& input_names,
+                  const std::vector<std::string>& output_names) {
+    auto program = builder.Build();
+    CHECK(IsValid(program)) << "The origin program is not valid.";
+    int origin_program_size = program.size();
+    LOG(INFO) << "Run origin program";
+    std::unordered_map<std::string, std::vector<float>> origin_outputs = Execute(program, input_names, output_names);
+
+    std::unordered_set<std::string> fetch_var_ids(output_names.begin(), output_names.end());
+    ProgramPass::Apply(&program, fetch_var_ids, target_, program_passes);
+    int optimized_program_size = program.size();
+    CHECK(IsValid(program)) << "The optimized program is not valid.";
+    LOG(INFO) << "Run optimized program";
+    std::unordered_map<std::string, std::vector<float>> optimized_outputs = Execute(program, input_names, output_names);
+
+    for (auto name : output_names) {
+      LOG(INFO) << "Check output name=" << name;
+      CHECK(origin_outputs.count(name));
+      CHECK(optimized_outputs.count(name));
+      CheckOutput(optimized_outputs[name], origin_outputs[name]);
+    }
+    return origin_program_size - optimized_program_size;
+  }
+
+ protected:
+  std::unordered_map<std::string, std::vector<float>> Execute(const Program& program,
+                                                              const std::vector<std::string>& input_names,
+                                                              const std::vector<std::string>& output_names) {
+    LOG(INFO) << program;
+    std::unordered_set<std::string> fetch_var_ids(output_names.begin(), output_names.end());
+    auto graph = std::make_shared<hlir::framework::Graph>(program, fetch_var_ids, target_);
+    hlir::framework::ApplyPasses(graph.get(), DefaultOpFusionPasses());
+
+    auto scope = hlir::framework::BuildScope(target_, graph);
+    hlir::framework::GraphCompiler gc(target_, scope, graph);
+    hlir::framework::GraphCompiler::CompileOptions options;
+    options.with_instantiate_variables = true;
+    auto result                        = gc.Build(options, std::move(fetch_var_ids));
+    auto runtime_program               = std::move(result.runtime_program);
+
+    for (auto& name : input_names) {
+      SetInputTensor(name, scope);
+    }
+    runtime_program->Execute();
+
+    std::unordered_map<std::string, std::vector<float>> outputs;
+    for (auto& name : output_names) {
+      auto tensor            = scope->GetTensor(name);
+      std::vector<float> vec = CopyToVector<float>(tensor);
+      outputs.emplace(name, vec);
+    }
+    return outputs;
+  }
+
+  void SetInputTensor(const std::string& name, std::shared_ptr<hlir::framework::Scope> scope) {
+    scope->Var<hlir::framework::Tensor>(name);
+    auto tensor = scope->GetTensor(name);
+
+    if (!inputs_.count(name)) {
+      std::vector<float> vec = GeneratedRandomVector<float>(tensor->shape().numel());
+      inputs_.emplace(name, vec);
+    }
+    auto iter = inputs_.find(name);
+    CopyFromVector<float>(iter->second, tensor, target_);
+  }
+
+  void CheckOutput(const std::vector<float>& actual, const std::vector<float>& expect) {
+    CHECK_EQ(actual.size(), expect.size());
+    for (size_t i = 0; i < expect.size(); ++i) {
+      ASSERT_FLOAT_EQ(actual[i], expect[i]);
+    }
+  }
+
+  bool IsValid(const Program& program) {
+    std::unordered_set<std::string> inputs;
+    for (auto& var : program.GetInputs()) {
+      inputs.insert(var->id);
+    }
+
+    std::unordered_set<std::string> outputs;
+    for (int i = 0; i < program.size(); ++i) {
+      const auto& instr = program[i];
+      for (auto& var : instr->outputs) {
+        outputs.insert(var->id);
+      }
+    }
+
+    bool valid = true;
+    for (int i = 0; i < program.size(); ++i) {
+      const auto& instr = program[i];
+      // The inputs should be feeded, or other instructions' output.
+      for (auto& var : instr->inputs) {
+        if (!inputs.count(var->id) && !outputs.count(var->id)) {
+          LOG(INFO) << "The input " << var->id << " of " << i << "-th instrution (" << instr
+                    << ") is not the output of any other instructions.";
+          valid = false;
+        }
+      }
+    }
+
+    return valid;
+  }
+
+  Target target_;
+  std::unordered_map<std::string, std::vector<float>> inputs_;
+};
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/transpose_collapsing.cc b/paddle/cinn/frontend/pass/transpose_collapsing.cc
new file mode 100644
index 0000000000000..241007c172397
--- /dev/null
+++ b/paddle/cinn/frontend/pass/transpose_collapsing.cc
@@ -0,0 +1,393 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/container/flat_hash_map.h>
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn::frontend::pass {
+
+using cinn::utils::DimType;
+using cinn::utils::ShapeType;
+
+class TransposeKey {
+ public:
+  TransposeKey(const std::string& input_id, const ShapeType& axis) { SetKey(input_id, axis); }
+
+  void SetKey(const std::string& input_id, const ShapeType& axis) {
+    input_id_ = input_id;
+    axis_     = axis;
+  }
+
+  bool operator==(const TransposeKey& other) const { return axis_ == other.axis_ && input_id_ == other.input_id_; }
+  bool operator!=(const TransposeKey& other) const { return !this->operator==(other); }
+
+  struct Hash {
+    size_t operator()(const TransposeKey& key) const {
+      std::string ret;
+
+      ret.append(key.input_id_);
+      std::for_each(key.axis_.begin(), key.axis_.end(), [&](const DimType& dim) { ret.append(std::to_string(dim)); });
+
+      return std::hash<std::string>()(ret);
+    }
+  };
+
+ private:
+  std::string input_id_;
+  ShapeType axis_;
+};
+
+// Pass `TransposeCollapsing` folds multi transpose into one.
+class TransposeCollapsingPass : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+  using OutputToOpMap = std::unordered_map<std::string, Instruction*>;
+  using InputToOpMap  = std::unordered_map<std::string, std::unordered_set<Instruction*>>;
+
+ protected:
+  void Clear() override {}
+
+  void ApplyImpl(Program* program,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) const override {
+    // `out2instr` is used to represent the mapping of Output to Instruction.
+    OutputToOpMap out2instr;
+    // `in2instr` is used to represent the mapping of Input to Instruction.
+    InputToOpMap in2instr;
+    // all transpose op in program
+    std::unordered_set<Instruction*> all_transpose;
+
+    for (size_t i = 0; i < program->size(); ++i) {
+      auto& instr = (*program)[i];
+      for (const auto& out : instr->outputs) {
+        out2instr[out->id] = &instr;
+      }
+      for (const auto& in : instr->inputs) {
+        in2instr[in->id].insert(&instr);
+      }
+      if ("transpose" == instr->op_type) {
+        all_transpose.insert(&instr);
+      }
+    }
+
+    // the useless transpose op need to remove from program
+    std::unordered_set<Instruction*> remove_instrs;
+    FoldingTransposeVertical(all_transpose, fetch_ids, in2instr, out2instr, &remove_instrs);
+
+    for (auto instr : remove_instrs) {
+      if (all_transpose.count(instr)) {
+        all_transpose.erase(instr);
+      }
+    }
+    // TODO(thisjiang): reopen after CINN support recompute for performance
+    // due to recompute unsupported, if the op output to two group, it will also create a new group,
+    // so that the horizontal fuse will not improve performance.
+    // FoldingTransposeHorizontal(all_transpose, fetch_ids, in2instr, out2instr, &remove_instrs);
+
+    NetBuilder builder("transpose_collapsing_builder");
+    for (auto& var : program->GetInputs()) {
+      builder.CreateInput(var);
+    }
+    for (int i = 0; i < program->size(); i++) {
+      if (remove_instrs.end() == remove_instrs.find(&(*program)[i])) {
+        builder.AppendInstruction((*program)[i]);
+      }
+    }
+    *program = builder.Build();
+  }
+
+ private:
+  void FoldingTransposeVertical(const std::unordered_set<Instruction*>& all_transpose,
+                                const std::unordered_set<std::string>& fetch_ids,
+                                const InputToOpMap& in2instr,
+                                const OutputToOpMap& out2instr,
+                                std::unordered_set<Instruction*>* remove_instrs) const {
+    if (all_transpose.size() == 1) {
+      return;
+    }
+    // the transpose op should not remove
+    std::unordered_set<Instruction*> visited_instrs;
+    for (auto transpose : all_transpose) {
+      if (("transpose" != (*transpose)->op_type) || visited_instrs.count(transpose)) {
+        // the transpose op had been fused, skip
+        continue;
+      }
+
+      // Fuse transpose from front to back, the fuse path is unique
+      auto first_transpose = FindFirstTranspose(transpose, out2instr);
+      TryFuseTranspose(first_transpose, fetch_ids, in2instr, remove_instrs, &visited_instrs);
+    }
+  }
+
+  Instruction* FindFirstTranspose(Instruction* transpose, const OutputToOpMap& out2instr) const {
+    auto first_transpose = transpose;
+
+    auto input_name = (*first_transpose)->inputs.front()->id;
+    // Q: Why check whether transpose's input in out2instr ?
+    // A: The input may be the input of the graph other than another op's output.
+    //    Obviously, the transpose op is the first transpose in the situation.
+    while (out2instr.count(input_name)) {
+      auto instr = out2instr.at(input_name);
+      if ("transpose" != (*instr)->op_type) {
+        // if input of transpose is not output of another transpose, it is the first transpose.
+        break;
+      }
+
+      input_name      = (*instr)->inputs.front()->id;
+      first_transpose = instr;
+    }
+    return first_transpose;
+  }
+
+  void TryFuseTranspose(Instruction* transpose,
+                        const std::unordered_set<std::string>& fetch_ids,
+                        const InputToOpMap& in2instr,
+                        std::unordered_set<Instruction*>* remove_instrs,
+                        std::unordered_set<Instruction*>* visited_instrs) const {
+    visited_instrs->insert(transpose);
+
+    const auto& input      = (*transpose)->inputs.front();
+    const auto& input_name = input->id;
+
+    const auto& output      = (*transpose)->outputs.front();
+    const auto& output_name = output->id;
+
+    const auto& axis = transpose->GetAttrs<ShapeType>("axis");
+    CHECK_EQ(axis.size(), input->shape.size())
+        << "The transpose's axis size should equal with input variable's shape size, but the transpose of ["
+        << input->id << "] not ! Please check.";
+
+    bool can_remove = !fetch_ids.count(output_name);
+
+    if (CheckTransposeBorder(transpose, in2instr)) {
+      if (can_remove) {
+        VLOG(4) << "The transpose op {input[" << input_name << "], output[" << output_name << "], axis["
+                << cinn::utils::Join(axis, ",") << "]} is a output op of graph, connot fuse, remove.";
+        // this transpose not used by any other op, remove
+        remove_instrs->insert(transpose);
+      } else {
+        if (CheckTransposeUseless(axis)) {
+          VLOG(4) << "The transpose op {input[" << input_name << "], output[" << output_name << "], axis["
+                  << cinn::utils::Join(axis, ",") << "]} is fetched but useless, replace with identity.";
+          // cannot remove, however, the transpsoe is useless, we can replace the transpose with indentiy for more
+          // fusion opportunity
+          ReplaceWithIdentity(transpose);
+        }
+        // else the transpsoe is fetched and helpful, ignore
+      }
+      return;
+    }
+
+    // CheckTransposeBorder ensure `output_name` existed in `in2instr`
+    const auto& out_instrs = in2instr.at(output_name);
+    if (CheckTransposeUseless(axis)) {
+      if (!can_remove) {
+        VLOG(4) << "The transpose op {input[" << input_name << "], output[" << output_name << "], axis["
+                << cinn::utils::Join(axis, ",") << "]} is useless but fetched, replace with identity.";
+        // cannot remove, but we can replace the transpose with indentiy for more fusion opportunity
+        ReplaceWithIdentity(transpose);
+      } else {
+        VLOG(4) << "The transpose op {input[" << input_name << "], output[" << output_name << "], axis["
+                << cinn::utils::Join(axis, ",") << "]} is useless, remove.";
+        for (auto instr : out_instrs) {
+          // replace the input to transpose's input
+          ReplaceInputVariable(instr, output_name, input);
+        }
+        remove_instrs->insert(transpose);
+
+        for (auto instr : out_instrs) {
+          if ("transpose" == (*instr)->op_type) {
+            // if the next instruction is transpose op, continue fuse
+            TryFuseTranspose(instr, fetch_ids, in2instr, remove_instrs, visited_instrs);
+          }
+        }
+      }
+      return;
+    }
+
+    if (!CheckOutputContainTranspose(transpose, in2instr)) {
+      VLOG(4) << "The transpose op {input[" << input_name << "], output[" << output_name << "], axis["
+              << cinn::utils::Join(axis, ",") << "]} doesn't has output link to transpose, skip.";
+      return;
+    }
+
+    std::unordered_set<Instruction*> next_fused_instrs;
+
+    for (auto instr : out_instrs) {
+      if ("transpose" != (*instr)->op_type) {
+        // the transpose was used by other non-transpose op, cannot remove, skip
+        can_remove = false;
+        VLOG(4) << "Fuse transpose of {input[" << input_name << "], output[" << output_name << "], axis ["
+                << cinn::utils::Join(axis, ",") << "]} was used by " << (*instr)->op_type << ", cannot remove.";
+        continue;
+      }
+
+      const auto& next_axis = instr->GetAttrs<ShapeType>("axis");
+      // we can fuse two transpose by fuse the two axes like:
+      // step |    axis   | after_transpose
+      //  1   | [0, 2, 1] | [0, 2, 1]
+      //  2   | [2, 1, 0] | [1, 2, 0]
+      // so we can fuse tranpose([0, 2, 1]) and tranpose([2, 1, 0]) into tranpose([1, 2, 0])
+      const auto& fused_axis = FuseTransposeAxis(axis, next_axis);
+
+      VLOG(4) << "Fuse transpose of {input[" << input_name << "], output[" << output_name << "], axis ["
+              << cinn::utils::Join(axis, ",") << "]} and transpose of {input[" << (*instr)->inputs.front()->id
+              << "], output[" << (*instr)->outputs.front()->id << "], axis [" << cinn::utils::Join(next_axis, ",")
+              << "]} into transpose of {input[" << input_name << "], output[" << (*instr)->outputs.front()->id
+              << "], axis[" << cinn::utils::Join(fused_axis, ",") << "]}.";
+
+      auto fused_transpose = FuseTransposeImpl(transpose, instr, fused_axis);
+
+      next_fused_instrs.insert(fused_transpose);
+    }
+
+    if (can_remove) {
+      VLOG(4) << "Remove transpose of {input[" << input_name << "], output[" << output_name << "], axis ["
+              << cinn::utils::Join(axis, ",") << "]}.";
+      remove_instrs->insert(transpose);
+    }
+
+    for (auto instr : next_fused_instrs) {
+      TryFuseTranspose(instr, fetch_ids, in2instr, remove_instrs, visited_instrs);
+    }
+  }
+
+  // check whether the op is the border op of graph, in other words, its output var was not
+  // used by any op in graph.
+  bool CheckTransposeBorder(Instruction* transpose, const InputToOpMap& in2instr) const {
+    const auto& output_name = (*transpose)->outputs.front()->id;
+    return !in2instr.count(output_name) || in2instr.at(output_name).empty();
+  }
+
+  // check whether the op's output ops has transpose, if not, no transpose need folding
+  bool CheckOutputContainTranspose(Instruction* transpose, const InputToOpMap& in2instr) const {
+    const auto& output_name = (*transpose)->outputs.front()->id;
+    for (auto instr : in2instr.at(output_name)) {
+      if ("transpose" == (*instr)->op_type) {
+        return true;
+      }
+    }
+    // the first transpose's output is not anyone transpose's input
+    return false;
+  }
+
+  // if the transpose axis like {0, 1, 2, 3, 4, 5}, the transpose is useless, should remove
+  bool CheckTransposeUseless(const ShapeType& axis) const {
+    for (int i = 0; i < axis.size(); ++i) {
+      if (axis[i] != i) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // replace the op's input variable whose name is `old_input_name` to `new_input`, note we need keep the input list
+  // order
+  void ReplaceInputVariable(Instruction* op, const std::string& old_input_name, const Variable& new_input) const {
+    auto find_input = [&](const std::string& input_name) {
+      return std::find_if(
+          (*op)->inputs.begin(), (*op)->inputs.end(), [&](const Variable& v) { return input_name == v->id; });
+    };
+
+    // Why Loop : To avoid the op's inputs are the same variable !
+    for (auto it = find_input(old_input_name); it != (*op)->inputs.end(); it = find_input(old_input_name)) {
+      // erase previous fill_constant output var and replace to new fill_constant output var
+      auto next_it = (*op)->inputs.erase(it);
+      // keep the input place same, it's very important
+      (*op)->inputs.insert(next_it, new_input);
+    }
+  }
+
+  Instruction* ReplaceWithIdentity(Instruction* op) const {
+    (*op)->op_type = "identity";
+    (*op)->attrs.clear();
+    (*op)->attrs_ordered.clear();
+    return op;
+  }
+
+  // compute the fused axis of `old_axis` and `new_axis`, like [0, 2, 1] + [2, 1, 0] = [1, 2, 0]
+  ShapeType FuseTransposeAxis(const ShapeType& old_axis, const ShapeType& new_axis) const {
+    CHECK_EQ(old_axis.size(), new_axis.size())
+        << "The transpose axis size should be " << old_axis.size() << ", but here " << new_axis.size();
+
+    ShapeType axis = old_axis;
+    for (int i = 0; i < new_axis.size(); ++i) {
+      axis[i] = old_axis[new_axis[i]];
+    }
+    return axis;
+  }
+
+  // fuse the two transpose axis into the second transpose, replace its input and axis
+  Instruction* FuseTransposeImpl(Instruction* transpose1, Instruction* transpose2, const ShapeType& fused_axis) const {
+    (*transpose2)->inputs.front() = (*transpose1)->inputs.front();
+    transpose2->SetAttr("axis", fused_axis);
+    return transpose2;
+  }
+
+  // if the transposes have the same input and axis, they can folding into one, the redundance should remove
+  void FoldingTransposeHorizontal(const std::unordered_set<Instruction*>& all_transpose,
+                                  const std::unordered_set<std::string>& fetch_ids,
+                                  const InputToOpMap& in2instr,
+                                  const OutputToOpMap& out2instr,
+                                  std::unordered_set<Instruction*>* remove_instrs) const {
+    std::unordered_map<TransposeKey, Variable*, TransposeKey::Hash> first_transpose_map;
+    for (auto transpose : all_transpose) {
+      if (("transpose" != (*transpose)->op_type) || remove_instrs->count(transpose)) {
+        continue;
+      }
+
+      const auto& input_id  = (*transpose)->inputs.front()->id;
+      const auto& output_id = (*transpose)->outputs.front()->id;
+      const auto& axis      = transpose->GetAttrs<ShapeType>("axis");
+
+      TransposeKey key(input_id, axis);
+      if (!first_transpose_map.count(key)) {
+        VLOG(4) << "The transpose, whose output [" << output_id
+                << "], cannot remove because it is the first transpose ! ";
+        first_transpose_map.emplace(key, &(*transpose)->outputs.front());
+        continue;
+      }
+
+      if (fetch_ids.find(output_id) != fetch_ids.end()) {
+        // the transpose's output variable was fetched, skip
+        VLOG(4) << "Cannot remove transpose, because the output [" << output_id << "] was fetched by other op ! ";
+        continue;
+      }
+
+      VLOG(4) << "Try remove transpose, whose output [" << output_id << "]. ";
+      remove_instrs->insert(transpose);
+
+      const auto& output_ops = in2instr.at(output_id);
+      for (auto op : output_ops) {
+        ReplaceInputVariable(op, output_id, *first_transpose_map.at(key));
+      }
+    }
+  }
+};
+
+}  // namespace cinn::frontend::pass
+
+CINN_REGISTER_HELPER(TransposeCollapsing) {
+  CINN_REGISTER_PROGRAM_PASS(TransposeCollapsing, ::cinn::frontend::pass::TransposeCollapsingPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/transpose_collapsing_test.cc b/paddle/cinn/frontend/pass/transpose_collapsing_test.cc
new file mode 100644
index 0000000000000..0bbf4ea456d74
--- /dev/null
+++ b/paddle/cinn/frontend/pass/transpose_collapsing_test.cc
@@ -0,0 +1,455 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <cfloat>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+namespace cinn::frontend {
+
+void SetInputData(const hlir::framework::Tensor& tensor, Target target) {
+  auto* data = tensor->mutable_data<float>(target);
+  std::vector<float> host_memory(tensor->shape().numel(), 0);
+  for (size_t i = 0; i < tensor->shape().numel(); ++i) {
+    host_memory[i] = static_cast<float>(i);
+  }
+#ifdef CINN_WITH_CUDA
+  if (target == common::DefaultNVGPUTarget()) {
+    cudaMemcpy(data, host_memory.data(), tensor->shape().numel() * sizeof(float), cudaMemcpyHostToDevice);
+    return;
+  }
+#endif
+  CHECK(target == common::DefaultHostTarget());
+  std::copy(host_memory.begin(), host_memory.end(), data);
+}
+std::vector<std::vector<float>> RunWithProgram(const Program& program,
+                                               const Target& target,
+                                               const std::vector<std::string>& input_names,
+                                               const std::vector<std::string>& out_ids) {
+  std::unordered_set<std::string> fetch_list;
+  for (auto id : out_ids) {
+    fetch_list.insert(id);
+  }
+  auto graph = std::make_shared<hlir::framework::Graph>(program, fetch_list, target);
+  auto scope = hlir::framework::BuildScope(target, graph);
+
+  for (const auto& in_name : input_names) {
+    scope->Var<hlir::framework::Tensor>(in_name);
+    SetInputData(scope->GetTensor(in_name), target);
+  }
+
+  hlir::framework::ApplyPasses(graph.get(), {"InferShape", "OpFusionPass"});
+  VLOG(1) << "graph:\n" << graph->Visualize();
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  runtime_program->Execute();
+
+  std::vector<std::vector<float>> outputs;
+  for (const auto& out_id : out_ids) {
+    outputs.emplace_back(GetTensorData<float>(scope->GetTensor(out_id), target));
+  }
+  return outputs;
+}
+
+TEST(TransposeCollapsing, FuseTwoTranspose) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_t     = builder.Transpose(x, {0, 2, 1});
+  auto out     = builder.Transpose(x_t, {2, 1, 0});
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  std::initializer_list<std::string> fetch_list = {out->id};
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program before pass:\n" << program;
+  // Program {
+  //   var_0 = transpose(X, axis=[0,2,1])
+  //   var_1 = transpose(var_0, axis=[2,1,0])
+  // }
+
+  auto origin_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ProgramPass::Apply(&program, fetch_list, target, {"TransposeCollapsing"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after pass:\n" << program;
+  // Program {
+  //   var_1 = transpose(X, axis=[1,2,0])
+  // }
+
+  auto folded_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ASSERT_EQ(origin_size, folded_size + 1);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_EQ(origin_out[i].size(), folded_out[i].size());
+    for (size_t j = 0; j < origin_out[i].size(); ++j) {
+      ASSERT_FLOAT_EQ(origin_out[i][j], folded_out[i][j]);
+    }
+  }
+}
+
+TEST(TransposeCollapsing, FuseThreeTranspose) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_1t    = builder.Transpose(x, {0, 2, 1});
+  auto x_2t    = builder.Transpose(x_1t, {2, 1, 0});
+  auto out     = builder.Transpose(x_2t, {1, 2, 0});
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  std::initializer_list<std::string> fetch_list = {out->id};
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program before pass:\n" << program;
+  // Program {
+  //   var_4 = transpose(X, axis=[0,2,1])
+  //   var_5 = transpose(var_4, axis=[2,1,0])
+  //   var_6 = transpose(var_5, axis=[1,2,0])
+  // }
+
+  auto origin_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ProgramPass::Apply(&program, fetch_list, target, {"TransposeCollapsing"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after pass:\n" << program;
+  // Program {
+  //   var_6 = transpose(X, axis=[2,0,1])
+  // }
+
+  auto folded_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ASSERT_EQ(origin_size, folded_size + 2);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_EQ(origin_out[i].size(), folded_out[i].size());
+    for (size_t j = 0; j < origin_out[i].size(); ++j) {
+      ASSERT_FLOAT_EQ(origin_out[i][j], folded_out[i][j]);
+    }
+  }
+}
+
+TEST(TransposeCollapsing, RemoveUselessTranspose) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_t     = builder.Transpose(x, {0, 1, 2});
+  auto out     = builder.Add(x, x_t);
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  std::initializer_list<std::string> fetch_list = {out->id};
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program before pass:\n" << program;
+  // Program {
+  //   var_9 = transpose(X, axis=[0,1,2])
+  //   var_10 = elementwise_add(X, var_9)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ProgramPass::Apply(&program, fetch_list, target, {"TransposeCollapsing"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after pass:\n" << program;
+  // Program {
+  //   var_10 = elementwise_add(X, X)
+  // }
+  auto folded_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_EQ(origin_out[i].size(), folded_out[i].size());
+    for (size_t j = 0; j < origin_out[i].size(); ++j) {
+      ASSERT_FLOAT_EQ(origin_out[i][j], folded_out[i][j]);
+    }
+  }
+}
+
+TEST(TransposeCollapsing, ReplaceUselessTransposeWithIndentity) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto out     = builder.Transpose(x, {0, 1, 2});
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  std::initializer_list<std::string> fetch_list = {out->id};
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program before pass:\n" << program;
+  // Program {
+  //   var_9 = transpose(X, axis=[0,1,2])
+  //   var_10 = elementwise_add(X, var_9)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ProgramPass::Apply(&program, fetch_list, target, {"TransposeCollapsing"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after pass:\n" << program;
+  // Program {
+  //   var_10 = elementwise_add(X, X)
+  // }
+
+  auto folded_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ASSERT_EQ(origin_size, folded_size);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_EQ(origin_out[i].size(), folded_out[i].size());
+    for (size_t j = 0; j < origin_out[i].size(); ++j) {
+      ASSERT_FLOAT_EQ(origin_out[i][j], folded_out[i][j]);
+    }
+  }
+}
+
+TEST(TransposeCollapsing, FuseTransposeToUseless) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_1t    = builder.Transpose(x, {0, 2, 1});
+  auto x_2t    = builder.Transpose(x_1t, {0, 2, 1});
+  auto x_3t    = builder.Transpose(x_2t, {0, 2, 1});
+  auto out     = builder.Add(x_3t, x_3t);
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  std::initializer_list<std::string> fetch_list = {out->id};
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program before pass:\n" << program;
+  // Program {
+  //   var_13 = transpose(X, axis=[0,2,1])
+  //   var_14 = transpose(var_13, axis=[0,2,1])
+  //   var_15 = transpose(var_14, axis=[0,2,1])
+  //   var_16 = elementwise_add(var_15, var_15)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ProgramPass::Apply(&program, fetch_list, target, {"TransposeCollapsing"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after pass:\n" << program;
+  // Program {
+  //   var_15 = transpose(X, axis=[0,2,1])
+  //   var_16 = elementwise_add(var_15, var_15)
+  // }
+
+  auto folded_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ASSERT_EQ(origin_size, folded_size + 2);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_EQ(origin_out[i].size(), folded_out[i].size());
+    for (size_t j = 0; j < origin_out[i].size(); ++j) {
+      ASSERT_FLOAT_EQ(origin_out[i][j], folded_out[i][j]);
+    }
+  }
+}
+
+TEST(TransposeCollapsing, FuseTransposeWithMultiOutput) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_1t    = builder.Transpose(x, {0, 2, 1});
+  auto x_2t    = builder.Transpose(x_1t, {2, 0, 1});
+  auto x_3t    = builder.Transpose(x_2t, {2, 1, 0});
+  auto out1    = builder.Sqrt(x_1t);
+  auto out2    = builder.Sqrt(x_2t);
+  auto out3    = builder.Sqrt(x_3t);
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  std::initializer_list<std::string> fetch_list = {out1->id, out2->id, out3->id};
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program before pass:\n" << program;
+  // Program {
+  //   var_18 = transpose(X, axis=[0,2,1])
+  //   var_19 = transpose(var_18, axis=[2,0,1])
+  //   var_20 = transpose(var_19, axis=[1,0,2])
+  //   var_21 = sqrt(var_18)
+  //   var_22 = sqrt(var_19)
+  //   var_23 = sqrt(var_20)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ProgramPass::Apply(&program, fetch_list, target, {"TransposeCollapsing"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after pass:\n" << program;
+  // Program {
+  //   var_18 = transpose(X, axis=[0,2,1])
+  //   var_19 = transpose(X, axis=[1,0,2])
+  //   var_20 = transpose(X, axis=[0,2,1])
+  //   var_21 = sqrt(var_18)
+  //   var_22 = sqrt(var_19)
+  //   var_23 = sqrt(var_20)
+  // }
+
+  auto folded_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ASSERT_EQ(origin_size, folded_size);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_EQ(origin_out[i].size(), folded_out[i].size());
+    for (size_t j = 0; j < origin_out[i].size(); ++j) {
+      ASSERT_FLOAT_EQ(origin_out[i][j], folded_out[i][j]);
+    }
+  }
+}
+
+TEST(TransposeCollapsing, FuseTwoSecTranspose) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto x_1t    = builder.Transpose(x, {0, 2, 1});
+  auto x_2t    = builder.Transpose(x_1t, {2, 1, 0});
+  auto out1    = builder.Reshape(x_2t, {5, 3, 4});
+  auto x_3t    = builder.Transpose(out1, {0, 2, 1});
+  auto x_4t    = builder.Transpose(x_3t, {2, 1, 0});
+  auto out2    = builder.Sqrt(x_4t);
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  std::initializer_list<std::string> fetch_list = {out1->id, out2->id};
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program before pass:\n" << program;
+  // Program {
+  //   var_26 = transpose(X, axis=[0,2,1])
+  //   var_27 = transpose(var_26, axis=[2,0,1])
+  //   var_28 = sqrt(var_27)
+  //   var_29 = transpose(var_28, axis=[0,2,1])
+  //   var_30 = transpose(var_29, axis=[2,0,1])
+  //   var_31 = sqrt(var_30)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ProgramPass::Apply(&program, fetch_list, target, {"TransposeCollapsing"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after pass:\n" << program;
+  // Program {
+  //   var_27 = transpose(X, axis=[1,0,2])
+  //   var_28 = sqrt(var_27)
+  //   var_30 = transpose(var_28, axis=[1,0,2])
+  //   var_31 = sqrt(var_30)
+  // }
+
+  auto folded_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ASSERT_EQ(origin_size, folded_size + 2);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_EQ(origin_out[i].size(), folded_out[i].size());
+    for (size_t j = 0; j < origin_out[i].size(); ++j) {
+      ASSERT_FLOAT_EQ(origin_out[i][j], folded_out[i][j]);
+    }
+  }
+}
+
+TEST(TransposeCollapsing, FuseTwoHorizontalTranspose) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y_t1    = builder.Transpose(x, {0, 2, 1});
+  auto y_t2    = builder.Transpose(x, {0, 2, 1});
+  auto out     = builder.Add(y_t1, y_t2);
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  std::initializer_list<std::string> fetch_list = {out->id};
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program before pass:\n" << program;
+  // Program {
+  //   var_35 = transpose(X, axis=[0,2,1])
+  //   var_36 = transpose(X, axis=[0,2,1])
+  //   var_37 = elementwise_add(var_35, var_36)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ProgramPass::Apply(&program, fetch_list, target, {"TransposeCollapsing"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after pass:\n" << program;
+  // Program {
+  //   var_36 = transpose(X, axis=[0,2,1])
+  //   var_37 = elementwise_add(var_36, var_36)
+  // }
+
+  auto folded_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ASSERT_EQ(origin_size - folded_size, 0);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_EQ(origin_out[i].size(), folded_out[i].size());
+    for (size_t j = 0; j < origin_out[i].size(); ++j) {
+      ASSERT_FLOAT_EQ(origin_out[i][j], folded_out[i][j]);
+    }
+  }
+}
+
+TEST(TransposeCollapsing, FuseVerAndHorTranspose) {
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y_t1    = builder.Transpose(x, {0, 2, 1});
+  auto y_t2    = builder.Transpose(y_t1, {2, 1, 0});
+  auto y_t3    = builder.Transpose(x, {1, 2, 0});
+  auto out     = builder.Add(y_t2, y_t3);
+  auto program = builder.Build();
+  auto target  = common::DefaultTarget();
+
+  std::initializer_list<std::string> fetch_list = {out->id};
+
+  size_t origin_size = program.size();
+  VLOG(1) << "Program before pass:\n" << program;
+  // Program {
+  //   var_40 = transpose(X, axis=[0,2,1])
+  //   var_41 = transpose(var_40, axis=[2,1,0])
+  //   var_42 = transpose(X, axis=[1,2,0])
+  //   var_43 = elementwise_add(var_41, var_42)
+  // }
+
+  auto origin_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ProgramPass::Apply(&program, fetch_list, target, {"TransposeCollapsing"});
+  size_t folded_size = program.size();
+  VLOG(1) << "Program after pass:\n" << program;
+  // Program {
+  //   var_42 = transpose(X, axis=[1,2,0])
+  //   var_43 = elementwise_add(var_42, var_42)
+  // }
+
+  auto folded_out = RunWithProgram(program, target, {"X"}, fetch_list);
+
+  ASSERT_EQ(origin_size - folded_size, 1);
+  ASSERT_EQ(origin_out.size(), folded_out.size());
+  for (size_t i = 0; i < origin_out.size(); ++i) {
+    ASSERT_EQ(origin_out[i].size(), folded_out[i].size());
+    for (size_t j = 0; j < origin_out[i].size(); ++j) {
+      ASSERT_FLOAT_EQ(origin_out[i][j], folded_out[i][j]);
+    }
+  }
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/transpose_folding_base.h b/paddle/cinn/frontend/pass/transpose_folding_base.h
new file mode 100644
index 0000000000000..2236f0128f75b
--- /dev/null
+++ b/paddle/cinn/frontend/pass/transpose_folding_base.h
@@ -0,0 +1,211 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/container/flat_hash_map.h>
+
+#include <sstream>
+#include <string>
+#include <unordered_set>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn::frontend::pass {
+
+class TransposeFoldingBase : public ProgramPass {
+ public:
+  using ProgramPass::ProgramPass;
+  using In2InstrType  = absl::flat_hash_map<std::string, std::unordered_set<Instruction*>>;
+  using Out2InstrType = absl::flat_hash_map<std::string, Instruction*>;
+
+ protected:
+  virtual void set_target_instrs() = 0;
+  // the ops which can folding into matmul
+  void set_fold_instrs() { fold_instrs_ = {"transpose", "scale", "broadcast_to"}; }
+  // the ops which cannot folding but can ignore when it place between into folding op and matmul
+  void set_skip_instrs() { skip_instrs_ = {"cast", "identity"}; }
+
+  void Clear() override {
+    target_instrs_.clear();
+    fold_instrs_.clear();
+    skip_instrs_.clear();
+  }
+
+  void ApplyImpl(Program* program,
+                 const std::unordered_set<std::string>& fetch_ids,
+                 const common::Target& target) override {
+    set_target_instrs();
+    set_fold_instrs();
+    set_skip_instrs();
+    // `out2instr` is used to represent the mapping of Output to Instruction.
+    Out2InstrType out2instr;
+    // `in2instr` is used to represent the mapping of Input to Instruction.
+    In2InstrType in2instr;
+    for (size_t i = 0; i < program->size(); ++i) {
+      auto& instr = (*program)[i];
+      for (const auto& out : instr->outputs) {
+        out2instr[out->id] = &instr;
+      }
+      for (const auto& in : instr->inputs) {
+        in2instr[in->id].insert(&instr);
+      }
+    }
+
+    // `remove_instrs` is used to represent Instructions of which type is transpose to be deleted.
+    absl::flat_hash_set<Instruction*> remove_instrs;
+    for (size_t i = 0; i < program->size(); ++i) {
+      auto& instr = (*program)[i];
+      if (target_instrs_.count(instr->op_type)) {
+        DoMatmulFoldOptimize(&instr, out2instr, in2instr, fetch_ids, &remove_instrs);
+      }
+    }
+
+    NetBuilder builder("transpose_folding_builder");
+    for (auto& var : program->GetInputs()) {
+      builder.CreateInput(var);
+    }
+    for (int i = 0; i < program->size(); i++) {
+      if (!remove_instrs.count(&(*program)[i])) {
+        builder.AppendInstruction((*program)[i]);
+      }
+    }
+    *program = builder.Build();
+  }
+
+  // get can fold instruction in order, more front, more near from dot op
+  // the `instr` param is the next instruction of matmul, not the matmul
+  std::vector<Instruction*> GetFoldInstruction(Instruction* instr,
+                                               const Out2InstrType& out2instr,
+                                               const In2InstrType& in2instr,
+                                               bool from_input) const {
+    if (!fold_instrs_.count((*instr)->op_type) && !skip_instrs_.count((*instr)->op_type)) {
+      return {};
+    }
+    CHECK_EQ((*instr)->inputs.size(), 1UL) << "The op " << (*instr)->op_type << " should has 1 input.";
+    CHECK_EQ((*instr)->outputs.size(), 1UL) << "The op " << (*instr)->op_type << " should has 1 output.";
+
+    VLOG(5) << "Try get matmul's folding instructions begin from [" << (*instr)->inputs[0]->id << "]";
+
+    if (!from_input && in2instr.at((*instr)->inputs[0]->id).size() != 1UL) {
+      // the matmul's output should only link to one op
+      VLOG(5) << "The var [" << (*instr)->inputs[0]->id << "] link to many op, cannot fold into matmul! Please check.";
+      return {};
+    }
+
+    std::vector<Instruction*> res           = {instr};
+    std::unordered_set<std::string> visited = {(*instr)->op_type};
+
+    auto cur_instr = instr;
+    while (cur_instr) {
+      Instruction* next_instr = nullptr;
+
+      if (from_input) {
+        // scale -> transpose -> matmul ==> {"transpose", "scale"}
+        auto iter = out2instr.find((*cur_instr)->inputs[0]->id);
+        if (iter != out2instr.end()) {
+          next_instr = iter->second;
+        }
+      } else {
+        // matmul -> transpose -> scale ==> {"transpose", "scale"}
+        auto iter = in2instr.find((*cur_instr)->outputs[0]->id);
+        if (iter != in2instr.end() && iter->second.size() == 1UL) {
+          next_instr = *iter->second.begin();
+        }
+      }
+
+      if (CanFold(next_instr, visited)) {
+        // found can fold instruction and not repeat
+        res.emplace_back(next_instr);
+        visited.emplace((*next_instr)->op_type);
+      } else {
+        // the fold instructions must consecutive
+        break;
+      }
+
+      cur_instr = next_instr;
+    }
+
+    return res;
+  }
+
+  bool CanFold(const Instruction* instr, const std::unordered_set<std::string>& visited_instr) const {
+    if (!instr) {
+      return false;
+    }
+
+    const auto& instr_type = (*instr)->op_type;
+    if ((!fold_instrs_.count(instr_type) && !skip_instrs_.count(instr_type)) || visited_instr.count(instr_type)) {
+      return false;
+    }
+    if (instr_type == "transpose") {
+      if (visited_instr.count("broadcast_to")) {
+        // if transpose after broadcast_to, cannot fold because shape has changed
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool IsValidTranspose(const Instruction& transpose) const {
+    if ("transpose" != transpose->op_type) {
+      return false;
+    }
+
+    // `axis` of tranpose must be consecutive in the reverse order,
+    // excluding the first dim
+    auto axis = transpose.GetAttrs<std::vector<int>>("axis");
+    if (axis.size() <= 1) {
+      return false;
+    }
+    int rank = axis.size();
+
+    // the batch dimension should not change
+    for (int i = 0; i < rank - 2; ++i) {
+      if (axis[i] != i) {
+        return false;
+      }
+    }
+    // only the last two dimension need transpose
+    if (axis[rank - 2] != rank - 1 || axis[rank - 1] != rank - 2) {
+      return false;
+    }
+
+    return true;
+  }
+
+  bool IsValidScale(const Instruction& scale) const {
+    if ("scale" != scale->op_type) {
+      return false;
+    }
+
+    float bias = scale->attrs.count("bias") ? absl::get<float>(scale->attrs.at("bias")) : 0.0f;
+    return (bias == 0.0f);
+  }
+
+  bool CanSkip(const Instruction& instr) const { return skip_instrs_.count(instr->op_type); }
+
+  virtual void DoMatmulFoldOptimize(Instruction* instr,
+                                    const Out2InstrType& out2instr,
+                                    const In2InstrType& in2instr,
+                                    const std::unordered_set<std::string>& fetch_ids,
+                                    absl::flat_hash_set<Instruction*>* remove_instrs) const = 0;
+
+  std::unordered_set<std::string> target_instrs_;
+  std::unordered_set<std::string> fold_instrs_;
+  std::unordered_set<std::string> skip_instrs_;
+};
+
+}  // namespace cinn::frontend::pass
diff --git a/paddle/cinn/frontend/pass/transpose_folding_input.cc b/paddle/cinn/frontend/pass/transpose_folding_input.cc
new file mode 100644
index 0000000000000..ac568340f40f3
--- /dev/null
+++ b/paddle/cinn/frontend/pass/transpose_folding_input.cc
@@ -0,0 +1,161 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/types/variant.h>
+
+#include <string>
+#include <unordered_set>
+
+#include "cinn/frontend/pass/transpose_folding_base.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn::frontend::pass {
+
+// Pass `TransposeFoldingInput` folds transpose into dot, then both of them can be implemented by a
+// GEMM kernel. For each dot operator, try folding every input that belong output of transpose.
+// If output of tranpose in `fetch_ids`, keep the operator.
+class TransposeFoldingInputPass : public TransposeFoldingBase {
+ public:
+  using TransposeFoldingBase::TransposeFoldingBase;
+
+ protected:
+  void set_target_instrs() override { TransposeFoldingBase::target_instrs_ = {"matmul"}; }
+
+  bool IsValidBroadCast(const Instruction& broadcast, const Instruction& dot, const int input_id) const {
+    if ("broadcast_to" != broadcast->op_type) {
+      return false;
+    }
+
+    // check whether the output shape can infer from another input, if not, cannot remove this broadcast
+    int next_id            = (input_id + 1) % dot->inputs.size();
+    const auto& next_shape = dot->inputs[next_id]->shape;
+    const auto& out_shape  = dot->outputs[0]->shape;
+
+    if (next_shape.size() != out_shape.size()) {
+      return false;
+    }
+
+    for (int i = 0; i < next_shape.size() - 2; ++i) {
+      if (next_shape[i] != out_shape[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void DoMatmulFoldOptimize(Instruction* dot,
+                            const Out2InstrType& out2instr,
+                            const In2InstrType& in2instr,
+                            const std::unordered_set<std::string>& fetch_ids,
+                            absl::flat_hash_set<Instruction*>* remove_instrs) const override {
+    CHECK_EQ((*dot)->inputs.size(), 2UL) << "The matmul should only have two inputs.";
+
+    auto debug_info = [](const std::vector<Instruction*>& instrs) {
+      std::stringstream ss;
+      for (auto instr : instrs) {
+        ss << (*instr)->op_type << ", ";
+      }
+      return ss.str();
+    };
+
+    for (size_t i = 0; i < (*dot)->inputs.size(); ++i) {
+      auto iter = out2instr.find((*dot)->inputs[i]->id);
+      if (iter != out2instr.end()) {
+        // for example: x -> scale -> y -> transpose -> z -> dot
+        // fold_instrs = {"transpose", "scale"}
+        const auto& fold_instrs = GetFoldInstruction(iter->second, out2instr, in2instr, true);
+
+        if (fold_instrs.empty()) {
+          continue;
+        }
+
+        VLOG(4) << "Fold Instruction: [" << debug_info(fold_instrs) << "]"
+                << " into " << (i == 0 ? "x" : "y") << " of matmul: " << *dot;
+
+        bool shape_has_changed = false;
+        for (int j = fold_instrs.size() - 1; j >= 0; --j) {
+          auto instr = fold_instrs[j];
+
+          if (IsValidTranspose(*instr)) {
+            // fold transpose into trans_a/trans_b
+            if (i == 0) {
+              bool trans_a = (*dot)->attrs.count("trans_a") ? absl::get<bool>((*dot)->attrs.at("trans_a")) : false;
+              dot->SetAttr("trans_a", static_cast<bool>(trans_a ^ true));
+            } else if (i == 1) {
+              bool trans_b = (*dot)->attrs.count("trans_b") ? absl::get<bool>((*dot)->attrs.at("trans_b")) : false;
+              dot->SetAttr("trans_b", static_cast<bool>(trans_b ^ true));
+            } else {
+              LOG(FATAL) << "The matmul should only have two inputs.";
+            }
+
+            // shape has changed, the ignore op should update shape
+            shape_has_changed = true;
+          } else if (IsValidScale(*instr)) {
+            // assume C = alpha * A * B + beta * C
+            // fold scale into alpha
+            float scale = (*instr)->attrs.count("scale") ? absl::get<float>((*instr)->attrs.at("scale")) : 1.0f;
+
+            float alpha = (*dot)->attrs.count("alpha") ? absl::get<float>((*dot)->attrs.at("alpha")) : 1.0f;
+            dot->SetAttr("alpha", alpha * scale);
+          } else if (IsValidBroadCast(*instr, *dot, i)) {
+            // nothin to do, can fold directly
+            // the x's broadcast has removed, cannot removed the y's
+            shape_has_changed = true;
+          } else if (CanSkip(*instr)) {
+            if (shape_has_changed) {
+              // the transpose op may change the shape, need update
+              (*instr)->outputs[0]->shape = (*instr)->inputs[0]->shape;
+            }
+            continue;
+          } else {
+            // invlid folding op, skip
+            continue;
+          }
+
+          // relink input: x-> transpose -> out -> dot ==> x -> dot
+          auto next_instr = (j == 0) ? dot : fold_instrs[j - 1];
+          if (j == 0) {
+            (*dot)->inputs[i] = (*instr)->inputs[0];
+          } else {
+            (*next_instr)->inputs[0] = (*instr)->inputs[0];
+          }
+
+          // check whether the instruction can be removed
+          const auto& out_name   = (*instr)->outputs[0]->id;
+          const auto& out_instrs = in2instr.at(out_name);
+
+          bool can_remove = std::all_of(out_instrs.begin(), out_instrs.end(), [&](Instruction* out_instr) {
+            // the transpose had linked to not matmul op, cannot remove
+            return target_instrs_.count((*out_instr)->op_type) || (out_instr == next_instr);
+          });
+
+          if (can_remove && !fetch_ids.count(out_name)) {
+            // the transpose is only link to matmul and its output is not in fetch_ids, should remove
+            remove_instrs->insert(instr);
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace cinn::frontend::pass
+
+CINN_REGISTER_HELPER(TransposeFoldingInput) {
+  CINN_REGISTER_PROGRAM_PASS(TransposeFoldingInput, ::cinn::frontend::pass::TransposeFoldingInputPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/transpose_folding_input_test.cc b/paddle/cinn/frontend/pass/transpose_folding_input_test.cc
new file mode 100644
index 0000000000000..eeed2426c36df
--- /dev/null
+++ b/paddle/cinn/frontend/pass/transpose_folding_input_test.cc
@@ -0,0 +1,257 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <cfloat>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/pass/pass_test_helper.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/data_util.h"
+
+namespace cinn::frontend {
+
+void RunWithProgram(const Program& program,
+                    const Target& target,
+                    const std::shared_ptr<hlir::framework::Scope>& scope) {
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass"});
+  VLOG(1) << "graph:\n" << graph->Visualize();
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  runtime_program->Execute();
+}
+
+TEST(TransposeFoldingInput, FoldIntoDotBatchedCase1) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y           = builder.CreateInput(Float(32), {4, 5, 6}, "Y");
+  auto transpose_x = builder.Transpose(x, {0, 2, 1});
+  auto out         = builder.Matmul(transpose_x, y);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(TransposeFoldingInput, FoldIntoDotBachedCase2) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {4, 3, 5}, "X");
+  auto y           = builder.CreateInput(Float(32), {4, 6, 5}, "Y");
+  auto transpose_y = builder.Transpose(y, {0, 2, 1});
+  auto out         = builder.Matmul(x, transpose_y);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(TransposeFoldingInput, FoldIntoDotBachedCase3) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y           = builder.CreateInput(Float(32), {4, 6, 5}, "Y");
+  auto transpose_x = builder.Transpose(x, {0, 2, 1});
+  auto transpose_y = builder.Transpose(y, {0, 2, 1});
+  auto out         = builder.Matmul(transpose_x, transpose_y);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(TransposeFoldingInput, FoldIntoDotCase1) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {2, 3}, "X");
+  auto y           = builder.CreateInput(Float(32), {2, 3}, "Y");
+  auto transpose_y = builder.Transpose(y, {1, 0});
+  auto out         = builder.Matmul(x, transpose_y);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(TransposeFoldingInput, FoldIntoDotCase2) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.FillConstant<float>({2, 20}, 2.0f, "A");
+  auto b       = builder.Transpose(a, {1, 0});
+  auto c       = builder.CreateInput(Float(32), {121, 20}, "C");
+  auto d       = builder.Matmul(c, b);
+  auto x       = builder.FillConstant<float>({2, 20}, 1.0f, "X");
+  auto y       = builder.Transpose(x, {1, 0});
+  auto z       = builder.CreateInput(Float(32), {121, 20}, "Z");
+  auto q       = builder.Matmul(z, y);
+  auto out     = builder.Add(d, q);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{c.id(), z.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(TransposeFoldingInput, TransposeOutInFetchIds) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {2, 3}, "X");
+  auto y           = builder.CreateInput(Float(32), {2, 3}, "Y");
+  auto transpose_y = builder.Transpose(y, {1, 0});
+  auto out         = builder.Matmul(x, transpose_y);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id, transpose_y->id}, 0, passes, 123, true);
+}
+
+TEST(TransposeFoldingInput, TransposeOutUsedByOtherInstrs) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {2, 2}, "X");
+  auto y           = builder.CreateInput(Float(32), {2, 2}, "Y");
+  auto transpose_y = builder.Transpose(y, {1, 0});
+  auto dot         = builder.Matmul(x, transpose_y);
+  auto out         = builder.Add(transpose_y, dot);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 0, passes, 123, true);
+}
+
+TEST(TransposeFoldingInput, TransposeTwiceWithMatmul) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x = builder.CreateInput(Float(32), {2, 20}, "X");
+  auto y = builder.CreateInput(Float(32), {10201, 20}, "Y");
+  auto z = builder.CreateInput(Float(32), {10201, 2}, "Z");
+
+  auto x_t     = builder.Transpose(x, {1, 0});
+  auto x_t_t   = builder.Transpose(x_t, {1, 0});
+  auto dot1    = builder.Matmul(y, x_t);
+  auto dot2    = builder.Matmul(z, x_t_t);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id(), z.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {dot1->id, dot2->id}, 1, passes, 123, true);
+}
+
+TEST(TransposeFoldingInput, TransposeWithMultiMamtul) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {2, 2}, "X");
+  auto y           = builder.CreateInput(Float(32), {2, 2}, "Y");
+  auto transpose_y = builder.Transpose(y, {1, 0});
+  auto dot1        = builder.Matmul(x, transpose_y);
+  auto dot2        = builder.Matmul(transpose_y, x);
+  auto out         = builder.Add(dot1, dot2);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/transpose_folding_output.cc b/paddle/cinn/frontend/pass/transpose_folding_output.cc
new file mode 100644
index 0000000000000..e763e2fa32417
--- /dev/null
+++ b/paddle/cinn/frontend/pass/transpose_folding_output.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/types/variant.h>
+
+#include <string>
+#include <unordered_set>
+
+#include "cinn/frontend/pass/transpose_folding_base.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+
+namespace cinn::frontend::pass {
+
+class TransposeFoldingOutputPass : public TransposeFoldingBase {
+ public:
+  using TransposeFoldingBase::TransposeFoldingBase;
+
+ protected:
+  void set_target_instrs() override { TransposeFoldingBase::target_instrs_ = {"cublas_matmul"}; }
+
+  void DoMatmulFoldOptimize(Instruction* dot,
+                            const Out2InstrType& out2instr,
+                            const In2InstrType& in2instr,
+                            const std::unordered_set<std::string>& fetch_ids,
+                            absl::flat_hash_set<Instruction*>* remove_instrs) const override {
+    const auto& gemm_out_name = (*dot)->outputs[0]->id;
+
+    auto debug_info = [](const std::vector<Instruction*>& instrs) {
+      std::stringstream ss;
+      for (auto instr : instrs) {
+        ss << (*instr)->op_type << ", ";
+      }
+      return ss.str();
+    };
+
+    if (in2instr.contains(gemm_out_name) && in2instr.at(gemm_out_name).size() == 1) {
+      // for example: dot -> x -> scale -> y -> transpose -> z
+      // fold_instrs = {"scale", "transpose"}
+      // ensure the foldiong structions's output only link to one op
+      const auto& fold_instrs = GetFoldInstruction(*in2instr.at(gemm_out_name).begin(), out2instr, in2instr, false);
+
+      VLOG(4) << "Fold Instruction: [" << debug_info(fold_instrs) << "]"
+              << " into output of matmul: " << *dot;
+
+      if (fold_instrs.empty()) {
+        return;
+      }
+
+      bool shape_has_changed = false;
+      for (int i = fold_instrs.size() - 1; i >= 0; --i) {
+        auto instr      = fold_instrs[i];
+        auto prev_instr = (i == 0) ? dot : fold_instrs[i - 1];
+
+        if (IsValidTranspose(*instr)) {
+          // As for cublas_matmul, we can continue to set the `trans_out` attr.
+          bool trans_out = (*dot)->attrs.count("trans_out") ? absl::get<bool>((*dot)->attrs.at("trans_out")) : false;
+          dot->SetAttr("trans_out", static_cast<bool>(trans_out ^ true));
+
+          // shape has changed, the ignore op should update shape
+          shape_has_changed = true;
+        } else if (IsValidScale(*instr)) {
+          // assume C = alpha * A * B + beta * C
+          // fold scale into alpha/beta
+          float scale = (*instr)->attrs.count("scale") ? absl::get<float>((*instr)->attrs.at("scale")) : 1.0f;
+
+          float alpha = (*dot)->attrs.count("alpha") ? absl::get<float>((*dot)->attrs.at("alpha")) : 1.0f;
+          float beta  = (*dot)->attrs.count("beta") ? absl::get<float>((*dot)->attrs.at("beta")) : 0.0f;
+
+          dot->SetAttr("alpha", alpha * scale);
+          dot->SetAttr("beta", beta * scale);
+        } else if (CanSkip(*instr)) {
+          if (shape_has_changed) {
+            // the transpose op may change the shape, need update
+            (*instr)->inputs[0]->shape = (*instr)->outputs[0]->shape;
+          }
+          continue;
+        } else {
+          // invlid folding op, skip
+          continue;
+        }
+
+        // relink input: dot -> x -> scale -> y ==> dot -> y
+        (*prev_instr)->outputs[0] = (*instr)->outputs[0];
+
+        // remove useless instruction, the `GetFoldInstruction` ensure this
+        remove_instrs->insert(instr);
+      }
+    }
+  }
+};
+
+}  // namespace cinn::frontend::pass
+
+CINN_REGISTER_HELPER(TransposeFoldingOutput) {
+  CINN_REGISTER_PROGRAM_PASS(TransposeFoldingOutput, ::cinn::frontend::pass::TransposeFoldingOutputPass);
+
+  return true;
+}
diff --git a/paddle/cinn/frontend/pass/transpose_folding_output_test.cc b/paddle/cinn/frontend/pass/transpose_folding_output_test.cc
new file mode 100755
index 0000000000000..e41667518909f
--- /dev/null
+++ b/paddle/cinn/frontend/pass/transpose_folding_output_test.cc
@@ -0,0 +1,568 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/pass/pass_test_helper.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/runtime/flags.h"
+
+namespace cinn::frontend {
+
+TEST(TransposeFoldingOutput, BatchedMatmulTransLeft) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 6, 8}, "A");
+  auto b       = builder.Transpose(a, {0, 2, 1});
+  auto c       = builder.CreateInput(Float(32), {3, 6, 7}, "C");
+  auto d       = builder.Matmul(b, c);
+  auto e       = builder.Transpose(d, {0, 2, 1});
+  auto f       = builder.CreateInput(Float(32), {3, 7, 8}, "F");
+  auto out     = builder.Subtract(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, BatchedGemmTransLeft) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 6, 8}, "A");
+  auto b       = builder.Transpose(a, {0, 2, 1});
+  auto c       = builder.CreateInput(Float(32), {3, 6, 7}, "C");
+  auto d       = builder.Matmul(b, c);
+  auto e       = builder.Transpose(d, {0, 2, 1});
+  auto f       = builder.CreateInput(Float(32), {3, 7, 8}, "F");
+  auto out     = builder.Add(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, BatchedMatmulTransRight) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 8, 6}, "A");
+  auto b       = builder.CreateInput(Float(32), {3, 7, 6}, "B");
+  auto c       = builder.Transpose(b, {0, 2, 1});
+  auto d       = builder.Matmul(a, c);
+  auto e       = builder.Transpose(d, {0, 2, 1});
+  auto f       = builder.CreateInput(Float(32), {3, 7, 8}, "F");
+  auto out     = builder.Subtract(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, BatchedGemmTransRight) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 8, 6}, "A");
+  auto b       = builder.CreateInput(Float(32), {3, 7, 6}, "B");
+  auto c       = builder.Transpose(b, {0, 2, 1});
+  auto d       = builder.Matmul(a, c);
+  auto e       = builder.Transpose(d, {0, 2, 1});
+  auto f       = builder.CreateInput(Float(32), {3, 7, 8}, "F");
+  auto out     = builder.Add(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, BatchedMatmulTransTwo) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 6, 8}, "A");
+  auto b       = builder.Transpose(a, {0, 2, 1});
+  auto c       = builder.CreateInput(Float(32), {3, 7, 6}, "C");
+  auto d       = builder.Transpose(c, {0, 2, 1});
+  auto e       = builder.Matmul(b, d);
+  auto f       = builder.CreateInput(Float(32), {3, 7, 8}, "F");
+  auto g       = builder.Transpose(e, {0, 2, 1});
+  auto out     = builder.Subtract(f, g);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 3, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, BatchedGemmTransTwo) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 6, 8}, "A");
+  auto b       = builder.Transpose(a, {0, 2, 1});
+  auto c       = builder.CreateInput(Float(32), {3, 7, 6}, "C");
+  auto d       = builder.Transpose(c, {0, 2, 1});
+  auto e       = builder.Matmul(b, d);
+  auto f       = builder.CreateInput(Float(32), {3, 7, 8}, "F");
+  auto g       = builder.Transpose(e, {0, 2, 1});
+  auto out     = builder.Add(f, g);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 3, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, BatchedMatmulNoTrans) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 8, 6}, "A");
+  auto c       = builder.CreateInput(Float(32), {3, 6, 7}, "C");
+  auto e       = builder.Matmul(a, c);
+  auto f       = builder.CreateInput(Float(32), {3, 7, 8}, "F");
+  auto g       = builder.Transpose(e, {0, 2, 1});
+  auto out     = builder.Subtract(f, g);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, BatchedGemmNoTrans) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {3, 8, 6}, "A");
+  auto c       = builder.CreateInput(Float(32), {3, 6, 7}, "C");
+  auto e       = builder.Matmul(a, c);
+  auto f       = builder.CreateInput(Float(32), {3, 7, 8}, "F");
+  auto g       = builder.Transpose(e, {0, 2, 1});
+  auto out     = builder.Add(f, g);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, MatmulTransLeft) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {6, 8}, "A");
+  auto b       = builder.Transpose(a, {1, 0});
+  auto c       = builder.CreateInput(Float(32), {6, 7}, "C");
+  auto d       = builder.Matmul(b, c);
+  auto e       = builder.Transpose(d, {1, 0});
+  auto f       = builder.CreateInput(Float(32), {7, 8}, "F");
+  auto out     = builder.Subtract(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, GemmTransLeft) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {6, 8}, "A");
+  auto b       = builder.Transpose(a, {1, 0});
+  auto c       = builder.CreateInput(Float(32), {6, 7}, "C");
+  auto d       = builder.Matmul(b, c);
+  auto e       = builder.Transpose(d, {1, 0});
+  auto f       = builder.CreateInput(Float(32), {7, 8}, "F");
+  auto out     = builder.Add(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, MatmulTransRight) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {8, 6}, "A");
+  auto b       = builder.CreateInput(Float(32), {7, 6}, "B");
+  auto c       = builder.Transpose(b, {1, 0});
+  auto d       = builder.Matmul(a, c);
+  auto e       = builder.Transpose(d, {1, 0});
+  auto f       = builder.CreateInput(Float(32), {7, 8}, "F");
+  auto out     = builder.Subtract(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, GemmTransRight) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {8, 6}, "A");
+  auto b       = builder.CreateInput(Float(32), {7, 6}, "B");
+  auto c       = builder.Transpose(b, {1, 0});
+  auto d       = builder.Matmul(a, c);
+  auto e       = builder.Transpose(d, {1, 0});
+  auto f       = builder.CreateInput(Float(32), {7, 8}, "F");
+  auto out     = builder.Add(e, f);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  std::pair<std::vector<std::string>, std::vector<std::string>> passes{
+      {"Decomposer", "RemoveIdentity"},
+      {"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"}};
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, MatmulTransTwo) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {6, 8}, "A");
+  auto b       = builder.Transpose(a, {1, 0});
+  auto c       = builder.CreateInput(Float(32), {7, 6}, "C");
+  auto d       = builder.Transpose(c, {1, 0});
+  auto e       = builder.Matmul(b, d);
+  auto f       = builder.CreateInput(Float(32), {7, 8}, "F");
+  auto g       = builder.Transpose(e, {1, 0});
+  auto out     = builder.Subtract(f, g);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 3, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, GemmTransTwo) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {6, 8}, "A");
+  auto b       = builder.Transpose(a, {1, 0});
+  auto c       = builder.CreateInput(Float(32), {7, 6}, "C");
+  auto d       = builder.Transpose(c, {1, 0});
+  auto e       = builder.Matmul(b, d);
+  auto f       = builder.CreateInput(Float(32), {7, 8}, "F");
+  auto g       = builder.Transpose(e, {1, 0});
+  auto out     = builder.Add(f, g);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 3, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, MatmulNoTrans) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {8, 6}, "A");
+  auto c       = builder.CreateInput(Float(32), {6, 7}, "C");
+  auto e       = builder.Matmul(a, c);
+  auto f       = builder.CreateInput(Float(32), {7, 8}, "F");
+  auto g       = builder.Transpose(e, {1, 0});
+  auto out     = builder.Subtract(f, g);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, GemmNoTrans) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {8, 6}, "A");
+  auto c       = builder.CreateInput(Float(32), {6, 7}, "C");
+  auto e       = builder.Matmul(a, c);
+  auto f       = builder.CreateInput(Float(32), {7, 8}, "F");
+  auto g       = builder.Transpose(e, {1, 0});
+  auto out     = builder.Add(f, g);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, BatchedComplex) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.FillConstant<float>({20, 2}, 2.0f, "A");
+  auto b       = builder.FillConstant<float>({16, 2, 20}, 2.0f, "B");
+  auto c       = builder.Transpose(b, {0, 2, 1});
+  auto d       = builder.CreateInput(Float(32), {121, 20}, "D");
+  auto e       = builder.BroadcastTo(d, {16, 121, 20}, {1, 2});
+  auto f       = builder.Matmul(e, c);
+  auto x       = builder.FillConstant<float>({16, 2, 20}, 1.0f, "X");
+  auto y       = builder.Transpose(x, {0, 2, 1});
+  auto z       = builder.CreateInput(Float(32), {16, 20, 121}, "Z");
+  auto l       = builder.Transpose(z, {0, 2, 1});
+  auto m       = builder.Matmul(l, y);
+  auto n       = builder.Matmul(d, a);
+  auto o       = builder.BroadcastTo(n, {16, n->shape[0], n->shape[1]}, {1, 2});
+  auto p       = builder.Subtract(f, o);
+  auto q       = builder.Transpose(f, {0, 2, 1});
+  auto u       = builder.Transpose(m, {0, 2, 1});
+  auto v       = builder.Add(q, u);
+  auto w       = builder.Matmul(v, p);
+  auto i       = builder.Transpose(w, {2, 1, 0});
+  auto j       = builder.FillConstant<float>({2, 2, 16}, 3.14f, "I");
+  auto out     = builder.Add(i, j);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{d.id(), z.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 5, passes, 123, false);
+}
+
+TEST(TransposeFoldingOutput, Complex) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.FillConstant<float>({2, 20}, 2.0f, "A");
+  auto b       = builder.Transpose(a, {1, 0});  // 20 * 2
+  auto c       = builder.CreateInput(Float(32), {121, 20}, "C");
+  auto f       = builder.Matmul(c, b);  // 121 * 2
+  auto x       = builder.FillConstant<float>({2, 20}, 1.0f, "X");
+  auto z       = builder.CreateInput(Float(32), {121, 20}, "Z");
+  auto l       = builder.Transpose(z, {1, 0});  // 20 * 121
+  auto y       = builder.Matmul(x, l);          // 2 * 121
+  auto m       = builder.Transpose(y, {1, 0});  // 121 * 2
+  auto n       = builder.Matmul(z, a, false, true);
+  auto p       = builder.Subtract(f, n);
+  auto q       = builder.Transpose(f, {1, 0});
+  auto u       = builder.Transpose(m, {1, 0});
+  auto v       = builder.Add(q, u);
+  auto w       = builder.Matmul(v, p);
+  auto i       = builder.Transpose(w, {1, 0});
+  auto j       = builder.FillConstant<float>({2, 2}, 3.14f, "I");
+  auto out     = builder.Add(i, j);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{c.id(), z.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+      std::vector<std::string>{
+          "TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter", "TransposeFoldingOutput"});
+  CompareResult(&program, target, input_ids, {out->id}, 5, passes, 123, false);
+}
+
+TEST(TransposeFoldingOutput, MultiTransCaseOne) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {2, 10}, "A");
+  auto b       = builder.CreateInput(Float(32), {10, 50}, "B");
+  auto c       = builder.Matmul(a, b);          // 2 * 50
+  auto d       = builder.Transpose(c, {1, 0});  // 50 * 2
+  auto e       = builder.CreateInput(Float(32), {50, 2}, "E");
+  auto f       = builder.Add(d, e);
+  auto g       = builder.Transpose(f, {1, 0});
+  auto h       = builder.CreateInput(Float(32), {2, 50}, "H");
+  auto out     = builder.Add(h, g);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), e.id(), h.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+                               std::vector<std::string>{"TransposeFoldingInput",
+                                                        "GemmRewriter",
+                                                        "TransposeFoldingOutput",
+                                                        "GemmRewriter",
+                                                        "TransposeFoldingOutput",
+                                                        "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, true);
+}
+
+TEST(TransposeFoldingOutput, MultiTransCaseTwo) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto a       = builder.CreateInput(Float(32), {2, 10}, "A");
+  auto b       = builder.CreateInput(Float(32), {10, 50}, "B");
+  auto c       = builder.Matmul(a, b);          // 2 * 50
+  auto d       = builder.Transpose(c, {1, 0});  // 50 * 2
+  auto g       = builder.Transpose(d, {1, 0});
+  auto h       = builder.CreateInput(Float(32), {2, 50}, "H");
+  auto out     = builder.Add(h, g);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), h.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(std::vector<std::string>{"Decomposer", "RemoveIdentity"},
+                               std::vector<std::string>{"TransposeFoldingInput",
+                                                        "GemmRewriter",
+                                                        "TransposeFoldingOutput",
+                                                        "GemmRewriter",
+                                                        "TransposeFoldingOutput",
+                                                        "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, true);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc b/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
new file mode 100644
index 0000000000000..d54a93228eeec
--- /dev/null
+++ b/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
@@ -0,0 +1,370 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/pass/pass_test_helper.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/runtime/flags.h"
+
+namespace cinn::frontend {
+
+TEST(ScaleFolding, FoldIntoDotBatchedCase1) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 3, 5}, "X");
+  auto y       = builder.CreateInput(Float(32), {4, 5, 6}, "Y");
+  auto scale_x = builder.Scale(x);
+  auto out     = builder.Matmul(scale_x, y);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, false);
+}
+
+TEST(ScaleFolding, FoldIntoDotBatchedCase2) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 3, 5}, "X");
+  auto y       = builder.CreateInput(Float(32), {4, 5, 6}, "Y");
+  auto scale_x = builder.Scale(x, 2.0f);
+  auto out     = builder.Matmul(scale_x, y);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, false);
+}
+
+TEST(ScaleFolding, FoldIntoDotBatchedCase3) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 3, 5}, "X");
+  auto y       = builder.CreateInput(Float(32), {4, 5, 6}, "Y");
+  auto scale_x = builder.Scale(x, 2.0f, 1.0f);
+  auto out     = builder.Matmul(scale_x, y);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 0, passes, 123, false);
+}
+
+TEST(ScaleFolding, FoldIntoDotBatchedCase4) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 3, 5}, "X");
+  auto y       = builder.CreateInput(Float(32), {4, 5, 6}, "Y");
+  auto scale_y = builder.Scale(y, 2.0f);
+  auto out     = builder.Matmul(x, scale_y);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 1, passes, 123, false);
+}
+
+TEST(ScaleFolding, FoldIntoDotBatchedCase5) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x       = builder.CreateInput(Float(32), {4, 3, 5}, "X");
+  auto y       = builder.CreateInput(Float(32), {4, 5, 6}, "Y");
+  auto scale_x = builder.Scale(x, 2.0f);
+  auto scale_y = builder.Scale(y, 2.0f);
+  auto out     = builder.Matmul(scale_x, scale_y);
+  auto program = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, false);
+}
+
+TEST(ScaleFolding, FoldIntoDotBatchedCase6) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x        = builder.CreateInput(Float(32), {4, 3, 5}, "X");
+  auto y        = builder.CreateInput(Float(32), {4, 5, 6}, "Y");
+  auto scale_x  = builder.Scale(x, 2.0f);
+  auto scale_y  = builder.Scale(y, 2.0f);
+  auto orig_out = builder.Matmul(scale_x, scale_y);
+  auto out      = builder.Scale(orig_out, 2.0f);
+  auto program  = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 3, passes, 123, false);
+}
+
+TEST(TransposeScaleFolding, BatchComplexCase1) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y           = builder.CreateInput(Float(32), {4, 6, 5}, "Y");
+  auto transpose_x = builder.Transpose(x, {0, 2, 1});
+  auto scale_x     = builder.Scale(transpose_x, 2.0f);
+  auto transpose_y = builder.Transpose(y, {0, 2, 1});
+  auto scale_y     = builder.Scale(transpose_y, 2.0f);
+  auto orig_out    = builder.Matmul(scale_x, scale_y);
+  auto scale_out   = builder.Scale(orig_out, 2.0f);
+  auto out         = builder.Transpose(scale_out, {0, 2, 1});
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 6, passes, 123, false);
+}
+
+TEST(TransposeScaleFolding, BatchComplexCase2) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x             = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y             = builder.CreateInput(Float(32), {4, 6, 5}, "Y");
+  auto scale_x       = builder.Scale(x, 2.0f);
+  auto transpose_x   = builder.Transpose(scale_x, {0, 2, 1});
+  auto scale_y       = builder.Scale(y, 2.0f);
+  auto transpose_y   = builder.Transpose(scale_y, {0, 2, 1});
+  auto orig_out      = builder.Matmul(transpose_x, transpose_y);
+  auto transpose_out = builder.Transpose(orig_out, {0, 2, 1});
+  auto out           = builder.Scale(transpose_out, 2.0f);
+  auto program       = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 6, passes, 123, false);
+}
+
+TEST(TransposeScaleFolding, BatchComplexCase3) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y           = builder.CreateInput(Float(32), {4, 5, 6}, "Y");
+  auto transpose_x = builder.Transpose(x, {0, 2, 1});
+  auto scale_y     = builder.Scale(y, 2.0f);
+  auto out         = builder.Matmul(transpose_x, scale_y);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, false);
+}
+
+TEST(TransposeScaleFolding, BatchComplexCase4) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto transpose_x = builder.Transpose(x, {0, 2, 1});
+  auto scale_x     = builder.Scale(x, 2.0f);
+  auto out         = builder.Matmul(transpose_x, scale_x);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id()}, std::back_inserter(input_ids), [](absl::string_view id) {
+    return std::string(id);
+  });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 2, passes, 123, false);
+}
+
+TEST(TransposeScaleFolding, BatchComplexCase5) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y           = builder.CreateInput(Float(32), {4, 5, 6}, "Y");
+  auto z           = builder.FillConstant({4, 3, 6}, 1.0f, "Z");
+  auto transpose_x = builder.Transpose(x, {0, 2, 1});
+  auto scale_y     = builder.Scale(y, 2.0f);
+  auto out_matmul  = builder.Matmul(transpose_x, scale_y);
+  auto transpose_o = builder.Transpose(out_matmul, {0, 2, 1});
+  auto out         = builder.Matmul(transpose_o, z);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 3, passes, 123, false);
+}
+
+TEST(TransposeScaleFolding, BatchComplexCase6) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {20, 3}, "X");
+  auto reshape_x   = builder.Reshape(x, {4, 5, 3});
+  auto scale_x     = builder.Scale(reshape_x, 2.0f);
+  auto transpose_x = builder.Transpose(scale_x, {0, 2, 1});
+  auto out_matmul  = builder.Matmul(scale_x, transpose_x);
+  auto out         = builder.Transpose(out_matmul, {0, 2, 1});
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id()}, std::back_inserter(input_ids), [](absl::string_view id) {
+    return std::string(id);
+  });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 3, passes, 123, false);
+}
+
+TEST(TransposeBroadCastFolding, BatchComplexCase1) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y           = builder.CreateInput(Float(32), {5, 6}, "Y");
+  auto transpose_x = builder.Transpose(x, {0, 2, 1});
+  auto scale_y     = builder.Scale(y, 2.0f);
+  auto broadcast_y = builder.BroadcastTo(scale_y, {4, 5, 6});
+  auto out_matmul  = builder.Matmul(transpose_x, broadcast_y);
+  auto out_trans   = builder.Transpose(out_matmul, {0, 2, 1});
+  auto out         = builder.Scale(out_trans, 2.0f);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 5, passes, 123, false);
+}
+
+TEST(TransposeBroadCastFolding, BatchComplexCase2) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  auto x           = builder.CreateInput(Float(32), {4, 5, 3}, "X");
+  auto y           = builder.CreateInput(Float(32), {5, 6}, "Y");
+  auto transpose_x = builder.Transpose(x, {0, 2, 1});
+  auto cast_x      = builder.Cast(transpose_x, "float32");
+  auto scale_y     = builder.Scale(y, 2.0f);
+  auto broadcast_y = builder.BroadcastTo(scale_y, {4, 5, 6});
+  auto out_matmul  = builder.Matmul(cast_x, broadcast_y);
+  auto out_cast    = builder.Cast(out_matmul, "float32");
+  auto out_trans   = builder.Transpose(out_cast, {0, 2, 1});
+  auto out         = builder.Scale(out_trans, 2.0f);
+  auto program     = builder.Build();
+
+  common::Target target = common::DefaultTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  auto passes = std::make_pair(
+      std::vector<std::string>{"Decomposer"},
+      std::vector<std::string>{"TransposeFoldingInput", "GemmRewriter", "TransposeFoldingOutput", "GemmRewriter"});
+  CompareResult(&program, target, input_ids, {out->id}, 5, passes, 123, false);
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/frontend/pass/use_program_pass.h b/paddle/cinn/frontend/pass/use_program_pass.h
new file mode 100644
index 0000000000000..29fd12ac1cea1
--- /dev/null
+++ b/paddle/cinn/frontend/pass/use_program_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/common/macros.h"
+
+CINN_USE_REGISTER(ExpandZeroDim)
+CINN_USE_REGISTER(AutoCast)
+CINN_USE_REGISTER(Decomposer)
+CINN_USE_REGISTER(DeadCodeEliminate)
+CINN_USE_REGISTER(RemoveIdentity)
+CINN_USE_REGISTER(TransposeCollapsing)
+CINN_USE_REGISTER(TransposeFoldingInput)
+CINN_USE_REGISTER(GemmRewriter)
+CINN_USE_REGISTER(TransposeFoldingOutput)
+CINN_USE_REGISTER(FillConstantRewriter)
+CINN_USE_REGISTER(FillConstantFolding)
+CINN_USE_REGISTER(CastCollapsing)
+CINN_USE_REGISTER(AutoBroadcast)
diff --git a/paddle/cinn/frontend/program_pass.cc b/paddle/cinn/frontend/program_pass.cc
new file mode 100644
index 0000000000000..902c5e8cf6c14
--- /dev/null
+++ b/paddle/cinn/frontend/program_pass.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/program_pass.h"
+
+#include <unordered_set>
+
+#include "cinn/hlir/framework/visualize_helper.h"
+
+namespace cinn {
+namespace frontend {
+
+void ProgramPass::Apply(Program* prog,
+                        const std::unordered_set<std::string>& fetch_ids,
+                        const common::Target& target,
+                        const std::vector<std::string>& passes) {
+  std::vector<const ProgramPass*> fpass;
+  for (auto& name : passes) {
+    const auto* pass = ProgramPassRegistry::Global()->Get(name);
+    fpass.push_back(pass);
+  }
+  for (const auto* pass : fpass) {
+    int before = prog->size();
+    cinn::hlir::framework::PassPrinter::GetInstance()->PassBegin(pass->name(), *prog);
+    pass->ApplyImpl(prog, fetch_ids, target);
+    const_cast<ProgramPass*>(pass)->Clear();
+    int after = prog->size();
+    cinn::hlir::framework::PassPrinter::GetInstance()->PassEnd(pass->name(), *prog);
+    VLOG(1) << "Apply " << pass->name() << " pass, program size: " << before << " -> " << after
+            << ", diff: " << after - before;
+  }
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/program_pass.h b/paddle/cinn/frontend/program_pass.h
new file mode 100755
index 0000000000000..7a7cf6bbff0f2
--- /dev/null
+++ b/paddle/cinn/frontend/program_pass.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "cinn/frontend/syntax.h"
+#include "cinn/utils/registry.h"
+
+namespace cinn {
+namespace frontend {
+
+class ProgramPass {
+ public:
+  ProgramPass(const std::string& name) : name_(name) {}
+
+  /**
+   * \brief Apply a sequence of passes on a program.
+   * @param prog The input program to apply passes on.
+   * @param passes The sequence of pass.
+   * @return The program after being modified by the passes.
+   */
+  static void Apply(Program* prog,
+                    const std::unordered_set<std::string>& fetch_ids,
+                    const common::Target& target,
+                    const std::vector<std::string>& passes);
+
+  const std::string& name() const { return name_; }
+
+ protected:
+  virtual void ApplyImpl(Program* prog,
+                         const std::unordered_set<std::string>& fetch_ids,
+                         const common::Target& target) {}
+  virtual void ApplyImpl(Program* prog,
+                         const std::unordered_set<std::string>& fetch_ids,
+                         const common::Target& target) const {
+    return const_cast<ProgramPass*>(this)->ApplyImpl(prog, fetch_ids, target);
+  }
+
+  virtual void Clear() = 0;
+
+ private:
+  std::string name_;
+};
+
+class ProgramPassRegistry : public Registry<ProgramPass> {
+ public:
+  static ProgramPassRegistry* Global() {
+    static ProgramPassRegistry x;
+    return &x;
+  }
+
+  inline const ProgramPass* Get(const std::string& name) {
+    const ProgramPass* pass = Registry<ProgramPass>::Find(name);
+    CHECK(pass) << "Pass [" << name << "] is not registered";
+    return pass;
+  }
+
+  inline ProgramPass* __REGISTER__(const std::string& name, ProgramPass* pass) {
+    std::lock_guard<std::mutex> guard(registering_mutex);
+    if (fmap_.count(name)) {
+      return fmap_[name];
+    }
+
+    fmap_[name] = pass;
+    const_list_.push_back(pass);
+    entry_list_.push_back(pass);
+    return pass;
+  }
+
+  inline ProgramPass* __REGISTER_OR_GET__(const std::string& name, ProgramPass* pass) {
+    if (!fmap_.count(name)) {
+      return __REGISTER__(name, pass);
+    } else {
+      return fmap_.at(name);
+    }
+  }
+
+ private:
+  ProgramPassRegistry() = default;
+  CINN_DISALLOW_COPY_AND_ASSIGN(ProgramPassRegistry);
+};
+
+/**
+ * @def CINN_REGISTER_PROGRAM_PASS
+ * \brief Register a new program pass
+ *
+ * @param PassType The type of pass
+ * @param PassClass The pass inherited from ProgramPass
+ *
+ * \code
+ *  CINN_REGISTER_PROGRAM_PASS(decompose, DecomposerPass());
+ * \endcode
+ */
+#define CINN_REGISTER_PROGRAM_PASS(PassType, PassClass)         \
+  static ::cinn::frontend::ProgramPass* __make_##PassType##__ = \
+      ::cinn::frontend::ProgramPassRegistry::Global()->__REGISTER_OR_GET__(#PassType, new PassClass{#PassType})
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/syntax.cc b/paddle/cinn/frontend/syntax.cc
new file mode 100644
index 0000000000000..07f473c0dfd84
--- /dev/null
+++ b/paddle/cinn/frontend/syntax.cc
@@ -0,0 +1,568 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/syntax.h"
+
+#include <absl/types/variant.h>
+
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "cinn/frontend/paddle/model_parser.h"
+#include "cinn/frontend/paddle_model_to_program.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace frontend {
+using hlir::framework::Scope;
+
+void Instruction::PrepareOutputs() {
+  auto* op_def = hlir::framework::OpRegistry::Global()->Find(get()->op_type);
+  CHECK(op_def) << "No operator called [" << get()->op_type << "]";
+  for (int i = 0; i < op_def->num_outputs; i++) {
+    get()->outputs.push_back(Variable());
+  }
+}
+
+Instruction::Instruction(absl::string_view op_type, const std::vector<Variable>& inputs, Program* parent)
+    : common::Shared<_Instruction_>(common::make_shared<_Instruction_>()) {
+  get()->op_type        = std::string(op_type);
+  get()->parent_program = parent;
+  get()->inputs         = inputs;
+  PrepareOutputs();
+}
+
+Placeholder::operator Variable() const { return var_; }
+
+Variable Program::conv2d(const Variable& a,
+                         const Variable& b,
+                         const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  Instruction instr("conv2d");
+  instr.SetInputs({a, b});
+  for (auto& iter : attr_store) {
+    instr.SetAttr(iter.first, iter.second);
+  }
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::layout_transform(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  Instruction instr("layout_transform");
+  instr.SetInputs({a});
+  for (auto& iter : attr_store) {
+    instr.SetAttr(iter.first, iter.second);
+  }
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::conv2d_NCHWc(const Variable& a,
+                               const Variable& b,
+                               const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  Instruction instr("conv2d_NCHWc");
+  instr.SetInputs({a, b});
+  for (auto& iter : attr_store) {
+    instr.SetAttr(iter.first, iter.second);
+  }
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::depthwise_conv2d(const Variable& a,
+                                   const Variable& b,
+                                   const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  Instruction instr("depthwise_conv2d");
+  instr.SetInputs({a, b});
+  for (auto& iter : attr_store) {
+    instr.SetAttr(iter.first, iter.second);
+  }
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::pool2d(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  Instruction instr("pool2d");
+  instr.SetInputs({a});
+  for (auto& iter : attr_store) {
+    instr.SetAttr(iter.first, iter.second);
+  }
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::batchnorm(const Variable& a,
+                            const Variable& scale,
+                            const Variable& bias,
+                            const Variable& mean,
+                            const Variable& variance,
+                            const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  Instruction instr("batch_norm");
+  instr.SetInputs({a, scale, bias, mean, variance});
+  for (auto& iter : attr_store) {
+    instr.SetAttr(iter.first, iter.second);
+  }
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+template <typename PrimType>
+Variable Program::primitive_const_scalar(PrimType value, const std::string& name) {
+  Instruction instr("const_scalar");
+  instr.SetInputs({});
+  instr.SetAttr("value", value);
+  AppendInstruction(instr);
+  auto out = instr.GetOutput(0);
+  out.set_id(name);
+  auto out_type = type_of<PrimType>();
+  CHECK(out_type.is_float() || out_type.is_int() || out_type.is_bool()) << "no supported type: " << out_type;
+  out->type = out_type;
+  out.set_const(true);
+  return out;
+}
+
+Variable Program::primitive_broadcast_to(const Variable& a,
+                                         const std::vector<int>& out_shape,
+                                         const std::vector<int>& broadcast_axes) {
+  Instruction instr("broadcast_to");
+  instr.SetInputs({a});
+  instr.SetAttr("out_shape", out_shape);
+  instr.SetAttr("broadcast_axes", broadcast_axes);
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::fused_meta_batchnorm_inference(const Variable& a,
+                                                 const Variable& scale,
+                                                 const Variable& bias,
+                                                 const Variable& mean,
+                                                 const Variable& variance,
+                                                 const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  float epsilon = 0.00001f;
+  if (attr_store.find("epsilon") != attr_store.end()) {
+    epsilon = absl::get<float>(attr_store.at("epsilon"));
+  }
+  auto eps_var = primitive_const_scalar<float>(epsilon, common::UniqName("epsilon"));
+  CHECK(!scale->shape.empty()) << "scale's shape is empty.";
+  auto broadcast_eps = primitive_broadcast_to(eps_var, scale->shape, {0});
+  auto var_add_eps   = add(variance, broadcast_eps);
+  auto rsrqt_var     = primitive_rsqrt(var_add_eps);
+  auto new_scale     = multiply(rsrqt_var, scale);
+  auto neg_mean      = primitive_negative(mean);
+  auto new_shift     = multiply(new_scale, neg_mean);
+  auto shift_bias    = add(new_shift, bias);
+  CHECK(!a->shape.empty()) << "variable a's shape is empty.";
+  auto broadcast_new_scale  = primitive_broadcast_to(new_scale, a->shape, {1});
+  auto broadcast_shift_bias = primitive_broadcast_to(shift_bias, a->shape, {1});
+  auto temp_out             = multiply(broadcast_new_scale, a);
+  auto bn_out               = add(temp_out, broadcast_shift_bias);
+
+  return bn_out;
+}
+
+Variable Program::fused_batchnorm_inference(const Variable& a,
+                                            const Variable& scale,
+                                            const Variable& bias,
+                                            const Variable& mean,
+                                            const Variable& variance,
+                                            const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  float epsilon = 0.00001f;
+  if (attr_store.find("epsilon") != attr_store.end()) {
+    epsilon = absl::get<float>(attr_store.at("epsilon"));
+  }
+  auto eps_var = primitive_const_scalar<float>(epsilon, common::UniqName("epsilon"));
+  CHECK(!scale->shape.empty()) << "scale's shape is empty.";
+  auto var_add_eps = elementwise_add(variance, eps_var);
+  auto rsrqt_var   = primitive_rsqrt(var_add_eps);
+  auto new_scale   = elementwise_mul(rsrqt_var, scale);
+  auto neg_mean    = primitive_negative(mean);
+  auto new_shift   = elementwise_mul(new_scale, neg_mean);
+  auto shift_bias  = elementwise_add(new_shift, bias);
+  auto temp_out    = elementwise_mul(a, new_scale, 1);
+  auto bn_out      = elementwise_add(temp_out, shift_bias, 1);
+
+  return bn_out;
+}
+
+Variable Program::scale(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  Instruction instr("scale", {a});
+  for (auto& iter : attr_store) {
+    instr.SetAttr(iter.first, iter.second);
+  }
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::softmax(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  Instruction instr("softmax", {a});
+  for (auto& iter : attr_store) {
+    instr.SetAttr(iter.first, iter.second);
+  }
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::sigmoid(const Variable& a) {
+  Instruction instr("sigmoid", {a});
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::slice(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  Instruction instr("slice", {a});
+  for (auto& iter : attr_store) {
+    instr.SetAttr(iter.first, iter.second);
+  }
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::dropout_infer(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store) {
+  Instruction instr("dropout_infer", {a});
+  for (auto& iter : attr_store) {
+    instr.SetAttr(iter.first, iter.second);
+  }
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Instruction& Program::operator[](size_t i) {
+  CHECK_LT(i, instrs_.size());
+  return instrs_[i];
+}
+
+const Instruction& Program::operator[](size_t i) const {
+  CHECK_LT(i, instrs_.size());
+  return instrs_[i];
+}
+
+std::ostream& operator<<(std::ostream& os, const Variable& x) {
+  os << "Var(" << x->id << ": shape=[" << utils::Join(x->shape, ", ") << "], dtype=" << x->type;
+  if (x->is_const) {
+    os << ", CONST";
+  }
+  os << ")";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const Instruction& instr) {
+  os << instr->debug_string();
+  return os;
+}
+
+std::tuple<std::unique_ptr<Program>,
+           absl::flat_hash_map<std::string, Variable>,
+           absl::flat_hash_map<std::string, std::string>,
+           absl::flat_hash_set<std::string>>
+LoadPaddleProgram(const std::string& model_dir,
+                  Scope* scope,
+                  std::unordered_map<std::string, std::vector<int>>& input_shape_map,
+                  bool is_combined,
+                  const common::Target& target) {
+  VLOG(1) << "Loading Paddle model from " << model_dir;
+  PaddleModelToProgram paddle_to_program(scope, input_shape_map, target);
+  return std::make_tuple(paddle_to_program(model_dir, is_combined),
+                         paddle_to_program.var_map(),
+                         paddle_to_program.var_model_to_program_map(),
+                         paddle_to_program.fetch_names());
+}
+
+void Program::SetInputs(const std::vector<Variable>& xs) {
+  CHECK(!xs.empty()) << "At least one input is needed for a program!";
+  for (int i = 0; i < xs.size(); i++) {
+    CHECK(!xs[i]->shape.empty()) << "Found " << i << "-th input's shape is not set yet";
+    CHECK(!xs[i]->type.is_unk()) << "Found " << i << "-th input's type is not set yet";
+    inputs_.push_back(xs[i]);
+  }
+}
+
+void Program::Validate() const {
+  // Existing some program don't have input, such as a program only has `fill_constant`
+  // CHECK(!inputs_.empty()) << "Inputs of the program is not set yet";
+  CHECK(!instrs_.empty()) << "No instruction is added yet";
+}
+
+Variable Program::add(const Variable& a, const Variable& b) {
+  Instruction instr("elementwise_add", {a, b});
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::multiply(const Variable& a, const Variable& b) {
+  Instruction instr("elementwise_mul", {a, b});
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+#define SYNTAX_PRIM_UNARY_IMPL(name__)                      \
+  Variable Program::primitive_##name__(const Variable& a) { \
+    Instruction instr(#name__, {a});                        \
+    AppendInstruction(instr);                               \
+    return instr.GetOutput(0);                              \
+  }
+
+SYNTAX_PRIM_UNARY_IMPL(exp);
+SYNTAX_PRIM_UNARY_IMPL(erf);
+SYNTAX_PRIM_UNARY_IMPL(sqrt);
+SYNTAX_PRIM_UNARY_IMPL(log);
+SYNTAX_PRIM_UNARY_IMPL(floor);
+SYNTAX_PRIM_UNARY_IMPL(ceil);
+SYNTAX_PRIM_UNARY_IMPL(round);
+SYNTAX_PRIM_UNARY_IMPL(tanh);
+SYNTAX_PRIM_UNARY_IMPL(log2);
+SYNTAX_PRIM_UNARY_IMPL(log10);
+SYNTAX_PRIM_UNARY_IMPL(trunc);
+SYNTAX_PRIM_UNARY_IMPL(cos);
+SYNTAX_PRIM_UNARY_IMPL(sin);
+SYNTAX_PRIM_UNARY_IMPL(cosh);
+SYNTAX_PRIM_UNARY_IMPL(tan);
+SYNTAX_PRIM_UNARY_IMPL(sinh);
+SYNTAX_PRIM_UNARY_IMPL(acos);
+SYNTAX_PRIM_UNARY_IMPL(acosh);
+SYNTAX_PRIM_UNARY_IMPL(asin);
+SYNTAX_PRIM_UNARY_IMPL(asinh);
+SYNTAX_PRIM_UNARY_IMPL(atan);
+SYNTAX_PRIM_UNARY_IMPL(atanh);
+
+SYNTAX_PRIM_UNARY_IMPL(isnan);
+SYNTAX_PRIM_UNARY_IMPL(isfinite);
+SYNTAX_PRIM_UNARY_IMPL(isinf);
+SYNTAX_PRIM_UNARY_IMPL(bitwise_not);
+
+SYNTAX_PRIM_UNARY_IMPL(negative);
+SYNTAX_PRIM_UNARY_IMPL(identity);
+SYNTAX_PRIM_UNARY_IMPL(logical_not);
+SYNTAX_PRIM_UNARY_IMPL(sign);
+SYNTAX_PRIM_UNARY_IMPL(abs);
+SYNTAX_PRIM_UNARY_IMPL(rsqrt);
+
+#define SYNTAX_PRIM_BINARY_IMPL(name__)                                        \
+  Variable Program::primitive_##name__(const Variable& a, const Variable& b) { \
+    Instruction instr(#name__, {a, b});                                        \
+    AppendInstruction(instr);                                                  \
+    return instr.GetOutput(0);                                                 \
+  }
+
+SYNTAX_PRIM_BINARY_IMPL(subtract)
+SYNTAX_PRIM_BINARY_IMPL(divide)
+SYNTAX_PRIM_BINARY_IMPL(floor_divide)
+SYNTAX_PRIM_BINARY_IMPL(mod)
+SYNTAX_PRIM_BINARY_IMPL(floor_mod)
+SYNTAX_PRIM_BINARY_IMPL(max)
+SYNTAX_PRIM_BINARY_IMPL(min)
+SYNTAX_PRIM_BINARY_IMPL(power)
+SYNTAX_PRIM_BINARY_IMPL(logical_and)
+SYNTAX_PRIM_BINARY_IMPL(logical_or)
+SYNTAX_PRIM_BINARY_IMPL(logical_xor)
+SYNTAX_PRIM_BINARY_IMPL(greater)
+SYNTAX_PRIM_BINARY_IMPL(less)
+SYNTAX_PRIM_BINARY_IMPL(equal)
+SYNTAX_PRIM_BINARY_IMPL(not_equal)
+SYNTAX_PRIM_BINARY_IMPL(greater_equal)
+SYNTAX_PRIM_BINARY_IMPL(less_equal)
+
+SYNTAX_PRIM_BINARY_IMPL(bitwise_or)
+SYNTAX_PRIM_BINARY_IMPL(bitwise_xor)
+SYNTAX_PRIM_BINARY_IMPL(bitwise_and)
+SYNTAX_PRIM_BINARY_IMPL(left_shift)
+SYNTAX_PRIM_BINARY_IMPL(right_shift)
+
+Variable Program::elementwise_add(const Variable& a, const Variable& b, int axis) {
+  Instruction instr("elementwise_add", {a, b});
+  instr.SetAttr("axis", axis);
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::elementwise_mul(const Variable& a, const Variable& b, int axis) {
+  Instruction instr("elementwise_mul", {a, b});
+  instr.SetAttr("axis", axis);
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::elementwise_div(const Variable& a, const Variable& b, int axis) {
+  Instruction instr("divide", {a, b});
+  instr.SetAttr("axis", axis);
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::elementwise_sub(const Variable& a, const Variable& b, int axis) {
+  Instruction instr("subtract", {a, b});
+  instr.SetAttr("axis", axis);
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+#define SYNTAX_PRIM_REDUCE_IMPL(name__)                                                              \
+  Variable Program::reduce_##name__(const Variable& a, const std::vector<int>& dim, bool keep_dim) { \
+    Instruction instr("reduce_" #name__, {a});                                                       \
+    instr.SetAttr("dim", dim);                                                                       \
+    instr.SetAttr("keep_dim", keep_dim);                                                             \
+    AppendInstruction(instr);                                                                        \
+    return instr.GetOutput(0);                                                                       \
+  }
+
+SYNTAX_PRIM_REDUCE_IMPL(sum)
+SYNTAX_PRIM_REDUCE_IMPL(prod)
+SYNTAX_PRIM_REDUCE_IMPL(min)
+SYNTAX_PRIM_REDUCE_IMPL(max)
+
+Variable Program::assign(const Variable& a) {
+  Instruction instr("identity", {a});
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::relu(const Variable& a) {
+  Instruction instr("relu", {a});
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::relu6(const Variable& a) {
+  Instruction instr("relu6", {a});
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::mul(const Variable& a, const Variable& b, int x_num_col_dims, int y_num_col_dims) {
+  Instruction instr("mul", {a, b});
+  instr.SetAttr("x_num_col_dims", x_num_col_dims);
+  instr.SetAttr("y_num_col_dims", y_num_col_dims);
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::matmul(const Variable& a, const Variable& b, bool trans_a, bool trans_b, float alpha) {
+  Instruction instr("matmul", {a, b});
+  instr.SetAttr("trans_a", trans_a);
+  instr.SetAttr("trans_b", trans_b);
+  instr.SetAttr("alpha", alpha);
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::reshape(const Variable& a, const std::vector<int>& shape) {
+  Instruction instr("reshape", {a});
+  instr.SetAttr("shape", shape);
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::concat(const std::vector<Variable>& input_vars, int axis) {
+  Instruction instr("concat", input_vars);
+  instr.SetAttr("axis", axis);
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+Variable Program::transpose(const Variable& a, const std::vector<int>& axis) {
+  Instruction instr("transpose", {a});
+  instr.SetAttr("axis", axis);
+  AppendInstruction(instr);
+  return instr.GetOutput(0);
+}
+
+std::string _Instruction_::debug_string() const {
+  struct Visit {
+    std::stringstream& s_;
+    explicit Visit(std::stringstream& s) : s_(s) {}
+    void operator()(int x) { s_ << x; }
+    void operator()(int64_t x) { s_ << x; }
+    void operator()(float x) { s_ << x; }
+    void operator()(double x) { s_ << x; }
+    void operator()(bool x) { s_ << (x ? "true" : "false"); }
+    void operator()(const std::string& x) { s_ << x; }
+    void operator()(const std::vector<int>& x) { s_ << "[" + utils::Join(x, ",") + "]"; }
+    void operator()(const std::vector<int64_t>& x) { s_ << "[" + utils::Join(x, ",") + "]"; }
+    void operator()(const std::vector<float>& x) { s_ << "[" + utils::Join(x, ",") + "]"; }
+    void operator()(const std::vector<double>& x) { s_ << "[" + utils::Join(x, ",") + "]"; }
+    void operator()(const std::vector<bool>& x) { s_ << "[" + utils::Join(x, ",") + "]"; }
+    void operator()(const std::vector<std::string>& x) { s_ << "[" + utils::Join(x, ",") + "]"; }
+  };
+
+  std::stringstream ss;
+  std::vector<std::string> input_names, output_names;
+  std::transform(
+      inputs.begin(), inputs.end(), std::back_inserter(input_names), [](const Variable& x) { return x->id; });
+  std::transform(
+      outputs.begin(), outputs.end(), std::back_inserter(output_names), [](const Variable& x) { return x->id; });
+
+  ss << utils::Join(output_names, ", ");
+  ss << " = ";
+  ss << op_type;
+  ss << "(";
+  ss << utils::Join(input_names, ", ");
+  if (!attrs.empty() && !input_names.empty()) ss << ", ";
+
+  std::map<std::string, std::string> attr_str_map;
+  for (const auto& attr : attrs) {
+    std::stringstream iss;
+    absl::visit(Visit{iss}, attr.second);
+    attr_str_map[attr.first] = iss.str();
+  }
+  bool is_first = true;
+  for (const auto& attr : attr_str_map) {
+    if (is_first) {
+      is_first = false;
+    } else {
+      ss << ", ";
+    }
+    ss << attr.first << "=" << attr.second;
+  }
+  ss << ")";
+
+  return ss.str();
+}
+
+struct HashVariable {
+  bool operator()(const Variable& lhs, const Variable& rhs) const {
+    return lhs->id == rhs->id && lhs->shape == rhs->shape && lhs->type == rhs->type;
+  }
+
+  std::size_t operator()(const Variable& var) const {
+    return std::hash<std::string>()(var->id + cinn::utils::Join(var->shape, ", ") + cinn::common::Type2Str(var->type));
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const Program& program) {
+  os << "Program {\n";
+
+  std::unordered_set<Variable, HashVariable, HashVariable> var_set;
+  for (int i = 0; i < program.size(); i++) {
+    var_set.insert(program[i]->inputs.cbegin(), program[i]->inputs.cend());
+    var_set.insert(program[i]->outputs.cbegin(), program[i]->outputs.cend());
+  }
+
+  for (const auto& var : var_set) {
+    os << var << "\n";
+  }
+  os << "\n";
+
+  for (int i = 0; i < program.size(); i++) {
+    os << program[i] << "\n";
+  }
+  os << "}\n";
+  return os;
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/syntax.h b/paddle/cinn/frontend/syntax.h
new file mode 100644
index 0000000000000..3e86bc230827b
--- /dev/null
+++ b/paddle/cinn/frontend/syntax.h
@@ -0,0 +1,507 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+/**
+ * \file This file defines some basic elements of CINN frontend syntax.
+ */
+#include <absl/container/flat_hash_map.h>
+#include <absl/container/flat_hash_set.h>
+#include <absl/strings/string_view.h>
+#include <glog/logging.h>
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <typeinfo>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/object.h"
+#include "cinn/common/type.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/scope.h"
+
+namespace cinn {
+namespace frontend {
+
+struct Program;
+struct Variable;
+
+struct _Variable_ : public common::Object {
+  std::string id;
+  common::Type type;
+  std::vector<int> shape;
+  bool is_const = false;
+
+  const char* type_info() const override { return __type_info__; }
+  static constexpr char* __type_info__ = "cinn_frontend_variable";
+};
+
+/**
+ * Variable represents the variable in a computation.
+ */
+struct Variable : public common::Shared<_Variable_> {
+  /**
+   * Constructor.
+   * @param id_hint The identifier of the variable, if null, a random ID will be assigned.
+   */
+  explicit Variable(const std::string& id_hint = "") : common::Shared<_Variable_>(common::make_shared<_Variable_>()) {
+    if (!id_hint.empty()) CheckVarNameValid(id_hint);
+    get()->id = id_hint.empty() ? common::Context::Global().NewName("var") : id_hint;
+  }
+
+  void set_id(const std::string& id) { operator->()->id = id; }
+  void set_const(bool is_const) { operator->()->is_const = is_const; }
+  bool is_const() { return operator->()->is_const; }
+
+  _Variable_* operator->() { return get(); }
+  const _Variable_* operator->() const { return get(); }
+};
+
+/**
+ * Placeholder is the fed slot of a computation.
+ */
+class Placeholder {
+ public:
+  /**
+   * @param type Type of the fed
+   * @param shape Shape of the fed
+   * @param id ID of the fed
+   */
+  Placeholder(const common::Type& type,
+              const std::vector<int>& shape,
+              absl::string_view id_hint = "",
+              bool is_const             = false) {
+    if (!id_hint.empty()) CheckVarNameValid(std::string(id_hint));
+    id_            = id_hint.empty() ? common::Context::Global().NewName("placeholder") : (std::string)id_hint;
+    var_           = Variable(id_);
+    var_->shape    = shape;
+    var_->type     = type;
+    var_->is_const = is_const;
+  }
+
+  explicit Placeholder(const Variable& var) {
+    id_  = var->id;
+    var_ = var;
+  }
+
+  const std::vector<int>& shape() const { return var_->shape; }
+
+  Type type() const { return var_->type; }
+
+  absl::string_view id() const { return id_; }
+
+  operator Variable() const;
+  void set_const(bool is_const) { Variable()->is_const = is_const; }
+  bool is_const() { return Variable().is_const(); }
+
+  Program* parent_program() { return parent_program_; }
+
+ private:
+  Variable var_;
+  std::string id_{};
+  Program* parent_program_{};
+};
+
+/**
+ * Data of a Instruction.
+ */
+struct _Instruction_ : public common::Object {
+  using attr_t = hlir::framework::AttrType;
+
+  std::string op_type;
+  absl::flat_hash_map<std::string, attr_t> attrs;
+  std::vector<std::pair<std::string, attr_t>> attrs_ordered;
+  std::vector<Variable> inputs;
+  std::vector<Variable> outputs;
+  Program* parent_program{};
+
+  const char* type_info() const override { return __type_info__; }
+
+  std::string debug_string() const;
+
+  static constexpr char* __type_info__ = "cinn_frontend_instruction";
+};
+
+/**
+ * Instruction is the basic computational unit of a Program, similar to the operator concept in a DNN platform.
+ */
+struct Instruction : public common::Shared<_Instruction_> {
+  explicit Instruction(absl::string_view op_type, const std::vector<Variable>& inputs = {}, Program* parent = nullptr);
+
+  /**
+   * Set the inputs of the instruction.
+   * @param vars The input variables.
+   */
+  void SetInputs(const std::vector<Variable>& vars) { get()->inputs = vars; }
+  const std::vector<Variable>& GetOutputs() const { return get()->outputs; }
+  const Variable& GetOutput(size_t offset) const {
+    CHECK_LT(offset, get()->outputs.size());
+    return GetOutputs()[offset];
+  }
+
+  /**
+   * Set an attribute of the instruction.
+   * @tparam T The type of the attribute value type.
+   * @param key The identifier of the attribute.
+   * @param v The value of the attribute.
+   */
+  template <typename T>
+  void SetAttr(const std::string& key, const T& v) {
+    get()->attrs[key] = v;
+  }
+
+  /**
+   * Get an attribute of the instruction.
+   * @tparam T The data type of the attribute value.
+   * @param key The identifier of the attribute.
+   * @return The attribute value.
+   */
+  template <typename T>
+  T GetAttrs(const std::string& key) const {
+    auto it = get()->attrs.find(key);
+    CHECK(it != get()->attrs.end()) << "No attribute called [" << key << "] in op " << get()->op_type;
+    CHECK(absl::holds_alternative<T>(it->second))
+        << "Try get attribute " << key << " from a error type " << typeid(T()).name() << " in op " << get()->op_type;
+    return absl::get<T>(it->second);
+  }
+
+ private:
+  // Generate outputs according to op's declaration.
+  void PrepareOutputs();
+};
+
+/**
+ * Program is a representation of a computation.
+ */
+struct Program {
+  using attr_t = hlir::framework::NodeAttr::attr_t;
+
+  Program() = default;
+
+  Program(std::vector<Instruction>&& instrs, std::vector<Variable>&& inputs)
+      : instrs_(std::move(instrs)), inputs_(std::move(inputs)) {}
+
+  void SetInputs(const std::vector<Variable>& xs);
+  const std::vector<Variable>& GetInputs() const { return inputs_; }
+
+  /**
+   * create scalar with the specific value and type
+   */
+  template <typename PrimType>
+  Variable primitive_const_scalar(PrimType value, const std::string& name);
+  /**
+   * create tensor with the specific shape, type and value
+   */
+  template <typename PrimType>
+  Variable fill_constant(const std::vector<int>& shape,
+                         float float_value,
+                         const std::string& str_value,
+                         bool force_cpu,
+                         const std::string& name) {
+    Instruction instr("fill_constant");
+    PrimType value;
+    if (str_value.empty()) {
+      value = static_cast<PrimType>(float_value);
+    } else {
+      if (str_value == "inf") {
+        value = static_cast<PrimType>(std::numeric_limits<double>::infinity());
+      } else if (str_value == "-inf") {
+        value = static_cast<PrimType>(-std::numeric_limits<double>::infinity());
+      } else if (str_value == "nan") {
+        value = static_cast<PrimType>(std::numeric_limits<double>::quiet_NaN());
+      } else {
+        std::stringstream convert_stream(str_value);
+        if (std::is_same<int64_t, PrimType>::value) {
+          int64_t tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<PrimType>(tmp_value);
+        } else {
+          double tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<PrimType>(tmp_value);
+        }
+      }
+    }
+    instr.SetInputs({});
+    instr.SetAttr("shape", shape);
+    instr.SetAttr("value", value);
+    instr.SetAttr("force_cpu", force_cpu);
+    AppendInstruction(instr);
+    auto out = instr.GetOutput(0);
+    out.set_id(name);
+    return out;
+  }
+  /**
+   * Add two variables.
+   *
+   * @param a The first variable.
+   * @param b The second variable.
+   * @return The result.
+   */
+  Variable add(const Variable& a, const Variable& b);
+  Variable multiply(const Variable& a, const Variable& b);
+
+  /**
+   * Multiply two matrix.
+   */
+  Variable mul(const Variable& a, const Variable& b, int x_num_col_dims = 1, int y_num_col_dims = 1);
+
+  /**
+   * Multiply two matrix.
+   */
+  Variable matmul(const Variable& a, const Variable& b, bool trans_a = false, bool trans_b = false, float alpha = 1);
+
+  /**
+   * Reshape a tensor.
+   * @param a The input tensor.
+   * @param shape The output tensor's shape we specified.
+   * @return The reshaped output tensor.
+   */
+  Variable reshape(const Variable& a, const std::vector<int>& shape);
+
+  /**
+   * Concat tensors.
+   * @param input_vars The input tensors.
+   * @param axis The axis specified to do the concat operation.
+   * @return The concated output tensor.
+   */
+  Variable concat(const std::vector<Variable>& input_vars, int axis = 0);
+
+  Variable transpose(const Variable& input_vars, const std::vector<int>& axis);
+
+#define SYNTAX_PRIM_UNARY_DECL(name__) Variable primitive_##name__(const Variable& a);
+
+  SYNTAX_PRIM_UNARY_DECL(exp);
+  SYNTAX_PRIM_UNARY_DECL(erf);
+  SYNTAX_PRIM_UNARY_DECL(sqrt);
+  SYNTAX_PRIM_UNARY_DECL(log);
+  SYNTAX_PRIM_UNARY_DECL(floor);
+  SYNTAX_PRIM_UNARY_DECL(ceil);
+  SYNTAX_PRIM_UNARY_DECL(round);
+  SYNTAX_PRIM_UNARY_DECL(tanh);
+  SYNTAX_PRIM_UNARY_DECL(log2);
+  SYNTAX_PRIM_UNARY_DECL(log10);
+  SYNTAX_PRIM_UNARY_DECL(trunc);
+  SYNTAX_PRIM_UNARY_DECL(cos);
+  SYNTAX_PRIM_UNARY_DECL(sin);
+  SYNTAX_PRIM_UNARY_DECL(cosh);
+  SYNTAX_PRIM_UNARY_DECL(tan);
+  SYNTAX_PRIM_UNARY_DECL(sinh);
+  SYNTAX_PRIM_UNARY_DECL(acos);
+  SYNTAX_PRIM_UNARY_DECL(acosh);
+  SYNTAX_PRIM_UNARY_DECL(asin);
+  SYNTAX_PRIM_UNARY_DECL(asinh);
+  SYNTAX_PRIM_UNARY_DECL(atan);
+  SYNTAX_PRIM_UNARY_DECL(atanh);
+
+  SYNTAX_PRIM_UNARY_DECL(isnan);
+  SYNTAX_PRIM_UNARY_DECL(isfinite);
+  SYNTAX_PRIM_UNARY_DECL(isinf);
+  SYNTAX_PRIM_UNARY_DECL(bitwise_not);
+
+  SYNTAX_PRIM_UNARY_DECL(negative);
+  SYNTAX_PRIM_UNARY_DECL(identity);
+  SYNTAX_PRIM_UNARY_DECL(logical_not);
+  SYNTAX_PRIM_UNARY_DECL(sign);
+  SYNTAX_PRIM_UNARY_DECL(abs);
+  SYNTAX_PRIM_UNARY_DECL(rsqrt);
+
+#define SYNTAX_PRIM_BINARY_DECL(name__) Variable primitive_##name__(const Variable& a, const Variable& b);
+  SYNTAX_PRIM_BINARY_DECL(subtract)
+  SYNTAX_PRIM_BINARY_DECL(divide)
+  SYNTAX_PRIM_BINARY_DECL(floor_divide)
+  SYNTAX_PRIM_BINARY_DECL(mod)
+  SYNTAX_PRIM_BINARY_DECL(floor_mod)
+  SYNTAX_PRIM_BINARY_DECL(max)
+  SYNTAX_PRIM_BINARY_DECL(min)
+  SYNTAX_PRIM_BINARY_DECL(power)
+  SYNTAX_PRIM_BINARY_DECL(logical_and)
+  SYNTAX_PRIM_BINARY_DECL(logical_or)
+  SYNTAX_PRIM_BINARY_DECL(logical_xor)
+  SYNTAX_PRIM_BINARY_DECL(greater)
+  SYNTAX_PRIM_BINARY_DECL(less)
+  SYNTAX_PRIM_BINARY_DECL(equal)
+  SYNTAX_PRIM_BINARY_DECL(not_equal)
+  SYNTAX_PRIM_BINARY_DECL(greater_equal)
+  SYNTAX_PRIM_BINARY_DECL(less_equal)
+
+  SYNTAX_PRIM_BINARY_DECL(bitwise_or)
+  SYNTAX_PRIM_BINARY_DECL(bitwise_xor)
+  SYNTAX_PRIM_BINARY_DECL(bitwise_and)
+  SYNTAX_PRIM_BINARY_DECL(left_shift)
+  SYNTAX_PRIM_BINARY_DECL(right_shift)
+
+#define SYNTAX_PRIM_REDUCE_DECL(name__) \
+  Variable reduce_##name__(const Variable& a, const std::vector<int>& dim, bool keep_dim = false);
+
+  SYNTAX_PRIM_REDUCE_DECL(sum)
+  SYNTAX_PRIM_REDUCE_DECL(prod)
+  SYNTAX_PRIM_REDUCE_DECL(min)
+  SYNTAX_PRIM_REDUCE_DECL(max)
+
+  /** broadcast one operand to the target shape
+   * broadcast axes: the target axis which a's ith axis is mapped to
+   * Notes: a's dim should be one or same with the output dim mapped to.
+   * e.g. if a[64] broadcasts to out[1, 64, 112, 112], then out_shape is {1, 64, 112, 112} and broadcast_axes are {1}
+   */
+  Variable primitive_broadcast_to(const Variable& a,
+                                  const std::vector<int>& out_shape,
+                                  const std::vector<int>& broadcast_axes);
+
+  /**
+   * Add two tensors element-wise.
+   */
+  Variable elementwise_add(const Variable& a, const Variable& b, int axis = -1);
+
+  /**
+   * Multiply two tensors element-wise.
+   */
+  Variable elementwise_mul(const Variable& a, const Variable& b, int axis = -1);
+
+  /**
+   * Divide two tensors element-wise.
+   */
+  Variable elementwise_div(const Variable& a, const Variable& b, int axis = -1);
+
+  /**
+   * Subtract two tensors element-wise.
+   */
+  Variable elementwise_sub(const Variable& a, const Variable& b, int axis = -1);
+
+  // copy the tensor
+  Variable assign(const Variable& a);
+
+  /**
+   * Apply Rectified Linear Unit on input Variable.
+   * Actually apply: outupt = max(input,0)
+   *
+   * @param a The first variable.
+   * @return The result.
+   */
+  Variable relu(const Variable& a);
+  Variable relu6(const Variable& a);
+
+  /**
+   * The convolution2D layer calculates the output based on the input, filter
+   * and strides, paddings, dilations, groups parameters.
+   *
+   * @param a The first variable input.
+   * @param b The second variable filter(weights).
+   * @param attr_store The params like padding, stride, dilation, etc.
+   * @return The result.
+   */
+  Variable conv2d(const Variable& a, const Variable& b, const absl::flat_hash_map<std::string, attr_t>& attr_store);
+  Variable layout_transform(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store);
+  Variable conv2d_NCHWc(const Variable& a,
+                        const Variable& b,
+                        const absl::flat_hash_map<std::string, attr_t>& attr_store);
+  Variable depthwise_conv2d(const Variable& a,
+                            const Variable& b,
+                            const absl::flat_hash_map<std::string, attr_t>& attr_store);
+  Variable pool2d(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store);
+
+  /**
+   * The batchnorm layer can be used as a normalizer function
+   * for convolution or fully_connected operations.
+   *
+   * @param a The first variable input.
+   * @param b The second variable filter(weights).
+   * @param attr_store The params like eplison.
+   * @return The result.
+   */
+  Variable batchnorm(const Variable& a,
+                     const Variable& scale,
+                     const Variable& bias,
+                     const Variable& mean,
+                     const Variable& variance,
+                     const absl::flat_hash_map<std::string, attr_t>& attr_store);
+
+  /**
+   *  batchnorm composed of primitive ops
+   */
+  Variable fused_meta_batchnorm_inference(const Variable& a,
+                                          const Variable& scale,
+                                          const Variable& bias,
+                                          const Variable& mean,
+                                          const Variable& variance,
+                                          const absl::flat_hash_map<std::string, attr_t>& attr_store);
+
+  Variable fused_batchnorm_inference(const Variable& a,
+                                     const Variable& scale,
+                                     const Variable& bias,
+                                     const Variable& mean,
+                                     const Variable& variance,
+                                     const absl::flat_hash_map<std::string, attr_t>& attr_store);
+
+  Variable scale(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store);
+
+  Variable softmax(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store);
+
+  Variable sigmoid(const Variable& a);
+
+  Variable slice(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store);
+
+  Variable dropout_infer(const Variable& a, const absl::flat_hash_map<std::string, attr_t>& attr_store);
+
+  /**
+   * Get \p i-th instruction.
+   */
+  Instruction& operator[](size_t i);
+  /**
+   * Get \p i-th instruction.
+   */
+  const Instruction& operator[](size_t i) const;
+
+  /**
+   * Get number of instructions in the program.
+   */
+  inline size_t size() const { return instrs_.size(); }
+
+  void Validate() const;
+
+  void AppendInstruction(const Instruction& other) { instrs_.push_back(other); }
+
+ private:
+  std::vector<Instruction> instrs_;
+
+  std::vector<Variable> inputs_;
+};
+
+/**
+ * Load a Paddle model and return a frontend program.
+ * @param model_dir The directory of the model.
+ * @param is_combined Whether the parameters in the Paddle model is combined.
+ * @returns program, a map from name to variable and a map from variable name in Paddle model to the corresponding in
+ * program
+ */
+std::tuple<std::unique_ptr<Program>,
+           absl::flat_hash_map<std::string, Variable>,
+           absl::flat_hash_map<std::string, std::string>,
+           absl::flat_hash_set<std::string>>
+LoadPaddleProgram(const std::string& model_dir,
+                  hlir::framework::Scope* scope,
+                  std::unordered_map<std::string, std::vector<int>>& input_shape_map,
+                  bool is_combined,
+                  const common::Target& target = common::DefaultHostTarget());
+
+std::ostream& operator<<(std::ostream& os, const Variable& x);
+std::ostream& operator<<(std::ostream& os, const Instruction& instr);
+std::ostream& operator<<(std::ostream& os, const Program& program);
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/syntax_test.cc b/paddle/cinn/frontend/syntax_test.cc
new file mode 100644
index 0000000000000..6ccbe51897057
--- /dev/null
+++ b/paddle/cinn/frontend/syntax_test.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/syntax.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+//
+#include "cinn/cinn.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace cinn {
+namespace frontend {
+
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::Scope;
+
+// using hlir::framework::Scope;
+using utils::Join;
+
+frontend::Program CreateAddProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+  frontend::NetBuilder builder("test");
+
+  auto a = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c = builder.Add(a, b);
+  auto d = builder.Add(a, c);
+  return builder.Build();
+}
+
+TEST(syntax, basic) {
+  auto program = CreateAddProgram();
+  // output program
+  for (int i = 0; i < program.size(); i++) {
+    LOG(INFO) << "instruction: " << program[i];
+  }
+}
+
+TEST(syntax, program_execute_multi_elementwise_add) {
+  auto program  = CreateAddProgram();
+  Target target = common::DefaultTarget();
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+  // auto graph    = std::make_shared<hlir::framework::Graph>(*program, target);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  auto A = scope->GetTensor("A");
+  auto B = scope->GetTensor("B");
+  SetRandData<float>(A, target);
+  SetRandData<float>(B, target);
+  runtime_program->Execute();
+}
+
+TEST(syntax, program_execute_multi_elementwise_add2) {
+  auto program  = CreateAddProgram();
+  Target target = common::DefaultTarget();
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = Optimize(&program, fetch_ids, target);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+
+  auto A = scope->GetTensor("A");
+  auto B = scope->GetTensor("B");
+  SetRandData<float>(A, target);
+  SetRandData<float>(B, target);
+
+  runtime_program->Execute();
+}
+
+/*
+// Load a simple Paddle model, execute it
+TEST(load_paddle_model, fc_execute) {
+  auto scope = std::make_shared<Scope>();
+
+  std::unordered_map<std::string, std::vector<int>> input_shape_map = {{"A", {1, 30}}};
+  auto programTuple               = LoadPaddleProgram(FLAGS_model_dir, scope.get(), input_shape_map, false);
+  auto& program                   = std::get<0>(programTuple);
+  auto& var_map                   = std::get<1>(programTuple);
+  auto& var_map_paddle_to_program = std::get<2>(programTuple);
+
+  LOG(INFO) << "program:\n" << *program;
+
+  Target target = common::DefaultHostTarget();
+  std::unordered_set<std::string> fetch_ids;
+  auto graph = cinn::frontend::Optimize(program.get(), fetch_ids, target);
+
+  scope = BuildScope(target, graph, scope);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  auto at = scope->GetTensor("A");
+  SetRandData<float>(at, target);
+  LOG(INFO) << "Before Execute";
+
+  runtime_program->Execute();
+
+  LOG(INFO) << "scope.names: " << Join(scope->var_names(), ",");
+
+  const std::string output_name = "fc_0.tmp_2";
+  auto tensor                   = scope->GetTensor(var_map_paddle_to_program.at(output_name));
+  LOG(INFO) << "tensor.shape: " << utils::Join(tensor->shape().data(), ",");
+  auto data = GetTensorData<float>(tensor, target);
+  for (int i = 0; i < 10; i++) LOG(INFO) << "data: " << data[i];
+}
+*/
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/frontend/var_type_utils.h b/paddle/cinn/frontend/var_type_utils.h
new file mode 100644
index 0000000000000..ee911ef8b93b4
--- /dev/null
+++ b/paddle/cinn/frontend/var_type_utils.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/common/macros.h"
+#include "cinn/common/type.h"
+#include "cinn/frontend/op_mapper_registry.h"
+#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "cinn/frontend/paddle/cpp/var_desc.h"
+
+namespace cinn {
+namespace frontend {
+namespace utils {
+
+inline common::Type CppVarType2CommonType(paddle::cpp::VarDescAPI::Type type) {
+#define SET_TYPE_CASE_ITEM(v_type, c_type)    \
+  case paddle::cpp::VarDescAPI::Type::v_type: \
+    return common::c_type();                  \
+    break;
+
+  static std::vector<std::string> var_type_names_ = {"BOOL",              // 0
+                                                     "INT16",             // 1
+                                                     "INT32",             // 2
+                                                     "INT64",             // 3
+                                                     "FP16",              // 4
+                                                     "FP32",              // 5
+                                                     "FP64",              // 6
+                                                     "LOD_TENSOR",        // 7
+                                                     "SELECTED_ROWS",     // 8
+                                                     "FEED_MINIBATCH",    // 9
+                                                     "FETCH_LIST",        // 10
+                                                     "STEP_SCOPES",       // 11
+                                                     "LOD_RANK_TABLE",    // 12
+                                                     "LOD_TENSOR_ARRAY",  // 13
+                                                     "PLACE_LIST",        // 14
+                                                     "READER",            // 15
+                                                     "",
+                                                     "RAW",          // 17
+                                                     "TUPLE",        // 18
+                                                     "SIZE_T",       // 19
+                                                     "UINT8",        // 20
+                                                     "INT8",         // 21
+                                                     "BF16",         // 22
+                                                     "COMPLEX64",    // 23
+                                                     "COMPLEX128",   // 24
+                                                     "STRING",       // 25
+                                                     "STRINGS",      // 26
+                                                     "VOCAB",        // 27
+                                                     "FEED_LIST",    // 28
+                                                     "PSTRING",      // 29
+                                                     "SPARSE_COO",   // 30
+                                                     "SPARSE_CSR"};  // 31
+  CHECK_LT(static_cast<int>(type), var_type_names_.size()) << "Unknown VarDesc type: " << static_cast<int>(type);
+
+  switch (type) {
+    SET_TYPE_CASE_ITEM(BOOL, Bool)
+    SET_TYPE_CASE_ITEM(INT16, I16)
+    SET_TYPE_CASE_ITEM(INT32, I32)
+    SET_TYPE_CASE_ITEM(INT64, I64)
+    SET_TYPE_CASE_ITEM(BF16, BF16)
+    SET_TYPE_CASE_ITEM(FP16, F16)
+    SET_TYPE_CASE_ITEM(FP32, F32)
+    SET_TYPE_CASE_ITEM(FP64, F64)
+    SET_TYPE_CASE_ITEM(SIZE_T, UI64)
+    SET_TYPE_CASE_ITEM(UINT8, UI8)
+    SET_TYPE_CASE_ITEM(INT8, I8)
+    SET_TYPE_CASE_ITEM(STRING, String)
+    // The paddle's phi::DataType::UNDEFINED is mapped into ProtoDataType::RAW,
+    // so here need convert back to unkown type.
+    SET_TYPE_CASE_ITEM(RAW, Type)
+    default:
+      LOG(FATAL) << "Unknown VarDesc type: " << var_type_names_[static_cast<int>(type)] << "(" << static_cast<int>(type)
+                 << ")";
+  }
+#undef SET_DATA_TYPE_CASE_ITEM
+  return common::Type();
+}
+
+inline OpMapperContext::FeedInfo GetFeedInfoFromDesc(const paddle::cpp::VarDesc& desc) {
+  OpMapperContext::FeedInfo info;
+  for (auto num : desc.GetShape()) {
+    info.shape.emplace_back(static_cast<int>(num));
+  }
+  info.type = CppVarType2CommonType(desc.GetDataType());
+  return info;
+}
+
+}  // namespace utils
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/gtest_main.cc b/paddle/cinn/gtest_main.cc
new file mode 100644
index 0000000000000..0de75bd71aa6f
--- /dev/null
+++ b/paddle/cinn/gtest_main.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, false);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/cinn/hlir/CMakeLists.txt b/paddle/cinn/hlir/CMakeLists.txt
new file mode 100644
index 0000000000000..ec04d1e5f16a7
--- /dev/null
+++ b/paddle/cinn/hlir/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_subdirectory(framework)
+add_subdirectory(pe)
+add_subdirectory(op)
+add_subdirectory(pass)
+add_subdirectory(kernels)
diff --git a/paddle/cinn/hlir/framework/CMakeLists.txt b/paddle/cinn/hlir/framework/CMakeLists.txt
new file mode 100755
index 0000000000000..4961373283c23
--- /dev/null
+++ b/paddle/cinn/hlir/framework/CMakeLists.txt
@@ -0,0 +1,41 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    tensor.cc
+    scope.cc
+    variable.cc
+    buffer.cc
+    memory.cc
+    instruction.cc
+    parallel_compiler.cc
+    graph_compiler.cc
+    graph.cc
+    node.cc
+    pass.cc
+    op_strategy.cc
+    op_lowering.cc
+    op_lowering_util.cc
+    accuracy_checker.cc
+    visualize_helper.cc
+)
+
+if(WITH_CUDA)
+  nv_test(test_hlir_framework_buffer SRCS buffer_test.cc DEPS cinncore)
+  cc_test(test_hlir_framework_accuracy_checker SRCS accuracy_checker_test.cc DEPS cinncore)
+  cc_test(test_hlir_framework_parallel_compiler SRCS parallel_compiler_test.cc DEPS cinncore)
+else()
+  cc_test(test_hlir_framework_buffer SRCS buffer_test.cc DEPS cinncore)
+endif()
+
+
+if (WITH_CUDA)
+cc_test(test_hlir_framework_op_lowering SRCS op_lowering_test.cc DEPS cinncore decomposer_test_helper)
+endif()
+cc_test(test_hlir_framework_tensor SRCS tensor_test.cc DEPS cinncore)
+cc_test(test_hlir_framework_scope SRCS scope_test.cc DEPS cinncore)
+cc_test(test_hlir_framework_instruction SRCS instruction_test.cc DEPS cinncore)
+cc_test(test_hlir_framework_op SRCS op_test.cc DEPS cinncore)
+cc_test(test_hlir_framework_print_graph_pass SRCS print_graph_pass_test.cc DEPS cinncore)
+cc_test(test_hlir_framework_graph SRCS graph_test.cc DEPS cinncore)
+
+#cc_test(test_hlir_framework_graph_compiler SRCS graph_compiler_test.cc DEPS cinncore)
diff --git a/paddle/cinn/hlir/framework/accuracy_checker.cc b/paddle/cinn/hlir/framework/accuracy_checker.cc
new file mode 100644
index 0000000000000..5f756bfcf176f
--- /dev/null
+++ b/paddle/cinn/hlir/framework/accuracy_checker.cc
@@ -0,0 +1,312 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/accuracy_checker.h"
+
+#include <iomanip>
+#include <limits>
+
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
+DECLARE_int64(cinn_self_check_accuracy_num);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+template <typename T>
+std::string PrintValue(const T& value) {
+  std::stringstream ss;
+  if (std::is_floating_point<T>::value) {
+    ss << std::showpoint;
+  }
+  ss << std::setprecision(std::numeric_limits<T>::max_digits10);
+
+  if (std::is_integral<T>::value) {
+    if (std::is_unsigned<T>::value) {
+      ss << static_cast<uint64_t>(value);
+    } else {
+      ss << static_cast<int64_t>(value);
+    }
+  } else {
+    ss << value;
+  }
+  return ss.str();
+}
+
+template <>
+std::string PrintValue<bool>(const bool& value) {
+  std::stringstream ss;
+  ss << std::boolalpha << value;
+  return ss.str();
+}
+
+template <typename T, typename Alloc = std::allocator<T>>
+std::ostream& operator<<(std::ostream& os, const std::vector<T, Alloc>& vec) {
+  os << "{";
+  bool is_first = true;
+  for (auto e : vec) {
+    if (is_first) {
+      is_first = false;
+    } else {
+      os << ", ";
+    }
+    os << PrintValue(e);
+  }
+  os << "}";
+  return os;
+}
+
+template <typename T>
+std::string GetTypeString() {
+  if (std::is_same<T, float>::value) {
+    return "float";
+  } else if (std::is_same<T, double>::value) {
+    return "double";
+  } else if (std::is_same<T, bfloat16>::value) {
+    return "bfloat16";
+  } else if (std::is_same<T, float16>::value) {
+    return "float16";
+  } else if (std::is_same<T, int8_t>::value) {
+    return "int8_t";
+  } else if (std::is_same<T, int16_t>::value) {
+    return "int16_t";
+  } else if (std::is_same<T, int32_t>::value) {
+    return "int32_t";
+  } else if (std::is_same<T, int64_t>::value) {
+    return "int64_t";
+  } else if (std::is_same<T, uint8_t>::value) {
+    return "uint8_t";
+  } else if (std::is_same<T, uint16_t>::value) {
+    return "uint16_t";
+  } else if (std::is_same<T, uint32_t>::value) {
+    return "uint32_t";
+  } else if (std::is_same<T, uint64_t>::value) {
+    return "uint64_t";
+  } else if (std::is_same<T, bool>::value) {
+    return "bool";
+  } else {
+    CHECK(false) << "Not supported data type.";
+    return "";
+  }
+}
+
+template <typename T>
+std::string DebugString(const Tensor& cpu_tensor, const std::string& name, const CheckResult& res) {
+  std::stringstream ss;
+  ss << "name=" << name << ", dtype=" << GetTypeString<T>() << ", shape=" << cpu_tensor->shape().data() << ", data=[";
+  size_t numel     = cpu_tensor->shape().numel();
+  const T* data    = cpu_tensor->data<T>();
+  size_t print_num = 5L;
+  if (FLAGS_cinn_self_check_accuracy_num < 0) {
+    print_num = numel;
+  } else if (FLAGS_cinn_self_check_accuracy_num > 0) {
+    print_num = FLAGS_cinn_self_check_accuracy_num;
+  }
+
+  if (numel <= 2 * print_num) {
+    for (size_t i = 0; i < numel; ++i) {
+      if (i > 0) {
+        ss << ", ";
+      }
+      ss << PrintValue(data[i]);
+    }
+  } else {
+    for (size_t i = 0; i < print_num; ++i) {
+      if (i > 0) {
+        ss << ", ";
+      }
+      ss << PrintValue(data[i]);
+    }
+    ss << ", ... , ";
+    for (size_t i = numel - print_num; i < numel; ++i) {
+      ss << PrintValue(data[i]);
+      if (i != numel - 1) {
+        ss << ", ";
+      }
+    }
+  }
+  ss << "]";
+  if (res == CheckResult::kZero) {
+    ss << ", Zero";
+  } else if (res == CheckResult::kOne) {
+    ss << ", One";
+  } else if (res == CheckResult::kNaN) {
+    ss << ", NaN";
+  } else if (res == CheckResult::kInf) {
+    ss << ", Inf";
+  } else {
+    ss << ", OK";
+  }
+  return ss.str();
+}
+
+std::string AccuracyChecker::operator()(const std::string& arg_name) {
+  auto tensor = scope_->GetTensor(arg_name);
+  if (tensor->type().is_float(32)) {
+    return CheckTensor<float>(tensor, arg_name);
+  } else if (tensor->type().is_float(64)) {
+    return CheckTensor<double>(tensor, arg_name);
+  } else if (tensor->type().is_bfloat16()) {
+    return CheckTensor<bfloat16>(tensor, arg_name);
+  } else if (tensor->type().is_float16()) {
+    return CheckTensor<float16>(tensor, arg_name);
+  } else if (tensor->type().is_int(8)) {
+    return CheckTensor<int8_t>(tensor, arg_name);
+  } else if (tensor->type().is_int(16)) {
+    return CheckTensor<int16_t>(tensor, arg_name);
+  } else if (tensor->type().is_int(32)) {
+    return CheckTensor<int32_t>(tensor, arg_name);
+  } else if (tensor->type().is_int(64)) {
+    return CheckTensor<int64_t>(tensor, arg_name);
+  } else if (tensor->type().is_uint(8)) {
+    return CheckTensor<uint8_t>(tensor, arg_name);
+  } else if (tensor->type().is_uint(16)) {
+    return CheckTensor<uint16_t>(tensor, arg_name);
+  } else if (tensor->type().is_uint(32)) {
+    return CheckTensor<uint32_t>(tensor, arg_name);
+  } else if (tensor->type().is_uint(64)) {
+    return CheckTensor<uint64_t>(tensor, arg_name);
+  } else if (tensor->type().is_bool()) {
+    return CheckTensor<bool>(tensor, arg_name);
+  } else {
+    CHECK(false) << "Not supported data type.";
+    return "";
+  }
+}
+
+std::string AccuracyChecker::operator()(const std::map<std::string, cinn_pod_value_t>* name2podargs,
+                                        const std::string& arg_name) {
+  CHECK(name2podargs) << "name2podargs should not be nullptr.";
+  const cinn_buffer_t* buffer = cinn_pod_value_to_buffer_p(const_cast<cinn_pod_value_t*>(&name2podargs->at(arg_name)));
+  if (buffer->type == cinn_float32_t()) {
+    return CheckBuffer<float>(buffer, arg_name);
+  } else if (buffer->type == cinn_float64_t()) {
+    return CheckBuffer<double>(buffer, arg_name);
+  } else if (buffer->type == cinn_bfloat16_t()) {
+    return CheckBuffer<bfloat16>(buffer, arg_name);
+  } else if (buffer->type == cinn_float16_t()) {
+    return CheckBuffer<float16>(buffer, arg_name);
+  } else if (buffer->type == cinn_int8_t()) {
+    return CheckBuffer<int8_t>(buffer, arg_name);
+  } else if (buffer->type == cinn_int16_t()) {
+    return CheckBuffer<int16_t>(buffer, arg_name);
+  } else if (buffer->type == cinn_int32_t()) {
+    return CheckBuffer<int32_t>(buffer, arg_name);
+  } else if (buffer->type == cinn_int64_t()) {
+    return CheckBuffer<int64_t>(buffer, arg_name);
+  } else if (buffer->type == cinn_uint8_t()) {
+    return CheckBuffer<uint8_t>(buffer, arg_name);
+  } else if (buffer->type == cinn_uint16_t()) {
+    return CheckBuffer<uint16_t>(buffer, arg_name);
+  } else if (buffer->type == cinn_uint32_t()) {
+    return CheckBuffer<uint32_t>(buffer, arg_name);
+  } else if (buffer->type == cinn_uint64_t()) {
+    return CheckBuffer<uint64_t>(buffer, arg_name);
+  } else if (buffer->type == cinn_bool_t()) {
+    return CheckBuffer<bool>(buffer, arg_name);
+  } else {
+    CHECK(false) << "Not supported data type.";
+    return "";
+  }
+}
+
+template <typename T>
+std::string AccuracyChecker::CheckTensor(const Tensor& tensor, const std::string& arg_name) {
+  Tensor cpu_tensor;
+  cpu_tensor->Resize(tensor->shape());
+  T* dst = cpu_tensor->mutable_data<T>(common::DefaultHostTarget());
+
+  const T* src = tensor->data<T>();
+  size_t numel = tensor->shape().numel();
+  MemcpyDeviceToHost(src, numel, dst);
+
+  auto res        = CheckNanOrInf<T>(cpu_tensor);
+  auto result_str = DebugString<T>(cpu_tensor, arg_name, res);
+  return result_str;
+}
+
+template <typename T>
+std::string AccuracyChecker::CheckBuffer(const cinn_buffer_t* buffer, const std::string& arg_name) {
+  std::vector<int> shape;
+  shape.resize(buffer->dimensions);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    shape[i] = buffer->dims[i];
+  }
+
+  Tensor cpu_tensor;
+  cpu_tensor->Resize(Shape(shape));
+  T* dst = cpu_tensor->mutable_data<T>(common::DefaultHostTarget());
+
+  const T* src = reinterpret_cast<const T*>(buffer->memory);
+  size_t numel = cpu_tensor->shape().numel();
+  MemcpyDeviceToHost(src, numel, dst);
+
+  auto res        = CheckNanOrInf<T>(cpu_tensor);
+  auto result_str = DebugString<T>(cpu_tensor, arg_name, res);
+  return result_str;
+}
+
+template <typename T>
+void AccuracyChecker::MemcpyDeviceToHost(const T* src, size_t numel, T* dst) {
+#ifdef CINN_WITH_CUDA
+  if (target_ == common::DefaultNVGPUTarget()) {
+    cudaMemcpy(dst, src, numel * sizeof(T), cudaMemcpyDeviceToHost);
+    return;
+  }
+#endif
+  if (target_ == common::DefaultHostTarget()) {
+    for (size_t i = 0; i < numel; ++i) {
+      dst[i] = src[i];
+    }
+  } else {
+    CHECK(false) << "Not supported target type.";
+  }
+}
+
+template <typename T>
+CheckResult AccuracyChecker::CheckNanOrInf(const Tensor& cpu_tensor) {
+  bool zero_flag = true;
+  bool one_flag  = true;
+  size_t numel   = cpu_tensor->shape().numel();
+  const T* data  = cpu_tensor->data<T>();
+  for (size_t i = 0; i < numel; ++i) {
+    if (std::isnan(data[i])) {
+      return CheckResult::kNaN;
+    } else if (std::isinf(data[i])) {
+      return CheckResult::kInf;
+    }
+    if (data[i] != static_cast<T>(0)) {
+      zero_flag = false;
+    }
+    if (data[i] != static_cast<T>(1)) {
+      one_flag = false;
+    }
+  }
+  if (zero_flag) {
+    return CheckResult::kZero;
+  } else if (one_flag) {
+    return CheckResult::kOne;
+  }
+  return CheckResult::kOK;
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/accuracy_checker.h b/paddle/cinn/hlir/framework/accuracy_checker.h
new file mode 100644
index 0000000000000..635c05705b39b
--- /dev/null
+++ b/paddle/cinn/hlir/framework/accuracy_checker.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/hlir/framework/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+enum CheckResult { kOK = 0, kZero = 1, kNaN = 2, kInf = 3, kOne = 4 };
+
+class AccuracyChecker {
+ public:
+  AccuracyChecker(const Target& target, Scope* scope) : target_(target), scope_(scope) {}
+
+  std::string operator()(const std::string& arg_name);
+  std::string operator()(const std::map<std::string, cinn_pod_value_t>* name2podargs, const std::string& arg_name);
+
+ private:
+  template <typename T>
+  std::string CheckTensor(const Tensor& tensor, const std::string& arg_name);
+
+  template <typename T>
+  std::string CheckBuffer(const cinn_buffer_t* buffer, const std::string& arg_name);
+
+  template <typename T>
+  void MemcpyDeviceToHost(const T* src, size_t numel, T* dst);
+
+  template <typename T>
+  CheckResult CheckNanOrInf(const Tensor& cpu_tensor);
+
+  Target target_;
+  Scope* scope_;  // Not owned
+};
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/accuracy_checker_test.cc b/paddle/cinn/hlir/framework/accuracy_checker_test.cc
new file mode 100644
index 0000000000000..624d9d5cfece7
--- /dev/null
+++ b/paddle/cinn/hlir/framework/accuracy_checker_test.cc
@@ -0,0 +1,162 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/accuracy_checker.h"
+
+#include <gtest/gtest.h>
+
+#include <random>
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/hlir/framework/instruction.h"
+#include "cinn/hlir/framework/op_strategy.h"
+
+DECLARE_string(cinn_self_check_accuracy);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+void GenerateRandomData(float* data, size_t numel, bool generate_nan) {
+  std::random_device seed;
+  std::default_random_engine engine(seed());
+  std::uniform_real_distribution<float> dist(-100.f, 100.f);
+  for (size_t i = 0; i < numel; i++) {
+    float v = dist(engine);
+    data[i] = generate_nan ? sqrt(v) : v;
+  }
+}
+
+void SetRandomTensor(Tensor tensor, Target target, bool generate_nan) {
+  size_t numel = tensor->shape().numel();
+  float* dst   = tensor->mutable_data<float>(target);
+
+  std::vector<float> random_nan_vec(numel);
+  GenerateRandomData(random_nan_vec.data(), numel, generate_nan);
+
+#ifdef CINN_WITH_CUDA
+  if (target == common::DefaultNVGPUTarget()) {
+    cudaMemcpy(dst, random_nan_vec.data(), numel * sizeof(float), cudaMemcpyHostToDevice);
+  }
+#endif
+  if (target == common::DefaultHostTarget()) {
+    std::copy(random_nan_vec.begin(), random_nan_vec.end(), dst);
+  }
+}
+
+TEST(AccuracyChecker, tensor) {
+  Target target = common::DefaultTarget();
+  Scope scope;
+  scope.Var<Tensor>("x");
+  auto out = scope.GetTensor("x");
+  out->Resize(Shape({16, 16}));
+  SetRandomTensor(out, target, true);
+
+  AccuracyChecker checker(target, &scope);
+  std::string result_str = checker("x");
+  LOG(INFO) << result_str;
+}
+
+std::unique_ptr<backends::SimpleJIT> GetLoweredFunc(Target target) {
+  Expr m(16);
+  Expr n(16);
+
+  lang::Placeholder<float> x("x", {m, n});
+
+  auto y = Compute(
+      {m, n}, [=](Expr i, Expr j) { return lang::CallExtern("sqrt", {x(i, j)}); }, "y");
+
+  auto stages = CreateStages({y});
+  auto fn     = Lower("fn_sqrt", stages, {x, y});
+
+  ir::Module::Builder builder("some_module", target);
+  builder.AddFunction(fn);
+
+  auto jit = backends::SimpleJIT::Create();
+  jit->Link(builder.Build());
+  return std::move(jit);
+}
+
+void InstantiateScope(Scope* scope, Target target) {
+  for (auto& name : std::vector<std::string>({"x", "y"})) {
+    scope->Var<Tensor>(name);
+    auto x = scope->GetTensor(name);
+    x->Resize(Shape({16, 16}));
+    SetRandomTensor(x, target, false);
+  }
+}
+
+TEST(AccuracyChecker, instruction) {
+  Target target = common::DefaultHostTarget();
+  Scope scope;
+  InstantiateScope(&scope, target);
+
+  auto jit    = GetLoweredFunc(target);
+  auto fn_ptr = jit->Lookup("fn_sqrt");
+  CHECK(fn_ptr);
+
+  FLAGS_cinn_self_check_accuracy = "true";
+  Instruction instr(target, &scope, {"x"}, {"y"});
+  instr.SetLoweredFunc(reinterpret_cast<void*>(fn_ptr), "fn_sqrt");
+  // should call Finalize explicitly before Run
+  instr.Finalize();
+
+  instr.Run();
+  FLAGS_cinn_self_check_accuracy = "";
+}
+
+void InitName2PodArgs(Target target,
+                      std::vector<cinn_buffer_t>* args_buffer,
+                      std::map<std::string, cinn_pod_value_t>* name2podargs) {
+  auto* default_memory_mng = MemoryManager::Global().RetrieveSafely(target.arch);
+
+  int count         = 0;
+  const auto& shape = Shape({16, 16});
+  size_t numel      = shape.numel();
+  for (const auto& name : std::vector<std::string>({"x", "y"})) {
+    auto* buffer = &args_buffer->at(count++);
+    buffer->type = cinn_float32_t();
+    buffer->resize(reinterpret_cast<const cinn_dimension_t*>(shape.data().data()), shape.size());
+    buffer->memory = reinterpret_cast<uint8_t*>(default_memory_mng->malloc(numel * sizeof(float)));
+    float* data    = reinterpret_cast<float*>(buffer->memory);
+    GenerateRandomData(data, numel, false);
+    name2podargs->emplace(name, buffer);
+  }
+}
+
+TEST(AccuracyChecker, instruction_podargs) {
+  Target target = common::DefaultHostTarget();
+  std::vector<cinn_buffer_t> args_buffer(2);
+  std::map<std::string, cinn_pod_value_t> name2podargs;
+  InitName2PodArgs(target, &args_buffer, &name2podargs);
+
+  auto jit    = GetLoweredFunc(target);
+  auto fn_ptr = jit->Lookup("fn_sqrt");
+  CHECK(fn_ptr);
+
+  FLAGS_cinn_self_check_accuracy = "true";
+  Instruction instr(target, nullptr, {"x"}, {"y"});
+  instr.SetLoweredFunc(reinterpret_cast<void*>(fn_ptr), "fn_sqrt");
+  instr.Finalize();
+
+  instr.Run(&name2podargs);
+  FLAGS_cinn_self_check_accuracy = "";
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/buffer.cc b/paddle/cinn/hlir/framework/buffer.cc
new file mode 100755
index 0000000000000..2828527af5b40
--- /dev/null
+++ b/paddle/cinn/hlir/framework/buffer.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/buffer.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+void Buffer::Resize(uint32_t size) {
+  if (size_ > 0) {
+    Free();
+    size_ = 0;
+  }
+
+  if (size_ != size) {
+    data_.memory      = reinterpret_cast<uint8_t*>(Malloc(size));
+    data_.memory_size = size;
+    size_             = size;
+  }
+}
+
+void Buffer::Resize(uint32_t alignment, uint32_t size) {
+  if (size_ > 0) {
+    Free();
+    size_ = 0;
+  }
+
+  if (size_ != size) {
+    data_.memory      = reinterpret_cast<uint8_t*>(AlignedAlloc(alignment, size));
+    data_.memory_size = size;
+    size_             = size;
+  }
+}
+
+void Buffer::SetTarget(const common::Target& target) {
+  target_           = target;
+  memory_mng_cache_ = MemoryManager::Global().RetrieveSafely(target_.arch);
+}
+
+void Buffer::ResizeLazy(uint32_t size) {
+  if (size <= size_) return;
+  Resize(size);
+}
+
+void Buffer::ResizeLazy(uint32_t alignment, uint32_t size) {
+  if (size <= size_) return;
+  Resize(alignment, size);
+}
+
+void Buffer::Resize(uint32_t size, const common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  Resize(size);
+}
+
+void Buffer::Resize(uint32_t alignment, uint32_t size, const common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  Resize(alignment, size);
+}
+
+void Buffer::ResizeLazy(uint32_t size, const common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  ResizeLazy(size);
+}
+
+void Buffer::ResizeLazy(uint32_t alignment, uint32_t size, const common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  ResizeLazy(alignment, size);
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/buffer.h b/paddle/cinn/hlir/framework/buffer.h
new file mode 100644
index 0000000000000..0538eaee678d1
--- /dev/null
+++ b/paddle/cinn/hlir/framework/buffer.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <glog/logging.h>
+
+#include <memory>
+
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/memory.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+/**
+ * Buffer helps to hold the memory, and offers a set of methods to help manage the memory.
+ */
+struct Buffer final {
+  Buffer() = default;
+  explicit Buffer(const common::Target& target) { SetTarget(target); }
+  ~Buffer() { Free(); }
+  //! Resize the memory hold by this buffer *exactlly* to \p size.
+  void Resize(uint32_t size);
+  void Resize(uint32_t alignment, uint32_t size);
+
+  //! Lazily resize the memory.
+  void ResizeLazy(uint32_t size);
+  void ResizeLazy(uint32_t alignment, uint32_t size);
+
+  //! Resize the memory to \p size in target \p target.
+  void Resize(uint32_t size, const common::Target& target);
+  void Resize(uint32_t alignment, uint32_t size, const common::Target& target);
+
+  //! Lazily resize the memory to \p size in target \p target.
+  void ResizeLazy(uint32_t size, const common::Target& target);
+  void ResizeLazy(uint32_t alignment, uint32_t size, const common::Target& target);
+
+  void SetTarget(const common::Target& target);
+
+  const cinn_buffer_t* data() const { return &data_; }
+  cinn_buffer_t* data() { return &data_; }
+
+  //! Free all the memory owned by this buffer.
+  void Free() {
+    if (!data_.memory) return;
+    memory_mng_cache_->free(data_.memory);
+  }
+
+ private:
+  inline void* Malloc(uint32_t size) CINN_RESULT_SHOULD_USE {
+    CHECK(memory_mng_cache_) << "Should set target first";
+    return memory_mng_cache_->malloc(size);
+  }
+
+  inline void* AlignedAlloc(uint32_t alignment, uint32_t size) CINN_RESULT_SHOULD_USE {
+    CHECK(memory_mng_cache_) << "Should set target first";
+    return memory_mng_cache_->aligned_alloc(alignment, size);
+  }
+
+ private:
+  cinn_buffer_t data_;
+
+  //! The place where this buffer locates.
+  common::Target target_;
+
+  //! Number of bytes of this buffer.
+  uint32_t size_{};
+
+  //! Hold the corresponding memory manager for speed.
+  MemoryInterface* memory_mng_cache_{};
+};
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/buffer_test.cc b/paddle/cinn/hlir/framework/buffer_test.cc
new file mode 100755
index 0000000000000..5c7f00276c7d0
--- /dev/null
+++ b/paddle/cinn/hlir/framework/buffer_test.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/buffer.h"
+#ifdef CINN_WITH_CUDA
+#include "cinn/backends/cuda_util.h"
+#endif
+#include <gtest/gtest.h>
+
+#include <vector>
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+TEST(Buffer, basic) {
+  Buffer buffer(common::DefaultHostTarget());
+  buffer.Resize(10 * sizeof(float));
+  auto* data = reinterpret_cast<float*>(buffer.data()->memory);
+  for (int i = 0; i < 10; i++) data[i] = i;
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(Buffer, nvgpu) {
+  const int num_elements = 10;
+  Buffer buffer(common::DefaultNVGPUTarget());
+  buffer.Resize(num_elements * sizeof(float));
+  auto* data = reinterpret_cast<float*>(buffer.data()->memory);
+  std::vector<float> host_data(num_elements);
+  std::vector<float> host_target(num_elements, 0);
+
+  for (int i = 0; i < num_elements; i++) {
+    host_data[i] = i;
+  }
+  LOG(INFO) << "Cuda copy data";
+  CUDA_DRIVER_CALL(cuMemcpy(reinterpret_cast<CUdeviceptr>(data),
+                            reinterpret_cast<CUdeviceptr>(host_data.data()),
+                            num_elements * sizeof(float)));
+  CUDA_DRIVER_CALL(cuMemcpy(reinterpret_cast<CUdeviceptr>(host_target.data()),
+                            reinterpret_cast<CUdeviceptr>(data),
+                            num_elements * sizeof(float)));
+  for (int i = 0; i < num_elements; i++) {
+    ASSERT_EQ(host_target[i], i);
+  }
+}
+#endif
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/graph.cc b/paddle/cinn/hlir/framework/graph.cc
new file mode 100644
index 0000000000000..2d79f107814f1
--- /dev/null
+++ b/paddle/cinn/hlir/framework/graph.cc
@@ -0,0 +1,514 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/graph.h"
+
+#include <atomic>
+#include <sstream>
+
+#include "cinn/hlir/framework/visualize_helper.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/string.h"
+
+DECLARE_string(cinn_fusion_groups_graphviz_dir);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using DTypeDict = absl::flat_hash_map<std::string, common::Type>;
+using ShapeDict = absl::flat_hash_map<std::string, shape_t>;
+
+void Graph::Initialize(const frontend::Program& prog,
+                       const std::unordered_set<std::string>& fetch_var_ids,
+                       const Target& target) {
+  target_ = target;
+  ShapeDict shape_dict;
+  DTypeDict dtype_dict;
+  int counter = 0;
+  for (size_t i = 0; i < prog.size(); i++) {
+    auto temp = prog[i];
+    VLOG(3) << "operator [" << temp->op_type << "] has [" << temp->inputs.size() << "] inputs, and ["
+            << temp->outputs.size() << "] outputs";
+    Node* node_tmp =
+        new Node(Operator::Get(temp->op_type), temp->op_type, temp->op_type + "_" + std::to_string(counter++));
+    Shared<Node> node_ptr(node_tmp);
+    node_tmp->attrs.attr_store = temp->attrs;
+    for (auto& input_v : temp->inputs) {
+      common::GraphNode* graph_node = this->RetrieveNode(input_v->id);
+      if (!graph_node) {
+        dtype_dict[input_v->id] = input_v->type;
+        shape_dict[input_v->id] = input_v->shape;
+        NodeData* input_data    = new NodeData(nullptr, 0, 0, input_v->id, input_v.is_const());
+        input_data->LinkTo(node_tmp);
+        this->RegisterNode(input_v->id, input_data);
+      } else {
+        graph_node->as<NodeData>()->LinkTo(node_tmp);
+      }
+    }
+    int out_idx = 0;
+    for (auto& output_v : temp->outputs) {
+      common::GraphNode* graph_node = this->RetrieveNode(output_v->id);
+      if (!graph_node) {
+        dtype_dict[output_v->id] = output_v->type;
+        shape_dict[output_v->id] = output_v->shape;
+        auto* output_data        = new NodeData(node_ptr, out_idx++, 0, output_v->id);
+        if (fetch_var_ids.count(output_v->id)) {
+          outputs.push_back(output_data);
+        }
+        node_tmp->LinkTo(output_data);
+        this->RegisterNode(output_v->id, output_data);
+      } else {
+        node_tmp->LinkTo(graph_node->as<NodeData>());
+        graph_node->as<NodeData>()->set_const(false);
+        graph_node->as<NodeData>()->output_index = out_idx++;
+        graph_node->as<NodeData>()->source_node  = node_ptr;
+      }
+    }
+    this->RegisterNode(node_tmp->id(), node_tmp);
+  }
+  this->attrs["infershape"] = std::make_shared<absl::any>(shape_dict);
+  this->attrs["inferdtype"] = std::make_shared<absl::any>(dtype_dict);
+}
+
+std::vector<std::vector<Node*>> Graph::FusionGroupsToGroups() {
+  std::vector<std::vector<Node*>> groups;
+  if (fusion_groups.empty()) {
+    // if no fusion_groups, the graph will be treated as a big group
+    const auto& nodes = this->CollectNodes([](const common::GraphNode* node) {
+      return node->safe_as<Node>() != nullptr && node->safe_as<Node>()->op() != nullptr;
+    });
+    std::vector<Node*> group;
+    group.reserve(nodes.size());
+    for (auto* node : nodes) {
+      group.emplace_back(node->safe_as<Node>());
+    }
+    groups.emplace_back(std::move(group));
+  } else {
+    groups.resize(fusion_groups.size());
+    for (size_t i = 0; i < fusion_groups.size(); ++i) {
+      groups[i] = fusion_groups[i]->CollectNodes();
+    }
+  }
+  return groups;
+}
+
+std::string Graph::DebugGroupedGraph(const std::unordered_set<std::string>& fetch_var_ids) {
+  if (!fusion_groups.empty()) {
+    return DebugGroupedGraph(FusionGroupsToGroups(), fetch_var_ids);
+  }
+
+  std::vector<Node*> graph_ops;
+  auto nodes_inorder = std::get<0>(topological_order());
+  for (auto* graph_node : nodes_inorder) {
+    auto node = graph_node->safe_as<Node>();
+    // if node is NodeData or not op, continue.
+    if (!node || node->op() == nullptr) {
+      continue;
+    }
+
+    graph_ops.emplace_back(node);
+  }
+
+  std::stringstream debug_str;
+  debug_str << "Graph {\n";
+  debug_str << DebugGroupedGraph(graph_ops, fetch_var_ids);
+  debug_str << "}\n";
+  return debug_str.str();
+}
+
+std::string Graph::DebugGroupedGraph(const std::vector<Node*>& group,
+                                     const std::unordered_set<std::string>& fetch_var_ids) {
+  auto& shape_dict = HasAttr("infershape") ? GetAttrs<ShapeDict>("infershape") : ShapeDict{};
+  auto& dtype_dict = HasAttr("inferdtype") ? GetAttrs<DTypeDict>("inferdtype") : DTypeDict{};
+
+  auto get_all_out_names = [](const std::vector<Node*>& nodes) {
+    // collect all op's output var name in group
+    std::unordered_set<std::string> out_names;
+    for (auto* node : nodes) {
+      for (const auto& link : node->outlinks()) {
+        auto* out_node = link->sink()->safe_as<NodeData>();
+        out_names.emplace(out_node->id());
+      }
+    }
+    return out_names;
+  };
+  auto get_feed_list = [](const std::vector<Node*>& nodes, const std::unordered_set<std::string>& out_names) {
+    // if the op's input var name cannot found in out_names, it is the group's feed var
+    std::unordered_set<std::string> feed_list;
+    for (auto* node : nodes) {
+      for (const auto& link : node->inlinks()) {
+        auto* in_node = link->source()->safe_as<NodeData>();
+        if (!out_names.count(in_node->id())) {
+          feed_list.emplace(in_node->id());
+        }
+      }
+    }
+    return std::vector<std::string>(feed_list.begin(), feed_list.end());
+  };
+  auto get_fetch_list = [&](const std::vector<Node*>& nodes, const std::unordered_set<std::string>& out_names) {
+    // if the fetch var in out_names, it's the group's fetch var, otherwise not
+    std::unordered_set<std::string> in_names;
+    for (auto* node : nodes) {
+      for (const auto& link : node->inlinks()) {
+        auto* in_node = link->source()->safe_as<NodeData>();
+        in_names.emplace(in_node->id());
+      }
+    }
+    std::vector<std::string> fetch_list;
+    for (const auto& out : out_names) {
+      if (!in_names.count(out) || fetch_var_ids.count(out)) {
+        // if the var not any op's input, or in fetch_var_ids, it's the group's fetch list
+        fetch_list.emplace_back(out);
+      }
+    }
+    return fetch_list;
+  };
+
+  const auto& out_names = get_all_out_names(group);
+  const auto& feed_list = get_feed_list(group, out_names);
+
+  std::stringstream debug_str;
+  // generator python test code
+  for (const auto& id : feed_list) {
+    const auto& shape = shape_dict.count(id) ? cinn::utils::Join(shape_dict.at(id), ", ") : "-1";
+    const auto& dtype = dtype_dict.count(id) ? common::Type2Str(dtype_dict.at(id)) : "float32";
+
+    // generator python create_input code
+    debug_str << "    " << id << " = builder.create_input(type=\"" << dtype << "\", shape=[" << shape << "], id_hint=\""
+              << id << "\")\n";
+  }
+  debug_str << "\n";
+  // generator builder.op code
+  for (auto* node : group) {
+    debug_str << "    " << DebugString(node) << "\n";
+  }
+  debug_str << "\n";
+  // generator
+  debug_str << "    feed_list = [" << cinn::utils::Join(feed_list, ", ") << "]\n";
+  debug_str << "    fetch_list = [" << cinn::utils::Join(get_fetch_list(group, out_names), ", ") << "]\n";
+
+  return debug_str.str();
+}
+
+std::string Graph::GenerateGroupPythonCode(const std::vector<Node*>& group,
+                                           const std::unordered_set<std::string>& fetch_var_ids) {
+  std::stringstream ss;
+  ss << "#!/usr/bin/env python3\n";
+  ss << "# Please set \"export PYTHONPATH=${CINN_ROOT}/build/python:${PYTHONPATH}\" first\n";
+  ss << "\n";
+  ss << "import unittest\n";
+  ss << "import numpy as np\n";
+  ss << "from cinn.frontend import NetBuilder\n";
+  ss << "from cinn.common import DefaultNVGPUTarget\n";
+  ss << "from tests.ops.op_test import OpTest\n";
+  ss << "\n";
+  ss << "class TestGroup(unittest.TestCase):\n";
+  ss << "  def test_group(self):\n";
+  ss << "    builder = NetBuilder(\"group_test\")\n";
+  ss << "\n";
+  ss << DebugGroupedGraph(group, fetch_var_ids);
+  ss << "\n";
+  ss << "    prog = builder.build()\n";
+  ss << "\n";
+  ss << "    feed_data = [OpTest.random(shape=var.shape(), dtype=var.type()) for var in feed_list]\n";
+  ss << "    result = prog.build_and_get_output(DefaultNVGPUTarget(), feed_list, feed_data, fetch_list)\n";
+  ss << "\n";
+  ss << "    result = [res.numpy(DefaultNVGPUTarget()) for res in result]\n";
+  ss << "    for i in range(len(result)):\n";
+  ss << "      info_str = fetch_list[i].name()\n";
+  ss << "      info_str += \", shape=\" + str(result[i].shape)\n";
+  ss << "      info_str += \", dtype=\" + str(result[i].dtype) + \":\\n\"\n";
+  ss << "      info_str += str(result[i])\n";
+  ss << "      print(info_str)\n";
+
+  ss << "\n";
+  ss << "\n";
+  ss << "if __name__ == \"__main__\":\n";
+  ss << "  unittest.main()\n";
+  ss << "\n";
+  return ss.str();
+}
+
+std::string Graph::DebugGroupedGraph(const std::vector<std::vector<Node*>>& groups,
+                                     const std::unordered_set<std::string>& fetch_var_ids) {
+  std::unordered_set<std::string> fetch_list;
+  if (!fetch_var_ids.empty()) {
+    fetch_list = fetch_var_ids;
+  } else {
+    for (auto* var : this->outputs) {
+      if (!var) {
+        continue;
+      }
+      fetch_list.insert(var->id());
+    }
+  }
+
+  std::stringstream debug_str;
+  int group_id = 0;
+  for (auto& group : groups) {
+    debug_str << "Group " << group_id++ << " {\n";
+    debug_str << DebugGroupedGraph(group, fetch_list);
+    debug_str << "}\n";
+  }
+  debug_str << "\n";
+
+  debug_str << "graph_fetch_list=["
+            << cinn::utils::Join(std::vector<std::string>(fetch_list.begin(), fetch_list.end()), ", ") << "]\n";
+
+  return debug_str.str();
+}
+
+void Graph::VisualizeGroupedGraph(const std::unordered_set<std::string>& fetch_var_ids) {
+  VisualizeGroupedGraph(FusionGroupsToGroups(), fetch_var_ids);
+}
+
+void Graph::VisualizeGroupedGraph(const std::vector<std::vector<Node*>>& origin_groups,
+                                  const std::unordered_set<std::string>& fetch_var_ids) {
+  if (cinn::runtime::CheckStringFlagFalse(FLAGS_cinn_fusion_groups_graphviz_dir)) {
+    return;
+  }
+
+  int viz_id = viz_count_.fetch_add(1);
+  {
+    // create base Directory
+    viz_path_ = utils::StringFormat("%s/fusion_groups_%d/", FLAGS_cinn_fusion_groups_graphviz_dir.c_str(), viz_id);
+    if (!MakeDirectory(viz_path_, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
+      LOG_IF(WARNING, viz_id == 0) << "Failed to make directory: \"" << viz_path_
+                                   << "\", the CINN subgraph's fusion group information will not print.";
+      viz_path_.clear();
+      return;
+    }
+    LOG_IF(INFO, viz_id == 0) << "The CINN subgraph's fusion group information will writing into path: \""
+                              << FLAGS_cinn_fusion_groups_graphviz_dir << "\"";
+  }
+
+  const auto& groups = RemoveAccCheckGroups(origin_groups);
+  {
+    // save python test file
+    std::string py_test_path = viz_path_ + "/tests/";
+    if (!MakeDirectory(py_test_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
+      LOG_IF(WARNING, viz_id == 0) << "Failed to make directory: \"" << py_test_path
+                                   << "\", the CINN subgraph's python test file will not generate.";
+      py_test_path.clear();
+    }
+    if (!py_test_path.empty()) {
+      for (int i = 0; i < groups.size(); i++) {
+        WriteToFile(py_test_path + "test_group_" + std::to_string(i) + ".py",
+                    GenerateGroupPythonCode(groups[i], fetch_var_ids));
+      }
+    }
+  }
+
+  Summary(groups, viz_path_);
+  WriteToFile(viz_path_ + "grouped_graph.dot", VisualizeGraph(groups, fetch_var_ids));
+
+  {
+    // save each group's graphviz dot file
+    std::string group_path = viz_path_ + "/groups/";
+    if (!MakeDirectory(group_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
+      LOG_IF(WARNING, viz_id == 0) << "Failed to make directory: \"" << group_path
+                                   << "\", the CINN subgraph's group graphviz file will not save.";
+      group_path.clear();
+    }
+    if (!group_path.empty()) {
+      const auto& group_dots = VisualizeGroups(groups, fetch_var_ids);
+      for (int i = 0; i < group_dots.size(); ++i) {
+        WriteToFile(GetFilePathForGroup(groups, i, group_path), group_dots[i]);
+      }
+    }
+  }
+}
+
+std::string Graph::VisualizeGraph(const std::unordered_set<std::string>& fetch_var_ids) {
+  return VisualizeGraph(FusionGroupsToGroups(), fetch_var_ids);
+}
+
+std::string Graph::VisualizeGraph(const std::vector<std::vector<Node*>>& groups,
+                                  const std::unordered_set<std::string>& fetch_var_ids) {
+  auto& shape_dict = HasAttr("infershape") ? GetAttrs<ShapeDict>("infershape") : ShapeDict{};
+  auto& dtype_dict = HasAttr("inferdtype") ? GetAttrs<DTypeDict>("inferdtype") : DTypeDict{};
+
+  std::unordered_map<std::string, int> recompute_nodes;
+  FindRecomputeNodes(groups, &recompute_nodes);
+
+  utils::DotLang dot;
+  utils::ResetDotCounters();
+
+  // Record the NodeData's actually ids.
+  std::unordered_set<std::string> nodedatas_set;
+
+  int group_id = 0;
+  for (auto& group : groups) {
+    std::string dot_cluster_id = GenClusterId(group, group_id);
+    dot.AddCluster(dot_cluster_id, GetGroupAttrs(group.size()));
+
+    std::unordered_map<std::string, std::string> outnode2dot_id;
+    for (auto* node : group) {
+      AddGroupNode(node,
+                   dot_cluster_id,
+                   fetch_var_ids,
+                   shape_dict,
+                   dtype_dict,
+                   &recompute_nodes,
+                   &outnode2dot_id,
+                   &nodedatas_set,
+                   &dot);
+    }
+    group_id++;
+  }
+  return dot();
+}
+
+std::vector<std::string> Graph::VisualizeGroups(const std::unordered_set<std::string>& fetch_var_ids) {
+  return VisualizeGroups(FusionGroupsToGroups(), fetch_var_ids);
+}
+
+std::vector<std::string> Graph::VisualizeGroups(const std::vector<std::vector<Node*>>& groups,
+                                                const std::unordered_set<std::string>& fetch_var_ids) {
+  auto& shape_dict = HasAttr("infershape") ? GetAttrs<ShapeDict>("infershape") : ShapeDict{};
+  auto& dtype_dict = HasAttr("inferdtype") ? GetAttrs<DTypeDict>("inferdtype") : DTypeDict{};
+
+  std::unordered_map<std::string, int> recompute_nodes;
+  FindRecomputeNodes(groups, &recompute_nodes);
+
+  utils::ResetDotCounters();
+
+  std::vector<std::string> dot_vec;
+  int group_id = 0;
+  for (auto& group : groups) {
+    utils::DotLang dot;
+    std::unordered_set<Node*> nodes_set;
+    std::string dot_cluster_id = GenClusterId(group, group_id);
+    dot.AddCluster(dot_cluster_id, GetGroupAttrs(group.size()));
+
+    std::unordered_map<std::string, std::string> outnode2dot_id;
+    for (auto* node : group) {
+      AddGroupNode(node,
+                   dot_cluster_id,
+                   fetch_var_ids,
+                   shape_dict,
+                   dtype_dict,
+                   &recompute_nodes,
+                   &outnode2dot_id,
+                   nullptr,
+                   &dot);
+      nodes_set.insert(node);
+    }
+
+    for (auto& node : group) {
+      for (auto& inlink : node->inlinks()) {
+        auto* innode = inlink->source()->safe_as<NodeData>();
+        if (innode) {
+          std::string dot_innode_id = outnode2dot_id[innode->id()];
+          for (auto& innode_inlink : innode->inlinks()) {
+            auto* in_innode = innode_inlink->source()->safe_as<Node>();
+            if (in_innode && !nodes_set.count(in_innode)) {
+              nodes_set.insert(in_innode);
+              dot.AddNode(in_innode->id(), GetOutlinkOpAttrs());
+              dot.AddEdge(in_innode->id(), dot_innode_id, {});
+            }
+          }
+        }
+      }
+      for (auto& outlink : node->outlinks()) {
+        auto* outnode = outlink->sink()->safe_as<NodeData>();
+        if (outnode) {
+          std::string dot_outnode_id = outnode2dot_id[outnode->id()];
+          for (auto& outnode_outlink : outnode->outlinks()) {
+            auto* out_outnode = outnode_outlink->sink()->safe_as<Node>();
+            if (out_outnode && !nodes_set.count(out_outnode)) {
+              if (IsAccCheckOp(out_outnode)) {
+                continue;
+              }
+              nodes_set.insert(out_outnode);
+              dot.AddNode(out_outnode->id(), GetOutlinkOpAttrs());
+              dot.AddEdge(dot_outnode_id, out_outnode->id(), {});
+            }
+          }
+        }
+      }
+    }
+    dot_vec.emplace_back(dot());
+
+    group_id++;
+  }
+  return dot_vec;
+}
+
+std::atomic_size_t Graph::viz_count_{0};
+
+std::unordered_set<NodeData*> Graph::Group::GetInputNodeDatas() {
+  std::unordered_set<NodeData*> group_inputs;
+
+  // count all node's input data
+  for (auto node : this->CollectNodes()) {
+    for (auto& in_edge : node->inlinks_in_order()) {
+      auto input_data = in_edge->source()->safe_as<NodeData>();
+      if (!input_data) {
+        continue;
+      }
+
+      if (!input_data->source_node.get()) {
+        // if the input data hasn't input op, it's the group's input
+        group_inputs.insert(input_data);
+        continue;
+      }
+
+      if (std::find(this->input_names.begin(), this->input_names.end(), input_data->id()) != this->input_names.end()) {
+        // if the input data in group's input_names
+        group_inputs.insert(input_data);
+        continue;
+      }
+    }
+  }
+
+  return group_inputs;
+}
+
+std::unordered_set<NodeData*> Graph::Group::GetOutputNodeDatas() {
+  std::unordered_set<NodeData*> group_outputs;
+
+  for (auto node : this->output_nodes) {
+    for (auto& link : node->outlinks_in_order()) {
+      auto node_data = link->sink()->safe_as<NodeData>();
+      if (!node_data) {
+        continue;
+      }
+
+      group_outputs.insert(node_data);
+    }
+  }
+
+  return group_outputs;
+}
+
+void Graph::SaveSourceCode(const std::string& code) {
+  if (cinn::runtime::CheckStringFlagFalse(FLAGS_cinn_fusion_groups_graphviz_dir) || viz_path_.empty()) {
+    return;
+  }
+  WriteToFile(viz_path_ + "source_code.cu", code);
+}
+
+void Graph::SavePTXCode(const std::string& ptx) {
+  if (cinn::runtime::CheckStringFlagFalse(FLAGS_cinn_fusion_groups_graphviz_dir) || viz_path_.empty()) {
+    return;
+  }
+
+  WriteToFile(viz_path_ + "source_code.ptx", ptx);
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/graph.h b/paddle/cinn/hlir/framework/graph.h
new file mode 100644
index 0000000000000..484d551508c0b
--- /dev/null
+++ b/paddle/cinn/hlir/framework/graph.h
@@ -0,0 +1,232 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+#include <absl/types/any.h>
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/node.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+/**
+ * \brief Symbolic computation graph.
+ *  This is the intermediate representation for optimization pass.
+ */
+class Graph : public cinn::common::Graph {
+ public:
+  Graph(const frontend::Program& prog, const Target& target) {
+    std::unordered_set<std::string> fetch_var_ids;
+    Initialize(prog, fetch_var_ids, target);
+  }
+  Graph(const frontend::Program& prog, const std::unordered_set<std::string>& fetch_var_ids, const Target& target) {
+    Initialize(prog, fetch_var_ids, target);
+  }
+
+  void Initialize(const frontend::Program& prog,
+                  const std::unordered_set<std::string>& fetch_var_ids,
+                  const Target& target);
+
+  Target target_;
+  /** \brief outputs of the computation graph. */
+  std::vector<NodeData*> outputs;
+
+  /** \brief attributes of a graph */
+  absl::flat_hash_map<std::string, std::shared_ptr<absl::any>> attrs;
+
+  std::vector<std::vector<Node*>> groups;
+  struct Group {
+    // distance to last group.
+    int depth{0};
+    int max_depth{0};
+    int min_depth{INT_MAX};
+    // group id, consisted of node's id.
+    std::string group_id{""};
+    // global unique id.
+    std::string unique_id{UniqName("")};
+    // node in this group
+    std::vector<Node*> nodes;
+    std::unordered_set<Node*> nodes_set;
+    // input nodes of the group.
+    std::unordered_map<Node*, int> input_nodes;
+    // output nodes of the group.
+    std::unordered_set<Node*> output_nodes;
+    // op pattern kind.
+    framework::OpPatternKind op_pattern_kind{framework::kElementWise};
+    // internal node, the output is used by multi-node.
+    // internal node can't use compute inline, should use buffer.
+    std::unordered_set<Node*> internal_nodes;
+    // master node for schedule
+    std::unordered_set<Node*> master_nodes;
+
+    struct SharedGroupHasher {
+      size_t operator()(const std::shared_ptr<Group>& group) const noexcept {
+        return std::hash<uint64_t>()(reinterpret_cast<uint64_t>(group.get()));
+      }
+    };
+    struct SharedGroupComparator {
+      bool operator()(const std::shared_ptr<Group>& first, const std::shared_ptr<Group>& second) const noexcept {
+        return first.get() == second.get();
+      }
+    };
+    // input groups
+    std::unordered_set<std::shared_ptr<Group>, SharedGroupHasher, SharedGroupComparator> producer_groups;
+    // output grous
+    std::unordered_set<std::shared_ptr<Group>, SharedGroupHasher, SharedGroupComparator> consumer_groups;
+    // fused sub-groups, used for fusion merge pass
+    std::vector<std::shared_ptr<Group>> fused_sub_groups;
+    // if as sub-group, used for belong groups.
+    std::unordered_set<std::shared_ptr<Group>, SharedGroupHasher, SharedGroupComparator> belong_groups;
+
+    // for op lowering.
+    std::vector<std::string> input_names;
+    std::vector<std::string> output_names;
+
+    std::vector<Node*> CollectNodes() {
+      if (fused_sub_groups.size()) {
+        std::vector<Node*> tmp_nodes;
+        for (auto& group : fused_sub_groups) {
+          tmp_nodes.insert(tmp_nodes.end(), group->nodes.begin(), group->nodes.end());
+        }
+        return tmp_nodes;
+      } else {
+        return nodes;
+      }
+    }
+
+    std::unordered_set<Node*> NodeSet() {
+      std::unordered_set<Node*> node_set;
+      for (auto node : CollectNodes()) {
+        node_set.insert(node);
+      }
+      return node_set;
+    }
+
+    std::unordered_set<NodeData*> GetInputNodeDatas();
+    std::unordered_set<NodeData*> GetOutputNodeDatas();
+
+    std::string GetFuncName() { return "fn_" + group_id + unique_id; }
+  };
+  std::vector<std::shared_ptr<Group>> fusion_groups;
+
+  void RegisterNode(size_t key, Node* node) { this->common::Graph::RegisterNode(key, node->as<common::GraphNode>()); }
+  void RegisterNode(size_t key, NodeData* node) {
+    this->common::Graph::RegisterNode(key, node->as<common::GraphNode>());
+  }
+  void RegisterNode(const std::string& key, Node* node) {
+    this->common::Graph::RegisterNode(key, node->as<common::GraphNode>());
+  }
+  void RegisterNode(const std::string& key, NodeData* node) {
+    this->common::Graph::RegisterNode(key, node->as<common::GraphNode>());
+  }
+
+  /**
+   * \brief Get the immutable attribute from attrs.
+   * @param attr_name the name of the attribute
+   * @return the reference to corresponding attribute
+   * @tparam T the type of the attribute.
+   */
+  template <typename T>
+  inline const T& GetAttrs(const std::string& attr_name) const {
+    auto it = attrs.find(attr_name);
+    CHECK(it != attrs.end()) << "Cannot find attribute [" << attr_name << "] in the graph";
+    return absl::any_cast<const T&>(*it->second);
+  }
+
+  /**
+   * \brief Get the mutable attribute from attrs.
+   * @param attr_name the name of the attribute
+   * @return the reference to corresponding attribute
+   * @tparam T the type of the attribute.
+   */
+  template <typename T>
+  inline T& GetMutableAttrs(const std::string& attr_name) {
+    auto it = attrs.find(attr_name);
+    CHECK(it != attrs.end()) << "Cannot find attribute [" << attr_name << "] in the graph";
+    return absl::any_cast<T&>(*it->second);
+  }
+
+  /**
+   * \brief Check whether has a specific attribute.
+   * @param attr_name the name of the attribute
+   * @return a boolean result
+   */
+  inline bool HasAttr(const std::string& attr_name) const {
+    auto it = attrs.find(attr_name);
+    return it != attrs.end();
+  }
+
+  /**
+   * \brief Debug the grouped graph according to fusion_groups.
+   */
+  std::string DebugGroupedGraph(const std::unordered_set<std::string>& fetch_var_ids = {});
+  std::string DebugGroupedGraph(const std::vector<Node*>& group,
+                                const std::unordered_set<std::string>& fetch_var_ids = {});
+
+  /**
+   * \brief Debug the grouped graph with GraphViz dot format according to fusion_groups.
+   */
+  std::string VisualizeGraph(const std::unordered_set<std::string>& fetch_var_ids = {});
+  std::vector<std::string> VisualizeGroups(const std::unordered_set<std::string>& fetch_var_ids = {});
+
+  /**
+   * \brief Genereate the python test code for group test
+   */
+  std::string GenerateGroupPythonCode(const std::vector<Node*>& group,
+                                      const std::unordered_set<std::string>& fetch_var_ids = {});
+
+  /**
+   * \brief Visualize the grouped graph according to fusion_groups.
+   */
+  void VisualizeGroupedGraph(const std::unordered_set<std::string>& fetch_var_ids = {});
+
+  /**
+   * \brief Visualize the grouped graph according to user specified groups.
+   */
+  void VisualizeGroupedGraph(const std::vector<std::vector<Node*>>& groups,
+                             const std::unordered_set<std::string>& fetch_var_ids = {});
+
+  void SaveSourceCode(const std::string& code);
+  void SavePTXCode(const std::string& ptx);
+
+ private:
+  std::string DebugGroupedGraph(const std::vector<std::vector<Node*>>& groups,
+                                const std::unordered_set<std::string>& fetch_var_ids = {});
+
+  std::string VisualizeGraph(const std::vector<std::vector<Node*>>& groups,
+                             const std::unordered_set<std::string>& fetch_var_ids = {});
+
+  std::vector<std::string> VisualizeGroups(const std::vector<std::vector<Node*>>& groups,
+                                           const std::unordered_set<std::string>& fetch_var_ids = {});
+
+  std::vector<std::vector<Node*>> FusionGroupsToGroups();
+
+  std::string viz_path_;
+  static std::atomic_size_t viz_count_;
+
+  CINN_DISALLOW_COPY_AND_ASSIGN(Graph);
+};
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/graph_compiler.cc b/paddle/cinn/hlir/framework/graph_compiler.cc
new file mode 100644
index 0000000000000..3850c50cbf0bc
--- /dev/null
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
@@ -0,0 +1,1532 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/graph_compiler.h"
+
+#include <absl/container/flat_hash_map.h>
+
+#include <memory>
+#include <unordered_set>
+
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/hlir/framework/instruction.h"
+#include "cinn/hlir/framework/op_lowering_util.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/lang/lower.h"
+#include "cinn/optim/transform_gpu_forloop.h"
+#include "cinn/poly/stage.h"
+#include "cinn/utils/profiler.h"
+
+DECLARE_bool(cinn_ir_schedule);
+DECLARE_int32(cinn_parallel_compile_size);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+// Store params from node to instruction
+void AddAttrs(const absl::flat_hash_map<std::string, AttrType>& attrs_store,
+              const std::vector<std::string>& attrs_name,
+              Instruction* instr) {
+  for (auto& attr : attrs_name) {
+    if (attrs_store.find(attr) != attrs_store.end()) {
+      switch (attrs_store.at(attr).index()) {
+        case 2:
+          instr->attrs.push_back(absl::get<int>(attrs_store.at(attr)));
+          break;
+        case 3:
+          instr->str_attrs.push_back(absl::get<std::string>(attrs_store.at(attr)));
+          break;
+        case 5:
+          auto temp = absl::get<std::vector<int>>(attrs_store.at(attr));
+          instr->attrs.insert(instr->attrs.end(), temp.begin(), temp.end());
+          break;
+      }
+    } else {
+      LOG(ERROR) << "Param " << attr << " missed! Please check.";
+    }
+  }
+}
+
+Program::Program(const std::shared_ptr<Scope>& scope, std::vector<std::unique_ptr<Instruction>>&& instrs)
+    : scope_(scope) {
+  for (auto& ins : instrs) {
+    if (ins->pre_run) {
+      prerun_instrs_.push_back(std::move(ins));
+    } else {
+      instrs_.push_back(std::move(ins));
+    }
+  }
+}
+
+void Program::PreRun(const std::map<std::string, cinn_pod_value_t>* name2podargs) {
+  for (auto& ins : prerun_instrs_) {
+    ins->Run(name2podargs);
+  }
+  for (auto& ins : instrs_) {
+    if (ins->size() == 4) {
+      ins->PreRun(name2podargs);
+    }
+  }
+}
+
+void Program::Export(const std::vector<std::string>& persistent_vars, const std::string& filename) {
+  auto writeplaceholder = [=](int s, int n, FILE* f) -> int {
+    int pos = ftell(f);
+    for (int i = 0; i < s * n; i++) {
+      fwrite("\0", 1, 1, f);
+    }
+    return pos;
+  };
+  auto setplaceholder = [=](int p, void* b, int s, int n, FILE* f) {
+    int cur = ftell(f);
+    fseek(f, p, SEEK_SET);
+    fwrite(b, s, n, f);
+    fseek(f, cur, SEEK_SET);
+  };
+  auto tellplaceholder = [=](int p, FILE* f) {
+    int cur = ftell(f);
+    setplaceholder(p, &cur, 4, 1, f);
+  };
+  auto padding = [=](int alignment, uint8_t value, FILE* f) {
+    int cur     = ftell(f);
+    int padding = (alignment - (cur % alignment)) % alignment;
+    for (int i = 0; i < padding; i++) {
+      fwrite(&value, 1, 1, f);
+    }
+  };
+  auto varnames = scope_->var_names();
+  std::unordered_map<std::string, int> varindex;
+  for (int i = 0; i < varnames.size(); i++) {
+    varindex[(std::string)varnames[i]] = i;
+  }
+
+  FILE* f = fopen(filename.c_str(), "w+");
+
+  fwrite("CINN", 4, 1, f);
+  int major_v = 0;
+  int minor_v = 0;
+  fwrite(&major_v, 4, 1, f);
+  fwrite(&minor_v, 4, 1, f);
+  int unused_v = 0;
+  fwrite(&unused_v, 4, 1, f);
+
+  // varname list
+  int varnamesec = writeplaceholder(4, 1, f);
+  int namesnum   = varnames.size();
+  fwrite(&namesnum, 4, 1, f);
+  int nameoffset = writeplaceholder(4, namesnum, f);
+  for (int i = 0; i < namesnum; i++) {
+    int namelen = varnames[i].size();
+    fwrite(&namelen, 4, 1, f);
+    tellplaceholder(nameoffset + i * 4, f);
+    fwrite(varnames[i].data(), namelen, 1, f);
+    fwrite("\0", 1, 1, f);
+  }
+  padding(16, 0, f);
+  tellplaceholder(varnamesec, f);
+  // pod_values
+  int buffersec = writeplaceholder(4, 1, f);
+  int bufoffset = writeplaceholder(4, 1, f);
+  padding(alignof(cinn_buffer_t), 0, f);
+  tellplaceholder(bufoffset, f);
+  std::vector<std::pair<cinn_buffer_t*, int>> pvars;
+  for (auto& varname : varnames) {
+    std::string name     = (std::string)varname;
+    auto t               = scope_->GetTensor(name);
+    cinn_buffer_t buffer = *t->buffer();
+    buffer.memory        = (uint8_t*)0;
+    if (std::find(persistent_vars.begin(), persistent_vars.end(), name) != persistent_vars.end()) {
+      pvars.emplace_back(t->buffer(), ftell(f) + offsetof(cinn_buffer_t, memory));
+    }
+    fwrite(&buffer, sizeof(cinn_buffer_t), 1, f);
+  }
+  padding(16, 0, f);
+  tellplaceholder(buffersec, f);
+  // persistent_buffers
+  int pbuffer = writeplaceholder(4, 1, f);
+  for (auto& p : pvars) {
+    if (p.first->align) {
+      padding(p.first->align, 0, f);
+    }
+    tellplaceholder(p.second, f);
+    fwrite(p.first->memory, p.first->memory_size, 1, f);
+  }
+  padding(16, 0, f);
+  tellplaceholder(pbuffer, f);
+  // instructions
+  int instsec = writeplaceholder(4, 1, f);
+  int insnum  = 0;
+  for (auto& ins : instrs_) {
+    ins->Run(nullptr, true);
+    insnum += ins->GetFnNames().size();
+  }
+  fwrite(&insnum, 4, 1, f);
+  int instplaceholder = writeplaceholder(4 * 3, insnum, f);
+  int findex          = 0;
+  for (auto& ins : instrs_) {
+    auto in_args  = ins->GetInArgs();
+    auto out_args = ins->GetOutArgs();
+    auto fn_names = ins->GetFnNames();
+    for (int i = 0; i < fn_names.size(); i++, findex++) {
+      std::vector<std::string> all_args(in_args[i].begin(), in_args[i].end());
+      all_args.insert(std::end(all_args), out_args[i].begin(), out_args[i].end());
+      auto fname    = fn_names[i];
+      int fnamesize = fname.size();
+      fwrite(&fnamesize, 4, 1, f);
+      tellplaceholder(instplaceholder + findex * 12, f);
+      fwrite(fname.c_str(), fname.size(), 1, f);
+      fwrite("\0", 1, 1, f);
+      int argsize = all_args.size();
+      setplaceholder(instplaceholder + findex * 12 + 4, &argsize, 4, 1, f);
+      padding(alignof(cinn_pod_value_t), 0, f);
+      tellplaceholder(instplaceholder + findex * 12 + 8, f);
+      for (auto& arg : all_args) {
+        uintptr_t bufindex = varindex[arg];
+        cinn_pod_value_t v((cinn_buffer_t*)bufindex);
+        fwrite(&v, sizeof(cinn_pod_value_t), 1, f);
+      }
+    }
+  }
+  padding(16, 0, f);
+  tellplaceholder(instsec, f);
+  fclose(f);
+}
+
+void Program::Execute(const std::map<std::string, cinn_pod_value_t>* name2podargs, void* stream, bool use_cache) {
+  for (auto& ins : instrs_) {
+    ins->Run(name2podargs, false, stream, use_cache);
+  }
+#ifdef CINN_WITH_CUDA
+  VLOG(4) << "-- The value of the used stream: " << stream;
+  if (instrs_[0]->target_.arch == Target::Arch::NVGPU && stream == nullptr) {
+    CUDA_CALL(cudaDeviceSynchronize());
+  }
+#endif
+}
+
+void Program::ExecuteTest(int repeat_) {
+  cinn::utils::Timer timer1;
+  for (int i = 0; i < 100; i++) {
+    for (auto& ins : instrs_) {
+      ins->Run();
+    }
+  }
+  timer1.Start();
+  for (int i = 0; i < repeat_; i++) {
+    for (auto& ins : instrs_) {
+      ins->Run();
+    }
+  }
+#ifdef CINN_WITH_CUDA
+  if (instrs_[0]->target_.arch == Target::Arch::NVGPU) {
+    CUDA_CALL(cudaDeviceSynchronize());
+  }
+#endif
+  double test_op_time = timer1.Stop() / repeat_;
+  VLOG(3) << "Repeat times: [" << repeat_ << "], average op time: [" << test_op_time << "] ms";
+}
+
+void GraphCompiler::PrintFunc() {
+  auto topo_order = graph_->topological_order();
+  auto& nodes     = std::get<0>(topo_order);
+  auto& edges     = std::get<1>(topo_order);
+
+  for (auto& n : nodes) {
+    auto* node = n->safe_as<Node>();
+    if (node) {
+      auto lowered_func = GetOpFunc(node);
+    }
+  }
+}
+
+std::string GraphCompiler::GenSourceCode() {
+  auto topo_order = graph_->topological_order();
+  auto& nodes     = std::get<0>(topo_order);
+  auto& edges     = std::get<1>(topo_order);
+
+  for (auto& n : nodes) {
+    auto* node = n->safe_as<Node>();
+    if (node) {
+      auto lowered_func = GetOpFunc(node);
+      for (auto& i : lowered_func) {
+        m_builder_.AddFunction(i);
+      }
+    }
+  }
+  // // compile the module
+  if (!compiler_) {
+    compiler_ = backends::Compiler::Create(target_);
+  }
+
+  auto build_module = m_builder_.Build();
+
+  return compiler_->GetSourceCode(build_module);
+}
+
+const std::string& GraphCompiler::GetOrGenFullFuncName(const std::string& prefix) {
+  // try_emplace only insert once, so the same function
+  // can get a consistent name next time
+  prefix2full_namemap_.try_emplace(prefix, Context::Global().NewName(prefix));
+  return prefix2full_namemap_.at(prefix);
+}
+
+std::vector<ir::LoweredFunc> GraphCompiler::GetOpFuncWithIRSchedule(
+    const Node* node,
+    const absl::flat_hash_map<std::string, Type>& type_dict_,
+    const absl::flat_hash_map<std::string, shape_t>& shape_dict_) {
+  // get input tensor and output tensor
+  auto& cinn_strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+  std::vector<ir::Tensor> tensor_inputs;
+  std::vector<common::CINNValue> cinn_inputs;
+  std::vector<std::string> input_output_nodes;
+  VLOG(3) << "GetOpFunc of op " << node->id();
+
+  // 1.Collect inputs info and outputs info
+  for (auto& i : node->inlinks_in_order()) {
+    std::string id = i->source()->as<NodeData>()->id();
+    auto shape     = shape_dict_.at(id);
+    Type dtype     = type_dict_.at(id);
+    CHECK(dtype.is_supported()) << "The dtype of node " << id
+                                << " is not float or bool or int! Other dtype is not implemented yet.";
+    ir::Tensor input;
+    if (dtype.is_float(32)) {
+      input = lang::Placeholder<float>(id, shape);
+    } else if (dtype.is_float(64)) {
+      input = lang::Placeholder<double>(id, shape);
+    } else if (dtype.is_bfloat16()) {
+      input = lang::Placeholder<bfloat16>(id, shape);
+    } else if (dtype.is_float16()) {
+      input = lang::Placeholder<float16>(id, shape);
+    } else if (dtype.is_bool()) {
+      input = lang::Placeholder<bool>(id, shape);
+    } else if (dtype.is_int(8)) {
+      input = lang::Placeholder<int8_t>(id, shape);
+    } else if (dtype.is_int(16)) {
+      input = lang::Placeholder<int16_t>(id, shape);
+    } else if (dtype.is_int(32)) {
+      input = lang::Placeholder<int32_t>(id, shape);
+    } else if (dtype.is_int(64)) {
+      input = lang::Placeholder<int64_t>(id, shape);
+    } else if (dtype.is_uint(8)) {
+      input = lang::Placeholder<uint8_t>(id, shape);
+    } else if (dtype.is_uint(16)) {
+      input = lang::Placeholder<uint16_t>(id, shape);
+    } else if (dtype.is_uint(32)) {
+      input = lang::Placeholder<uint32_t>(id, shape);
+    } else if (dtype.is_uint(64)) {
+      input = lang::Placeholder<uint64_t>(id, shape);
+    }
+    tensor_inputs.push_back(input);
+    cinn_inputs.push_back(common::CINNValue(input));
+    input_output_nodes.push_back(id);
+  }
+
+  std::vector<Type> out_types;
+  std::vector<std::vector<int>> out_shapes;
+  auto node_datas = GetAllNodeData(node);
+  for (auto node_data : node_datas) {
+    // collect output node data name.
+    std::string out_name = node_data->id();
+    VLOG(3) << "cinn_inputs.push_back " << out_name;
+    cinn_inputs.push_back(common::CINNValue(out_name));
+    out_types.push_back(type_dict_.at(out_name));
+    out_shapes.push_back(shape_dict_.at(out_name));
+    input_output_nodes.push_back(out_name);
+  }
+
+  auto impl =
+      OpStrategy::SelectImpl(cinn_strategy[node->op()](node->attrs, tensor_inputs, out_types, out_shapes, target_));
+
+  auto res =
+      GetFuncFromImpl(impl, common::CINNValuePack{cinn_inputs}, tensor_inputs, input_output_nodes, node->id(), target_);
+  return res;
+}
+
+std::vector<ir::LoweredFunc> GraphCompiler::GetOpFunc(const Node* node) {
+  auto& strategy   = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+  auto& shape_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+  auto& dtype_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, Type>>("inferdtype");
+  std::vector<ir::Tensor> inputs;
+  std::vector<common::CINNValue> cinn_inputs;
+  std::vector<std::vector<int>> output_shapes;
+  VLOG(3) << "GetOpFunc of op " << node->id();
+  for (auto& i : node->inlinks_in_order()) {
+    std::string input_id = i->source()->as<NodeData>()->id();
+    auto in_shape        = shape_dict.at(input_id);
+    Type dtype           = dtype_dict.at(input_id);
+    CHECK(dtype.is_supported()) << "The dtype of node " << input_id
+                                << " is not float or bool or int! Other dtype is not implemented yet.";
+    ir::Tensor temp;
+    if (dtype.is_float(32)) {
+      temp = lang::Placeholder<float>(input_id, in_shape);
+    } else if (dtype.is_float(64)) {
+      temp = lang::Placeholder<double>(input_id, in_shape);
+    } else if (dtype.is_bfloat16()) {
+      temp = lang::Placeholder<bfloat16>(input_id, in_shape);
+    } else if (dtype.is_float16()) {
+      temp = lang::Placeholder<float16>(input_id, in_shape);
+    } else if (dtype.is_bool()) {
+      temp = lang::Placeholder<bool>(input_id, in_shape);
+    } else if (dtype.is_int(8)) {
+      temp = lang::Placeholder<int8_t>(input_id, in_shape);
+    } else if (dtype.is_int(16)) {
+      temp = lang::Placeholder<int16_t>(input_id, in_shape);
+    } else if (dtype.is_int(32)) {
+      temp = lang::Placeholder<int32_t>(input_id, in_shape);
+    } else if (dtype.is_int(64)) {
+      temp = lang::Placeholder<int64_t>(input_id, in_shape);
+    } else if (dtype.is_uint(8)) {
+      temp = lang::Placeholder<uint8_t>(input_id, in_shape);
+    } else if (dtype.is_uint(16)) {
+      temp = lang::Placeholder<uint16_t>(input_id, in_shape);
+    } else if (dtype.is_uint(32)) {
+      temp = lang::Placeholder<uint32_t>(input_id, in_shape);
+    } else if (dtype.is_uint(64)) {
+      temp = lang::Placeholder<uint64_t>(input_id, in_shape);
+    }
+    inputs.push_back(temp);
+    cinn_inputs.push_back(common::CINNValue(temp));
+  }
+  std::vector<Type> out_types;
+  for (auto& out : node->outlinks_in_order()) {
+    std::string out_id = out->sink()->safe_as<NodeData>()->id();
+    auto out_shape     = shape_dict.at(out_id);
+    Type dtype         = dtype_dict.at(out_id);
+    output_shapes.push_back(out_shape);
+    out_types.push_back(dtype);
+  }
+
+  auto impl = OpStrategy::SelectImpl(strategy[node->op()](node->attrs, inputs, out_types, output_shapes, target_));
+
+  common::CINNValuePack C = impl->fcompute(common::CINNValuePack{cinn_inputs});
+  poly::StageMap stages   = C.back();
+  // make sure all the tensors in the stages before schedule launch.
+  for (int i = 0; i < C->size() - 1; i++) {
+    ir::Expr temp = C[i];
+    stages->InsertLazily(temp.as_tensor_ref());
+  }
+
+  C = impl->fschedule(C);
+  for (int i = 0; i < C->size() - 1; i++) {
+    ir::Expr temp = C[i];
+    // checkout whether the tensor is with buffer.
+    if ((!temp.as_tensor_ref()->buffer.defined() || this->target_ != common::DefaultNVGPUTarget()) &&
+        !stages[temp.as_tensor_ref()]->inlined()) {
+      inputs.push_back(temp.as_tensor_ref());
+    }
+  }
+
+  auto func = lang::LowerVec(GetOrGenFullFuncName(GenOpFuncName(node)), stages, inputs, {}, {}, nullptr, this->target_);
+  VLOG(3) << "The [" << func.size() << "] functions of node [" << node->attrs.node_name << "] are:\n";
+  for (auto& i : func) {
+    VLOG(3) << i;
+  }
+  return func;
+}
+
+// get the most complex op's index in the fused groups according to the OpPattern. If the OpPattern is same, we will
+// take the latter.
+int GetMasterRefNode(const std::vector<Node*>& nodes) {
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+  int master_index      = 0;
+  int master_pattern    = op_pattern_dict[nodes[0]->op()];
+  for (int i = 1; i < nodes.size(); i++) {
+    int pattern    = op_pattern_dict[nodes[i]->op()];
+    master_index   = pattern >= master_pattern ? i : master_index;
+    master_pattern = std::max(pattern, master_pattern);
+  }
+  VLOG(3) << "master_index: " << master_index << ", master op: " << nodes[master_index]->op()->name;
+  return master_index;
+}
+
+std::vector<ir::LoweredFunc> GraphCompiler::GetOpFunc(const std::vector<Node*>& nodes) {
+  CHECK_GT(nodes.size(), 1) << "fuse nodes number must be greater than 1";
+  auto& strategy   = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+  auto& shape_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+  auto& dtype_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, Type>>("inferdtype");
+  int fuse_number  = nodes.size();
+  VLOG(3) << "fuse begin: " << nodes[0]->id();
+  std::vector<ir::Tensor> inputs;
+  std::vector<ir::Tensor> outputs;
+  poly::StageMap stages;
+  int index             = 0;
+  std::string fuse_name = "fn_";
+  std::unordered_set<NodeData*> in_vars;
+  std::unordered_set<NodeData*> out_vars;
+  absl::flat_hash_map<NodeData*, Expr> temp_var_map;
+  absl::flat_hash_set<ir::Tensor> fetch_tensors;
+  ir::Tensor master_out_tensor;
+  int master_index = GetMasterRefNode(nodes);
+  for (auto& node : nodes) {
+    std::vector<ir::Tensor> temp_inputs;
+    std::vector<common::CINNValue> cinn_inputs;
+    std::vector<std::vector<int>> output_shapes;
+    fuse_name += node->id() + "_";
+    for (auto& link : node->inlinks_in_order()) {
+      auto source = link->source();
+      CHECK(source);
+      auto source_data = source->as<NodeData>();
+      CHECK(source_data);
+      if (temp_var_map.count(source_data)) {
+        VLOG(3) << "duplicate var: " << source_data->id();
+        Expr fuse_out = temp_var_map[source_data];
+        cinn_inputs.push_back(common::CINNValue(fuse_out));
+        temp_inputs.push_back(fuse_out.as_tensor_ref());
+      } else {
+        std::string input_id = source_data->id();
+        auto in_shape        = shape_dict.at(input_id);
+        Type dtype           = dtype_dict.at(input_id);
+        CHECK(dtype.is_supported()) << "The dtype of node " << input_id
+                                    << " is not float or bool or int! Other dtype is not implemented yet.";
+        ir::Tensor temp_in;
+        if (dtype.is_float(32)) {
+          temp_in = lang::Placeholder<float>(input_id, in_shape);
+        } else if (dtype.is_float(64)) {
+          temp_in = lang::Placeholder<double>(input_id, in_shape);
+        } else if (dtype.is_bfloat16()) {
+          temp_in = lang::Placeholder<bfloat16>(input_id, in_shape);
+        } else if (dtype.is_float16()) {
+          temp_in = lang::Placeholder<float16>(input_id, in_shape);
+        } else if (dtype.is_bool()) {
+          temp_in = lang::Placeholder<bool>(input_id, in_shape);
+        } else if (dtype.is_int(8)) {
+          temp_in = lang::Placeholder<int8_t>(input_id, in_shape);
+        } else if (dtype.is_int(16)) {
+          temp_in = lang::Placeholder<int16_t>(input_id, in_shape);
+        } else if (dtype.is_int(32)) {
+          temp_in = lang::Placeholder<int32_t>(input_id, in_shape);
+        } else if (dtype.is_int(64)) {
+          temp_in = lang::Placeholder<int64_t>(input_id, in_shape);
+        } else if (dtype.is_uint(8)) {
+          temp_in = lang::Placeholder<uint8_t>(input_id, in_shape);
+        } else if (dtype.is_uint(16)) {
+          temp_in = lang::Placeholder<uint16_t>(input_id, in_shape);
+        } else if (dtype.is_uint(32)) {
+          temp_in = lang::Placeholder<uint32_t>(input_id, in_shape);
+        } else if (dtype.is_uint(64)) {
+          temp_in = lang::Placeholder<uint64_t>(input_id, in_shape);
+        }
+        inputs.push_back(temp_in);
+        temp_inputs.push_back(temp_in);
+        cinn_inputs.push_back(common::CINNValue(temp_in));
+        temp_var_map[source_data] = Expr(temp_in);
+      }
+      in_vars.insert(source_data);
+    }
+    std::vector<Type> out_types;
+    std::vector<NodeData*> temp_outvars;
+    for (auto& out : node->outlinks_in_order()) {
+      auto out_var = out->sink()->safe_as<NodeData>();
+      CHECK(out_var);
+      out_vars.insert(out_var);
+      temp_outvars.push_back(out_var);
+      std::string out_id = out_var->id();
+      VLOG(3) << "out_id " << out_id;
+      auto out_shape = shape_dict.at(out_id);
+      Type dtype     = dtype_dict.at(out_id);
+      output_shapes.push_back(out_shape);
+      out_types.push_back(dtype);
+    }
+    auto impl =
+        OpStrategy::SelectImpl(strategy[node->op()](node->attrs, temp_inputs, out_types, output_shapes, target_));
+
+    common::CINNValuePack C = impl->fcompute(common::CINNValuePack{cinn_inputs});
+    if (index == master_index) {
+      // use the most complex op's schedule as the fused ops' schedule.
+      C = impl->fschedule(C);
+      CHECK(!C.empty());
+      Expr out          = C[0];
+      master_out_tensor = out.as_tensor_ref();
+    }
+
+    CHECK_GE(C.size(), 2);
+    std::vector<Expr> temp_C;
+    if (C.size() - 1 > node->outlinks_in_order().size()) {
+      for (int i = 1; i < C.size() - 1; i++) {
+        ir::Expr temp = C[i];
+        VLOG(1) << "C[" << i << "] name is : " << temp.as_tensor_ref()->name;
+        outputs.push_back(temp.as_tensor_ref());
+      }
+      common::CINNValuePack C_temp{{C[0], C.back()}};
+      C = C_temp;
+    }
+    for (int i = 0; i < C.size() - 1; i++) {
+      Expr out                      = C[i];
+      temp_var_map[temp_outvars[i]] = out;
+      if (fetch_var_ids_.count(temp_outvars[i]->id())) {
+        VLOG(3) << "get fetch output var " << temp_outvars[i]->id();
+        CHECK(out.as_tensor());
+        fetch_tensors.insert(out.as_tensor_ref());
+      }
+    }
+    CHECK_LE(C.size() - 1, node->outlinks_in_order().size());
+    poly::StageMap temp_stages = C.back();
+
+    for (auto& i : temp_stages) {
+      auto tensor = ir::Tensor(i.second->tensor());
+      stages->InsertLazily(tensor, i.second.get());
+    }
+    for (int i = 0; i < C->size() - 1; i++) {
+      ir::Expr temp = C[i];
+      CHECK(temp.as_tensor());
+      auto temp_tensor = temp.as_tensor_ref();
+      stages->InsertLazily(temp_tensor, temp_stages[temp_tensor]);
+      if (index < fuse_number - 1 && !temp_tensor->is_reduce_tensor()) {
+        // assume that only the first out_var links to other op node which will compute inline
+        if (fetch_tensors.count(temp_tensor)) {
+          VLOG(3) << "add op's fetch out_vars: " << temp_tensor->name;
+          outputs.insert(outputs.begin(), temp_tensor);
+        } else if (i == 0) {
+          VLOG(3) << "inline " << temp_tensor->name;
+          stages[temp_tensor]->ComputeInline();
+        } else {
+          VLOG(3) << "add middle op's other out_vars: " << temp_tensor->name;
+          outputs.push_back(temp_tensor);
+        }
+      } else if (index < fuse_number - 1 && temp_tensor->is_reduce_tensor()) {
+        VLOG(3) << "temp buffer " << temp_tensor->name;
+        VLOG(3) << "add op's out_vars: " << temp_tensor->name;
+        outputs.push_back(temp_tensor);
+      } else {
+        if (index == fuse_number - 1) {
+          // final output tensor
+          outputs.insert(outputs.begin(), temp_tensor);
+        } else {
+          outputs.push_back(temp_tensor);
+        }
+      }
+    }
+    index++;
+  }
+  fuse_name += "fused";
+  VLOG(3) << "fuse_name: " << fuse_name;
+  // args order: inputs + final output + fetch outputs + other no_fused outputs
+  for (auto& tensor : outputs) {
+    // checkout the tensor is with buffer.
+    if ((!tensor->buffer.defined() || this->target_ != common::DefaultNVGPUTarget()) && !stages[tensor]->inlined()) {
+      inputs.push_back(tensor);
+    }
+  }
+
+  ir::Tensor final_out_tensor = outputs.front();
+  if (final_out_tensor->name != master_out_tensor->name) {
+    if (final_out_tensor->is_reduce_tensor()) {
+      VLOG(3) << "final_out_tensor is reduce tensor!";
+    } else {
+      stages[final_out_tensor]->CopyTransform(stages[master_out_tensor]);
+      stages[final_out_tensor]->CopyLoopInfo(stages[master_out_tensor]);
+    }
+  }
+
+  for (auto& s : stages) {
+    auto& compute_ats = s.second->GetComputeAts();
+    auto tensor       = s.second->tensor();
+    if (!compute_ats.empty()) {
+      poly::ComputeAtRelation new_relation;
+      CHECK_EQ(compute_ats.size(), 1U);
+      auto new_stage = stages[final_out_tensor];
+      for (auto& compute_at : compute_ats) {
+        auto& old_relation     = compute_at.second;
+        auto old_target_tensor = old_relation.stage->tensor();
+        if (stages[old_target_tensor]->inlined()) {
+          new_relation.stage = new_stage;
+          new_relation.level = old_relation.level;
+
+          compute_ats.clear();
+          CHECK(new_relation.IsCompatible(s.second.get())) << "new computeAt should be compatible";
+          compute_ats[new_stage->id()] = new_relation;
+          break;
+        }
+      }
+    }
+  }
+  // deal with fetch tensors, not compute_inline but do compute_at
+  for (auto& fetch_tensor : fetch_tensors) {
+    if (fetch_tensor->is_reduce_tensor() || fetch_tensor->name == final_out_tensor->name) continue;
+    stages[fetch_tensor]->DisableComputeInline();
+    int level = stages[final_out_tensor]->n_out_dims() - 1;
+    VLOG(3) << "no fuse fetch tensor " << fetch_tensor->name << " and recomputeAt in level " << level;
+
+    // if the fetch tensor size is 1, the fetch tensor cannot fuse by ComputeAt2
+    int len = 1;
+    for (const auto& dim : fetch_tensor->shape) {
+      len *= dim.as_int32();
+    }
+    if (len <= 1) {
+      continue;
+    }
+
+    stages[fetch_tensor]->ComputeAt2(stages[final_out_tensor], level);
+  }
+
+  auto func = lang::LowerVec(GetOrGenFullFuncName(fuse_name), stages, inputs, {}, {}, nullptr, this->target_);
+  VLOG(3) << "The [" << func.size() << "] functions are:\n";
+  for (auto& i : func) {
+    VLOG(3) << "Function [" << i->name << "] is:\n";
+    VLOG(3) << i;
+  }
+
+  return func;
+}
+
+void GraphCompiler::ProcessFunction(const std::vector<ir::LoweredFunc>& lowered_funcs) {
+  for (auto&& func : lowered_funcs) {
+    std::vector<std::string> input_args;
+    std::vector<std::string> output_args;
+    for (auto&& arg : func->args) {
+      std::string arg_name = arg.name();
+      if (arg_name[0] == '_') arg_name = arg_name.substr(1);
+      if (arg.io == ir::Argument::IO::kOutput)
+        output_args.push_back(arg_name);
+      else if (arg.io == ir::Argument::IO::kInput)
+        input_args.push_back(arg_name);
+      auto* var = scope_->FindVar(arg_name);
+      if (!arg.is_buffer()) {
+        VLOG(3) << "function:" << func->name << "-argument:" << arg_name << " type is not buffer, lowered_func:\n"
+                << func;
+      }
+      if (!var && arg.is_buffer()) {  // For argument buffer not in scope, create it.
+        auto* new_var = scope_->Var<Tensor>(arg_name);
+        auto& tensor  = absl::get<Tensor>(*new_var);
+        std::vector<Shape::dim_t> shape;
+        for (auto& shape_dim : arg.buffer_arg()->shape) {
+          CHECK(shape_dim.is_constant());
+          shape.push_back(static_cast<int>(shape_dim.get_constant()));
+        }
+        tensor->Resize(Shape{shape});
+        tensor->set_type(arg.buffer_arg()->dtype);
+        VLOG(3) << utils::StringFormat(
+            "Will create a new variable in scope for argument[%s] in function[%s] with shape[%s],dtype[%s]",
+            arg_name.c_str(),
+            func->name.c_str(),
+            utils::Join(tensor->shape().data(), ","),
+            common::Type2Str(tensor->type()));
+      }
+    }
+    function2input_args_[func->name]  = input_args;
+    function2output_args_[func->name] = output_args;
+    m_builder_.AddFunction(func);
+  }
+}
+
+std::unique_ptr<Program> GraphCompiler::Build(const std::string& code) {
+  utils::RecordEvent("GraphCompiler::Build", utils::EventType::kGraph);
+  GraphCompiler::CompileOptions options;
+  options.attached_code              = code;
+  options.with_instantiate_variables = true;
+
+  auto&& result = Build(options);
+  return std::move(result.runtime_program);
+}
+
+void GraphCompiler::CompileOptions::Apply(const auto_schedule::TuningResult& tuning_result) {
+  // assign options with TuningResult directly
+  groups.assign(tuning_result.subgraphs.begin(), tuning_result.subgraphs.end());
+  lowered_funcs.assign(tuning_result.function_groups.begin(), tuning_result.function_groups.end());
+}
+
+GraphCompiler::CompilationResult GraphCompiler::Build(const GraphCompiler::CompileOptions& options,
+                                                      std::unordered_set<std::string>&& fetch_var_ids,
+                                                      void* stream) {
+  Context::Global().ResetNameId();
+  if (FLAGS_cinn_parallel_compile_size) {
+    // write group's information into FLAGS_cinn_fusion_groups_graphviz_dir
+    graph_->VisualizeGroupedGraph(fetch_var_ids.empty() ? fetch_var_ids_ : fetch_var_ids);
+
+    if (options.with_instantiate_variables) {
+      VLOG(3) << "Instantiate all variables on compile-time";
+      utils::RecordEvent("GraphCompiler MutableData", utils::EventType::kOrdinary);
+      // All variables reside in scope_, so traverse it to instantiate each one
+      for (auto& name : scope_->var_names()) {
+        auto* var    = scope_->Var<Tensor>(std::string({name.data(), name.size()}));
+        auto& tensor = absl::get<Tensor>(*var);
+        if (reuse_vars_map_.count(name)) {
+          auto src_var_name = reuse_vars_map_.at(name);
+          auto* src_var     = scope_->Var<Tensor>(src_var_name);
+          auto& src_tensor  = absl::get<Tensor>(*src_var);
+          tensor->set_buffer(src_tensor->get_buffer());
+        } else {
+          tensor->mutable_data(target_, tensor->type());
+        }
+      }
+    }
+
+    VLOG(2) << "Compile With Parallel Compiler!";
+    utils::RecordEvent("GraphCompiler CompileResult", utils::EventType::kOrdinary);
+    ParallelCompiler::CompileOptions option;
+    option.lowered_funcs = options.lowered_funcs;
+
+    parallel_compiler_ = std::make_shared<ParallelCompiler>(scope_, graph_, option, target_);
+    auto instructions  = (*parallel_compiler_.get())();
+
+    if (options.remove_unused_variables) {
+      RemoveInvalidVariables(instructions);
+    }
+
+    if (options.with_buffer_handle_instruction_inserted) {
+      VLOG(3) << "option.with_buffer_handle_instruction_inserted enable";
+      InsertBufferHandlers(&instructions);
+    }
+    VLOG(2) << "Compile With Parallel Compiler Done!";
+
+    GraphCompiler::CompilationResult compilation_result;
+    compilation_result.runtime_program.reset(new Program(scope_, std::move(instructions)));
+    return compilation_result;
+  }
+
+  compile_options_ = options;
+  fetch_var_ids_   = std::move(fetch_var_ids);
+  auto topo_order  = graph_->topological_order();
+  auto& nodes      = std::get<0>(topo_order);
+  VLOG(3) << "Begin GraphCompiler::Build";
+  function2input_args_.clear();
+  function2output_args_.clear();
+  m_builder_.Clear();
+  // if there are no available groups, we will take each node as a group
+  if (options.groups.empty() && graph_->groups.empty() && graph_->fusion_groups.empty()) {
+    VLOG(3) << "not run opfusion pass";
+    for (auto& node : nodes) {
+      auto op_node = node->safe_as<Node>();
+      if (op_node) {
+        graph_->groups.push_back({op_node});
+      }
+    }
+  }
+  // use the input groups in options firstly if exists
+  std::vector<std::vector<Node*>> groups;
+  if (options.groups.empty()) {
+    groups = graph_->groups;
+  } else {
+    for (std::shared_ptr<Graph::Group> g : options.groups) {
+      groups.push_back(g->CollectNodes());
+    }
+  }
+
+  // if the input lowered_funcs is empty, we will use the default lowering process to generate
+  std::vector<std::vector<ir::LoweredFunc>> local_lowered_funcs;
+  if (options.lowered_funcs.empty()) {
+    utils::RecordEvent("GraphCompiler LoweredFuncs", utils::EventType::kOrdinary);
+    // lowering of new fusion pass is not compatible with the groups from the input options,
+    // thus process it separately
+    if (!graph_->fusion_groups.empty()) {
+      auto& dtype_dict = graph_->GetMutableAttrs<absl::flat_hash_map<std::string, Type>>("inferdtype");
+      auto& shape_dict = graph_->GetMutableAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+
+      OpLowerer op_lowerer(dtype_dict, shape_dict, target_);
+      for (auto& group : graph_->fusion_groups) {
+        VLOG(3) << "group_id is : " << group->group_id << ", and its number is : " << group->nodes.size();
+        groups.push_back(std::move(group->CollectNodes()));
+        local_lowered_funcs.emplace_back(std::move(op_lowerer.Lower(group)));
+        CHECK_EQ(local_lowered_funcs.back().size(), 1) << "Lowered Function Is Not Equal 1!";
+        VLOG(3) << local_lowered_funcs.back()[0];
+      }
+    } else {
+      VLOG(3) << "fusion_groups is empty";
+      std::vector<ir::LoweredFunc> lowered_func;
+      if (FLAGS_cinn_ir_schedule) {
+        auto& dtype_dict = graph_->GetMutableAttrs<absl::flat_hash_map<std::string, Type>>("inferdtype");
+        auto& shape_dict = graph_->GetMutableAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+        for (int i = 0; i < groups.size(); i++) {
+          for (int j = 0; j < groups[i].size(); j++) {
+            lowered_func = GetOpFuncWithIRSchedule(groups[i][j], dtype_dict, shape_dict);
+            local_lowered_funcs.emplace_back(std::move(lowered_func));
+          }
+        }
+      } else {
+        for (int i = 0; i < groups.size(); i++) {
+          if (groups[i].size() == 1) {
+            lowered_func = GetOpFunc(groups[i][0]);
+          } else {
+            lowered_func = GetOpFunc(groups[i]);
+          }
+          local_lowered_funcs.emplace_back(std::move(lowered_func));
+        }
+      }
+    }
+  }
+  // write group's information into FLAGS_cinn_fusion_groups_graphviz_dir
+  graph_->VisualizeGroupedGraph(groups, fetch_var_ids.empty() ? fetch_var_ids_ : fetch_var_ids);
+
+  // use the input lowered_funcs in options firstly if exists
+  const auto& lowered_funcs = options.lowered_funcs.empty() ? local_lowered_funcs : options.lowered_funcs;
+  CHECK_EQ(groups.size(), lowered_funcs.size()) << "The size of groups and lowered_funcs should be equal";
+  {
+    utils::RecordEvent("GraphCompiler ProcessFunction", utils::EventType::kOrdinary);
+    for (auto&& lowered_func : lowered_funcs) {
+      this->ProcessFunction(lowered_func);
+    }
+  }
+
+  // compile the module
+  // Need to create a new compiler for every call of Build,
+  // because the underneath jit engine doesn't support addIRModule repeatedly now.
+  compiler_ = backends::Compiler::Create(target_);
+
+  auto build_module = m_builder_.Build();
+  VLOG(3) << "End of m_builder_.Build()";
+  if (this->target_.arch == Target::Arch::X86) {
+    utils::RecordEvent("GraphCompiler CodeGenCX86", utils::EventType::kOrdinary);
+    CodeGenCX86 codegen(this->target_, CodeGenCX86::Feature::AVX512);
+    codegen.SetInlineBuiltinCodes(false);
+    auto out = codegen.Compile(build_module, CodeGenC::OutputKind::CImpl);
+    VLOG(3) << "[X86] C Code is:\n" << out;
+  }
+
+  {
+    utils::RecordEvent("GraphCompiler BackendsBuild", utils::EventType::kOrdinary);
+    compiler_->Build(build_module, options.attached_code);
+    VLOG(3) << "End of compiler_->Build";
+  }
+
+  auto instructions = BuildInstructions(groups, options.groups.empty() ? graph_->fusion_groups : options.groups);
+  VLOG(3) << "End of BuildInstructions";
+  if (options.remove_unused_variables) {
+    RemoveInvalidVariables(instructions);
+  }
+  if (options.with_buffer_handle_instruction_inserted) {
+    VLOG(3) << "option.with_buffer_handle_instruction_inserted enable";
+    InsertBufferHandlers(&instructions);
+  }
+
+  if (options.with_instantiate_variables) {
+    VLOG(3) << "Instantiate all variables on compile-time";
+    utils::RecordEvent("GraphCompiler MutableData", utils::EventType::kOrdinary);
+    // All variables reside in scope_, so traverse it to instantiate each one
+    for (auto& name : scope_->var_names()) {
+      auto* var    = scope_->Var<Tensor>(std::string({name.data(), name.size()}));
+      auto& tensor = absl::get<Tensor>(*var);
+      if (reuse_vars_map_.count(name)) {
+        auto src_var_name = reuse_vars_map_.at(name);
+        auto* src_var     = scope_->Var<Tensor>(src_var_name);
+        auto& src_tensor  = absl::get<Tensor>(*src_var);
+        tensor->set_buffer(src_tensor->get_buffer());
+      } else {
+        tensor->mutable_data(target_, tensor->type());
+      }
+    }
+  }
+
+  GraphCompiler::CompilationResult result;
+  result.runtime_program.reset(new Program(scope_, std::move(instructions)));
+  return result;
+}
+
+void GraphCompiler::SetSubKernels(Instruction* instr, const std::string& func_name) {
+  int i                   = 1;
+  std::string new_op_func = func_name + "_" + std::to_string(i);
+  if (function2input_args_.count(new_op_func) != 0) {
+    CHECK_GT(function2input_args_.count(func_name), 0);
+    instr->AddInArgs(function2input_args_[func_name]);
+    instr->AddOutArgs(function2output_args_[func_name]);
+  }
+  while (function2input_args_.count(new_op_func) != 0) {
+    auto* fn_ptr = compiler_->Lookup(new_op_func);
+    CHECK(fn_ptr);
+    instr->SetLoweredFunc(reinterpret_cast<void*>(fn_ptr), new_op_func);
+    instr->AddInArgs(function2input_args_[new_op_func]);
+    instr->AddOutArgs(function2output_args_[new_op_func]);
+    i++;
+    new_op_func = func_name + "_" + std::to_string(i);
+  }
+}
+
+void GraphCompiler::BuildCublasInstr(const Node& node, Instruction* instr) const {
+  instr->ClearInArgs();
+  instr->AddInArgs(OpGetInputNames(&node));
+  auto& shape_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+  // shape info
+  std::vector<int> shape_sizes;
+  for (auto& in_node : node.inlinks_in_order()) {
+    std::string in_id = in_node->source()->safe_as<NodeData>()->id();
+    auto in_shape     = shape_dict.at(in_id);
+    instr->attrs.insert(instr->attrs.end(), in_shape.begin(), in_shape.end());
+    shape_sizes.push_back(in_shape.size());
+  }
+  // cublas_gemm has three input vars, and its output shape is equal to the input bias.
+  // cublas_matmul only has two input vars, so we should get its output shape from shape_dict.
+  if (node.op()->name == "cublas_matmul") {
+    for (auto& out_node : node.outlinks_in_order()) {
+      std::string out_id = out_node->sink()->safe_as<NodeData>()->id();
+      auto out_shape     = shape_dict.at(out_id);
+      instr->attrs.insert(instr->attrs.end(), out_shape.begin(), out_shape.end());
+      shape_sizes.push_back(out_shape.size());
+    }
+  }
+  instr->attrs.insert(instr->attrs.end(), shape_sizes.begin(), shape_sizes.end());
+  // attribute info
+  bool trans_a = false;
+  if (node.attrs.attr_store.contains("trans_a")) {
+    trans_a = absl::get<bool>(node.attrs.attr_store.at("trans_a"));
+  }
+  instr->attrs.push_back(static_cast<int>(trans_a));
+  bool trans_b = false;
+  if (node.attrs.attr_store.contains("trans_b")) {
+    trans_b = absl::get<bool>(node.attrs.attr_store.at("trans_b"));
+  }
+  instr->attrs.push_back(static_cast<int>(trans_b));
+  bool trans_out = false;
+  if (node.attrs.attr_store.contains("trans_out")) {
+    trans_out = absl::get<bool>(node.attrs.attr_store.at("trans_out"));
+  }
+  instr->attrs.push_back(static_cast<int>(trans_out));
+  float alpha = 1.f;
+  if (node.attrs.attr_store.contains("alpha")) {
+    alpha = absl::get<float>(node.attrs.attr_store.at("alpha"));
+  }
+  instr->attrs.push_back(*reinterpret_cast<int*>(&alpha));
+}
+
+std::vector<std::unique_ptr<Instruction>> GraphCompiler::BuildInstructions(
+    const std::vector<std::vector<Node*>>& groups, const std::vector<std::shared_ptr<Graph::Group>>& fusion_groups) {
+  utils::RecordEvent("GraphCompiler BuildInstructions", utils::EventType::kOrdinary);
+  std::vector<std::unique_ptr<Instruction>> instructions;
+  auto topo_order = graph_->topological_order();
+  auto& nodes     = std::get<0>(topo_order);
+  auto& edges     = std::get<1>(topo_order);
+  VLOG(3) << "Begin GraphCompiler::BuildInstructions";
+  CHECK_GT(groups.size(), 0);
+  CHECK_EQ(fusion_groups.size() != 0, groups.size() == fusion_groups.size())
+      << "fusion_groups's size must be 0 or equal to groups. Currently fusion_group's size = " << fusion_groups.size()
+      << ", group's size = " << groups.size();
+  for (int idx = 0; idx < groups.size(); ++idx) {
+    auto& group = groups[idx];
+    std::shared_ptr<Graph::Group> fusion_group(nullptr);
+    if (fusion_groups.size()) {
+      fusion_group = fusion_groups[idx];
+    }
+    if (group.size() == 1) {
+      auto node       = group[0];
+      auto instr_name = node->op()->name;
+      if (node->op()->name == "reshape" && compile_options_.with_instantiate_variables) {
+        // not run instruction and shares buffer only when instantiate_variables
+        const auto& inlinks  = node->inlinks_in_order();
+        const auto& outlinks = node->outlinks_in_order();
+        CHECK_EQ(inlinks.size(), 1U);
+        CHECK_EQ(outlinks.size(), 1U);
+        std::string in_id       = inlinks[0]->source()->safe_as<NodeData>()->id();
+        std::string out_id      = outlinks[0]->sink()->safe_as<NodeData>()->id();
+        reuse_vars_map_[out_id] = in_id;
+        instr_name              = "no_run";
+      }
+      auto instr = std::unique_ptr<Instruction>(
+          new Instruction(target_,
+                          scope_.get(),
+                          fusion_group.get() ? fusion_group->input_names : OpGetInputNames(node),
+                          fusion_group.get() ? fusion_group->output_names : OpGetOutputNames(node),
+                          instr_name));
+
+      if (target_.arch == Target::Arch::NVGPU) {
+        if (node->op()->name == "conv2d") {
+          auto& shape_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+          for (auto& in_node : node->inlinks_in_order()) {
+            std::string in_id = in_node->source()->safe_as<NodeData>()->id();
+            auto in_shape     = shape_dict.at(in_id);
+            instr->attrs.insert(instr->attrs.end(), in_shape.begin(), in_shape.end());
+          }
+          AddAttrs(node->attrs.attr_store, {"padding", "stride", "dilation"}, instr.get());
+          if (node->attrs.attr_store.find("groups") != node->attrs.attr_store.end()) {
+            auto conv_groups = absl::get<int>(node->attrs.attr_store.at("groups"));
+            instr->attrs.push_back(conv_groups);
+          } else {
+            instr->attrs.push_back(1);
+          }
+          // output shape
+          const auto& out_links = node->outlinks_in_order();
+          CHECK(!out_links.empty());
+          auto& out_node     = out_links.front();
+          std::string out_id = out_node->sink()->safe_as<NodeData>()->id();
+          auto out_shape     = shape_dict.at(out_id);
+          instr->attrs.insert(instr->attrs.end(), out_shape.begin(), out_shape.end());
+          CHECK_EQ(instr->attrs.size(), 19UL);
+          // conv type {forward, backward_data, backward_filter}
+          std::string type = "forward";
+          if (node->attrs.attr_store.find("conv_type") != node->attrs.attr_store.end()) {
+            type = absl::get<std::string>(node->attrs.attr_store.at("conv_type"));
+          }
+          instr->str_attrs.push_back(type);
+          if (node->attrs.attr_store.find("data_format") != node->attrs.attr_store.end()) {
+            instr->str_attrs.push_back(absl::get<std::string>(node->attrs.attr_store["data_format"]));
+          }
+        } else if (node->op()->name == "depthwise_conv2d") {
+          auto& shape_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+          for (auto& in_node : node->inlinks_in_order()) {
+            std::string in_id = in_node->source()->safe_as<NodeData>()->id();
+            auto in_shape     = shape_dict.at(in_id);
+            instr->attrs.insert(instr->attrs.end(), in_shape.begin(), in_shape.end());
+          }
+          // conv
+          AddAttrs(node->attrs.attr_store, {"padding", "stride", "dilation"}, instr.get());
+          if (node->attrs.attr_store.find("groups") != node->attrs.attr_store.end()) {
+            auto groups = absl::get<int>(node->attrs.attr_store.at("groups"));
+            instr->attrs.push_back(groups);
+          } else {
+            instr->attrs.push_back(instr->attrs[1]);
+          }
+          // output shape
+          const auto& out_links = node->outlinks_in_order();
+          CHECK(!out_links.empty());
+          auto& out_node     = out_links.front();
+          std::string out_id = out_node->sink()->safe_as<NodeData>()->id();
+          auto out_shape     = shape_dict.at(out_id);
+          instr->attrs.insert(instr->attrs.end(), out_shape.begin(), out_shape.end());
+          CHECK_EQ(instr->attrs.size(), 19UL);
+          // conv type {forward, backward_data, backward_filter}
+          std::string type = "forward";
+          if (node->attrs.attr_store.find("conv_type") != node->attrs.attr_store.end()) {
+            type = absl::get<std::string>(node->attrs.attr_store.at("conv_type"));
+          }
+          instr->str_attrs.push_back(type);
+        } else if (node->op()->name == "pool2d") {
+          auto& shape_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+          for (auto& in_node : node->inlinks_in_order()) {
+            std::string in_id = in_node->source()->safe_as<NodeData>()->id();
+            auto in_shape     = shape_dict.at(in_id);
+            CHECK_EQ(in_shape.size(), 4UL);
+            instr->attrs.insert(instr->attrs.end(), in_shape.begin(), in_shape.end());
+          }
+          bool global_pooling = false;
+          if (node->attrs.attr_store.find("global_pooling") != node->attrs.attr_store.end()) {
+            global_pooling = absl::get<bool>(node->attrs.attr_store.at("global_pooling"));
+          }
+          if (node->attrs.attr_store.find("kernel_size") != node->attrs.attr_store.end()) {
+            if (global_pooling == false) {
+              auto kernel_size = absl::get<std::vector<int>>(node->attrs.attr_store.at("kernel_size"));
+              instr->attrs.insert(instr->attrs.end(), kernel_size.begin(), kernel_size.end());
+            } else {
+              instr->attrs.push_back(instr->attrs[2]);
+              instr->attrs.push_back(instr->attrs[3]);
+            }
+          }
+          if (node->attrs.attr_store.find("padding_size") != node->attrs.attr_store.end()) {
+            if (global_pooling == false) {
+              auto padding = absl::get<std::vector<int>>(node->attrs.attr_store.at("padding_size"));
+              instr->attrs.insert(instr->attrs.end(), padding.begin(), padding.end());
+              if (padding.size() == 2) instr->attrs.insert(instr->attrs.end(), padding.begin(), padding.end());
+            } else {
+              instr->attrs.push_back(0);
+              instr->attrs.push_back(0);
+              instr->attrs.push_back(0);
+              instr->attrs.push_back(0);
+            }
+          }
+          AddAttrs(node->attrs.attr_store, {"stride_size", "pool_type"}, instr.get());
+
+          for (auto& out_node : node->outlinks_in_order()) {
+            std::string out_id = out_node->sink()->safe_as<NodeData>()->id();
+            auto out_shape     = shape_dict.at(out_id);
+            instr->attrs.insert(instr->attrs.end(), out_shape.begin(), out_shape.end());
+          }
+          if (node->attrs.attr_store.find("adaptive") != node->attrs.attr_store.end()) {
+            bool adaptive = absl::get<bool>(node->attrs.attr_store.at("adaptive"));
+            if (adaptive)
+              instr->attrs.push_back(1);
+            else
+              instr->attrs.push_back(0);
+          }
+          CHECK_EQ(instr->attrs.size(), 17UL);
+          CHECK_EQ(instr->str_attrs.size(), 1UL);
+        } else if (node->op()->name == "softmax") {
+          auto& shape_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+          for (auto& in_node : node->inlinks_in_order()) {
+            std::string in_id = in_node->source()->safe_as<NodeData>()->id();
+            auto in_shape     = shape_dict.at(in_id);
+            instr->attrs.insert(instr->attrs.end(), in_shape.begin(), in_shape.end());
+          }
+          AddAttrs(node->attrs.attr_store, {"axis"}, instr.get());
+        } else if (node->op()->name == "mul") {
+          auto& shape_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+          for (auto& in_node : node->inlinks_in_order()) {
+            std::string in_id = in_node->source()->safe_as<NodeData>()->id();
+            auto in_shape     = shape_dict.at(in_id);
+            instr->attrs.insert(instr->attrs.end(), in_shape.begin(), in_shape.end());
+          }
+          if (node->attrs.attr_store.find("x_num_col_dims") != node->attrs.attr_store.end()) {
+            auto axis = absl::get<int>(node->attrs.attr_store.at("x_num_col_dims"));
+            instr->attrs.push_back(axis);
+          } else {
+            instr->attrs.push_back(1);
+          }
+          if (node->attrs.attr_store.find("y_num_col_dims") != node->attrs.attr_store.end()) {
+            auto axis = absl::get<int>(node->attrs.attr_store.at("y_num_col_dims"));
+            instr->attrs.push_back(axis);
+          } else {
+            instr->attrs.push_back(1);
+          }
+        } else if (node->op()->name == "cublas_gemm" || node->op()->name == "cublas_matmul") {
+          BuildCublasInstr(*node, instr.get());
+        }
+      }
+      std::string op_func_name =
+          fusion_group.get() ? fusion_group->GetFuncName() : GetOrGenFullFuncName(GenOpFuncName(node));
+      auto* fn_ptr = compiler_->Lookup(op_func_name);
+      CHECK(fn_ptr);
+      instr->SetLoweredFunc(reinterpret_cast<void*>(fn_ptr), op_func_name);
+
+      // As some instruction like reduce, will generate more than one kernel.
+      // So try to find the rest kernel, if it exists.
+      SetSubKernels(instr.get(), op_func_name);
+      if (node->attrs.attr_store.count("pre_run")) {
+        instr->pre_run = absl::get<bool>(node->attrs.attr_store["pre_run"]);
+      }
+      // explicitly call Finalize of the instruction after all assignments on it were done
+      instr->Finalize();
+      instructions.push_back(std::move(instr));
+    } else {
+      CHECK_GT(group.size(), 1U) << "fuse number should be greater than 1";
+      std::vector<std::string> inputNames;
+      std::vector<std::string> outputNames;
+      std::unordered_set<std::string> names_set;
+      int count             = 0;
+      std::string fuse_name = "fn_";
+      if (!fusion_group.get()) {
+        for (int i = 0; i < group.size(); i++) {
+          auto node = group[i];
+          CHECK(node);
+          fuse_name += node->id() + "_";
+          auto temp_inputnames = OpGetInputNames(node);
+          for (int j = 0; j < temp_inputnames.size(); j++) {
+            if (!names_set.count(temp_inputnames[j])) {
+              inputNames.push_back(temp_inputnames[j]);
+              names_set.insert(temp_inputnames[j]);
+            }
+          }
+          auto temp_outputnames = OpGetOutputNames(node);
+          // fused output arg order: final output, ops no_fused outputs
+          for (int j = 0; j < temp_outputnames.size(); j++) {
+            if (!names_set.count(temp_outputnames[j])) {
+              names_set.insert(temp_outputnames[j]);
+              // assume that the first out_var of the op node is the fused var
+              bool is_fetch = fetch_var_ids_.count(temp_outputnames[j]);
+              if (j == 0 && i != group.size() - 1 && !is_fetch) continue;
+              if (j == 0 && i == group.size() - 1) {
+                outputNames.insert(outputNames.begin(), temp_outputnames[0]);
+              } else if (is_fetch) {
+                VLOG(3) << "fetch var " << temp_outputnames[j];
+                outputNames.insert(outputNames.begin(), temp_outputnames[j]);
+              } else {
+                outputNames.push_back(temp_outputnames[j]);
+              }
+            }
+          }
+        }
+
+        fuse_name += "fused";
+        VLOG(3) << "In buildInstructions, fuse_name is : " << fuse_name;
+        VLOG(3) << "input_names: " << utils::Join(inputNames, ", ");
+        VLOG(3) << "out_names: " << utils::Join(outputNames, ", ");
+      }
+      fuse_name = fusion_group.get() ? fusion_group->GetFuncName() : GetOrGenFullFuncName(fuse_name);
+      auto instr =
+          std::unique_ptr<Instruction>(new Instruction(target_,
+                                                       scope_.get(),
+                                                       fusion_group.get() ? fusion_group->input_names : inputNames,
+                                                       fusion_group.get() ? fusion_group->output_names : outputNames,
+                                                       fuse_name));
+
+      auto* fn_ptr = compiler_->Lookup(fuse_name);
+      CHECK(fn_ptr);
+      instr->SetLoweredFunc(reinterpret_cast<void*>(fn_ptr), fuse_name);
+      // As some situation like reduce,will generate more than one kernel.
+      // So try to find the rest kernel, if it exists.
+      SetSubKernels(instr.get(), fuse_name);
+
+      for (int j = 0; j < group.size(); j++) {
+        auto node = group[j];
+        if (node->attrs.attr_store.count("pre_run") && absl::get<bool>(node->attrs.attr_store["pre_run"]) == true) {
+          instr->pre_run = true;
+        }
+      }
+      // explicitly call Finalize of the instruction after all assignments on it were done
+      instr->Finalize();
+      instructions.push_back(std::move(instr));
+    }
+  }
+  return instructions;
+}
+
+void GraphCompiler::RemoveInvalidVariables(const std::vector<std::unique_ptr<Instruction>>& instructions) {
+  // mark all variables are invalid initially
+  utils::RecordEvent("GraphCompiler RemoveInvalidVariables", utils::EventType::kOrdinary);
+  std::unordered_set<std::string> invalid_variables;
+  auto var_names = scope_->var_names();
+  invalid_variables.reserve(var_names.size());
+  std::transform(var_names.begin(),
+                 var_names.end(),
+                 std::inserter(invalid_variables, invalid_variables.end()),
+                 [](const auto& name_view) { return std::string(name_view.data()); });
+
+  // erase used variable names
+  auto exclude_arguments_fn = [&invalid_variables](const std::vector<std::string>& args) {
+    std::for_each(args.begin(), args.end(), [&invalid_variables](const std::string& var_name) {
+      invalid_variables.erase(var_name);
+    });
+  };
+
+  // iterate the arguments of each instruction, eliminate the
+  // used variables, and remain variables are invalid finally
+  auto unused_var_num = invalid_variables.size();
+  VLOG(3) << "Before removing invalid variables: " << instructions.size() << " instructions, "
+          << invalid_variables.size() << " variables";
+  for (auto i = 0; i < instructions.size(); ++i) {
+    const auto& instr    = instructions.at(i);
+    const auto& in_args  = instr->GetInArgs();
+    const auto& out_args = instr->GetOutArgs();
+    std::for_each(in_args.begin(), in_args.end(), exclude_arguments_fn);
+    std::for_each(out_args.begin(), out_args.end(), exclude_arguments_fn);
+
+    VLOG(3) << "Instruction-" << i << " filter " << unused_var_num - invalid_variables.size() << " used variables";
+    unused_var_num = invalid_variables.size();
+  }
+
+  VLOG(3) << "There are " << unused_var_num << " invalid variables to be removed from scope";
+  std::for_each(invalid_variables.begin(), invalid_variables.end(), [this](const std::string& var_name) {
+    scope_->EraseVar(var_name);
+    VLOG(3) << "Variable(" << var_name << ") is erased";
+  });
+}
+
+static void BufferMallocWithCallback(void* args, int num_args) {
+  cinn_pod_value_t* pod_args = static_cast<cinn_pod_value_t*>(args);
+  for (int i = 0; i < num_args; ++i) {
+    cinn_buffer_t* buffer = static_cast<cinn_buffer_t*>(pod_args[i]);
+    CHECK(buffer->external_malloc) << "external_malloc is nullptr at " << i << "-th argumemnts";
+    buffer->external_malloc->operator()(nullptr, buffer);
+  }
+}
+
+static void BufferFreeWithCallback(void* args, int num_args) {
+  cinn_pod_value_t* pod_args = static_cast<cinn_pod_value_t*>(args);
+  for (int i = 0; i < num_args; ++i) {
+    cinn_buffer_t* buffer = static_cast<cinn_buffer_t*>(pod_args[i]);
+    CHECK(buffer->external_free) << "external_free is nullptr";
+    buffer->external_free->operator()(nullptr, buffer);
+  }
+}
+
+void GraphCompiler::AnalyzeVariableLifeTime(const std::vector<std::unique_ptr<Instruction>>& instructions,
+                                            std::unordered_map<int, std::vector<std::string>>* step2malloc,
+                                            std::unordered_map<int, std::vector<std::string>>* step2free) {
+  utils::RecordEvent("GraphCompiler AnalyzeVariableLifeTime", utils::EventType::kOrdinary);
+  absl::flat_hash_map<std::string, int> variable_last_used, variable_first_used;
+  for (auto step = 0; step < instructions.size(); ++step) {
+    const auto& instr = instructions.at(step);
+
+    for (const auto& args : instr->GetInArgs()) {
+      for (const auto& var_name : args) {
+        // use try_emplace to record the first time a variable appearance
+        variable_first_used.try_emplace(var_name, step);
+        // will update until last time a variable used
+        variable_last_used[var_name] = step;
+      }
+    }
+    for (const auto& args : instr->GetOutArgs()) {
+      for (const auto& var_name : args) {
+        variable_first_used.try_emplace(var_name, step);
+        variable_last_used[var_name] = step;
+      }
+    }
+  }
+
+  for (const auto& var2first : variable_first_used) {
+    (*step2malloc)[var2first.second].emplace_back(var2first.first);
+  }
+  for (const auto& var2last : variable_last_used) {
+    (*step2free)[var2last.second].emplace_back(var2last.first);
+  }
+}
+
+void GraphCompiler::InsertBufferHandlers(std::vector<std::unique_ptr<Instruction>>* instructions) {
+  utils::RecordEvent("GraphCompiler InsertBufferHandlers", utils::EventType::kOrdinary);
+  std::unordered_map<int, std::vector<std::string>> step2malloc, step2free;
+  AnalyzeVariableLifeTime(*instructions, &step2malloc, &step2free);
+
+  std::vector<std::unique_ptr<Instruction>> results;
+  for (auto step = 0; step < instructions->size(); ++step) {
+    auto& instr = instructions->at(step);
+
+    // insert a buffer malloc instruction applying on variables
+    // before they are firstly used in the next instruction
+    auto m_it = step2malloc.find(step);
+    if (m_it != step2malloc.end()) {
+      const auto& malloc_var_names = m_it->second;
+      auto function_name           = "malloc_buffer_instruction_" + std::to_string(step);
+      auto malloc_instr            = std::make_unique<Instruction>(
+          common::DefaultHostTarget(), scope_.get(), malloc_var_names, std::vector<std::string>({}), function_name);
+      VLOG(4) << "seting malloc function " << function_name << " for var " << cinn::utils::Join(malloc_var_names, ", ");
+      malloc_instr->SetLoweredFunc(reinterpret_cast<void*>(BufferMallocWithCallback), function_name);
+      malloc_instr->Finalize();
+      results.emplace_back(std::move(malloc_instr));
+    }
+
+    // join the real computation instruction
+    results.emplace_back(std::move(instr));
+
+    // insert a buffer free instruction applying on variables
+    // after no instruction will use them anymore
+    auto f_it = step2free.find(step);
+    if (f_it != step2free.end()) {
+      const auto& free_var_names = f_it->second;
+      auto function_name         = "free_buffer_instruction_" + std::to_string(step);
+      auto free_instr            = std::make_unique<Instruction>(
+          common::DefaultHostTarget(), scope_.get(), std::vector<std::string>({}), free_var_names, function_name);
+      VLOG(4) << "setting free function " << function_name << " for var " << cinn::utils::Join(free_var_names, ", ");
+      free_instr->SetLoweredFunc(reinterpret_cast<void*>(BufferFreeWithCallback), function_name);
+      free_instr->Finalize();
+      results.emplace_back(std::move(free_instr));
+    }
+  }
+
+  // replace original instructions
+  instructions->swap(results);
+}
+
+std::vector<std::string> GraphCompiler::OpGetInputNames(const Node* node) const {
+  std::vector<std::string> res;
+  if (node->op()->name == "cublas_gemm" || node->op()->name == "cublas_matmul" || node->op()->name == "conv2d" ||
+      node->op()->name == "depthwise_conv2d" || node->op()->name == "pool2d" || node->op()->name == "softmax" ||
+      node->op()->name == "mul" || node->op()->name == "matmul") {
+    for (auto& i : node->inlinks_in_order()) {
+      res.push_back(i->source()->as<NodeData>()->id());
+    }
+  } else {
+    std::unordered_set<std::string> repeat;
+    for (auto& inode : node->inlinks_in_order()) {
+      auto id = inode->source()->as<NodeData>()->id();
+      if (repeat.count(id)) {
+        continue;
+      }
+      repeat.insert(id);
+      res.push_back(id);
+    }
+  }
+  return res;
+}
+
+std::vector<std::string> GraphCompiler::OpGetOutputNames(const Node* node) const {
+  std::vector<std::string> res;
+  for (auto& i : node->outlinks_in_order()) {
+    res.push_back(i->sink()->as<NodeData>()->id());
+  }
+  return res;
+}
+
+std::shared_ptr<Scope> BuildScope(Target target, const std::shared_ptr<Graph>& graph, std::shared_ptr<Scope> scope) {
+  utils::RecordEvent("GraphCompiler BuildScope", utils::EventType::kOrdinary);
+  auto& shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+  auto& dtype_dict = graph->GetAttrs<absl::flat_hash_map<std::string, Type>>("inferdtype");
+  if (!scope) scope = std::make_shared<Scope>();
+  for (auto& iter : shape_dict) {
+    auto* var    = scope->Var<Tensor>(iter.first);
+    auto& tensor = absl::get<Tensor>(*var);
+    std::vector<Shape::dim_t> shape;
+    for (auto& shape_dim : iter.second) {
+      shape.push_back(Shape::dim_t(shape_dim));
+    }
+    VLOG(3) << "Tensor [" << iter.first << "] resize to " << utils::Join(shape, ",");
+    tensor->Resize(Shape{shape});
+    CHECK(dtype_dict.count(iter.first));
+    CHECK(dtype_dict.at(iter.first).is_supported())
+        << "The dtype of node " << iter.first << " is not float or bool or int! Its type "
+        << dtype_dict.at(iter.first).type() << ", " << dtype_dict.at(iter.first).bits() << " is not implemented yet.";
+    tensor->set_type(dtype_dict.at(iter.first));
+  }
+  return scope;
+}
+
+std::vector<ir::LoweredFunc> GetFuncFromImpl(const std::shared_ptr<OpImpl>& impl,
+                                             const common::CINNValuePack& cinn_inputs,
+                                             std::vector<ir::Tensor>& all_arg_tensors,
+                                             const std::vector<std::string>& input_output_nodes,
+                                             const std::string& node_id,
+                                             const Target& target) {
+  utils::RecordEvent("GraphCompiler GetFuncFromImpl", utils::EventType::kOrdinary);
+  // 1.Call Op's Compute function, using the default stages and LowerVec to get IR tree.
+  common::CINNValuePack C = impl->fcompute(cinn_inputs);
+
+  // 2. Collect tensors and arguments
+  // Add output tensors to all_arg_tensors
+  for (int i = 0; i < C->size() - 1; i++) {
+    ir::Expr temp = C[i];
+    // checkout whether the tensor is with buffer.
+    if (!temp.as_tensor_ref()->buffer.defined() || target != common::DefaultNVGPUTarget()) {
+      all_arg_tensors.push_back(temp.as_tensor_ref());
+    }
+  }
+
+  poly::StageMap stages        = C.back();
+  std::string func_name_prefix = "fn_";
+  auto funcs = lang::LowerVec(func_name_prefix + node_id, stages, all_arg_tensors, {}, {}, nullptr, target, true);
+
+  std::vector<common::CINNValue> schedule_inputs;
+  for (int i = 0; i < C.size() - 1; ++i) {
+    CHECK(C[i].is_tensor());
+    schedule_inputs.push_back(common::CINNValue(C[i]));
+  }
+  for (auto& f : funcs) {
+    schedule_inputs.push_back(common::CINNValue(f->body));
+  }
+
+  // 3. Call Op's Schedule function, optimizing the IR tree by new IR schedule
+  common::CINNValuePack expr_pack = impl->fschedule(common::CINNValuePack{schedule_inputs});
+
+  // 4. Optimize the LoweredFunc
+  VLOG(3) << "expr_pack.size() is : " << expr_pack.size() << ", funcs.size() is " << funcs.size();
+  VLOG(3) << "input_output_nodes.size() is: " << input_output_nodes.size()
+          << ", all_arg_tensors.size() is: " << all_arg_tensors.size();
+  std::vector<ir::LoweredFunc> funcs_after_schedule;
+  CHECK_GE(funcs.size(), expr_pack.size());
+  if (funcs.size() > expr_pack.size() || all_arg_tensors.size() > input_output_nodes.size()) {
+    for (int i = 0; i < funcs.size(); i++) {
+      for (int j = 0; j < expr_pack.size(); j++) {
+        Expr temp = expr_pack[j];
+        if (temp == funcs[i]->body) {
+          auto new_args  = lang::GetArgs(funcs[i]->body, input_output_nodes);
+          funcs[i]->args = new_args;
+          funcs_after_schedule.push_back(funcs[i]);
+          break;
+        }
+      }
+    }
+  } else if (funcs.size() == expr_pack.size()) {
+    funcs_after_schedule = funcs;
+  } else {
+    LOG(FATAL) << "The number of funcs should not less than expr_pack's";
+  }
+  CHECK_EQ(funcs_after_schedule.size(), expr_pack.size());
+  std::vector<ir::LoweredFunc> res;
+  for (int i = 0; i < funcs_after_schedule.size(); i++) {
+#ifdef CINN_WITH_CUDA
+    optim::OptimizeExprGPU(&(funcs_after_schedule[i]->body));
+#endif
+    auto temp_buffers                  = lang::GetTempBuffers(all_arg_tensors, stages, funcs_after_schedule[i]->body);
+    funcs_after_schedule[i]->temp_bufs = temp_buffers;
+    funcs_after_schedule[i]            = ir::_LoweredFunc_::Make(funcs_after_schedule[i]->name,
+                                                      funcs_after_schedule[i]->args,
+                                                      funcs_after_schedule[i]->body,
+                                                      funcs_after_schedule[i]->temp_bufs);
+    res.emplace_back(optim::Optimize(Expr(funcs_after_schedule[i]), target, false).as_lowered_func_ref());
+  }
+  // 5. Return the result.
+  return res;
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/graph_compiler.h b/paddle/cinn/hlir/framework/graph_compiler.h
new file mode 100644
index 0000000000000..9a7567591d20e
--- /dev/null
+++ b/paddle/cinn/hlir/framework/graph_compiler.h
@@ -0,0 +1,210 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cinn/auto_schedule/tuning.h"
+#include "cinn/backends/compiler.h"
+#include "cinn/backends/cuda_util.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/instruction.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/framework/parallel_compiler.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/utils/timer.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+/**
+ * The Program is the runtime instance for running a computation.
+ */
+class Program {
+ public:
+  /**
+   * Constructor.
+   * @param scope The scope containing all the runtime variables.
+   * @param instrs The instructions belonging to this program.
+   */
+  Program(const std::shared_ptr<Scope>& scope, std::vector<std::unique_ptr<Instruction>>&& instrs);
+
+  void PreRun(const std::map<std::string, cinn_pod_value_t>* name2podargs = nullptr);
+
+  void Export(const std::vector<std::string>& persistent_vars, const std::string& filename);
+
+  /**
+   * Execute the program -- that is running all the instructions inside it.
+   */
+  void Execute(const std::map<std::string, cinn_pod_value_t>* name2podargs = nullptr,
+               void* stream                                                = nullptr,
+               bool use_cache                                              = true);
+
+  void ExecuteTest(int repeat_);
+
+  /**
+   * Get the number of instructions.
+   */
+  size_t size() const { return instrs_.size(); }
+
+  const std::vector<std::unique_ptr<Instruction>>& GetPreRunInstructions() { return prerun_instrs_; }
+  const std::vector<std::unique_ptr<Instruction>>& GetRunInstructions() { return instrs_; }
+
+ private:
+  // We need to hold scope to assure tensors alive used in instructions.
+  std::shared_ptr<Scope> scope_;
+  // prerun instructions
+  std::vector<std::unique_ptr<Instruction>> prerun_instrs_;
+  // only runtime instructions
+  std::vector<std::unique_ptr<Instruction>> instrs_;
+};
+
+/**
+ * GraphCompiler compiles a graph and generate the runtime Program.
+ */
+class GraphCompiler final {
+ public:
+  GraphCompiler(Target target, const std::shared_ptr<Scope>& scope, const std::shared_ptr<Graph>& graph)
+      : target_(std::move(target)), scope_(scope), graph_(graph), m_builder_(UniqName("module"), target) {}
+
+  struct CompilationResult {
+    std::unique_ptr<Program> runtime_program;
+  };
+
+  struct CompileOptions {
+    std::string attached_code                    = "";
+    bool with_instantiate_variables              = false;
+    bool with_buffer_handle_instruction_inserted = false;
+    bool remove_unused_variables                 = true;
+    // nodes group, it may come from the result of op fusion or graph tuning.
+    // nodes in a group will be built into an Instruction
+    std::vector<std::shared_ptr<Graph::Group>> groups;
+    // corresponding LoweredFuncs of above grouped nodes,
+    // if it is empty then graph_compiler will generate for them
+    std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
+
+    // apply results of auto-tune to compile
+    void Apply(const auto_schedule::TuningResult& tuning_result);
+  };
+
+  // Compile with a packing option and result, to be extended easily.
+  CompilationResult Build(const CompileOptions& options,
+                          std::unordered_set<std::string>&& fetch_var_ids = {},
+                          void* stream                                    = nullptr);
+  void ExportObject(const std::string& path) { compiler_->ExportObject(path); }
+
+  std::unique_ptr<Program> Build(const std::string& code = "");
+
+  std::string GenSourceCode();
+
+  void PrintFunc();
+
+  const std::shared_ptr<Scope>& GetScope() const { return scope_; }
+
+ private:
+  std::vector<ir::LoweredFunc> GetOpFunc(const std::vector<Node*>& nodes);
+
+  std::vector<ir::LoweredFunc> GetOpFunc(const Node* node);
+  // Given a node, lower it to LoweredFunc using new ir schedule
+  std::vector<ir::LoweredFunc> GetOpFuncWithIRSchedule(const Node* node,
+                                                       const absl::flat_hash_map<std::string, Type>& type_dict_,
+                                                       const absl::flat_hash_map<std::string, shape_t>& shape_dict_);
+
+  std::string GenOpFuncName(const Node* node) const { return "fn_" + node->id(); }
+
+  // append a unique number at the end of the function name to distinguish
+  // different functions from graphs whose structures are same
+  const std::string& GetOrGenFullFuncName(const std::string& prefix);
+
+  // TODO(haozech) add implementation
+  std::vector<std::string> OpGetInputNames(const Node* node) const;
+  // TODO(haozech) add implementation
+  std::vector<std::string> OpGetOutputNames(const Node* node) const;
+
+  std::vector<std::unique_ptr<Instruction>> BuildInstructions(
+      const std::vector<std::vector<Node*>>& groups, const std::vector<std::shared_ptr<Graph::Group>>& fusion_groups);
+
+  void BuildCublasInstr(const Node& node, Instruction* instr) const;
+  // some variables are eliminated by optimized passes(such as OpFusion),
+  // we can filter out them according to arguments of the built instructions,
+  // and erase them from the scope to avoid unnecessary buffer allocation
+  void RemoveInvalidVariables(const std::vector<std::unique_ptr<Instruction>>& instructions);
+
+  // find the first and last instruction where a variable used, and mark the
+  // variable should allocate buffer before the first instruction runing and
+  // can release the buffer after the last instruction finished.
+  void AnalyzeVariableLifeTime(const std::vector<std::unique_ptr<Instruction>>& instructions,
+                               std::unordered_map<int, std::vector<std::string>>* step2malloc,
+                               std::unordered_map<int, std::vector<std::string>>* step2free);
+
+  // insert a buffer malloc instruction applying on variables before they are
+  // firstly used in the next instruction, and insert a buffer free instruction
+  // applying on variables after no instruction will use them anymore
+  void InsertBufferHandlers(std::vector<std::unique_ptr<Instruction>>* instructions);
+
+ private:
+  // parallel compiler
+  std::shared_ptr<ParallelCompiler> parallel_compiler_;
+
+  void ProcessFunction(const std::vector<ir::LoweredFunc>& lowered_funcs);
+  void SetSubKernels(Instruction* instr, const std::string& func_name);
+  Target target_;
+  std::shared_ptr<Graph> graph_;
+  std::shared_ptr<Scope> scope_;
+  // mapping a function's name to its input artuments' names
+  std::map<std::string, std::vector<std::string>> function2input_args_;
+  // mapping a function's name to its output artuments' names
+  std::map<std::string, std::vector<std::string>> function2output_args_;
+  // fetch var ids in cinn and the corresponding var nodes will not be fused so as to get the result
+  std::unordered_set<std::string> fetch_var_ids_;
+
+  absl::flat_hash_map<std::string, std::string> prefix2full_namemap_;
+  // map dst reuse var to the src var sharing buffer
+  absl::flat_hash_map<std::string, std::string> reuse_vars_map_;
+
+  std::unique_ptr<backends::Compiler> compiler_;
+  CompileOptions compile_options_;
+
+  ir::Module::Builder m_builder_;
+
+  CINN_DISALLOW_COPY_AND_ASSIGN(GraphCompiler);
+};
+
+std::shared_ptr<Scope> BuildScope(Target target,
+                                  const std::shared_ptr<Graph>& graph,
+                                  std::shared_ptr<Scope> scope = nullptr);
+
+// Given params, lower the op to LoweredFunc using new IR Schedule
+std::vector<ir::LoweredFunc> GetFuncFromImpl(const std::shared_ptr<OpImpl>& impl,
+                                             const common::CINNValuePack& cinn_inputs,
+                                             std::vector<ir::Tensor>& tensor_inputs,
+                                             const std::vector<std::string>& input_output_nodes,
+                                             const std::string& node_id,
+                                             const Target& target);
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/graph_compiler_test.cc b/paddle/cinn/hlir/framework/graph_compiler_test.cc
new file mode 100644
index 0000000000000..7805c9e5bd131
--- /dev/null
+++ b/paddle/cinn/hlir/framework/graph_compiler_test.cc
@@ -0,0 +1,216 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/graph_compiler.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using common::Float;
+using frontend::Placeholder;
+
+TEST(GraphCompilerTest, TestRemoveInvaildVariables) {
+  frontend::NetBuilder builder("test");
+  auto a = builder.CreateInput(Float(32), {1, 64, 112, 112}, "A");
+  auto b = builder.CreateInput(Float(32), {64}, "B");
+
+  auto c = builder.Add(a, b, 1);
+  auto d = builder.Relu(c);
+
+  auto target  = common::DefaultHostTarget();
+  auto program = builder.Build();
+  auto graph   = Optimize(&program, {}, target);
+
+  auto scope = BuildScope(target, graph);
+  ASSERT_EQ(scope->var_names().size(), 6);
+  EXPECT_NE(scope->FindVar(c->id), nullptr);
+
+  GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  ASSERT_EQ(scope->var_names().size(), 3);
+  EXPECT_EQ(scope->FindVar(c->id), nullptr);
+
+  ASSERT_NO_THROW(runtime_program->Execute());
+}
+
+TEST(GraphCompilerTest, TestInsertBufferHandlers) {
+  frontend::NetBuilder builder("test");
+  auto a = builder.CreateInput(Float(32), {1, 64, 112, 112}, "A");
+  auto b = builder.CreateInput(Float(32), {64}, "B");
+
+  auto c = builder.Add(a, b, 1);
+  auto d = builder.Relu(c);
+
+  auto target  = common::DefaultHostTarget();
+  auto program = builder.Build();
+  auto graph   = Optimize(&program, {}, target);
+  auto scope   = BuildScope(target, graph);
+
+  GraphCompiler gc_disable(target, scope, graph);
+  GraphCompiler::CompileOptions options;
+  // disable with_buffer_handle_instruction_inserted: only 1 instruction
+  auto runtime_program_disable = gc_disable.Build(options).runtime_program;
+  ASSERT_EQ(runtime_program_disable->size(), 1);
+  const auto& computation_instr_disable = runtime_program_disable->GetRunInstructions().front();
+
+  // enable with_buffer_handle_instruction_inserted: 3 instructions, 1st ->
+  // malloc instruction(a, b, d), 2nd -> the real computation
+  // instruction(add + relu)  and 3rd -> free instruction
+  GraphCompiler gc_enable(target, scope, graph);
+  options.with_buffer_handle_instruction_inserted = true;
+  auto runtime_program_enable                     = gc_enable.Build(options).runtime_program;
+  const auto& instructions                        = runtime_program_enable->GetRunInstructions();
+  ASSERT_EQ(instructions.size(), 3);
+
+  const auto& malloc_instr = instructions.front();
+  ASSERT_EQ(malloc_instr->size(), 1);
+  auto malloc_variable_names = malloc_instr->GetInArgs().front();
+  auto used_variable_names   = std::unordered_set<std::string>(
+      {static_cast<frontend::Variable>(a)->id, static_cast<frontend::Variable>(b)->id, d->id});
+  EXPECT_EQ(malloc_instr->GetFnNames().size(), 1);
+  EXPECT_EQ(malloc_instr->GetFnNames().front(), "malloc_buffer_instruction_0");
+  EXPECT_EQ(malloc_instr->GetOutArgs().size(), 1);
+  EXPECT_TRUE(malloc_instr->GetOutArgs().front().empty());
+  EXPECT_EQ(malloc_variable_names.size(), 3);
+  EXPECT_EQ(std::unordered_set<std::string>(malloc_variable_names.begin(), malloc_variable_names.end()),
+            used_variable_names);
+
+  const auto& computation_instr_enable = instructions.at(1);
+  ASSERT_EQ(computation_instr_disable->size(), computation_instr_enable->size());
+  auto computation_instr_function_names = computation_instr_enable->GetFnNames();
+  ASSERT_EQ(computation_instr_disable->GetFnNames().size(), computation_instr_enable->GetFnNames().size());
+
+  EXPECT_EQ(computation_instr_disable->GetInArgs(), computation_instr_enable->GetInArgs());
+  EXPECT_EQ(computation_instr_disable->GetOutArgs(), computation_instr_enable->GetOutArgs());
+
+  const auto& free_instr = instructions.back();
+  ASSERT_EQ(free_instr->size(), 1);
+  EXPECT_EQ(free_instr->GetFnNames().size(), 1);
+  EXPECT_EQ(free_instr->GetFnNames().front(), "free_buffer_instruction_0");
+  EXPECT_EQ(free_instr->GetInArgs().size(), 1);
+  EXPECT_TRUE(free_instr->GetInArgs().front().empty());
+  auto free_variable_names = free_instr->GetOutArgs().front();
+  EXPECT_EQ(std::unordered_set<std::string>(free_variable_names.begin(), free_variable_names.end()),
+            used_variable_names);
+}
+
+#ifdef CINN_WITH_CUDA
+std::vector<float> test_mul(
+    const std::vector<float>& A, const std::vector<float>& B, int M, int K, int N, bool trans_a, bool trans_b) {
+  std::vector<float> C(M * N, 0);
+  if (!trans_a && !trans_b) {
+    for (int idx = 0; idx < M; ++idx) {
+      for (int idy = 0; idy < N; ++idy) {
+        for (int idz = 0; idz < K; ++idz) {
+          C[idx * N + idy] += A[idx * K + idz] * B[idz * N + idy];
+        }
+      }
+    }
+  } else if (trans_a && !trans_b) {
+    for (int idx = 0; idx < M; ++idx) {
+      for (int idy = 0; idy < N; ++idy) {
+        for (int idz = 0; idz < K; ++idz) {
+          C[idx * N + idy] += A[idz * M + idx] * B[idz * N + idy];
+        }
+      }
+    }
+  } else if (!trans_a && trans_b) {
+    for (int idx = 0; idx < M; ++idx) {
+      for (int idy = 0; idy < N; ++idy) {
+        for (int idz = 0; idz < K; ++idz) {
+          C[idx * N + idy] += A[idx * K + idz] * B[idy * K + idz];
+        }
+      }
+    }
+  } else {
+    for (int idx = 0; idx < M; ++idx) {
+      for (int idy = 0; idy < N; ++idy) {
+        for (int idz = 0; idz < K; ++idz) {
+          C[idx * N + idy] += A[idz * M + idx] * B[idy * K + idz];
+        }
+      }
+    }
+  }
+  return C;
+}
+
+void RunCublas(int M, int N, int K, bool trans_a = false, bool trans_b = false) {
+  frontend::NetBuilder net_builder("builder");
+  auto A = net_builder.CreateInput(Float(32), trans_a ? std::vector<int>({K, M}) : std::vector<int>({M, K}), "A");
+  auto B = net_builder.CreateInput(Float(32), trans_b ? std::vector<int>({N, K}) : std::vector<int>({K, N}), "B");
+  auto C = net_builder.Matmul(A, B, trans_a, trans_b);
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  auto graph   = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "TransToCustomCallPass");
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+
+  auto scope = BuildScope(target, graph);
+  GraphCompiler gc(target, scope, graph);
+  auto exe_program = gc.Build();
+
+  auto data_a = scope->GetTensor("A");
+  auto data_b = scope->GetTensor("B");
+  SetRandData<float>(data_a, target);
+  SetRandData<float>(data_b, target);
+  exe_program->Execute();
+  auto data_c = scope->GetTensor(C->id);
+
+  auto host_a = GetTensorData<float>(data_a, target);
+  auto host_b = GetTensorData<float>(data_b, target);
+  auto host_c = GetTensorData<float>(data_c, target);
+
+  auto target_mul = test_mul(host_a, host_b, M, K, N, trans_a, trans_b);
+  for (int i = 0; i < data_c->shape().numel(); i++) {
+    // LOG_FIRST_N(INFO, 10) << "cinn_data[" << i << "]: " <<  target_mul[i]
+    //                       << " v.s. target_data[" << i << "]: " << host_c[i];
+    // EXPECT_NEAR(host_c[i], target_mul[i], 1e-4);
+    CHECK(abs(host_c[i] - target_mul[i]) < 1e-4);
+  }
+}
+
+TEST(GraphCompilerTest, TestCublas) {
+  RunCublas(64, 64, 128);
+  RunCublas(64, 64, 128, false, true);
+  RunCublas(64, 64, 128, true, false);
+  RunCublas(64, 64, 128, true, true);
+  RunCublas(64, 32, 128);
+  RunCublas(64, 32, 128, false, true);
+  RunCublas(64, 32, 128, true, false);
+  RunCublas(64, 32, 128, true, true);
+  RunCublas(64, 128, 128);
+  RunCublas(64, 128, 128, false, true);
+  RunCublas(64, 128, 128, true, false);
+  RunCublas(64, 128, 128, true, true);
+}
+
+#endif
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/graph_test.cc b/paddle/cinn/hlir/framework/graph_test.cc
new file mode 100644
index 0000000000000..a4fec36f7ccf0
--- /dev/null
+++ b/paddle/cinn/hlir/framework/graph_test.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/graph.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/pass/use_pass.h"
+
+DECLARE_string(cinn_fusion_groups_graphviz_dir);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+TEST(Graph, visualize) {
+  frontend::NetBuilder builder("test");
+  auto x            = builder.CreateInput(Float(32), {32, 16}, "x");
+  auto y            = builder.CreateInput(Float(32), {32, 16}, "y");
+  auto add_1        = builder.Add(x, y);
+  auto relu_1       = builder.Relu(add_1);
+  auto reduce_sum_1 = builder.ReduceSum(relu_1, {1});
+  auto program      = builder.Build();
+
+  auto target = common::DefaultHostTarget();
+  auto graph  = std::make_shared<Graph>(program, target);
+  ApplyPass(graph.get(), "OpFusion");
+
+  FLAGS_cinn_fusion_groups_graphviz_dir = "./visualize";
+  graph->VisualizeGroupedGraph(graph->groups, {reduce_sum_1->id});
+}
+
+TEST(Graph, visualize_recompute) {
+  frontend::NetBuilder builder("test");
+  auto x              = builder.CreateInput(Float(32), {16, 32}, "x");
+  auto y              = builder.CreateInput(Float(32), {32, 16}, "y");
+  auto z              = builder.CreateInput(Float(32), {16}, "z");
+  auto constant_1     = builder.FillConstant<float>({16}, 1, "constant_1");
+  auto add_1          = builder.Add(z, constant_1);
+  auto broadcast_to_1 = builder.BroadcastTo(add_1, {16, 32});
+  auto broadcast_to_2 = builder.BroadcastTo(add_1, {32, 16});
+  auto add_2          = builder.Add(x, broadcast_to_1);
+  auto add_3          = builder.Add(y, broadcast_to_2);
+  auto program        = builder.Build();
+
+  auto target = common::DefaultHostTarget();
+  auto graph  = std::make_shared<Graph>(program, target);
+  ApplyPass(graph.get(), "OpFusionPass");
+  ApplyPass(graph.get(), "FusionMergePass");
+
+  FLAGS_cinn_fusion_groups_graphviz_dir = "./visualize_recompute";
+  graph->VisualizeGroupedGraph({add_2->id, add_3->id});
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/instruction.cc b/paddle/cinn/hlir/framework/instruction.cc
new file mode 100644
index 0000000000000..51c8f98de3804
--- /dev/null
+++ b/paddle/cinn/hlir/framework/instruction.cc
@@ -0,0 +1,333 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/instruction.h"
+
+#include <fstream>
+#include <sstream>
+
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/framework/accuracy_checker.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/profiler.h"
+
+DECLARE_bool(cinn_sync_run);
+DECLARE_string(cinn_self_check_accuracy);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+namespace details {
+class ResultsPrint {
+ public:
+  static ResultsPrint* GetInstance() {
+    static ResultsPrint print;
+    return &print;
+  }
+
+  void write(const std::string& result_str) {
+    if (of_.is_open()) {
+      of_ << result_str << std::endl;
+    } else if (!FLAGS_cinn_self_check_accuracy.empty()) {
+      std::cerr << result_str << std::endl;
+    } else {
+      VLOG(2) << result_str << std::endl;
+    }
+  }
+
+ private:
+  ResultsPrint() {
+    bool print_to_file = !FLAGS_cinn_self_check_accuracy.empty() &&
+                         !cinn::runtime::CheckStringFlagTrue(FLAGS_cinn_self_check_accuracy) &&
+                         !cinn::runtime::CheckStringFlagFalse(FLAGS_cinn_self_check_accuracy);
+
+    if (print_to_file) {
+      of_.open(FLAGS_cinn_self_check_accuracy, std::ios_base::out);
+      if (of_.is_open()) {
+        LOG(INFO) << "The CINN compute results will writing into file: \"" << FLAGS_cinn_self_check_accuracy << "\"";
+      } else if (!FLAGS_cinn_self_check_accuracy.empty()) {
+        LOG(WARNING) << "Failed to open file: \"" << FLAGS_cinn_self_check_accuracy
+                     << "\", the CINN compute result will print.";
+      }
+    }
+  }
+
+  ~ResultsPrint() {
+    if (of_.is_open()) {
+      of_.close();
+    }
+  }
+
+  std::ofstream of_;
+};
+}  // namespace details
+
+void Instruction::UpdateArgsCache(const std::map<std::string, cinn_pod_value_t>* name2podargs) {
+  int cache_size = size();
+  args_cached_.resize(cache_size);
+
+  for (int i = 0; i < cache_size; ++i) {
+    common::ArgsBuilder builder;
+    std::vector<std::string> all_args = in_args_[i];
+    all_args.insert(std::end(all_args), out_args_[i].begin(), out_args_[i].end());
+
+    if (name2podargs != nullptr) {
+      for (const auto& arg : all_args) {
+        CHECK_NE(name2podargs->count(arg), 0) << "Argument [" << arg << "] not found in the name2podargs";
+        VLOG(5) << "Get a argument, name=" << arg << ",type_code=" << name2podargs->at(arg).type_code();
+        builder.Add(name2podargs->at(arg));
+      }
+    } else {
+      for (const auto& arg : all_args) {
+        auto* var = scope_->FindVar(arg);
+        CHECK(var) << "Argument [" << arg << "] not found in the scope";
+
+        // TODO(Superjomn) Support other types.
+        auto& tensor = absl::get<Tensor>(*var);
+        VLOG(5) << "Get a argument, name=" << arg;
+        builder.Add(tensor->buffer());
+      }
+    }
+
+    args_cached_[i] = builder.Build();
+  }
+}
+
+void Instruction::Finalize() {
+  if (fn_ptrs_.size() > 1 && fn_ptrs_.size() != in_args_.size()) {
+    out_args_.back()[0] = out_args_.front()[0];
+    out_args_.erase(out_args_.begin());
+    in_args_.erase(in_args_.begin());
+  }
+
+  finalized_flag_ = true;
+}
+
+void Instruction::Run(const std::map<std::string, cinn_pod_value_t>* name2podargs,
+                      bool dryrun,
+                      void* stream,
+                      bool use_cache) {
+  utils::RecordEvent record_run(function_name_, cinn::utils::EventType::kInstruction);
+  CHECK(finalized_flag_) << "Instruction must be finalized before run";
+  if (function_name_ == "no_run") {
+    VLOG(2) << "skip instruction";
+    return;
+  }
+
+  VLOG(2) << "Run function " << function_name_;
+
+  {
+    utils::RecordEvent record_args("UpdateArgsCache", cinn::utils::EventType::kInstruction);
+    if (!use_cache || args_cached_.size() != size()) {
+      UpdateArgsCache(name2podargs);
+    }
+  }
+
+  utils::RecordEvent record_args("Instruction::Run", cinn::utils::EventType::kInstruction);
+#if defined(CINN_WITH_CUDA) && !defined(CINN_WITH_CUDNN)
+  if (function_name_ == "cublas_gemm" && target_.arch == Target::Arch::NVGPU) {
+    auto& pod_args = args_cached_[0];
+    VLOG(3) << "The pod_args size of cublas_gemm: " << pod_args.size();
+    runtime::cuda::cinn_gpu_cublas_gemm(
+        attrs, pod_args[0], pod_args[1], pod_args[2], pod_args[3], static_cast<cudaStream_t>(stream));
+  } else if (function_name_ == "cublas_matmul" && target_.arch == Target::Arch::NVGPU) {
+    auto& pod_args = args_cached_[0];
+    VLOG(3) << "The pod_args size of cublas_matmul: " << pod_args.size();
+    runtime::cuda::cinn_gpu_cublas_gemm(
+        attrs, pod_args[0], pod_args[1], nullptr, pod_args[2], static_cast<cudaStream_t>(stream));
+  } else {
+    VLOG(3) << "Runing extern function " << function_name_;
+    for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
+      VLOG(3) << "Runing func name: " << fn_names_[idx];
+      auto& pod_args = args_cached_[idx];
+      CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by calling SetLoweredFunc method";
+      if (!dryrun) {
+        if (target_ == common::DefaultNVGPUTarget()) {
+          ((lower_func_ptr_g)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()), pod_args.size(), stream);
+        } else {
+          ((lower_func_ptr_t)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()), pod_args.size());
+        }
+      }
+    }
+    VLOG(3) << "Done Runing extern function " << function_name_;
+  }
+#elif defined(CINN_WITH_CUDNN)
+  auto& pod_args = args_cached_[0];
+  // Here conv2d and depthwise_conv2d are implemented by one cudnn api cudnnConvolutionForward
+  if ((function_name_ == "conv2d" || function_name_ == "depthwise_conv2d") && target_.arch == Target::Arch::NVGPU) {
+    if (str_attrs[0] == "forward") {
+      if (str_attrs.size() > 1 && str_attrs[1] == "NHWC") {
+        absl::flat_hash_map<std::string, int> attrs_map = {
+            {"input_n", attrs[0]},     {"input_h", attrs[1]},     {"input_w", attrs[2]},   {"input_c", attrs[3]},
+            {"weights_n", attrs[4]},   {"weights_c", attrs[5]},   {"weights_h", attrs[6]}, {"weights_w", attrs[7]},
+            {"pad_h", attrs[8]},       {"pad_w", attrs[9]},       {"stride_h", attrs[10]}, {"stride_w", attrs[11]},
+            {"dilation_h", attrs[12]}, {"dilation_w", attrs[13]}, {"groups", attrs[14]},   {"output_n", attrs[15]},
+            {"output_h", attrs[16]},   {"output_w", attrs[17]},   {"output_c", attrs[18]},
+        };
+        runtime::cuda::cinn_gpu_cudnn_conv2d(
+            attrs_map, pod_args[0], pod_args[1], pod_args[2], static_cast<cudaStream_t>(stream), common::Layout::kNHWC);
+
+      } else {
+        absl::flat_hash_map<std::string, int> attrs_map = {
+            {"input_n", attrs[0]},     {"input_c", attrs[1]},     {"input_h", attrs[2]},   {"input_w", attrs[3]},
+            {"weights_n", attrs[4]},   {"weights_c", attrs[5]},   {"weights_h", attrs[6]}, {"weights_w", attrs[7]},
+            {"pad_h", attrs[8]},       {"pad_w", attrs[9]},       {"stride_h", attrs[10]}, {"stride_w", attrs[11]},
+            {"dilation_h", attrs[12]}, {"dilation_w", attrs[13]}, {"groups", attrs[14]},   {"output_n", attrs[15]},
+            {"output_c", attrs[16]},   {"output_h", attrs[17]},   {"output_w", attrs[18]},
+        };
+        runtime::cuda::cinn_gpu_cudnn_conv2d(
+            attrs_map, pod_args[0], pod_args[1], pod_args[2], static_cast<cudaStream_t>(stream), common::Layout::kNCHW);
+      }
+    } else if (str_attrs[0] == "backward_data") {
+      // w, dy, dx
+      absl::flat_hash_map<std::string, int> attrs_map = {
+          {"input_n", attrs[15]},    {"input_c", attrs[16]},    {"input_h", attrs[17]},  {"input_w", attrs[18]},
+          {"weights_n", attrs[0]},   {"weights_c", attrs[1]},   {"weights_h", attrs[2]}, {"weights_w", attrs[3]},
+          {"pad_h", attrs[8]},       {"pad_w", attrs[9]},       {"stride_h", attrs[10]}, {"stride_w", attrs[11]},
+          {"dilation_h", attrs[12]}, {"dilation_w", attrs[13]}, {"groups", attrs[14]},   {"output_n", attrs[4]},
+          {"output_c", attrs[5]},    {"output_h", attrs[6]},    {"output_w", attrs[7]},
+      };
+      // w, dy, dx
+      runtime::cuda::cinn_gpu_cudnn_conv2d_backward_data(
+          attrs_map, pod_args[0], pod_args[1], pod_args[2], static_cast<cudaStream_t>(stream));
+    } else {
+      // x, dy, w
+      absl::flat_hash_map<std::string, int> attrs_map = {
+          {"input_n", attrs[0]},     {"input_c", attrs[1]},     {"input_h", attrs[2]},    {"input_w", attrs[3]},
+          {"weights_n", attrs[15]},  {"weights_c", attrs[16]},  {"weights_h", attrs[17]}, {"weights_w", attrs[18]},
+          {"pad_h", attrs[8]},       {"pad_w", attrs[9]},       {"stride_h", attrs[10]},  {"stride_w", attrs[11]},
+          {"dilation_h", attrs[12]}, {"dilation_w", attrs[13]}, {"groups", attrs[14]},    {"output_n", attrs[4]},
+          {"output_c", attrs[5]},    {"output_h", attrs[6]},    {"output_w", attrs[7]},
+      };
+      // x, dy, w
+      runtime::cuda::cinn_gpu_cudnn_conv2d_backward_filter(
+          attrs_map, pod_args[0], pod_args[1], pod_args[2], static_cast<cudaStream_t>(stream));
+    }
+  } else if (function_name_ == "pool2d" && target_.arch == Target::Arch::NVGPU) {
+    runtime::cuda::cinn_gpu_cudnn_pool2d(attrs, str_attrs, pod_args[0], pod_args[1], static_cast<cudaStream_t>(stream));
+  } else if (function_name_ == "softmax" && target_.arch == Target::Arch::NVGPU) {
+    CHECK_EQ(pod_args.size(), 3);
+    runtime::cuda::cinn_gpu_cudnn_softmax(attrs, pod_args[0], pod_args[1], static_cast<cudaStream_t>(stream));
+  } else if (function_name_ == "mul" && target_.arch == Target::Arch::NVGPU) {
+    CHECK_EQ(pod_args.size(), 4);
+    runtime::cuda::cinn_gpu_cublas_mul(attrs, pod_args[0], pod_args[1], pod_args[2], static_cast<cudaStream_t>(stream));
+  } else if (function_name_ == "cublas_gemm" && target_.arch == Target::Arch::NVGPU) {
+    VLOG(3) << "The pod_args size of cublas_gemm: " << pod_args.size();
+    runtime::cuda::cinn_gpu_cublas_gemm(
+        attrs, pod_args[0], pod_args[1], pod_args[2], pod_args[3], static_cast<cudaStream_t>(stream));
+  } else if (function_name_ == "cublas_matmul" && target_.arch == Target::Arch::NVGPU) {
+    auto& pod_args = args_cached_[0];
+    VLOG(3) << "The pod_args size of cublas_matmul: " << pod_args.size();
+    runtime::cuda::cinn_gpu_cublas_gemm(
+        attrs, pod_args[0], pod_args[1], nullptr, pod_args[2], static_cast<cudaStream_t>(stream));
+  } else {
+    VLOG(3) << "Runing extern function " << function_name_;
+    for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
+      VLOG(3) << "Runing func name: " << fn_names_[idx];
+      auto& pod_args = args_cached_[idx];
+      CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by calling SetLoweredFunc method";
+      if (!dryrun) {
+        if (target_ == common::DefaultNVGPUTarget()) {
+          ((lower_func_ptr_g)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()), pod_args.size(), stream);
+        } else {
+          ((lower_func_ptr_t)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()), pod_args.size());
+        }
+      }
+    }
+    VLOG(3) << "Done Runing extern function " << function_name_;
+  }
+#else
+  VLOG(3) << "Runing extern function " << function_name_;
+  for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
+    VLOG(3) << "Runing func name: " << fn_names_[idx];
+    auto& pod_args = args_cached_[idx];
+    CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by calling SetLoweredFunc method";
+    if (!dryrun) {
+      if (target_ == common::DefaultNVGPUTarget()) {
+        ((lower_func_ptr_g)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()), pod_args.size(), stream);
+      } else {
+        ((lower_func_ptr_t)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()), pod_args.size());
+      }
+    }
+  }
+  VLOG(3) << "Done Runing extern function " << function_name_;
+#endif
+
+  if (!cinn::runtime::CheckStringFlagFalse(FLAGS_cinn_self_check_accuracy)) {
+    CheckResults(name2podargs, stream);
+  }
+  // TODO(thisjiang): revert while flags correct
+  //   if (FLAGS_cinn_sync_run) {
+  // #ifdef CINN_WITH_CUDA
+  //     utils::RecordEvent record_sync("FLAGS_cinn_sync_run");
+  //     CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
+  // #endif
+  //   }
+}
+
+void Instruction::CheckResults(const std::map<std::string, cinn_pod_value_t>* name2podargs, void* stream) {
+#ifdef CINN_WITH_CUDA
+  cudaStreamSynchronize(static_cast<cudaStream_t>(stream));
+#endif
+
+  if (fn_names_.size() == 1) {
+    std::unordered_set<std::string> skipped_instr_set = {"malloc_buffer_instruction", "free_buffer_instruction"};
+    for (auto& name : skipped_instr_set) {
+      if (fn_names_[0].find(name) != std::string::npos) {
+        // Skip the malloc & free buffer instructions.
+        return;
+      }
+    }
+  }
+
+  AccuracyChecker checker(target_, scope_);
+
+  std::stringstream ss;
+  ss << "Instruction {" << std::endl;
+  for (size_t i = 0; i < fn_names_.size(); ++i) {
+    ss << "  Function " << fn_names_[i] << ":" << std::endl;
+
+    auto in_arg = in_args_[i];
+    std::sort(in_arg.begin(), in_arg.end());
+    for (auto& in_name : in_arg) {
+      std::string result_str;
+      if (name2podargs) {
+        result_str = checker(name2podargs, in_name);
+      } else {
+        result_str = checker(in_name);
+      }
+      ss << "    input: " << result_str << std::endl;
+    }
+
+    auto out_arg = out_args_[i];
+    std::sort(out_arg.begin(), out_arg.end());
+    for (auto& out_name : out_arg) {
+      std::string result_str;
+      if (name2podargs) {
+        result_str = checker(name2podargs, out_name);
+      } else {
+        result_str = checker(out_name);
+      }
+      ss << "    output: " << result_str << std::endl;
+    }
+  }
+  ss << "}" << std::endl;
+
+  details::ResultsPrint::GetInstance()->write(ss.str());
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/instruction.h b/paddle/cinn/hlir/framework/instruction.h
new file mode 100644
index 0000000000000..7d48a22d6585a
--- /dev/null
+++ b/paddle/cinn/hlir/framework/instruction.h
@@ -0,0 +1,150 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/hlir/framework/scope.h"
+#ifdef CINN_WITH_CUDA
+#include "cinn/runtime/cuda/cuda_util.h"
+#endif
+#include "cinn/utils/string.h"
+#include "cinn/utils/timer.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+/**
+ * Instruction is the basic executable element in runtime, it holds a pointer to the JIT-compiled LoweredFunc, and
+ * collect the cinn_buffer of the inputs and outputs from the scope, prepare the arguments and finally pass them into
+ * the LoweredFunc and execute it.
+ */
+class Instruction {
+ public:
+  using infershape_t = std::function<void(Scope*, const std::vector<std::string>&)>;
+
+  /**
+   * Constructor.
+   * @param target The \p target the instruction runs on.
+   * @param scope The scope containing all the runtime variables(Tensors and PODs).
+   * @param in_args The names of the inputs.
+   * @param out_args The names of the outputs.
+   * @param infershape The handler of this Instruction to perform shape inference.
+   */
+  Instruction(const Target& target,
+              Scope* scope,
+              const std::vector<std::string>& in_args,
+              const std::vector<std::string>& out_args,
+              const std::string& function_name = "")
+      : target_(target), scope_(scope), in_args_({in_args}), out_args_({out_args}), function_name_(function_name) {}
+
+  /**
+   * Set compiled function address.
+   * @param fn The JIT compiled function address.
+   */
+  void SetLoweredFunc(void* fn_ptr, const std::string& name = "") {
+    fn_ptrs_.push_back(fn_ptr);
+    fn_names_.push_back(name);
+  }
+
+  // explicitly finalize the instruction, and can't append function again after call it
+  void Finalize();
+
+  void UpdateArgsCache(const std::map<std::string, cinn_pod_value_t>* name2podargs);
+  /**
+   * Run the Instruction.
+   */
+  void Run(const std::map<std::string, cinn_pod_value_t>* name2podargs = nullptr,
+           bool dryrun                                                 = false,
+           void* stream                                                = nullptr,
+           bool use_cache                                              = true);
+
+  void PreRun(const std::map<std::string, cinn_pod_value_t>* name2podargs = nullptr) {
+    CHECK_EQ(fn_ptrs_.size(), 4);
+    if (fn_ptrs_.size() > 1 && fn_ptrs_.size() != in_args_.size()) {
+      out_args_.back()[0] = out_args_.front()[0];
+      out_args_.erase(out_args_.begin());
+      in_args_.erase(in_args_.begin());
+    }
+    UpdateArgsCache(name2podargs);
+
+    CHECK_EQ(fn_ptrs_.size(), in_args_.size());
+    CHECK_EQ(fn_ptrs_.size(), out_args_.size());
+
+    int flag     = -1;
+    void* stream = nullptr;
+    for (int idx = 0; idx < 4; idx++) {
+      if (utils::Startswith(out_args_[idx][0], "kernel_pack")) {
+        VLOG(3) << "PreRun " << idx << "-th function of fn_:" << fn_names_[idx];
+        flag           = idx;
+        auto& pod_args = args_cached_[idx];
+        CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by calling SetLoweredFunc method";
+        if (target_ == common::DefaultNVGPUTarget()) {
+          ((lower_func_ptr_g)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()), pod_args.size(), stream);
+        } else {
+          ((lower_func_ptr_t)fn_ptrs_[idx])(static_cast<void*>(pod_args.data()), pod_args.size());
+        }
+#ifdef CINN_WITH_CUDA
+        CUDA_CALL(cudaDeviceSynchronize());
+#endif
+      }
+    }
+    if (flag >= 0) {
+      args_cached_.erase(args_cached_.begin() + flag);
+      in_args_.erase(in_args_.begin() + flag);
+      out_args_.erase(out_args_.begin() + flag);
+      fn_ptrs_.erase(fn_ptrs_.begin() + flag);
+      fn_names_.erase(fn_names_.begin() + flag);
+    }
+  }
+
+  int size() { return fn_ptrs_.size(); }
+
+  std::vector<std::vector<std::string>> GetInArgs() { return in_args_; }
+  std::vector<std::vector<std::string>> GetOutArgs() { return out_args_; }
+  void ClearInArgs() { in_args_.clear(); }
+  void ClearOutArgs() { out_args_.clear(); }
+  std::vector<std::string> GetFnNames() { return fn_names_; }
+  void AddInArgs(const std::vector<std::string>& in_args) { in_args_.push_back(in_args); }
+  void AddOutArgs(const std::vector<std::string>& out_args) { out_args_.push_back(out_args); }
+  std::vector<int> attrs;
+  std::vector<std::string> str_attrs;
+  bool pre_run = false;
+  Target target_;
+
+ protected:
+  void CheckResults(const std::map<std::string, cinn_pod_value_t>* name2podargs = nullptr, void* stream = nullptr);
+
+ private:
+  bool finalized_flag_ = false;
+  Scope* scope_{};
+  std::string function_name_;
+  std::vector<std::vector<std::string>> in_args_;
+  std::vector<std::vector<std::string>> out_args_;
+
+  std::vector<std::vector<cinn_pod_value_t>> args_cached_;
+
+  std::vector<void*> fn_ptrs_{};
+  std::vector<std::string> fn_names_;
+};
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/instruction_test.cc b/paddle/cinn/hlir/framework/instruction_test.cc
new file mode 100644
index 0000000000000..087baf02984ce
--- /dev/null
+++ b/paddle/cinn/hlir/framework/instruction_test.cc
@@ -0,0 +1,482 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/instruction.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+std::unique_ptr<backends::SimpleJIT> GetLoweredFunc(int M, int N) {
+  Expr m(M);
+  Expr n(N);
+
+  Placeholder<float> x("x", {m, n});
+  Placeholder<float> y("y", {m, n});
+
+  auto z = Compute(
+      {m, n}, [=](Expr i, Expr j) { return x(i, j) + y(i, j); }, "z");
+
+  auto stages = CreateStages({z});
+  auto fn     = Lower("fn", stages, {x, y, z});
+
+  ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+  builder.AddFunction(fn);
+
+  auto jit = backends::SimpleJIT::Create();
+  jit->Link(builder.Build());
+  return std::move(jit);
+}
+
+void InstantiateScope(int M, int N, Scope* scope) {
+  for (auto& name : std::vector<std::string>({"x", "y", "z"})) {
+    auto* var    = scope->Var<Tensor>(name);
+    auto& tensor = absl::get<Tensor>(*var);
+    tensor->Resize(Shape{{M, N}});
+    auto* data = tensor->mutable_data<float>(common::DefaultHostTarget());
+    for (int i = 0; i < M * N; i++) {
+      data[i] = (rand() * 1.f) / RAND_MAX;  // NOLINT
+    }
+  }
+}
+
+TEST(Instruction, basic) {
+  const int M = 10;
+  const int N = 20;
+
+  Scope scope;
+  InstantiateScope(M, N, &scope);
+  // create Instruction
+  Instruction instr(common::DefaultHostTarget(), &scope, {"x", "y"}, {"z"});
+  auto jit    = GetLoweredFunc(M, N);
+  auto fn_ptr = jit->Lookup("fn");
+  CHECK(fn_ptr);
+  instr.SetLoweredFunc(reinterpret_cast<void*>(fn_ptr));
+  // should call Finalize explicitly before Run
+  ASSERT_DEATH(instr.Run(), "");
+  instr.Finalize();
+  instr.Run();
+
+  // check result
+  {
+    auto* xd = scope.GetTensor("x")->data<float>();
+    auto* yd = scope.GetTensor("y")->data<float>();
+    auto* zd = scope.GetTensor("z")->data<float>();
+
+    for (int i = 0; i < M * N; i++) {
+      LOG_FIRST_N(INFO, 3) << "data: " << xd[i] << " + " << yd[i] << " = " << zd[i];
+      ASSERT_NEAR(xd[i] + yd[i], zd[i], 1e-5);
+    }
+  }
+}
+
+TEST(Instruction, RunWithRawPodArgs) {
+  const int M       = 10;
+  const int N       = 20;
+  const auto& shape = Shape({M, N});
+
+  std::map<std::string, cinn_pod_value_t> name2podargs;
+  // case 1: create cinn_pod_value_t arguments dicrectly
+  std::vector<cinn_buffer_t> args_buffer(3);  // store {"x", "y", "z"} buffer objects
+  auto* default_memory_mng = MemoryManager::Global().RetrieveSafely(common::DefaultHostTarget().arch);
+
+  int count = 0;
+  for (const auto& name : std::vector<std::string>({"x", "y", "z"})) {
+    auto* buffer = &args_buffer.at(count++);
+    buffer->resize(reinterpret_cast<const cinn_dimension_t*>(shape.data().data()), shape.size());
+    buffer->memory = reinterpret_cast<uint8_t*>(default_memory_mng->malloc(shape.numel() * sizeof(float)));
+    auto* data     = reinterpret_cast<float*>(buffer->memory);
+    for (int i = 0; i < M * N; i++) {
+      data[i] = (rand() * 1.f) / RAND_MAX;  // NOLINT
+    }
+    name2podargs.emplace(name, buffer);
+  }
+
+  // create Instruction
+  auto jit    = GetLoweredFunc(M, N);
+  auto fn_ptr = jit->Lookup("fn");
+  CHECK(fn_ptr);
+  Instruction instr(common::DefaultHostTarget(), nullptr, {"x", "y"}, {"z"});  // empty scope
+  instr.SetLoweredFunc(reinterpret_cast<void*>(fn_ptr));
+  instr.Finalize();
+
+  auto check_equal_by_element = [&]() {
+    auto xd = reinterpret_cast<float*>(cinn_pod_value_to_buffer_p(&name2podargs.at("x"))->memory);
+    auto yd = reinterpret_cast<float*>(cinn_pod_value_to_buffer_p(&name2podargs.at("y"))->memory);
+    auto zd = reinterpret_cast<float*>(cinn_pod_value_to_buffer_p(&name2podargs.at("z"))->memory);
+    for (int i = 0; i < M * N; ++i) {
+      LOG_FIRST_N(INFO, 3) << "data: " << xd[i] << " + " << yd[i] << " = " << zd[i];
+      ASSERT_NEAR(xd[i] + yd[i], zd[i], 1e-5);
+    }
+  };
+
+  // run with a arguments map passed
+  instr.Run(&name2podargs, false, nullptr, false);
+  // check instruction run correctly
+  check_equal_by_element();
+
+  // case 2: create cinn_pod_value_t arguments from scope;
+  Scope scope;
+  InstantiateScope(M, N, &scope);
+  name2podargs.clear();
+
+  for (auto& name : std::vector<std::string>({"x", "y", "z"})) {
+    auto&& tensor = scope.GetTensor(name);
+    name2podargs.emplace(name, tensor->buffer());
+  }
+  instr.Run(&name2podargs, false, nullptr, false);
+  // check instruction run correctly
+  check_equal_by_element();
+}
+
+#ifdef CINN_WITH_CUDNN
+
+class TestInstruction : public Instruction {
+ public:
+  TestInstruction(const Target& target,
+                  Scope* scope,
+                  const std::vector<std::string>& in_args,
+                  const std::vector<std::string>& out_args,
+                  const std::string& func_name)
+      : Instruction(target, scope, in_args, out_args, func_name) {}
+  void SetArgs(const std::vector<int>& args) { args_ = args; }
+  void SetPodArgs(const std::vector<cinn_pod_value_t>& pod_args) { pod_args_ = pod_args; }
+
+  void RunX(std::string conv_type) {
+    if (conv_type == "forward") {
+      // input weight output
+      runtime::cuda::cinn_call_cudnn_conv2d_forward(pod_args_.data(),
+                                                    pod_args_.size(),
+                                                    0,
+                                                    1.0f,
+                                                    0.0f,
+                                                    args_[0],
+                                                    args_[1],
+                                                    args_[2],
+                                                    args_[3],
+                                                    args_[4],
+                                                    args_[5],
+                                                    args_[6],
+                                                    args_[7],
+                                                    args_[8],
+                                                    args_[9],
+                                                    args_[10],
+                                                    args_[11],
+                                                    args_[12],
+                                                    args_[13],
+                                                    args_[14],
+                                                    args_[15],
+                                                    args_[16],
+                                                    args_[17],
+                                                    args_[18],
+                                                    stream_);
+    } else if (conv_type == "backward_data") {
+      // weight dy dx
+      runtime::cuda::cinn_call_cudnn_conv2d_backward_data(pod_args_.data(),
+                                                          pod_args_.size(),
+                                                          0,
+                                                          1.0f,
+                                                          0.0f,
+                                                          args_[0],
+                                                          args_[1],
+                                                          args_[2],
+                                                          args_[3],
+                                                          args_[4],
+                                                          args_[5],
+                                                          args_[6],
+                                                          args_[7],
+                                                          args_[8],
+                                                          args_[9],
+                                                          args_[10],
+                                                          args_[11],
+                                                          args_[12],
+                                                          args_[13],
+                                                          args_[14],
+                                                          args_[15],
+                                                          args_[16],
+                                                          args_[17],
+                                                          args_[18],
+                                                          stream_);
+    } else if (conv_type == "backward_filter") {
+      // input dy dx
+      runtime::cuda::cinn_call_cudnn_conv2d_backward_filter(pod_args_.data(),
+                                                            pod_args_.size(),
+                                                            0,
+                                                            1.0f,
+                                                            0.0f,
+                                                            args_[0],
+                                                            args_[1],
+                                                            args_[2],
+                                                            args_[3],
+                                                            args_[4],
+                                                            args_[5],
+                                                            args_[6],
+                                                            args_[7],
+                                                            args_[8],
+                                                            args_[9],
+                                                            args_[10],
+                                                            args_[11],
+                                                            args_[12],
+                                                            args_[13],
+                                                            args_[14],
+                                                            args_[15],
+                                                            args_[16],
+                                                            args_[17],
+                                                            args_[18],
+                                                            stream_);
+    } else {
+      LOG(FATAL) << "Unkown Conv Type!";
+    }
+    CUDA_CALL(cudaStreamSynchronize(stream_));
+  }
+
+ private:
+  std::vector<int> args_;
+  cudaStream_t stream_{nullptr};
+  std::vector<cinn_pod_value_t> pod_args_;
+};
+
+TEST(Instruction, CONV_FORWARD) {
+  int in = 32, ic = 32, ih = 128, iw = 128;
+  int fn = 64, fc = 32, fh = 3, fw = 3;
+  int on = 32, oc = 64, oh = 128, ow = 128;
+
+  int ph = 1, pw = 1;
+  int sh = 1, sw = 1;
+  int dila_h = 1, dila_w = 1;
+
+  int group             = 1;
+  std::vector<int> args = {in, ic, ih, iw, fn, fc, fh, fw, ph, pw, sh, sw, dila_h, dila_w, group, on, oc, oh, ow};
+
+  // infer shape
+  auto conv2d           = Operator::Get("conv2d");
+  auto strategy         = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+  auto infer_shape_func = Operator::GetAttrs<InferShapeFunction>("infershape")[conv2d];
+
+  CUDA_CALL(cudaSetDevice(0));
+  auto buffer_x = common::BufferBuilder(Float(32), {in, ic, ih, iw}).set_random().Build();
+  auto buffer_w = common::BufferBuilder(Float(32), {fn, fc, fh, fw}).set_random().Build();
+  auto buffer_y = common::BufferBuilder(Float(32), {on, oc, oh, ow}).set_random().Build();
+
+  void *dev_x = nullptr, *dev_w = nullptr, *dev_y = nullptr;
+  CUDA_CALL(cudaMalloc(&dev_x, buffer_x->memory_size));
+  CUDA_CALL(cudaMalloc(&dev_w, buffer_w->memory_size));
+  CUDA_CALL(cudaMalloc(&dev_y, buffer_y->memory_size));
+
+  CUDA_CALL(cudaMemcpy(dev_x, buffer_x->memory, buffer_x->memory_size, cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(dev_w, buffer_w->memory, buffer_w->memory_size, cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(dev_y, buffer_y->memory, buffer_y->memory_size, cudaMemcpyHostToDevice));
+
+  cinn_buffer_t _x;
+  cinn_buffer_t _w;
+  cinn_buffer_t _y;
+
+  _x.memory = static_cast<uint8_t*>(dev_x);
+  _w.memory = static_cast<uint8_t*>(dev_w);
+  _y.memory = static_cast<uint8_t*>(dev_y);
+
+  _x.memory_size = buffer_x->memory_size;
+  _w.memory_size = buffer_w->memory_size;
+  _y.memory_size = buffer_y->memory_size;
+
+  _x.type = cinn_float32_t();
+  _w.type = cinn_float32_t();
+  _y.type = cinn_float32_t();
+
+  cinn_pod_value_t x(&_x);
+  cinn_pod_value_t w(&_w);
+  cinn_pod_value_t y(&_y);
+  std::vector<cinn_pod_value_t> pod_args = {x, w, y};
+
+  Scope scope;
+  auto target = common::DefaultNVGPUTarget();
+  std::vector<std::string> in_args, out_args;
+  TestInstruction instr(target, &scope, in_args, out_args, "conv2d");
+
+  instr.SetArgs(args);
+  instr.SetPodArgs(pod_args);
+  instr.RunX("forward");
+
+  CUDA_CALL(cudaFree(dev_x));
+  CUDA_CALL(cudaFree(dev_w));
+  CUDA_CALL(cudaFree(dev_y));
+}
+
+TEST(Instruction, CONV_BACKWARD_DATA) {
+  int in = 32, ic = 32, ih = 128, iw = 128;
+  int fn = 64, fc = 32, fh = 3, fw = 3;
+  int on = 32, oc = 64, oh = 128, ow = 128;
+
+  int ph = 1, pw = 1;
+  int sh = 1, sw = 1;
+  int dila_h = 1, dila_w = 1;
+
+  int group             = 1;
+  std::vector<int> args = {in, ic, ih, iw, fn, fc, fh, fw, ph, pw, sh, sw, dila_h, dila_w, group, on, oc, oh, ow};
+
+  // infer shape
+  auto conv2d           = Operator::Get("conv2d");
+  auto strategy         = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+  auto infer_shape_func = Operator::GetAttrs<InferShapeFunction>("infershape")[conv2d];
+
+  CUDA_CALL(cudaSetDevice(0));
+  auto buffer_x = common::BufferBuilder(Float(32), {in, ic, ih, iw}).set_random().Build();
+  auto buffer_w = common::BufferBuilder(Float(32), {fn, fc, fh, fw}).set_random().Build();
+  auto buffer_y = common::BufferBuilder(Float(32), {on, oc, oh, ow}).set_random().Build();
+
+  void *dev_x = nullptr, *dev_w = nullptr, *dev_y = nullptr;
+  CUDA_CALL(cudaMalloc(&dev_x, buffer_x->memory_size));
+  CUDA_CALL(cudaMalloc(&dev_w, buffer_w->memory_size));
+  CUDA_CALL(cudaMalloc(&dev_y, buffer_y->memory_size));
+
+  CUDA_CALL(cudaMemcpy(dev_x, buffer_x->memory, buffer_x->memory_size, cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(dev_w, buffer_w->memory, buffer_w->memory_size, cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(dev_y, buffer_y->memory, buffer_y->memory_size, cudaMemcpyHostToDevice));
+
+  cinn_buffer_t _x;
+  cinn_buffer_t _w;
+  cinn_buffer_t _y;
+
+  _x.memory = static_cast<uint8_t*>(dev_x);
+  _w.memory = static_cast<uint8_t*>(dev_w);
+  _y.memory = static_cast<uint8_t*>(dev_y);
+
+  _x.type = cinn_float32_t();
+  _w.type = cinn_float32_t();
+  _y.type = cinn_float32_t();
+
+  _x.memory_size = buffer_x->memory_size;
+  _w.memory_size = buffer_w->memory_size;
+  _y.memory_size = buffer_y->memory_size;
+
+  cinn_pod_value_t x(&_x);
+  cinn_pod_value_t w(&_w);
+  cinn_pod_value_t y(&_y);
+  // weight dy dx
+  std::vector<cinn_pod_value_t> pod_args = {w, y, x};
+
+  Scope scope;
+  auto target = common::DefaultNVGPUTarget();
+  std::vector<std::string> in_args, out_args;
+  TestInstruction instr(target, &scope, in_args, out_args, "conv2d");
+
+  instr.SetArgs(args);
+  instr.SetPodArgs(pod_args);
+  instr.RunX("backward_data");
+
+  CUDA_CALL(cudaFree(dev_x));
+  CUDA_CALL(cudaFree(dev_w));
+  CUDA_CALL(cudaFree(dev_y));
+}
+
+TEST(Instruction, CONV_BACKWARD_FILTER) {
+  int in = 32, ic = 32, ih = 128, iw = 128;
+  int fn = 64, fc = 32, fh = 3, fw = 3;
+  int on = 32, oc = 64, oh = 128, ow = 128;
+
+  int ph = 1, pw = 1;
+  int sh = 1, sw = 1;
+  int dila_h = 1, dila_w = 1;
+
+  int group             = 1;
+  std::vector<int> args = {in, ic, ih, iw, fn, fc, fh, fw, ph, pw, sh, sw, dila_h, dila_w, group, on, oc, oh, ow};
+
+  // infer shape
+  auto conv2d           = Operator::Get("conv2d");
+  auto strategy         = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+  auto infer_shape_func = Operator::GetAttrs<InferShapeFunction>("infershape")[conv2d];
+
+  absl::flat_hash_map<std::string, AttrType> attrs_map;
+  attrs_map["padding"]      = std::vector<int>({ph, pw});
+  attrs_map["stride"]       = std::vector<int>({sh, sw});
+  attrs_map["dilation"]     = std::vector<int>({dila_h, dila_w});
+  attrs_map["data_format"]  = std::string("NCHW");
+  attrs_map["conv_type"]    = std::string("backward_filter");
+  attrs_map["output_shape"] = std::vector<int>({fn, fc, fh, fw});
+
+  auto infer_shape = infer_shape_func({{in, ic, ih, iw}, {on, oc, oh, ow}}, attrs_map);
+  ASSERT_EQ(infer_shape[0][0], fn);
+  ASSERT_EQ(infer_shape[0][1], fc);
+  ASSERT_EQ(infer_shape[0][2], fh);
+  ASSERT_EQ(infer_shape[0][3], fw);
+
+  CUDA_CALL(cudaSetDevice(0));
+  auto buffer_x = common::BufferBuilder(Float(32), {in, ic, ih, iw}).set_random().Build();
+  auto buffer_w = common::BufferBuilder(Float(32), {fn, fc, fh, fw}).set_random().Build();
+  auto buffer_y = common::BufferBuilder(Float(32), {on, oc, oh, ow}).set_random().Build();
+
+  void *dev_x = nullptr, *dev_w = nullptr, *dev_y = nullptr;
+  CUDA_CALL(cudaMalloc(&dev_x, buffer_x->memory_size));
+  CUDA_CALL(cudaMalloc(&dev_w, buffer_w->memory_size));
+  CUDA_CALL(cudaMalloc(&dev_y, buffer_y->memory_size));
+
+  CUDA_CALL(cudaMemcpy(dev_x, buffer_x->memory, buffer_x->memory_size, cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(dev_w, buffer_w->memory, buffer_w->memory_size, cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(dev_y, buffer_y->memory, buffer_y->memory_size, cudaMemcpyHostToDevice));
+
+  cinn_buffer_t _x;
+  cinn_buffer_t _w;
+  cinn_buffer_t _y;
+
+  _x.type = cinn_float32_t();
+  _w.type = cinn_float32_t();
+  _y.type = cinn_float32_t();
+
+  _x.memory = static_cast<uint8_t*>(dev_x);
+  _w.memory = static_cast<uint8_t*>(dev_w);
+  _y.memory = static_cast<uint8_t*>(dev_y);
+
+  _x.memory_size = buffer_x->memory_size;
+  _w.memory_size = buffer_w->memory_size;
+  _y.memory_size = buffer_y->memory_size;
+
+  cinn_pod_value_t x(&_x);
+  cinn_pod_value_t w(&_w);
+  cinn_pod_value_t y(&_y);
+  // input dy dw
+  std::vector<cinn_pod_value_t> pod_args = {x, y, w};
+
+  Scope scope;
+  auto target = common::DefaultNVGPUTarget();
+  std::vector<std::string> in_args, out_args;
+  TestInstruction instr(target, &scope, in_args, out_args, "conv2d");
+
+  instr.SetArgs(args);
+  instr.SetPodArgs(pod_args);
+  instr.RunX("backward_filter");
+
+  CUDA_CALL(cudaFree(dev_x));
+  CUDA_CALL(cudaFree(dev_w));
+  CUDA_CALL(cudaFree(dev_y));
+}
+
+#endif
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/memory.cc b/paddle/cinn/hlir/framework/memory.cc
new file mode 100755
index 0000000000000..69e29ef56caab
--- /dev/null
+++ b/paddle/cinn/hlir/framework/memory.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/memory.h"
+
+#ifdef CINN_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "cinn/backends/cuda_util.h"
+#endif
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using common::Target;
+
+namespace {
+
+class X86MemoryMng : public MemoryInterface {
+ public:
+  void* malloc(size_t nbytes) override { return ::malloc(nbytes); }
+  void free(void* data) override {
+    if (!data) return;
+    ::free(data);
+  }
+  void* aligned_alloc(size_t alignment, size_t nbytes) override { return ::aligned_alloc(alignment, nbytes); }
+};
+
+#ifdef CINN_WITH_CUDA
+class CudaMemoryMng : public MemoryInterface {
+ public:
+  void* malloc(size_t nbytes) override {
+    void* data;
+    CUDA_CALL(cudaMalloc(&data, nbytes));
+    return data;
+  }
+
+  void free(void* data) override { CUDA_CALL(cudaFree(data)); }
+};
+
+#endif
+
+}  // namespace
+
+MemoryManager::MemoryManager() {
+  Register(Target::Arch::Unk, new X86MemoryMng);
+  Register(Target::Arch::X86, new X86MemoryMng);
+#ifdef CINN_WITH_CUDA
+  Register(Target::Arch::NVGPU, new CudaMemoryMng);
+#endif
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/memory.h b/paddle/cinn/hlir/framework/memory.h
new file mode 100755
index 0000000000000..8f23f352af92c
--- /dev/null
+++ b/paddle/cinn/hlir/framework/memory.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <glog/logging.h>
+
+#include <memory>
+
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+class MemoryInterface {
+ public:
+  virtual void* malloc(size_t nbytes) = 0;
+  virtual void free(void* data)       = 0;
+  virtual void* aligned_alloc(size_t alignment, size_t nbytes) { return nullptr; }
+  virtual ~MemoryInterface() {}
+};
+
+/**
+ * MemoryManager holds a map of MemoryInterface for each articture.
+ */
+class MemoryManager final {
+ public:
+  using key_t = common::Target::Arch;
+
+  static MemoryManager& Global() {
+    static auto* x = new MemoryManager;
+    return *x;
+  }
+
+  MemoryInterface* Retrieve(key_t key) CINN_RESULT_SHOULD_USE {
+    auto it = memory_mngs_.find(key);
+    if (it != memory_mngs_.end()) return it->second.get();
+    return nullptr;
+  }
+
+  MemoryInterface* RetrieveSafely(key_t key) {
+    auto* res = Retrieve(key);
+    CHECK(res) << "no MemoryInterface for architecture [" << key << "]";
+    return res;
+  }
+
+  MemoryInterface* Register(key_t key, MemoryInterface* item) {
+    CHECK(!memory_mngs_.count(key)) << "Duplicate register [" << key << "]";
+    memory_mngs_[key].reset(item);
+    return item;
+  }
+
+ private:
+  MemoryManager();
+
+  absl::flat_hash_map<common::Target::Arch, std::unique_ptr<MemoryInterface>> memory_mngs_;
+
+  CINN_DISALLOW_COPY_AND_ASSIGN(MemoryManager);
+};
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/node.cc b/paddle/cinn/hlir/framework/node.cc
new file mode 100644
index 0000000000000..176d67697b9ca
--- /dev/null
+++ b/paddle/cinn/hlir/framework/node.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/node.h"
+
+#include <algorithm>
+
+#include "cinn/common/context.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+std::tuple<common::GraphEdge*, common::GraphEdge*> Node::LinkTo(NodeData* other) {
+  return this->common::GraphNode::LinkTo(other->as<common::GraphNode>());
+}
+
+std::tuple<common::GraphEdge*, common::GraphEdge*> NodeData::LinkTo(Node* other) {
+  return this->common::GraphNode::LinkTo(other->as<common::GraphNode>());
+}
+
+void Node::Controls(NodeData* other) { return this->common::GraphNode::Controls(other->as<common::GraphNode>()); }
+
+void NodeData::Controls(Node* other) { return this->common::GraphNode::Controls(other->as<common::GraphNode>()); }
+
+namespace {
+
+struct PyBindNodeAttrVisitor {
+  std::stringstream& out;
+  explicit PyBindNodeAttrVisitor(std::stringstream& out) : out(out) {}
+
+  void operator()(int v) { out << "int: " << v; }
+  void operator()(int64_t v) { out << "int64_t: " << v; }
+  void operator()(float v) { out << "float: " << v; }
+  void operator()(double v) { out << "double: " << v; }
+  void operator()(bool v) { out << "bool: " << v; }
+  void operator()(const std::string& v) { out << "string: " << v; }
+#define VISIT_ELEMENTS(T__)                                      \
+  void operator()(const std::vector<T__>& vs) {                  \
+    if (vs.empty()) return;                                      \
+    for (int i = 0; i < vs.size() - 1; i++) out << vs[i] << ","; \
+    out << vs.back();                                            \
+  }
+  VISIT_ELEMENTS(int)
+  VISIT_ELEMENTS(int64_t)
+  VISIT_ELEMENTS(float)
+  VISIT_ELEMENTS(double)
+  VISIT_ELEMENTS(bool)
+  VISIT_ELEMENTS(std::string)
+};
+
+}  // namespace
+
+std::ostream& operator<<(std::ostream& os, const NodeAttr& node_attr) {
+  std::stringstream ss;
+  ss << "NodeAttr:\n";
+  for (auto& item : node_attr.attr_store) {
+    std::stringstream os;
+    PyBindNodeAttrVisitor visitor(os);
+    absl::visit(visitor, item.second);
+    ss << "- " << os.str() << "\n";
+  }
+  os << ss.str();
+  return os;
+}
+
+//! Using index to sort the input/output tensors
+bool edge_index_compare(const common::Shared<common::GraphEdge>& a, const common::Shared<common::GraphEdge>& b) {
+  CHECK_NOTNULL(a.get());
+  CHECK_NOTNULL(b.get());
+  return a->index() < b->index();
+}
+
+std::vector<common::Shared<common::GraphEdge>> Node::inlinks_in_order() const {
+  std::vector<common::Shared<common::GraphEdge>> ordered_links;
+  for (auto& in_edge : this->inlinks()) {
+    ordered_links.push_back(in_edge);
+    CHECK_GE(in_edge->index(), 0) << "The index of a node's inlinks should be >= 0! Now index is: " << in_edge->index()
+                                  << ". Please check.";
+  }
+  std::sort(ordered_links.begin(), ordered_links.end(), edge_index_compare);
+  return ordered_links;
+}
+
+std::vector<common::Shared<common::GraphEdge>> Node::outlinks_in_order() const {
+  std::vector<common::Shared<common::GraphEdge>> ordered_links;
+  for (auto& out_edge : this->outlinks()) {
+    ordered_links.push_back(out_edge);
+    CHECK_GE(out_edge->index(), 0) << "The index of a node's outlinks should be >= 0! Now index is: "
+                                   << out_edge->index() << ". Please check.";
+  }
+  std::sort(ordered_links.begin(), ordered_links.end(), edge_index_compare);
+  return ordered_links;
+}
+
+NodeData* InsertGraphOpNodeAfter(
+    common::Graph* graph, Node* insert_node, NodeData* input_nodedata, Node* out_node, int pos) {
+  CHECK(graph);
+  CHECK(insert_node);
+  CHECK(input_nodedata);
+  input_nodedata->Controls(insert_node);
+  common::Shared<Node> node_ptr(insert_node);
+  auto* out_nodedata = new NodeData(node_ptr, 0, 0, common::UniqName(insert_node->id() + "_out"));
+  insert_node->Controls(out_nodedata);
+  std::vector<common::GraphNode*> old_sources;
+  auto input_links = out_node->inlinks_in_order();
+
+  if (out_node) {
+    for (auto& link : input_links) {
+      auto* source = link->source();
+      // unlink and relink afterwards to make sure the order
+      source->UnLinkSingleTo(out_node);
+      old_sources.push_back(source);
+    }
+    for (int i = 0; i < old_sources.size(); i++) {
+      auto* source = old_sources[i];
+      if (i == pos) {
+        out_nodedata->LinkTo(out_node);
+      } else {
+        source->LinkTo(out_node);
+      }
+    }
+  }
+
+  graph->RegisterNode(insert_node->id(), insert_node);
+  graph->RegisterNode(out_nodedata->id(), out_nodedata);
+  return out_nodedata;
+}
+
+NodeData* InsertGraphOpNodeBefore(
+    common::Graph* graph, Node* insert_node, Node* input_node, NodeData* dst_data, int pos) {
+  CHECK(graph);
+  CHECK(insert_node);
+  CHECK(input_node);
+  CHECK(dst_data);
+  auto node_ptr        = dst_data->source_node;
+  auto* input_node_out = new NodeData(node_ptr, 0, 0, common::UniqName(input_node->id() + "_out"));
+  std::vector<common::GraphNode*> old_sinks;
+  const auto& old_outlinks = input_node->outlinks_in_order();
+  for (auto& link : old_outlinks) {
+    auto sink = link->sink();
+    // unlink and relink afterwards to make sure the right outputs order
+    input_node->UnLinkSingleTo(sink);
+    old_sinks.push_back(sink);
+  }
+  input_node_out->Controls(insert_node);
+  insert_node->Controls(dst_data);
+  dst_data->source_node = common::Shared<Node>(insert_node);
+
+  for (int i = 0; i < old_sinks.size(); i++) {
+    if (i == pos) {
+      input_node->LinkTo(input_node_out);
+    } else {
+      auto outdata = old_sinks[i]->safe_as<NodeData>();
+      input_node->LinkTo(outdata);
+    }
+  }
+
+  graph->RegisterNode(input_node_out->id(), input_node_out);
+  graph->RegisterNode(insert_node->id(), insert_node);
+  return input_node_out;
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/node.h b/paddle/cinn/hlir/framework/node.h
new file mode 100644
index 0000000000000..c22b817355427
--- /dev/null
+++ b/paddle/cinn/hlir/framework/node.h
@@ -0,0 +1,212 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/common/shared.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+class Node;
+class NodeData;
+
+using NodePtr     = common::Shared<Node>;
+using AttrType    = utils::Attribute;
+using AttrMapType = utils::AttributeMap;
+
+/**
+ * \brief Attributes of each node in graph.
+ *  The attributes include the node's name, the corresponding operator
+ *  and other parameters like axis.
+ */
+struct NodeAttr {
+  using attr_t = AttrType;
+
+  /**
+   * \brief The operator this node uses.
+   */
+  const Operator *op{nullptr};
+
+  /**
+   * \brief The name of this node.
+   */
+  std::string node_name;
+
+  /**
+   * \brief The attributes stored as string in dictionary.
+   */
+  absl::flat_hash_map<std::string, attr_t> attr_store;
+};
+
+std::ostream &operator<<(std::ostream &os, const NodeAttr &node_attr);
+
+/**
+ * \brief Node represents an operation in a computation graph.
+ */
+class Node : public common::GraphNode {
+ public:
+  Node() = default;
+  Node(const Operator *op, const std::string &name, std::string id = {}) {
+    this->attrs.op        = op;
+    this->attrs.node_name = name;
+    this->id_             = std::move(id);
+  }
+  const char *type_info() const override { return __type_info__; }
+  std::tuple<common::GraphEdge *, common::GraphEdge *> LinkTo(NodeData *other);
+
+  // This node determines another node, which means the other node depeneds on this node.
+  void Controls(NodeData *other);
+
+  /**
+   * \brief Get the unique id of this NodeData.
+   */
+  std::string id() const override { return id_; }
+
+  /**
+   * \brief The attributes in the node.
+   */
+  NodeAttr attrs;
+
+  //! Get the input tensors in order to match tensors correctly. If do refresh, we will update the links.
+  std::vector<common::Shared<common::GraphEdge>> inlinks_in_order() const;
+
+  //! Get the output tensors in order to match tensors correctly. If do refresh, we will update the links.
+  std::vector<common::Shared<common::GraphEdge>> outlinks_in_order() const;
+
+  inline const Operator *op() const { return this->attrs.op; }
+
+  inline bool is_variable() { return (this->attrs.op == nullptr); }
+
+  inline uint32_t num_outputs() {
+    if (is_variable()) return 1;
+    if (this->op()->num_outputs == 0) {
+      using shape_func_t = std::function<std::vector<shape_t>(const std::vector<shape_t> &, const AttrMapType &)>;
+      const auto &op_infershape = Operator::GetAttrs<shape_func_t>("infershape");
+      auto key                  = Operator::Get(this->op()->name);
+      auto out_shapes           = op_infershape[key]({}, this->attrs.attr_store);
+      return out_shapes.size();
+    } else {
+      return this->op()->num_outputs;
+    }
+  }
+
+  inline uint32_t num_inputs() { return is_variable() ? 1 : this->op()->num_inputs; }
+
+  template <class... Args>
+  static NodePtr Create(Args &&...args) {
+    return common::Shared<Node>(new Node(std::forward<Args>(args)...));
+  }
+
+  static constexpr char *__type_info__ = "hlir_framework_node";
+
+ private:
+  /**
+   * \brief The unique id of the node.
+   */
+  std::string id_;
+};
+
+/**
+ * \brief NodeData represents the output data from an operator.
+ */
+class NodeData : public common::GraphNode {
+  using attr_t = AttrType;
+
+ public:
+  NodeData(NodePtr node, uint32_t index, uint32_t version, std::string id, bool is_const = false)
+      : source_node(std::move(node)), output_index(index), version(version), id_(std::move(id)), is_const_(is_const) {}
+
+  NodeData() : source_node(), output_index(), version(), id_(), is_const_() {}
+
+  std::tuple<common::GraphEdge *, common::GraphEdge *> LinkTo(Node *other);
+
+  // This node determines another node, which means the other node depeneds on this node.
+  void Controls(Node *other);
+
+  static std::shared_ptr<NodeData> Create(
+      const char *op_name,
+      std::string node_name,
+      std::vector<NodeData> inputs,
+      std::string id                                 = nullptr,
+      absl::flat_hash_map<std::string, attr_t> attrs = absl::flat_hash_map<std::string, attr_t>(),
+      bool is_const                                  = false) {
+    auto res                           = std::make_shared<NodeData>();
+    res->id_                           = std::move(id);
+    res->is_const_                     = is_const;
+    res->source_node                   = Node::Create();
+    res->source_node->attrs.op         = Operator::Get(op_name);
+    res->source_node->attrs.node_name  = std::move(node_name);
+    res->source_node->attrs.attr_store = attrs;
+    return res;
+  }
+
+  const char *type_info() const override { return __type_info__; }
+  /**
+   * \brief Get the unique id of this NodeData.
+   */
+  std::string id() const override { return id_; }
+  bool is_const() const { return is_const_; }
+  void set_const(bool is_const) { is_const_ = is_const; }
+
+  /**
+   * \brief Source_node represents the operator this NodeData comes from.
+   */
+  NodePtr source_node;
+
+  /**
+   * \brief Output_index represents the index of this output data
+   *  among all the outputs of the operator.
+   *  For example, if an operator has 2 outputs, the index of
+   *  the 2 NodeData should be 0 and 1.
+   */
+  uint32_t output_index;
+
+  /**
+   * \brief The version of input Variable.
+   *  This field can only be nonzero when this->node is a Variable node.
+   *  version is increased by one each time a Variable get composed to a mutation Op.
+   */
+  uint32_t version;
+
+  static constexpr char *__type_info__ = "hlir_framework_nodedata";
+
+ private:
+  /**
+   * \brief The unique id of this NodeData.
+   */
+  std::string id_;
+  bool is_const_ = false;
+};
+
+// insert op_node after input_data
+NodeData *InsertGraphOpNodeAfter(
+    common::Graph *graph, Node *insert_node, NodeData *input_nodedata, Node *dst_node, int pos);
+// insert op_node before out_data
+NodeData *InsertGraphOpNodeBefore(
+    common::Graph *graph, Node *insert_node, Node *input_node, NodeData *dst_data, int pos);
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/op.h b/paddle/cinn/hlir/framework/op.h
new file mode 100755
index 0000000000000..2a158c720e1d5
--- /dev/null
+++ b/paddle/cinn/hlir/framework/op.h
@@ -0,0 +1,248 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+#include <absl/types/any.h>
+#include <glog/logging.h>
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mutex>  //NOLINT
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/macros.h"
+#include "cinn/utils/registry.h"
+#include "cinn/utils/type_defs.h"
+
+template <typename R, typename... Args>
+inline auto MakeOpFunction(R (*func)(Args...)) {
+  return std::function<R(Args...)>(func);
+}
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+class Operator;
+
+using shape_t = utils::ShapeType;
+using dim_t   = utils::DimType;
+
+/*! \brief operator pattern used in graph fusion */
+enum OpPatternKind {
+  // The relation between input tensor index and output tensor index is one-to-one correspondence.
+  // for example :code:`out[i, j] = input[i, j] + 1`.
+  // Note that the axis need to be in order.
+  kElementWise = 0,
+  // The relation between input tensor index and output tensor index is one-to-many correspondence.
+  // for example :code:`out[i, j, k] = input[i, j]`.
+  // Note that the axis need to be in order.
+  kBroadcast = 1,
+  // Injective operator, we can always injectively map a output axis to a input axis.
+  // for example :code:`out[i, j] = input[j, i]`.
+  kInjective = 2,
+  // The relation between input tensor index and output tensor index is many-to-one correspondence.
+  // for example :code:`out[i, j] = sum(input[i, j, k]) along k`.
+  kReduction = 3,
+  // Complex operation, can still fuse one-to-one operations into its output.
+  kOutFusible = 4,
+  // Operation that cannot fuse anything.
+  kNonFusible = 8
+};
+
+struct OpRegistry : public Registry<Operator> {
+  std::recursive_mutex mutex;
+  std::atomic<int> op_counter{0};
+  absl::flat_hash_map<std::string, std::unique_ptr<absl::any>> attrs;
+
+  static OpRegistry* Global() {
+    static OpRegistry x;
+    return &x;
+  }
+
+ private:
+  OpRegistry() = default;
+  CINN_DISALLOW_COPY_AND_ASSIGN(OpRegistry);
+};
+
+template <typename ValueType>
+class OpValueType {
+ public:
+  inline const ValueType& operator[](const Operator* op) const;
+
+  inline const ValueType& Get(const Operator* op, const ValueType& def_value) const;
+
+  inline bool Find(const Operator* op) const;
+
+  size_t Size() const { return data.size(); }
+
+ private:
+  friend class Operator;
+  std::string attr_name;
+  std::vector<ValueType> data;
+  OpValueType() = default;
+};
+
+class Operator {
+ public:
+  std::string name;
+  std::string description;
+  uint32_t num_inputs{1};
+  uint32_t num_outputs{1};
+  uint32_t support_level{10};
+
+  inline Operator& describe(const std::string description) {
+    this->description = description;
+    return *this;
+  }
+
+  inline Operator& set_num_inputs(uint32_t n) {
+    this->num_inputs = n;
+    return *this;
+  }
+
+  inline Operator& set_num_outputs(uint32_t n) {
+    this->num_outputs = n;
+    return *this;
+  }
+
+  inline Operator& set_support_level(uint32_t n) {
+    this->support_level = n;
+    return *this;
+  }
+  /**
+   * \brief Get an Op for a given operator name.
+   *  Will raise an error if the op has not been registered.
+   * @param op_name Name of the operator.
+   * @return Pointer to a Op, valid throughout program lifetime.
+   */
+  static const Operator* Get(const std::string& op_name) {
+    const Operator* op = OpRegistry::Global()->Find(op_name);
+    CHECK(op) << "Operator [" << op_name << "] is not registered";
+    return op;
+  }
+
+  template <typename ValueType>
+  inline Operator& set_attr(const std::string& attr_name, const ValueType& value) {
+    UpdateAttrMap(attr_name, [this, attr_name, value](absl::any* pmap) {
+      if (!pmap->has_value()) {
+        OpValueType<ValueType> pm;
+        pm.attr_name = attr_name;
+        *pmap        = std::move(pm);
+      }
+      std::vector<ValueType>& vec = absl::any_cast<OpValueType<ValueType>&>(*pmap).data;
+      // resize the value type.
+      if (vec.size() <= index) {
+        vec.resize(index + 1, ValueType());
+      }
+      vec[index] = value;
+    });
+    return *this;
+  }
+  template <typename ValueType>
+  static const OpValueType<ValueType>& GetAttrs(const std::string& attr_name) {
+    const absl::any* ref = GetAttrMap(attr_name);
+    if (ref == nullptr) {
+      //! update the attribute map of the key by creating new empty OpMap
+      UpdateAttrMap(attr_name, [attr_name](absl::any* pmap) {
+        if (!pmap->has_value()) {
+          OpValueType<ValueType> pm;
+          pm.attr_name = attr_name;
+          *pmap        = std::move(pm);
+        }
+      });
+      ref = GetAttrMap(attr_name);
+    }
+    return absl::any_cast<const OpValueType<ValueType>&>(*ref);
+  }
+
+  auto get_index() const { return index; }
+
+ private:
+  template <typename ValueType>
+  friend class OpValueType;
+  friend class Registry<Operator>;
+  uint32_t index{0};
+  Operator() { index = OpRegistry::Global()->op_counter++; }
+  static const absl::any* GetAttrMap(const std::string& key) {
+    auto& dict = OpRegistry::Global()->attrs;
+    auto it    = dict.find(key);
+    if (it != dict.end()) {
+      return it->second.get();
+    } else {
+      return nullptr;
+    }
+  }
+  //! update the attribute OpValueType
+  static void UpdateAttrMap(const std::string& key, std::function<void(absl::any*)> updater) {
+    OpRegistry* reg = OpRegistry::Global();
+    std::lock_guard<std::recursive_mutex>(reg->mutex);
+    std::unique_ptr<absl::any>& value = reg->attrs[key];
+    if (value.get() == nullptr) value.reset(new absl::any());
+    if (updater != nullptr) updater(value.get());
+  }
+};
+
+template <typename ValueType>
+const ValueType& OpValueType<ValueType>::operator[](const Operator* op) const {
+  CHECK(op) << "The input op is nullptr and it is invalid! Please check again.";
+  const uint32_t idx = op->index;
+  CHECK_LT(idx, data.size()) << "Attribute " << attr_name << " has not been registered for Operator " << op->name;
+  return data[idx];
+}
+
+template <typename ValueType>
+const ValueType& OpValueType<ValueType>::Get(const Operator* op, const ValueType& def_value) const {
+  if (!op) return def_value;
+  const uint32_t idx = op->index;
+  if (idx < data.size()) {
+    return data[idx];
+  } else {
+    return def_value;
+  }
+}
+
+template <typename ValueType>
+bool OpValueType<ValueType>::Find(const Operator* op) const {
+  if (!op) return false;
+  const uint32_t idx = op->index;
+  return idx < data.size();
+}
+
+// internal macros to make
+#define CINN_REGISTER_VAR_DEF(OpName) static ::cinn::hlir::framework::Operator& __make_##HlirOp##_##OpName
+
+/**
+ * @def CINNR_REGISTER_OP
+ * \brief Register a new operator, or set attribute of the corresponding op.
+ *
+ * @param OpName The name of registry
+ *
+ * \code
+ *  CINN_REGISTER_OP(add)
+ *  .describe("add two inputs together")
+ *  .set_num_inputs(2)
+ *  .set_attr<OpKernel>("gpu_kernel", AddKernel);
+ * \endcode
+ */
+#define CINN_REGISTER_OP(OpName)                                \
+  CINN_STR_CONCAT(CINN_REGISTER_VAR_DEF(OpName), __COUNTER__) = \
+      ::cinn::hlir::framework::OpRegistry::Global()->__REGISTER_OR_GET__(#OpName)
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/op_lowering.cc b/paddle/cinn/hlir/framework/op_lowering.cc
new file mode 100644
index 0000000000000..9a213242a17e4
--- /dev/null
+++ b/paddle/cinn/hlir/framework/op_lowering.cc
@@ -0,0 +1,1351 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/op_lowering.h"
+
+#include "cinn/hlir/framework/op_lowering_util.h"
+#include "cinn/hlir/op/external_api_registry.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/transform_gpu_forloop.h"
+
+DECLARE_bool(cinn_ir_schedule);
+DECLARE_bool(cinn_use_cuda_vectorize);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using common::bfloat16;
+using common::float16;
+
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+using framework::OpPatternKind;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+using common::GraphEdge;
+using common::GraphNode;
+using common::Type;
+using namespace lang;
+
+using Comparator = Graph::Group::SharedGroupComparator;
+using Hasher     = Graph::Group::SharedGroupHasher;
+using cinn::hlir::op::ExternalApiRegistry;
+
+OpLowerer::OpLowerer(const absl::flat_hash_map<std::string, Type>& type_dict,
+                     const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                     const Target& target)
+    : type_dict_(type_dict), shape_dict_(shape_dict), target_(target) {}
+
+std::vector<ir::LoweredFunc> OpLowerer::Lower(GroupPtr& group) {
+  VLOG(3) << "Lowering Group : " << group->group_id << " , Op Pattern : " << group->op_pattern_kind;
+  group->input_names.clear();
+  group->output_names.clear();
+  if (FLAGS_cinn_ir_schedule) {
+    switch (group->op_pattern_kind) {
+      case framework::kElementWise:
+      case framework::kBroadcast:
+      case framework::kInjective:
+        return IRLowerOp(&OpLowerer::IRElementwiseCompute, &OpLowerer::IRElementwiseSchedule, group);
+      case framework::kReduction:
+        return IRLowerOp(&OpLowerer::IRReduceCompute, &OpLowerer::IRReduceSchedule, group);
+      case framework::kOutFusible:
+        LOG(FATAL) << "Group Pattern Kind kOutFusible Is Not Implemented!";
+      case framework::kNonFusible:
+        return IRLowerNonFusibleOp(group, /*apply_impl_schedule = */ true);
+      default:
+        LOG(FATAL) << "Group Pattern Kind Is Unknown!";
+    }
+  } else {
+    LOG(FATAL) << "Previous IR Schedule Is Not Implemented!";
+  }
+}
+
+std::vector<ir::LoweredFunc> OpLowerer::LowerWithoutSchedule(GroupPtr& group) {
+  VLOG(3) << "Lowering Group : " << group->group_id << " , Op Pattern : " << group->op_pattern_kind;
+  if (FLAGS_cinn_ir_schedule) {
+    switch (group->op_pattern_kind) {
+      case framework::kElementWise:
+      case framework::kBroadcast:
+      case framework::kInjective:
+        return IRLowerOpWithoutSchedule(&OpLowerer::IRElementwiseCompute, group);
+      case framework::kReduction:
+        return IRLowerOpWithoutSchedule(&OpLowerer::IRReduceCompute, group);
+      case framework::kOutFusible:
+        LOG(FATAL) << "Group Pattern Kind kOutFusible Is Not Implemented!";
+      case framework::kNonFusible:
+        return IRLowerNonFusibleOp(group, /*apply_impl_schedule = */ false);
+      default:
+        LOG(FATAL) << "Group Pattern Kind kNonFusible Is Not Implemented!";
+    }
+  } else {
+    LOG(FATAL) << "Previous IR Schedule Is Not Implemented!";
+  }
+}
+
+std::vector<ir::LoweredFunc> OpLowerer::IRLowerOp(IRComputeFunction compute,
+                                                  IRScheduleFunction schedule,
+                                                  GroupPtr& group) {
+  poly::StageMap stages;
+  std::vector<ir::Tensor> arg_tensors;
+  std::unordered_map<std::string, ir::Tensor> tensor_map;
+  // do compute.
+  VLOG(3) << "group->fused_sub_groups.size() is : " << group->fused_sub_groups.size();
+  std::vector<Expr> ast_exprs;
+  if (group->fused_sub_groups.size() == 0) {
+    ast_exprs = (this->*compute)(stages, arg_tensors, tensor_map, group, group, /*apply_impl_schedule = */ true);
+  } else {
+    for (auto& sub_group : group->fused_sub_groups) {
+      auto exprs = (this->*compute)(stages, arg_tensors, tensor_map, group, sub_group, /*apply_impl_schedule = */ true);
+      ast_exprs.insert(ast_exprs.end(), exprs.begin(), exprs.end());
+    }
+  }
+  ir::ModuleExpr mod_expr(ast_exprs);
+  ir::IRSchedule ir_sch(mod_expr);
+  ir_sch.MergeExprs();
+
+  Node* first  = nullptr;
+  Node* second = nullptr;
+
+  VLOG(3) << "Before IRLowerOp schedule, ir is: \n" << ir_sch.GetModule().GetExprs().at(0);
+  // do schedule.
+  IRSchedule(ir_sch, group, tensor_map);
+  VLOG(3) << "After IRLowerOp schedule, ir is: \n" << ir_sch.GetModule().GetExprs().at(0);
+  // function args
+  group->input_names.clear();
+  std::vector<ir::Argument> func_args;
+  for (auto& args : arg_tensors) {
+    // input node data name.
+    group->input_names.push_back(args->name);
+    // input args
+    func_args.emplace_back(args->buffer, ir::Argument::IO::kInput);
+  }
+
+  group->output_names.clear();
+  for (auto& node : group->output_nodes) {
+    // output node data name.
+    for (auto node_data : GetAllNodeData(node)) {
+      group->output_names.push_back(node_data->id());
+    }
+    // collect all output tensor.
+    std::string post   = "";
+    std::string prefix = GetNodeData(node)->id();
+    for (int idx = 0; idx < 1; ++idx) {
+      CHECK(tensor_map.count(prefix)) << "Can't find output tensor " << prefix;
+      if (!tensor_map.count(prefix + post)) {
+        break;
+      }
+      auto tensor = tensor_map[prefix + post];
+      arg_tensors.push_back(tensor);
+      // output args
+      func_args.emplace_back(tensor->buffer, ir::Argument::IO::kOutput);
+      // update post
+      post = "_" + std::to_string(idx);
+    }
+  }
+  auto func_body = ir_sch.GetModule().GetExprs().at(0);
+#ifdef CINN_WITH_CUDA
+  optim::OptimizeExprGPU(&(func_body));
+#endif
+
+  auto temp_buffers = lang::GetTempBuffers(arg_tensors, stages, func_body);
+  auto func =
+      ir::_LoweredFunc_::Make(group->GetFuncName(), func_args, ir_sch.GetModule().GetExprs().at(0), temp_buffers);
+  func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref();
+  return {func};
+}
+
+std::vector<ir::LoweredFunc> OpLowerer::IRLowerOpWithoutSchedule(IRComputeFunction compute, GroupPtr& group) {
+  poly::StageMap stages;
+  std::vector<ir::Tensor> arg_tensors;
+  std::unordered_map<std::string, ir::Tensor> tensor_map;
+  // do compute.
+  VLOG(3) << "group->fused_sub_groups.size() is : " << group->fused_sub_groups.size();
+  std::vector<Expr> ast_exprs;
+  if (group->fused_sub_groups.size() == 0) {
+    ast_exprs = (this->*compute)(stages, arg_tensors, tensor_map, group, group, /*apply_impl_schedule = */ false);
+  } else {
+    for (auto& sub_group : group->fused_sub_groups) {
+      auto exprs =
+          (this->*compute)(stages, arg_tensors, tensor_map, group, sub_group, /*apply_impl_schedule = */ false);
+      ast_exprs.insert(ast_exprs.end(), exprs.begin(), exprs.end());
+    }
+  }
+  ir::ModuleExpr mod_expr(ast_exprs);
+  ir::IRSchedule ir_sch(mod_expr);
+  ir_sch.MergeExprs();
+
+  VLOG(3) << "After IRLowerOp compute, ir is: \n" << ir_sch.GetModule().GetExprs().at(0);
+  // function args
+  group->input_names.clear();
+  std::vector<ir::Argument> func_args;
+  for (auto& args : arg_tensors) {
+    // input node data name.
+    group->input_names.push_back(args->name);
+    // input args
+    func_args.emplace_back(args->buffer, ir::Argument::IO::kInput);
+  }
+
+  group->output_names.clear();
+  for (auto& node : group->output_nodes) {
+    // output node data name.
+    for (auto node_data : GetAllNodeData(node)) {
+      group->output_names.push_back(node_data->id());
+    }
+    // collect all output tensor.
+    std::string post   = "";
+    std::string prefix = GetNodeData(node)->id();
+    for (int idx = 0; idx < 1; ++idx) {
+      CHECK(tensor_map.count(prefix)) << "Can't find output tensor " << prefix;
+      if (!tensor_map.count(prefix + post)) {
+        break;
+      }
+      auto tensor = tensor_map[prefix + post];
+      arg_tensors.push_back(tensor);
+      // output args
+      func_args.emplace_back(tensor->buffer, ir::Argument::IO::kOutput);
+      // update post
+      post = "_" + std::to_string(idx);
+    }
+  }
+
+  std::unordered_set<std::string> args_map;
+  for (auto arg : func_args) {
+    args_map.insert(arg.name());
+  }
+
+  for (auto& tensor : tensor_map) {
+    if (args_map.count("_" + tensor.first)) {
+      continue;
+    }
+    arg_tensors.push_back(tensor.second);
+    // use the underlying tensor name to be consistent with the argument name in the lowered function
+    group->output_names.push_back(tensor.second->name);
+    func_args.emplace_back(tensor.second->buffer, ir::Argument::IO::kOutput);
+  }
+
+  auto func_body = ir_sch.GetModule().GetExprs().at(0);
+#ifdef CINN_WITH_CUDA
+  optim::OptimizeExprGPU(&(func_body));
+#endif
+
+  auto temp_buffers = lang::GetTempBuffers(arg_tensors, stages, func_body);
+  auto func =
+      ir::_LoweredFunc_::Make(group->GetFuncName(), func_args, ir_sch.GetModule().GetExprs().at(0), temp_buffers);
+  func->PrepareBufferCastExprs();
+  func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref();
+
+  return {func};
+}
+
+std::vector<Expr> OpLowerer::IRElementwiseCompute(poly::StageMap& stages,
+                                                  std::vector<ir::Tensor>& func_tensors,
+                                                  std::unordered_map<std::string, ir::Tensor>& tensor_map,
+                                                  const GroupPtr& group,
+                                                  const GroupPtr& sub_group,
+                                                  bool apply_impl_schedule) {
+  VLOG(2) << "ElementwiseCompute Group : " << sub_group->group_id;
+  auto& strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  std::vector<Expr> ast_exprs;
+  for (auto& node : sub_group->nodes) {
+    VLOG(4) << "Lower op: " << node->op()->name;
+    auto node_data = GetNodeData(node);
+    CHECK_EQ(GetAllNodeData(node).size(), 1U);
+    std::vector<common::CINNValue> cinn_inputs;
+    std::vector<ir::Tensor> tensor_inputs =
+        std::move(CollectInputTensor(node, func_tensors, tensor_map, this->type_dict_, this->shape_dict_));
+    for (auto& tensor : tensor_inputs) {
+      cinn_inputs.push_back(common::CINNValue(ir::Expr(tensor)));
+    }
+    // set tensor name = node data name
+    cinn_inputs.push_back(common::CINNValue(node_data->id()));
+
+    std::vector<Type> out_types;
+    std::vector<std::vector<int>> out_shapes;
+    out_types.push_back(this->type_dict_.at(node_data->id()));
+    out_shapes.push_back(this->shape_dict_.at(node_data->id()));
+    auto impl =
+        OpStrategy::SelectImpl(strategy[node->op()](node->attrs, tensor_inputs, out_types, out_shapes, this->target_));
+    // do compute
+    common::CINNValuePack pack = impl->fcompute(common::CINNValuePack{cinn_inputs});
+    CHECK_EQ(pack.size(), 2U);
+
+    Expr expr                  = pack[0];
+    poly::StageMap node_stages = pack.back();
+    tensor_inputs.push_back(expr.as_tensor_ref());
+    tensor_map[node_data->id()] = expr.as_tensor_ref();
+
+    auto func = lang::LowerVec("fn_" + node->id(), node_stages, tensor_inputs, {}, {}, nullptr, this->target_, true);
+    CHECK_EQ(func.size(), 1);
+
+    if (apply_impl_schedule) {
+      std::vector<common::CINNValue> schedule_inputs;
+      // collect tensor
+      for (int idx = 0; idx < pack.size() - 1; ++idx) {
+        CHECK(pack[idx].is_tensor());
+        schedule_inputs.push_back(common::CINNValue(pack[idx]));
+      }
+      for (auto& f : func) {
+        schedule_inputs.push_back(common::CINNValue(f->body));
+      }
+      // do ast tree schedule
+      common::CINNValuePack expr_pack = impl->fschedule(common::CINNValuePack{schedule_inputs});
+
+      CHECK_EQ(expr_pack.size(), 1);
+      Expr ast_expr = expr_pack[0];
+      ast_exprs.push_back(ast_expr);
+    } else {
+      ast_exprs.push_back(func[0]->body);
+    }
+  }
+
+  return ast_exprs;
+}
+
+void OpLowerer::IRElementwiseSchedule(ir::IRSchedule& ir_sch,
+                                      std::unordered_map<std::string, ir::Tensor>& tensor_map,
+                                      const GroupPtr& group,
+                                      const GroupPtr& sub_group,
+                                      Node*&,
+                                      Node*&) {
+  VLOG(2) << "IRElementwiseSchedule Group : " << sub_group->group_id;
+  auto master_node    = *group->master_nodes.begin();
+  auto manster_tensor = tensor_map[GetNodeData(master_node)->id()];
+
+  for (int idx = sub_group->nodes.size() - 1; idx >= 0; --idx) {
+    auto node        = sub_group->nodes[idx];
+    auto node_tensor = tensor_map[GetNodeData(node)->id()];
+
+    VLOG(3) << "Schedule node -> " << node->id() << " var : " << node_tensor->name;
+    if (group->master_nodes.count(node)) {
+      continue;
+    }
+
+    if (IsConstOp(node) && !group->output_nodes.count(node)) {
+      ir_sch.ComputeInline(ir_sch.GetBlock(node_tensor->name));
+      continue;
+    }
+
+    // if node is fringe node or internal node, fringe node is output node of sub-graph
+    if (group->output_nodes.count(node) || group->internal_nodes.count(node) || sub_group->internal_nodes.count(node)) {
+      // internal node use buffer
+      if (!group->output_nodes.count(node)) {
+        auto node_block = ir_sch.GetBlock(node_tensor->name);
+        ir_sch.SetBuffer(node_block, "local", true);
+      }
+
+      auto node_block   = ir_sch.GetBlock(node_tensor->name);
+      auto master_loops = ir_sch.GetLoops(manster_tensor->name);
+      ir_sch.SimpleComputeAt(node_block, master_loops.back());
+      continue;
+    }
+
+    // others elemenwise internal node use compute-inline
+    ir_sch.ComputeInline(ir_sch.GetBlock(node_tensor->name));
+  }
+}
+
+std::vector<Expr> OpLowerer::IRReduceCompute(poly::StageMap& stages,
+                                             std::vector<ir::Tensor>& func_args,
+                                             std::unordered_map<std::string, ir::Tensor>& tensor_map,
+                                             const GroupPtr& group,
+                                             const GroupPtr& sub_group,
+                                             bool apply_impl_schedule) {
+  VLOG(2) << "ReduceCompute Group : " << sub_group->group_id;
+  auto& cinn_strategy   = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+
+  std::vector<Expr> ast_exprs;
+  for (auto& node : sub_group->nodes) {
+    auto node_data = GetNodeData(node);
+    VLOG(3) << "In ReduceCompute, process node: " << node->id() << " with op type: " << node->op()->name;
+
+    std::vector<common::CINNValue> cinn_inputs;
+    std::vector<ir::Tensor> tensor_inputs =
+        std::move(CollectInputTensor(node, func_args, tensor_map, this->type_dict_, this->shape_dict_));
+    for (auto& tensor : tensor_inputs) {
+      cinn_inputs.push_back(common::CINNValue(ir::Expr(tensor)));
+    }
+    cinn_inputs.push_back(common::CINNValue(node_data->id()));
+
+    std::vector<Type> out_types;
+    std::vector<std::vector<int>> out_shapes;
+
+    out_types.push_back(this->type_dict_.at(node_data->id()));
+    out_shapes.push_back(this->shape_dict_.at(node_data->id()));
+
+    auto impl =
+        OpStrategy::SelectImpl(cinn_strategy[node->op()](node->attrs, tensor_inputs, out_types, out_shapes, target_));
+    // do compute
+    common::CINNValuePack pack = impl->fcompute(common::CINNValuePack{cinn_inputs});
+
+    CHECK_GE(pack.size(), 2UL);
+    CHECK_LE(pack.size(), 5UL);
+    poly::StageMap tmp_stages = pack.back();
+
+    std::string post = "";
+    for (int idx = 0; idx < pack.size() - 1; ++idx) {
+      Expr expr                          = pack[idx];
+      tensor_map[node_data->id() + post] = expr.as_tensor_ref();
+      // As op may has more than 1 output tensor, using id + "_0"/"_1" as key.
+      post = "_" + std::to_string(idx);
+
+      // Insert outout tensors
+      if (!expr.as_tensor_ref()->buffer.defined() || this->target_ != common::DefaultNVGPUTarget()) {
+        tensor_inputs.push_back(expr.as_tensor_ref());
+      }
+    }
+    auto func = lang::LowerVec("fn_" + node->id(), tmp_stages, tensor_inputs, {}, {}, nullptr, this->target_, true);
+
+    // node is kReduction
+    if (op_pattern_dict[node->op()] == framework::kReduction && apply_impl_schedule) {
+      std::vector<common::CINNValue> schedule_inputs;
+      // collect tensor
+      for (int idx = 0; idx < pack.size() - 1; ++idx) {
+        CHECK(pack[idx].is_tensor());
+        schedule_inputs.push_back(common::CINNValue(pack[idx]));
+      }
+      for (auto& f : func) {
+        schedule_inputs.push_back(common::CINNValue(f->body));
+      }
+      // do ast tree schedule
+      common::CINNValuePack expr_pack = impl->fschedule(common::CINNValuePack{schedule_inputs});
+      // ast tree after schedule.
+      Expr ast_expr = expr_pack[0];
+      ast_exprs.push_back(ast_expr);
+    } else if (group->master_nodes.count(node)) {
+      // as master node should copy transform from reducer, left it to reduce schedule.
+      ast_exprs.push_back(func[0]->body);
+    } else {
+      ast_exprs.push_back(func[0]->body);
+    }
+  }
+
+  return ast_exprs;
+}
+
+void OpLowerer::IRReduceSchedule(ir::IRSchedule& ir_sch,
+                                 std::unordered_map<std::string, ir::Tensor>& tensor_map,
+                                 const GroupPtr& group,
+                                 const GroupPtr& sub_group,
+                                 Node*& master,
+                                 Node*& reducer) {
+  auto& op_pattern_dict  = Operator::GetAttrs<OpPatternKind>("OpPattern");
+  auto OrderAssignReduce = [this](ir::IRSchedule& ir_sch,
+                                  const std::string& block_name,
+                                  const std::vector<int>& axes,
+                                  const bool just_reorder = false) {
+    // reorder none-last reduce axis to last.
+    // like: shape = [16,16,16,16,16],axes = [1,3] -> new order = [0, 2, 4, 1, 3].
+    std::vector<int> order;
+    int n_out_dims = ir_sch.GetLoops(block_name).size();
+    for (int idx = 0; idx < n_out_dims; ++idx) {
+      if (std::find(axes.begin(), axes.end(), idx) == axes.end()) {
+        order.push_back(idx);
+      }
+    }
+    for (auto axis : axes) {
+      order.push_back(axis);
+    }
+    ir_sch.Reorder(ir_sch.GetBlock(block_name), order);
+
+    if (just_reorder) {
+      return;
+    }
+    // fuse others none-reduce axis.
+    int last_dimension_num = n_out_dims - axes.back() - 1;
+    int index              = n_out_dims - last_dimension_num - axes.size();
+
+    // fuse last_dimension_num - 1 times
+    for (auto idx = index; idx < index + last_dimension_num - 1; ++idx) {
+      ir_sch.Fuse(block_name, {index, index + 1});
+    }
+
+    auto loops = ir_sch.GetLoops(block_name);
+    auto psize = ir::GetLoopExtent(loops[index]);
+    if (psize > this->target_.max_num_threads()) {
+      for (int idx = this->target_.max_num_threads(); idx > 0; --idx) {
+        if (psize % idx == 0) {
+          ir_sch.Split(loops[index], {-1, idx});
+          break;
+        }
+        CHECK_GT(idx, 1);
+      }
+    }
+
+    // fuse index - 1 times
+    for (int idx = 0; idx < index - 1; ++idx) {
+      ir_sch.Fuse(block_name, {0, 1});
+    }
+  };
+
+  auto WithoutLastDimInReduce = [](const std::vector<int>& inshape, std::vector<int>& axes) {
+    // if last axis is in reduce.
+    axes = axes.empty() ? inshape : axes;
+    if (std::find(axes.begin(), axes.end(), inshape.size() - 1) != axes.end() ||
+        std::find(axes.begin(), axes.end(), -1) != axes.end()) {
+      return false;
+    }
+
+    int sum_last_axes = 1;
+    for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+      sum_last_axes *= inshape[idx];
+    }
+
+    if (sum_last_axes > 1) {
+      return true;
+    } else {
+      return false;
+    }
+  };
+
+  auto ScheduleAssignReduceWithoutLast = [this, OrderAssignReduce](ir::IRSchedule& ir_sch,
+                                                                   const std::string& block_name,
+                                                                   const std::vector<int>& inshape,
+                                                                   std::vector<int>& axes) {
+    axes                = axes.empty() ? inshape : axes;
+    int lane            = 1;
+    int max_num_threads = this->target_.max_num_threads();
+    for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+      lane *= inshape[idx];
+    }
+    CHECK_LE(lane, max_num_threads / 2) << "Parallel threads must less equal max_num_threads/2 on gpu!";
+    int pos   = 0;
+    int index = axes.size() - 1;
+    for (; index >= 0; --index) {
+      if (index + 1 < axes.size() && axes[index] != axes[index + 1] - 1) {
+        pos = axes[index + 1];
+        break;
+      }
+
+      lane *= inshape[axes[index]];
+      if (lane > max_num_threads / 2) {
+        pos = axes[index];
+        break;
+      }
+
+      if (index == 0) {
+        pos = axes[0];
+      }
+    }
+
+    if (lane > max_num_threads / 2) {
+      int prefix = inshape[axes[index]];
+      int tail   = lane / prefix;
+      for (int idx = max_num_threads / tail; idx > (max_num_threads / 2) / tail; --idx) {
+        if (prefix % idx == 0) {
+          ir_sch.Split(block_name, axes[index], {-1, idx});
+          break;
+        }
+        CHECK_GT(idx - 1, (max_num_threads / 2) / tail) << "idx should greater than (max_num_threads / 2) / tail.";
+      }
+    }
+
+    // insert 1
+    for (int idx = 0; idx < axes.size() - 1 - index; ++idx) {
+      auto loops = ir_sch.GetLoops(block_name);
+      ir_sch.Split(block_name, pos, {-1, ir::GetLoopExtent(loops[pos])});
+    }
+    OrderAssignReduce(ir_sch, block_name, axes);
+    // return insert 1
+    int start_index = ir_sch.GetLoops(block_name).size() - axes.size();
+    for (int idx = 0; idx < axes.size(); ++idx) {
+      auto loops = ir_sch.GetLoops(block_name);
+      if (ir::GetLoopExtent(loops[start_index]) == 1) {
+        ir_sch.Fuse({loops[start_index - 1], loops[start_index]});
+      } else {
+        ++start_index;
+      }
+    }
+  };
+
+  auto ScheduleAssignReduceWithLast = [this, OrderAssignReduce](ir::IRSchedule& ir_sch,
+                                                                const std::string& block_name,
+                                                                const std::vector<int>& inshape,
+                                                                std::vector<int>& axes) {
+    // find first reduce and second reduce axis.
+    axes                 = axes.empty() ? inshape : axes;
+    int lane             = 1;
+    int index            = static_cast<int>(axes.size()) - 1;
+    auto max_num_threads = this->target_.max_num_threads();
+    for (; index >= 0; --index) {
+      if (index + 1 < axes.size() && axes[index] != axes[index + 1] - 1) {
+        break;
+      }
+      lane *= inshape[axes[index]];
+      if (index == 0 && lane <= max_num_threads) {
+        LOG(FATAL) << "Error! lane is less equal than max_num_threads, Please check!";
+      }
+      if (lane >= max_num_threads / 2) {
+        if (lane <= max_num_threads) {
+          --index;
+        }
+        break;
+      }
+    }
+    std::vector<int> first_axes(axes.begin(), axes.begin() + index + 1);
+    if (lane > max_num_threads) {
+      // last reduce axis size > 1024
+      if (index == static_cast<int>(axes.size()) - 1) {
+        int idx = max_num_threads;
+        do {
+          if (lane % idx == 0) {
+            ir_sch.Split(block_name, axes[index], {-1, idx});
+            break;
+          }
+          --idx;
+        } while (idx >= max_num_threads / 2);
+        // if can't be divide by(1024, 512), it's shouldn't be fused.
+        CHECK_GE(idx, max_num_threads / 2) << "Check bounds exist, can't fuse!";
+      } else {
+        int axis   = axes[index];
+        int prefix = inshape[axis];
+        int tail   = lane / prefix;
+        for (int idx = max_num_threads / tail; idx > (max_num_threads / 2) / tail; --idx) {
+          if (prefix % idx == 0) {
+            ir_sch.Split(block_name, axis, {-1, idx});
+            break;
+          }
+          CHECK_GT(idx, (max_num_threads / 2) / tail) << "Error, it's shouldn't fuse!";
+        }
+      }
+      OrderAssignReduce(ir_sch, block_name, first_axes);
+    } else {
+      int fuse_times = axes.size() - (index + 1) - 1;
+      for (int idx = 0; idx < fuse_times; ++idx) {
+        ir_sch.Fuse(block_name, {axes[index + 1], axes[index + 1] + 1});
+      }
+      OrderAssignReduce(ir_sch, block_name, first_axes, true);
+      // fuse axis before reduce to bind blockidx.
+      for (int idx = 0; idx < (inshape.size() - axes.size()) - 1; ++idx) {
+        ir_sch.Fuse(block_name, {0, 1});
+      }
+    }
+  };
+
+  if (master == nullptr && reducer == nullptr) {
+    auto blocks = ir_sch.GetAllBlocks();
+    for (int idx = blocks.size() - 1; idx >= 0; --idx) {
+      auto block = blocks[idx];
+      CHECK(block->as<ir::ScheduleBlockRealize>());
+      CHECK(block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>());
+      if (!tensor_map.count(block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>()->name)) {
+        continue;
+      }
+
+      for (auto node : group->master_nodes) {
+        if (GetNodeData(node)->id() ==
+            block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>()->name) {
+          if (op_pattern_dict[node->op()] != framework::kReduction) {
+            master = node;
+            break;
+          }
+
+          if (op_pattern_dict[node->op()] == framework::kReduction && master) {
+            reducer = node;
+            break;
+          }
+        }
+      }
+
+      if (master && reducer) {
+        break;
+      }
+    }
+    CHECK((master && reducer) || (!master && !reducer)) << "Can't find Master reducer!";
+    if (!master && !reducer) {
+      master  = *group->master_nodes.begin();
+      reducer = *group->master_nodes.begin();
+    }
+
+    // do master schedule.
+    if (op_pattern_dict[master->op()] != framework::kReduction) {
+      VLOG(2) << "Do Master Schedule : " << master->id();
+      auto master_data = GetNodeData(master);
+      CHECK(master_data);
+      CHECK(tensor_map.count(master_data->id()));
+      auto master_tensor = tensor_map[master_data->id()];
+      auto loops         = ir_sch.GetLoops(master_tensor->name);
+      if (op_pattern_dict[master->op()] == framework::kElementWise) {
+        ir_sch.FlattenLoops(loops, true);
+      } else {
+        ir_sch.FlattenLoops(loops, false);
+      }
+
+      auto reducer_data   = GetNodeData(reducer);
+      auto reducer_tensor = tensor_map[reducer_data->id()];
+      auto rloops         = ir_sch.GetLoops(reducer_tensor->name);
+
+      // assign master loops to reducer loops without reduce axis.
+      int extend = 1;
+      std::vector<int> factors;
+      auto sloops = ir_sch.GetLoops(master_tensor->name);
+      for (auto& loop : rloops) {
+        // without last reduce axis, so check loop extend.
+        extend *= loop.As<ir::For>()->extent.as_int32();
+        if (extend > sloops.back().As<ir::For>()->extent.as_int32()) {
+          break;
+        }
+        CHECK_LE(extend, sloops.back().As<ir::For>()->extent.as_int32());
+        factors.push_back(loop.As<ir::For>()->extent.as_int32());
+      }
+      ir_sch.Split(sloops.back(), factors);
+
+      auto nloops = ir_sch.GetLoops(master_tensor->name);
+      CHECK_GE(rloops.size(), nloops.size());
+      for (int idx = 0; idx < nloops.size(); ++idx) {
+        nloops[idx].As<ir::For>()->set_bind_info(rloops[idx].As<ir::For>()->bind_info());
+      }
+    }
+    // do reducer schedule.
+    {
+      auto reducer_data   = GetNodeData(reducer);
+      auto reducer_tensor = tensor_map[reducer_data->id()];
+      CHECK(reducer->attrs.attr_store.count("dim"));
+      auto reducer_axes = absl::get<std::vector<int>>(reducer->attrs.attr_store.at("dim"));
+      CHECK(reducer->inlinks_in_order().size());
+      CHECK(this->shape_dict_.count(reducer->inlinks_in_order()[0]->source()->id()));
+      auto reducer_shape = this->shape_dict_.at(reducer->inlinks_in_order()[0]->source()->id());
+
+      if (reducer_axes.empty()) {
+        for (int i = 0; i < reducer_shape.size(); ++i) {
+          reducer_axes.emplace_back(i);
+        }
+      }
+
+      bool without_last_dim = WithoutLastDimInReduce(reducer_shape, reducer_axes);
+
+      std::unordered_set<Node*> visited_nodes;
+      for (auto node : group->master_nodes) {
+        VLOG(2) << "Schedule reduce node -> " << node->id();
+        if (op_pattern_dict[node->op()] != framework::kReduction) {
+          continue;
+        }
+        auto node_data   = GetNodeData(node);
+        auto node_tensor = tensor_map[node_data->id()];
+
+        if (!group->output_nodes.count(node)) {
+          auto node_block = ir_sch.GetBlock(node_tensor->name);
+          ir_sch.SetBuffer(node_block, "local", true);
+        }
+        if (node == reducer) {
+          continue;
+        }
+        auto node_shape = this->shape_dict_.at(node->inlinks_in_order()[0]->source()->id());
+        if (without_last_dim) {
+          VLOG(2) << "Reduce Schedule WithoutLastDimInReduce";
+          // find a shape to do simple compute at.
+          auto tmp_reducer       = reducer;
+          auto tmp_reducer_shape = reducer_shape;
+          if (node_shape != reducer_shape) {
+            // try to find the same shape reduce from visited_nodes
+            for (auto visited : visited_nodes) {
+              auto shape = this->shape_dict_.at(visited->inlinks_in_order()[0]->source()->id());
+              if (shape == node_shape) {
+                tmp_reducer       = visited;
+                tmp_reducer_shape = shape;
+                break;
+              }
+            }
+          }
+          visited_nodes.insert(node);
+          auto tmp_reducer_data   = GetNodeData(tmp_reducer);
+          auto tmp_reducer_tensor = tensor_map[tmp_reducer_data->id()];
+
+          // using block shuffle reduce.
+          if (tensor_map.count(reducer_data->id() + "_1")) {
+            auto node_0_tensor = tensor_map[node_data->id() + "_0"];
+            auto node_0_block  = ir_sch.GetBlock(node_0_tensor->name);
+
+            auto tmp_reducer_0_tensor = tensor_map[tmp_reducer_data->id() + "_0"];
+            auto tmp_reducer_0_loops  = ir_sch.GetLoops(tmp_reducer_0_tensor->name);
+
+            if (tmp_reducer_shape == node_shape) {
+              ir_sch.SimpleComputeAt(node_0_block, tmp_reducer_0_loops.back());
+              // init compute at reduce
+              int loop_depth = ir_sch.GetLoops(node_0_tensor->name + "__reduce_init").size();
+              ir_sch.SimpleComputeAt(ir_sch.GetBlock(node_0_tensor->name + "__reduce_init"),
+                                     ir_sch.GetLoops(node_0_tensor->name)[loop_depth - 1]);
+            } else {
+              if (tmp_reducer_0_tensor->shape.back() == node_0_tensor->shape.back()) {
+                int num_reduce_axis = tmp_reducer_0_tensor->reduce_axis.size();
+                CHECK_GE(static_cast<int>(tmp_reducer_0_loops.size()) - num_reduce_axis - 1, 0);
+                ir_sch.SimpleComputeAt(node_0_block,
+                                       tmp_reducer_0_loops[tmp_reducer_0_loops.size() - num_reduce_axis - 1]);
+                // init compute at reduce
+                int loop_depth = ir_sch.GetLoops(node_0_tensor->name + "__reduce_init").size();
+                ir_sch.SimpleComputeAt(ir_sch.GetBlock(node_0_tensor->name + "__reduce_init"),
+                                       ir_sch.GetLoops(node_0_tensor->name)[loop_depth - 1]);
+              } else {
+                CHECK_GE(static_cast<int>(tmp_reducer_0_loops.size()), 2);
+                ir_sch.SimpleComputeAt(node_0_block, tmp_reducer_0_loops[0]);
+              }
+            }
+            ir_sch.SimpleComputeAt(ir_sch.GetBlock(node_tensor->name),
+                                   ir_sch.GetLoops(tmp_reducer_tensor->name).back());
+          } else {
+            if (tmp_reducer_shape == node_shape) {
+              ir_sch.SimpleComputeAt(ir_sch.GetBlock(node_tensor->name),
+                                     ir_sch.GetLoops(tmp_reducer_tensor->name).back());
+            } else {
+              int num_reduce_axis    = tmp_reducer_tensor->reduce_axis.size();
+              auto tmp_reducer_loops = ir_sch.GetLoops(tmp_reducer_tensor->name);
+              CHECK_GE(static_cast<int>(tmp_reducer_loops.size()) - num_reduce_axis - 1, 0);
+              ir_sch.SimpleComputeAt(ir_sch.GetBlock(node_tensor->name),
+                                     tmp_reducer_loops[tmp_reducer_loops.size() - num_reduce_axis - 1]);
+            }
+            // init compute at reduce
+            int loop_depth = ir_sch.GetLoops(node_tensor->name + "__reduce_init").size();
+            ir_sch.SimpleComputeAt(ir_sch.GetBlock(node_tensor->name + "__reduce_init"),
+                                   ir_sch.GetLoops(node_tensor->name)[loop_depth - 1]);
+          }
+        } else {
+          VLOG(2) << "Reduce Schedule WithLastDimInReduce";
+          // if with column reduce behind.
+          if (tensor_map.count(node_data->id() + "_1")) {
+            auto reducer_1_tensor = tensor_map[reducer_data->id() + "_1"];
+            auto reducer_0_tensor = tensor_map[reducer_data->id() + "_0"];
+
+            auto node_1_tensor = tensor_map[node_data->id() + "_1"];
+            auto node_0_tensor = tensor_map[node_data->id() + "_0"];
+
+            auto node_block_1 = ir_sch.GetBlock(node_1_tensor->name);
+            auto node_block_0 = ir_sch.GetBlock(node_0_tensor->name);
+            auto node_block   = ir_sch.GetBlock(node_tensor->name);
+
+            ir_sch.SimpleComputeAt(node_block, ir_sch.GetLoops(reducer_tensor->name).back());
+            ir_sch.SimpleComputeAt(node_block_0, ir_sch.GetLoops(reducer_0_tensor->name).back());
+            ir_sch.SimpleComputeAt(node_block_1, ir_sch.GetLoops(reducer_1_tensor->name).back());
+            // init compute at reduce
+            int loop_depth = ir_sch.GetLoops(node_1_tensor->name + "__reduce_init").size();
+            ir_sch.SimpleComputeAt(ir_sch.GetBlock(node_1_tensor->name + "__reduce_init"),
+                                   ir_sch.GetLoops(node_1_tensor->name)[loop_depth - 1]);
+          } else if (tensor_map.count(node_data->id() + "_0")) {
+            auto reducer_0_tensor = tensor_map[reducer_data->id() + "_0"];
+            auto node_0_tensor    = tensor_map[node_data->id() + "_0"];
+
+            auto node_0_block = ir_sch.GetBlock(node_0_tensor->name);
+            auto node_block   = ir_sch.GetBlock(node_tensor->name);
+            ir_sch.SimpleComputeAt(node_block, ir_sch.GetLoops(reducer_tensor->name).back());
+            ir_sch.SimpleComputeAt(node_0_block, ir_sch.GetLoops(reducer_0_tensor->name).back());
+          } else {
+            LOG(FATAL) << "Error! Unkown Reduce Type, Please Check!";
+          }
+        }
+      }
+
+      if (without_last_dim) {
+        if (tensor_map.count(reducer_data->id() + "_1")) {
+          auto reducer_tensor = tensor_map[GetNodeData(reducer)->id()];
+          auto reducer_loops  = ir_sch.GetLoops(reducer_tensor->name);
+          ir_sch.SyncThreads(reducer_loops[0], false);
+        }
+      }
+    }
+  }
+
+  // master node
+  auto master_data = GetNodeData(master);
+  CHECK(master_data);
+  CHECK(tensor_map.count(master_data->id()));
+  auto master_tensor = tensor_map[master_data->id()];
+  auto master_shape  = this->shape_dict_.at(master_data->id());
+  auto master_size   = std::accumulate(master_shape.begin(), master_shape.end(), 1, std::multiplies<int>());
+
+  // reducer node
+  auto reducer_data = GetNodeData(reducer);
+  CHECK(reducer_data);
+  CHECK(reducer->inlinks_in_order().size());
+  CHECK(this->shape_dict_.count(reducer->inlinks_in_order()[0]->source()->id()));
+  auto reducer_shape = this->shape_dict_.at(reducer->inlinks_in_order()[0]->source()->id());
+  auto reduce_size   = std::accumulate(reducer_shape.begin(), reducer_shape.end(), 1, std::multiplies<int>());
+
+  CHECK(reducer->attrs.attr_store.count("dim"));
+  auto reducer_axes = absl::get<std::vector<int>>(reducer->attrs.attr_store.at("dim"));
+  if (reducer_axes.empty()) {
+    for (int i = 0; i < reducer_shape.size(); ++i) {
+      reducer_axes.emplace_back(i);
+    }
+  }
+
+  VLOG(2) << "master node : " << master->id() << " ,reducer node : " << reducer->id();
+  for (int idx = sub_group->nodes.size() - 1; idx >= 0; --idx) {
+    auto node = sub_group->nodes[idx];
+
+    if (node == master) {
+      continue;
+    }
+    if (op_pattern_dict[node->op()] == framework::kReduction) {
+      continue;
+    }
+    auto node_data   = GetNodeData(node);
+    auto node_tensor = tensor_map[node_data->id()];
+
+    VLOG(3) << "Schedule node -> " << node->id() << " var : " << node_tensor->name;
+    // for x86 schedule.
+    if (this->target_ == common::DefaultHostTarget()) {
+      LOG(FATAL) << "X86 Not implemented";
+    }
+
+    bool dont_compute_inline =
+        group->output_nodes.count(node) || group->internal_nodes.count(node) || sub_group->internal_nodes.count(node);
+    if (!dont_compute_inline) {
+      auto consumers = GetConsumers(node);
+      for (auto& consumer : consumers) {
+        if (op_pattern_dict[consumer->op()] == framework::kReduction) {
+          dont_compute_inline = true;
+          break;
+        }
+      }
+    }
+
+    // if is const op, do compute inline.
+    if (IsConstOp(node) && !group->output_nodes.count(node)) {
+      dont_compute_inline = false;
+    }
+
+    // if node is internal node or output, try to copy schedule from fellow node
+    if (dont_compute_inline) {
+      VLOG(2) << "Reduce Schedule for Elementwise Type";
+      // if node is not output node, set buffer.
+      if (!group->output_nodes.count(node)) {
+        auto node_block = ir_sch.GetBlock(node_tensor->name);
+        ir_sch.SetBuffer(node_block, "local", true);
+      }
+      // node is after reduce
+      auto node_shape = this->shape_dict_.at(node_data->id());
+      auto node_size  = std::accumulate(node_shape.begin(), node_shape.end(), 1, std::multiplies<int>());
+      if (node_shape == master_shape || node_size == master_size) {
+        VLOG(2) << "Do Elementwise Type After Reduce!";
+        auto loops = ir_sch.GetLoops(node_tensor->name);
+        // flat loop and tensor shape
+        if (op_pattern_dict[master->op()] == framework::kElementWise) {
+          ir_sch.FlattenLoops(loops, true);
+        } else {
+          ir_sch.FlattenLoops(loops, false);
+        }
+        // split loop to assign master loop
+        std::vector<int> factors;
+        auto mloops = ir_sch.GetLoops(master_tensor->name);
+        for (auto& loop : mloops) {
+          factors.push_back(loop.As<ir::For>()->extent.as_int32());
+        }
+        loops = ir_sch.GetLoops(node_tensor->name);
+        ir_sch.Split(loops.back(), factors);
+        // note do simple compute at
+        auto node_block = ir_sch.GetBlock(node_tensor->name);
+        ir_sch.SimpleComputeAt(node_block, mloops.back());
+        continue;
+      }
+      // do elementwise flat
+      auto loops = ir_sch.GetLoops(node_tensor->name);
+      if (op_pattern_dict[node->op()] == framework::kElementWise) {
+        ir_sch.FlattenLoops(loops, true);
+      } else {
+        ir_sch.FlattenLoops(loops, false);
+      }
+      // node is before reduce.
+      if (WithoutLastDimInReduce(reducer_shape, reducer_axes)) {
+        VLOG(2) << "Reduce Schedule for WithoutLastDimInReduce";
+        // find a shape to do simple compute at.
+        auto tmp_reducer       = reducer;
+        auto tmp_reducer_shape = reducer_shape;
+        auto tmp_reducer_size  = std::accumulate(reducer_shape.begin(), reducer_shape.end(), 1, std::multiplies<int>());
+        // node shape.
+        auto node_shape = this->shape_dict_.at(node_data->id());
+        if (node_shape != tmp_reducer_shape && node_size != reduce_size) {
+          // try to find the same shape reduce from visited_nodes
+          for (auto rnode : group->master_nodes) {
+            if (op_pattern_dict[rnode->op()] != framework::kReduction) {
+              continue;
+            }
+            auto shape = this->shape_dict_.at(rnode->inlinks_in_order()[0]->source()->id());
+            auto size  = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+            if (shape == node_shape || size == node_size) {
+              tmp_reducer       = rnode;
+              tmp_reducer_size  = size;
+              tmp_reducer_shape = shape;
+              break;
+            }
+          }
+        }
+        // do split
+        CHECK(node_shape == tmp_reducer_shape || node_size == tmp_reducer_size);
+
+        auto loops = ir_sch.GetLoops(node_tensor->name);
+        ir_sch.Split(loops.back(), tmp_reducer_shape);
+
+        auto tmp_reducer_data   = GetNodeData(tmp_reducer);
+        auto tmp_reducer_tensor = tensor_map[tmp_reducer_data->id()];
+        // if used block shuffle reduce
+        if (tensor_map.count(tmp_reducer_data->id() + "_1")) {
+          ScheduleAssignReduceWithoutLast(ir_sch, node_tensor->name, tmp_reducer_shape, reducer_axes);
+          auto tmp_reducer_tensor_0 = tensor_map[tmp_reducer_data->id() + "_0"];
+          auto tmp_reducer_loops_0  = ir_sch.GetLoops(tmp_reducer_tensor_0->name);
+          auto node_loops           = ir_sch.GetLoops(node_tensor->name);
+          if (node_loops.size() < tmp_reducer_loops_0.size()) {
+            ir_sch.Split(node_tensor->name, 0, {-1, ir::GetLoopExtent(node_loops[0])});
+          }
+          CHECK_EQ(ir_sch.GetLoops(node_tensor->name).size(), tmp_reducer_loops_0.size())
+              << "node loops and reduce loops must be equal!";
+          auto node_block = ir_sch.GetBlock(node_tensor->name);
+          ir_sch.SimpleComputeAt(node_block, tmp_reducer_loops_0.back());
+        } else {
+          OrderAssignReduce(ir_sch, node_tensor->name, reducer_axes);
+
+          auto node_block = ir_sch.GetBlock(node_tensor->name);
+          auto node_loops = ir_sch.GetLoops(node_tensor->name);
+          if (node_loops.size() < ir_sch.GetLoops(tmp_reducer_tensor->name).size()) {
+            ir_sch.Split(node_tensor->name, 0, {-1, ir::GetLoopExtent(node_loops[0])});
+          }
+          CHECK_EQ(ir_sch.GetLoops(node_tensor->name).size(), ir_sch.GetLoops(tmp_reducer_tensor->name).size())
+              << "node loop size and reduce loop size must be equal!";
+          ir_sch.SimpleComputeAt(node_block, ir_sch.GetLoops(tmp_reducer_tensor->name).back());
+        }
+      } else {
+        VLOG(2) << "Reduce Schedule for WithLastDimInReduce";
+        if (tensor_map.count(reducer_data->id() + "_1")) {
+          {
+            auto node_loops = ir_sch.GetLoops(node_tensor->name);
+            ir_sch.Split(node_loops.back(), reducer_shape);
+          }
+
+          ScheduleAssignReduceWithLast(ir_sch, node_tensor->name, reducer_shape, reducer_axes);
+          auto reducer_1_tensor = tensor_map[reducer_data->id() + "_1"];
+          auto reducer_1_block  = ir_sch.GetBlock(reducer_1_tensor->name);
+          auto reducer_1_loops  = ir_sch.GetLoops(reducer_1_block);
+
+          auto node_loops = ir_sch.GetLoops(node_tensor->name);
+          if (ir_sch.GetLoops(node_tensor->name).size() < ir_sch.GetLoops(reducer_1_block).size()) {
+            ir_sch.Split(node_tensor->name, 0, {-1, ir::GetLoopExtent(node_loops[0])});
+          }
+
+          CHECK_EQ(ir_sch.GetLoops(node_tensor->name).size(), ir_sch.GetLoops(reducer_1_block).size())
+              << "node loop size and reduce loop size must be equal!" << ir_sch.GetModule().GetExprs().at(0);
+          auto node_block = ir_sch.GetBlock(node_tensor->name);
+          ir_sch.SimpleComputeAt(node_block, reducer_1_loops.back());
+        } else {
+          auto reducer_0_tensor = tensor_map[reducer_data->id() + "_0"];
+          auto reducer_0_block  = ir_sch.GetBlock(reducer_0_tensor->name);
+          auto reducer_0_loops  = ir_sch.GetLoops(reducer_0_block);
+          {
+            auto node_loops = ir_sch.GetLoops(node_tensor->name);
+            std::vector<int> factors;
+            for (auto& loop : reducer_0_loops) {
+              factors.push_back(loop.As<ir::For>()->extent.as_int32());
+            }
+            ir_sch.Split(node_loops.back(), factors);
+          }
+
+          auto node_loops = ir_sch.GetLoops(node_tensor->name);
+          if (node_loops.size() < reducer_0_loops.size()) {
+            ir_sch.Split(node_tensor->name, 0, {-1, ir::GetLoopExtent(node_loops[0])});
+          }
+          CHECK_EQ(ir_sch.GetLoops(node_tensor->name).size(), reducer_0_loops.size())
+              << "node loop size and reduce loop size must be equal!" << ir_sch.GetModule().GetExprs().at(0);
+          auto node_block = ir_sch.GetBlock(node_tensor->name);
+          ir_sch.SimpleComputeAt(node_block, reducer_0_loops.back());
+        }
+      }
+      continue;
+    }
+
+    // others elemenwise internal node use compute-inline
+    VLOG(2) << "Do Elementwise ComputeInline!";
+    auto loops = ir_sch.GetLoops(node_tensor->name);
+    if (op_pattern_dict[node->op()] == framework::kElementWise) {
+      ir_sch.FlattenLoops(loops, true);
+    } else {
+      ir_sch.FlattenLoops(loops, false);
+    }
+    auto node_block = ir_sch.GetBlock(node_tensor->name);
+    ir_sch.ComputeInline(node_block);
+  }
+}
+
+std::vector<ir::LoweredFunc> OpLowerer::IRLowerNonFusibleOp(GroupPtr& group, bool apply_impl_schedule) {
+  VLOG(3) << "LowerNonFusibleOp Group : " << group->group_id;
+  // get input tensor and output tensor
+  CHECK(group->nodes.size() || group->fused_sub_groups.size());
+  auto& cinn_strategy   = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+
+  auto node = group->fused_sub_groups.size() ? group->fused_sub_groups[0]->nodes.front() : group->nodes.front();
+  VLOG(3) << "GetOpFunc of op " << node->id();
+  std::vector<ir::Tensor> inputs;
+  std::vector<common::CINNValue> cinn_inputs;
+
+  std::vector<ir::Argument> args;
+  std::unordered_map<std::string, ir::Tensor> tensor_map;
+  for (auto& node_data : GetInputNodeData(node)) {
+    CHECK(node_data);
+    ir::Tensor tensor;
+    if (!tensor_map.count(node_data->id())) {
+      tensor = GetTensor(node_data, this->type_dict_, this->shape_dict_);
+      // record tensor.
+      tensor_map[node_data->id()] = tensor;
+      // input name.
+      group->input_names.push_back(node_data->id());
+      // input type.
+      args.emplace_back(tensor->buffer, ir::Argument::IO::kInput);
+    } else {
+      tensor = tensor_map[node_data->id()];
+    }
+    inputs.push_back(tensor);
+    cinn_inputs.push_back(common::CINNValue(tensor));
+  }
+
+  std::vector<Type> out_types;
+  std::vector<std::vector<int>> out_shapes;
+  auto node_datas = GetAllNodeData(node);
+  for (auto node_data : node_datas) {
+    VLOG(3) << "cinn_inputs.push_back " << node_data->id();
+    group->output_names.push_back(node_data->id());
+    out_types.push_back(this->type_dict_.at(node_data->id()));
+    out_shapes.push_back(this->shape_dict_.at(node_data->id()));
+    cinn_inputs.push_back(common::CINNValue(node_data->id()));
+  }
+
+  auto impl = OpStrategy::SelectImpl(cinn_strategy[node->op()](node->attrs, inputs, out_types, out_shapes, target_));
+  // if node op is custom_call, apply custom_call compute.
+  if (node->op()->name == "custom_call") {
+    std::string external_api;
+    if (node->attrs.attr_store.count("custom_call")) {
+      external_api = absl::get<std::string>(node->attrs.attr_store.at("custom_call"));
+    } else {
+      external_api = ExternalApiRegistry::Global()->GetExternalApi(node, target_);
+    }
+    std::vector<common::CINNValue> compute_args = {common::CINNValue(group->GetFuncName()),
+                                                   common::CINNValue(external_api)};
+    common::CINNValuePack pack                  = impl->fcompute(common::CINNValuePack{compute_args});
+    CHECK_EQ(pack.size(), 1UL);
+    // reset input names as extern api input args can't be remove duplicate.
+    group->input_names.clear();
+    for (auto& inode : node->inlinks_in_order()) {
+      group->input_names.push_back(inode->source()->as<NodeData>()->id());
+    }
+    return {pack[0].operator ir::Expr().as_lowered_func_ref()};
+  }
+
+  common::CINNValuePack pack = impl->fcompute(common::CINNValuePack{cinn_inputs});
+  for (int i = 0; i < pack->size() - 1; i++) {
+    ir::Expr temp = pack[i];
+    // checkout whether the tensor is with buffer.
+    if (!temp.as_tensor_ref()->buffer.defined() || this->target_ != common::DefaultNVGPUTarget()) {
+      inputs.push_back(temp.as_tensor_ref());
+      temp.as_tensor_ref()->WithBuffer();
+      args.emplace_back(temp.as_tensor_ref()->buffer, ir::Argument::IO::kOutput);
+    }
+  }
+
+  poly::StageMap stages = pack.back();
+  auto func             = lang::LowerVec(group->GetFuncName(), stages, inputs, {}, {}, nullptr, this->target_, true);
+
+  if (apply_impl_schedule) {
+    std::vector<common::CINNValue> schedule_inputs;
+    // collect tensor
+    for (int idx = 0; idx < pack.size() - 1; ++idx) {
+      CHECK(pack[idx].is_tensor());
+      schedule_inputs.push_back(common::CINNValue(pack[idx]));
+    }
+    for (auto& f : func) {
+      schedule_inputs.push_back(common::CINNValue(f->body));
+    }
+    // do ast tree schedule
+    common::CINNValuePack expr_pack = impl->fschedule(common::CINNValuePack{schedule_inputs});
+
+    ir::Expr func_body = expr_pack[0];
+    std::vector<std::string> input_output_nodes(group->input_names);
+    input_output_nodes.insert(input_output_nodes.end(), group->output_names.begin(), group->output_names.end());
+    VLOG(6) << "func.size() = " << func.size() << ", expr_pack.size() = " << expr_pack.size();
+    VLOG(6) << "args.size() = " << args.size() << ", input_output_nodes.size() = " << input_output_nodes.size();
+    if (args.size() > input_output_nodes.size()) {
+      args = lang::GetArgs(func_body, input_output_nodes);
+    }
+    std::vector<ir::LoweredFunc> res;
+    for (int i = 0; i < expr_pack.size(); i++) {
+      ir::Expr func_body = expr_pack[0];
+#ifdef CINN_WITH_CUDA
+      optim::OptimizeExprGPU(&(func_body));
+#endif
+      auto temp_buffers = lang::GetTempBuffers(inputs, stages, func_body);
+      auto function     = ir::_LoweredFunc_::Make(group->GetFuncName(), args, func_body, temp_buffers);
+      res.push_back(function);
+    }
+    for (auto& i : res) {
+      i = optim::Optimize(Expr(i), target_, false).as_lowered_func_ref();
+    }
+    return res;
+  } else {
+    for (auto& f : func) {
+#ifdef CINN_WITH_CUDA
+      optim::OptimizeExprGPU(&(f->body));
+#endif
+      f = optim::Optimize(Expr(f), target_, false).as_lowered_func_ref();
+    }
+    return func;
+  }
+}
+
+// do compute
+void OpLowerer::IRSchedule(ir::IRSchedule& ir_sch,
+                           const GroupPtr& group,
+                           const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
+  // topological order.
+  auto nodes_set      = group->NodeSet();
+  auto v_consumers    = BuildVirtualConsumer(group, this->shape_dict_);
+  auto nodes_in_order = BFSTopologicalOrderWithPriority(group, v_consumers, this->shape_dict_);
+  // find reducer.
+  std::unordered_set<Node*> nodes_inline;
+  auto greducer         = FindGlobalReducer(nodes_in_order);
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+
+  // do schedule
+  for (auto node : nodes_in_order) {
+    VLOG(4) << "Try FUSION " << node->op()->name;
+    // consumers.
+    auto consumers      = GetConsumersInSet(node, nodes_set);
+    const Node* reducer = greducer ? FindNearestReducer(node, nodes_set) : greducer;
+    if (!reducer && greducer) {
+      reducer = v_consumers.count(node) ? v_consumers.find(node)->second : reducer;
+      if (reducer && op_pattern_dict[reducer->op()] != framework::kReduction) {
+        reducer = nullptr;
+      }
+    }
+
+    auto masters = GetMasters(node, nodes_inline, nodes_set);
+    // node can be inline.
+    if (CanbeInline(node, consumers, reducer, masters, group, nodes_set, this->shape_dict_)) {
+      VLOG(3) << "Before compute inline, ir is:\n" << ir_sch.GetModule().GetExprs().at(0);
+      auto block = ir_sch.GetBlock(GetNodeData(node)->id());
+      ir::ComputeInlineChecker checker(ir_sch, block);
+      if (!checker.Check()) {
+        checker.BuildDataDependency();
+        continue;
+      }
+
+      // if exist global reduce node.
+      if (greducer) {
+        auto loops = ir_sch.GetLoops(GetNodeData(node)->id());
+        if (op_pattern_dict[node->op()] == framework::kElementWise) {
+          ir_sch.FlattenLoops(loops, true);
+        } else {
+          ir_sch.FlattenLoops(loops, false);
+        }
+      }
+
+      ir_sch.ComputeInline(block);
+      nodes_inline.insert(node);
+      VLOG(3) << "After compute inline, ir is:\n" << ir_sch.GetModule().GetExprs().at(0);
+      continue;
+    }
+    // find master to computeat.
+    auto master = GetMasterToComputeAt(node, nodes_in_order, nodes_inline, nodes_set, v_consumers, this->shape_dict_);
+    // assign to reducer/master loop.
+    if (reducer) {
+      VLOG(3) << "Before assign node " << node->id() << " into vertical link reducer " << reducer->id() << ", ir is:\n"
+              << ir_sch.GetModule().GetExprs().at(0);
+      // if node is vertical with reduce, loop assign reducer.
+      LoopAssignReduce(ir_sch, node, reducer, this->target_, tensor_map, this->shape_dict_);
+    } else if (greducer) {
+      auto greducer_out_shape = this->shape_dict_.at(greducer->outlinks_in_order()[0]->sink()->id());
+      auto node_out_shape     = this->shape_dict_.at(node->outlinks_in_order()[0]->sink()->id());
+      if (std::accumulate(greducer_out_shape.begin(), greducer_out_shape.end(), 1, std::multiplies<int>()) !=
+          std::accumulate(node_out_shape.begin(), node_out_shape.end(), 1, std::multiplies<int>())) {
+        LoopAssignReduce(ir_sch, node, greducer, this->target_, tensor_map, this->shape_dict_);
+      } else {
+        VLOG(3) << "Before assign node " << node->id() << " into horizontal link reducer " << greducer->id()
+                << ", ir is:\n"
+                << ir_sch.GetModule().GetExprs().at(0);
+        // if node is horizontal with reduce or node is reduce, loop assign master.
+        auto loops = ir_sch.GetLoops(GetNodeData(node)->id());
+        if (op_pattern_dict[node->op()] == framework::kElementWise) {
+          ir_sch.FlattenLoops(loops, true);
+        } else if (op_pattern_dict[node->op()] != framework::kReduction) {
+          ir_sch.FlattenLoops(loops, false);
+        }
+
+        if (master && op_pattern_dict[node->op()] != framework::kReduction) {
+          auto master_loops = ir_sch.GetLoops(GetNodeData(master)->id());
+          std::vector<int> splits;
+          for (auto loop : master_loops) {
+            splits.push_back(loop.As<ir::For>()->extent.as_int32());
+          }
+          loops = ir_sch.GetLoops(GetNodeData(node)->id());
+          ir_sch.Split(loops[0], splits);
+        }
+      }
+    }
+    VLOG(3) << "Before loop fusion, ir is:\n" << ir_sch.GetModule().GetExprs().at(0);
+    // do loop fuse.
+    LoopComputeAt(ir_sch, node, master ? master : nodes_in_order.front(), group, this->shape_dict_, tensor_map);
+    VLOG(3) << "After loop fusion, ir is:\n" << ir_sch.GetModule().GetExprs().at(0);
+  }
+
+  // do vectorize
+  auto all_blocks = ir_sch.GetAllBlocks();
+  VLOG(4) << "Size of blocks: " << all_blocks.size();
+  VLOG(4) << "Op Pattern : " << group->op_pattern_kind;
+
+  // only support first block?
+  auto block = all_blocks[0];
+  CHECK(block->as<ir::ScheduleBlockRealize>());
+  CHECK(block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>());
+  auto is_tensor_block = true;
+  auto tensor_name     = block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>()->name;
+  if (!tensor_map.count(tensor_name)) {
+    is_tensor_block = false;
+  }
+  if (FLAGS_cinn_use_cuda_vectorize && is_tensor_block &&
+      (group->op_pattern_kind == framework::kElementWise || group->op_pattern_kind == framework::kInjective ||
+       group->op_pattern_kind == framework::kBroadcast)) {
+    // auto loops = ir_sch.GetLoops(GetNodeData(node)->id());
+    auto loops = ir_sch.GetLoops(block);
+    VLOG(4) << "Op Pattern : " << loops.size();
+    if (loops.size() >= 1) {
+      VLOG(4) << "Before vectorize, ir is: \n" << ir_sch.GetModule().GetExprs().at(0);
+      auto loop_inner  = loops.back();
+      int vector_width = 1;
+      auto psize       = ir::GetLoopExtent(loop_inner);
+      // get dtype of vectorized var
+      auto dtype = this->type_dict_.at(tensor_name);
+      VLOG(4) << tensor_name << " dtype " << dtype;
+      if (psize % 8 == 0 && (dtype.is_float16() || dtype.is_bfloat16())) {
+        vector_width = 8;
+      } else if (psize % 4 == 0) {
+        vector_width = 4;
+      } else if (psize % 2 == 0) {
+        vector_width = 2;
+      }
+      if (vector_width > 1) {
+        auto splited = ir_sch.Split(loop_inner, {-1, vector_width});
+        splited[0].As<ir::For>()->set_bind_info(loop_inner.As<ir::For>()->bind_info());
+        splited[1].As<ir::For>()->set_serial();
+        ir_sch.Vectorize(splited[1], vector_width);
+      }
+      VLOG(4) << "After vectorize, ir is: \n" << ir_sch.GetModule().GetExprs().at(0);
+    }
+  }
+
+  VLOG(3) << "Before Sync IRLowerOp schedule, ir is: \n" << ir_sch.GetModule().GetExprs().at(0);
+  SyncThreadWithShared(ir_sch, group, nodes_inline, nodes_set, this->shape_dict_, tensor_map);
+  VLOG(4) << "After IRSchedule,  ir is: \n" << ir_sch.GetModule().GetExprs().at(0);
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/op_lowering.h b/paddle/cinn/hlir/framework/op_lowering.h
new file mode 100755
index 0000000000000..6e291afeb6e08
--- /dev/null
+++ b/paddle/cinn/hlir/framework/op_lowering.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/instruction.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/ir_schedule_util.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/lang/packed_func.h"
+
+// Fusion Op lowering, there are four kinds of lowering function:
+// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
+// Elementwise/Broadcast/Injective Ops is with same shcedule.
+// Reduce,OutEWiseFusable,NonFusible are using different schedule.
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using GroupPtr = std::shared_ptr<Graph::Group>;
+using common::Target;
+
+class OpLowerer;
+typedef std::vector<Expr> (OpLowerer::*IRComputeFunction)(poly::StageMap&,
+                                                          std::vector<ir::Tensor>&,
+                                                          std::unordered_map<std::string, ir::Tensor>&,
+                                                          const GroupPtr&,
+                                                          const GroupPtr&,
+                                                          bool);
+typedef void (OpLowerer::*IRScheduleFunction)(ir::IRSchedule& ir_sch,
+                                              std::unordered_map<std::string, ir::Tensor>&,
+                                              const GroupPtr&,
+                                              const GroupPtr&,
+                                              Node*&,
+                                              Node*&);
+
+class OpLowerer {
+ public:
+  OpLowerer(const absl::flat_hash_map<std::string, Type>&,
+            const absl::flat_hash_map<std::string, shape_t>&,
+            const Target&);
+  std::vector<ir::LoweredFunc> Lower(GroupPtr& group);
+  std::vector<ir::LoweredFunc> LowerWithoutSchedule(GroupPtr& group);
+
+ private:
+  std::vector<ir::LoweredFunc> IRLowerOp(IRComputeFunction, IRScheduleFunction, GroupPtr&);
+  std::vector<ir::LoweredFunc> IRLowerNonFusibleOp(GroupPtr&, bool);
+  std::vector<ir::LoweredFunc> IRLowerOpWithoutSchedule(IRComputeFunction, GroupPtr&);
+#define DEFINE_IR_COMPUTE_SCHDULE(type)                                                        \
+  std::vector<Expr> IR##type##Compute(poly::StageMap& stages,                                  \
+                                      std::vector<ir::Tensor>& func_args,                      \
+                                      std::unordered_map<std::string, ir::Tensor>& tensor_map, \
+                                      const GroupPtr& group,                                   \
+                                      const GroupPtr& sub_group,                               \
+                                      bool apply_impl_schedule = false);                       \
+  void IR##type##Schedule(ir::IRSchedule& ir_sch,                                              \
+                          std::unordered_map<std::string, ir::Tensor>& tensor_map,             \
+                          const GroupPtr& group,                                               \
+                          const GroupPtr& sub_group,                                           \
+                          Node*& first,                                                        \
+                          Node*& second);
+
+  // compute and schedule
+  DEFINE_IR_COMPUTE_SCHDULE(Elementwise);
+  DEFINE_IR_COMPUTE_SCHDULE(Reduce);
+  DEFINE_IR_COMPUTE_SCHDULE(OutEWiseFusable);
+
+  void IRSchedule(ir::IRSchedule& ir_sch,
+                  const GroupPtr& group,
+                  const std::unordered_map<std::string, ir::Tensor>& tensor_map);
+
+  Target target_;
+  const absl::flat_hash_map<std::string, Type>& type_dict_;
+  const absl::flat_hash_map<std::string, shape_t>& shape_dict_;
+
+  // fucntion name prefix
+  const std::string func_name_prefix = "fn_";
+};
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/op_lowering_test.cc b/paddle/cinn/hlir/framework/op_lowering_test.cc
new file mode 100644
index 0000000000000..55a5a2fd445c7
--- /dev/null
+++ b/paddle/cinn/hlir/framework/op_lowering_test.cc
@@ -0,0 +1,1268 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/op_lowering.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/cuda_util.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "cinn/common/target.h"
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using namespace frontend;
+
+void CodeGen(ir::LoweredFunc& func) {
+#ifdef CINN_WITH_CUDA
+  auto target = common::DefaultNVGPUTarget();
+  Module::Builder builder("module_builder", target);
+
+  builder.AddFunction(func);
+  auto module   = builder.Build();
+  auto compiler = backends::Compiler::Create(target);
+
+  std::string code = "";
+  compiler->Build(module, code);
+#else
+  auto target = common::DefaultHostTarget();
+  ir::Module::Builder builder("Module_Builder", target);
+  builder.AddFunction(func);
+
+  CodeGenCX86 codegen(target, CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  auto source_code = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  LOG(INFO) << "compiled code of " << func->name << "is:\n\n\n" << source_code;
+#endif
+}
+
+void Compile(NetBuilder& net_builder) {
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+  auto& dtype_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, Type>>("inferdtype");
+  auto& shape_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+
+  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+  for (auto& fusion_op : graph->fusion_groups) {
+    auto lowered_func = op_lowerer.Lower(fusion_op);
+    CHECK_EQ(lowered_func.size(), 1);
+    CodeGen(lowered_func[0]);
+  }
+}
+
+TEST(OP_LOWERING, Reduce_Without_Last_Axis_3) {
+  int h = 128, w = 128;
+  NetBuilder net_builder("Reduce_Without_Last_Axis_3");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(C, {0});
+    auto F = net_builder.ReduceSum(C, {0});
+    auto G = net_builder.Add(E, F);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Without_Last_Axis_2) {
+  int h = 128, w = 128;
+  NetBuilder net_builder("Reduce_Without_Last_Axis_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h * 2, w}, "B");
+    auto E = net_builder.ReduceSum(A, {0});
+    auto F = net_builder.ReduceSum(B, {0});
+    auto G = net_builder.Add(E, F);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Without_Last_Axis_1) {
+  NetBuilder net_builder("Reduce_Without_Last_Axis_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {128, 1024}, "A");
+    auto B = net_builder.ReduceSum(A, {0});
+    auto C = net_builder.ReduceSum(A, {0});
+    auto D = net_builder.ReduceSum(A, {0});
+  }
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_With_Last_Axis_1) {
+  NetBuilder net_builder("Reduce_With_Last_Axis_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {10, 100, 1}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 2});
+  }
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fuse_Broadcast_With_Output) {
+  NetBuilder net_builder("Reduce_Fuse_Broadcast_With_Output");
+  auto layer_norm_51__tmp_1 = net_builder.CreateInput(Float(32), {256}, "layer_norm_51__tmp_1");
+  auto var_3216             = net_builder.CreateInput(Float(32), {256, 60}, "var_3216");
+  auto var_3202             = net_builder.CreateInput(Float(32), {1, 60}, "var_3202");
+  auto var_3212             = net_builder.CreateInput(Float(32), {256, 60}, "var_3212");
+
+  auto var_3206         = net_builder.Reshape(layer_norm_51__tmp_1, {256, 1});
+  auto composite_tmp_8  = net_builder.FillConstant<float>({256, 1}, 1e-5, "composite_tmp_8");
+  auto var_3214         = net_builder.Add(var_3206, composite_tmp_8);
+  auto composite_tmp_10 = net_builder.FillConstant<float>({256, 1}, 1.0, "composite_tmp_10");
+  auto var_3220         = net_builder.Divide(composite_tmp_10, var_3214);
+  auto var_3226         = net_builder.Sqrt(var_3220);
+  auto var_3224         = net_builder.Scale(var_3220, -1.0, 0.0, true);
+  auto var_3366         = net_builder.BroadcastTo(var_3224, {256, 60});
+  auto var_3228         = net_builder.Multiply(var_3366, var_3216);
+  auto var_3368         = net_builder.BroadcastTo(var_3202, {256, 60});
+  auto var_3236         = net_builder.Multiply(var_3228, var_3212);
+  auto var_3244         = net_builder.Multiply(var_3236, var_3368);
+  auto var_3252         = net_builder.ReduceSum(var_3244, {1}, true);
+  auto var_3232         = net_builder.Scale(var_3226, 0.0166667, 0.0, true);
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fuse_Broadcast_Layernorm) {
+  int h = 32, w = 1024;
+  NetBuilder net_builder("Reduce_Fuse_Broadcast_Layernorm");
+  // create model
+  {
+    // x
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    // x * x
+    auto B = net_builder.Multiply(A, A);
+    // sum x
+    auto C = net_builder.ReduceSum(A, {1});
+    // sum x*x
+    auto D = net_builder.ReduceSum(B, {1});
+    // constant w
+    auto E = net_builder.FillConstant<float>({h}, 1024.0f, "E");
+    // mean
+    auto F  = net_builder.Divide(C, E);
+    auto FF = net_builder.BroadcastTo(F, {h, w}, {0});
+    // mean x*x
+    auto G = net_builder.Divide(D, E);
+    // mean * mean
+    auto H = net_builder.Multiply(F, F);
+    // var^2
+    auto I = net_builder.Subtract(G, H);
+    // eps
+    auto J = net_builder.FillConstant<float>({h}, 1e-10f, "J");
+    // eps + delta
+    auto K = net_builder.Add(I, J);
+    // var
+    auto L  = net_builder.Sqrt(K);
+    auto LL = net_builder.BroadcastTo(L, {h, w}, {0});
+    // x - mean
+    auto M = net_builder.Subtract(A, FF);
+    // /var
+    auto N = net_builder.Divide(M, LL);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fuse_Broadcast_Softmax) {
+  int h = 32, w = 1024;
+  NetBuilder net_builder("Reduce_Fuse_Broadcast_Softmax");
+  // create model
+  {
+    // softmax
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    // redece max
+    auto B = net_builder.ReduceMax(A, {1});
+    // broadcast
+    auto C = net_builder.BroadcastTo(B, {h, w}, {0});
+    // x - max(x)
+    auto D = net_builder.Subtract(A, C);
+    // exp(x)
+    auto E = net_builder.Exp(D);
+    // reduce sum
+    auto F = net_builder.ReduceSum(E, {1});
+    // broadcast
+    auto G = net_builder.BroadcastTo(F, {h, w}, {0});
+    // exp(x)/sum(exp(x))
+    auto H = net_builder.Divide(E, G);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fuse_Broadcast_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fuse_Broadcast_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h * w}, "A");
+    auto B = net_builder.ReduceSum(A, {0});
+    auto C = net_builder.BroadcastTo(B, {h * w}, {0});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fuse_Broadcast_2) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fuse_Broadcast_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 1});
+    auto C = net_builder.BroadcastTo(B, {h, w}, {1});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fuse_Broadcast_3) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fuse_Broadcast_3");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {1, 2});
+    auto C = net_builder.BroadcastTo(B, {h, h, w}, {0});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fuse_Broadcast_4) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fuse_Broadcast_4");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {1, 2});
+    auto C = net_builder.BroadcastTo(B, {h, h, w}, {1});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fuse_Broadcast_5) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fuse_Broadcast_5");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {1, 2});
+    auto C = net_builder.BroadcastTo(B, {h, h, w}, {0});
+    auto D = net_builder.ReduceSum(C, {1, 2});
+    auto E = net_builder.BroadcastTo(D, {h, h, w}, {0});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OpFusionPass, Reduce_Fuse_Broadcast_6) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fuse_Broadcast_6");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {1, 2});
+    auto C = net_builder.BroadcastTo(B, {h, h, w}, {0});
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto E = net_builder.BroadcastTo(D, {h, h, w}, {1, 2});
+    auto F = net_builder.Add(C, E);
+  }
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Dim_Equal_One_0) {
+  NetBuilder net_builder("Reduce_Dim_Equal_One_0");
+  {
+    auto A = net_builder.CreateInput(Float(32), {1, 1000}, "A");
+    auto B = net_builder.CreateInput(Float(32), {1, 1000}, "B");
+    auto C = net_builder.Add(A, B);
+    auto D = net_builder.ReduceSum(C, {1}, false);
+    auto E = net_builder.ReduceSum(C, {1}, false);
+    auto F = net_builder.Add(D, E);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Dim_Equal_One_1) {
+  NetBuilder net_builder("Reduce_Dim_Equal_One_1");
+  {
+    auto A = net_builder.CreateInput(Float(32), {32, 32}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 1}, false);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Dim_Equal_One_2) {
+  NetBuilder net_builder("Reduce_Dim_Equal_One_2");
+  {
+    auto A = net_builder.CreateInput(Float(32), {32, 1024}, "A");
+    auto B = net_builder.ReduceSum(A, {1}, false);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Dim_Equal_One_3) {
+  NetBuilder net_builder("Reduce_Dim_Equal_One_3");
+  {
+    auto A = net_builder.CreateInput(Float(32), {32, 1024}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 1}, false);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Dim_Equal_One_4) {
+  NetBuilder net_builder("Reduce_Dim_Equal_One_4");
+  {
+    auto A = net_builder.CreateInput(Float(32), {32, 32, 1024}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 2}, false);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Dim_Equal_One_5) {
+  NetBuilder net_builder("Reduce_Dim_Equal_One_5");
+  {
+    auto A = net_builder.CreateInput(Float(32), {32, 32, 32, 256}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 2, 3}, false);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Dim_Equal_One_6) {
+  NetBuilder net_builder("Reduce_Dim_Equal_One_6");
+  {
+    auto A = net_builder.CreateInput(Float(32), {32, 32, 256}, "A");
+    auto B = net_builder.ReduceSum(A, {1, 2});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Dim_Equal_One_7) {
+  NetBuilder net_builder("Reduce_Dim_Equal_One_7");
+  {
+    auto A = net_builder.CreateInput(Float(32), {1, 1, 1024}, "A");
+    auto B = net_builder.ReduceSum(A, {2}, false);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_0) {
+  NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_0");
+  {
+    auto A = net_builder.CreateInput(Float(32), {16, 64, 1024}, "A");
+    auto B = net_builder.ReduceSum(A, {2}, true);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_1) {
+  NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_1");
+  {
+    auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A");
+    auto B = net_builder.CreateInput(Float(32), {1, 64, 1, 1}, "B");
+    auto C = net_builder.ReduceSum(A, {0, 2, 3}, true);
+    auto D = net_builder.Add(B, C);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_2) {
+  NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_2");
+  {
+    auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A");
+    auto B = net_builder.CreateInput(Float(32), {16, 1, 112, 112}, "B");
+    auto C = net_builder.ReduceSum(A, {1}, true);
+    auto D = net_builder.Add(B, C);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_3) {
+  NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_3");
+  {
+    auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A");
+    auto B = net_builder.ReduceSum(A, {2, 3}, true);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_4) {
+  NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_4");
+  {
+    auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A");
+    auto B = net_builder.ReduceSum(A, {2}, true);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_5) {
+  NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_5");
+  {
+    auto A = net_builder.CreateInput(Float(32), {16, 64, 2048}, "A");
+    auto B = net_builder.ReduceSum(A, {2}, true);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_6) {
+  NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_6");
+  {
+    auto A = net_builder.CreateInput(Float(32), {16, 64, 1024}, "A");
+    auto B = net_builder.ReduceSum(A, {2}, true);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_7) {
+  NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_7");
+  {
+    auto A = net_builder.CreateInput(Float(32), {16, 64, 16, 1024}, "A");
+    auto B = net_builder.ReduceSum(A, {1, 3}, true);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_Test_Concat_Before_Reduce) {
+  NetBuilder net_builder("Elementwise_Test_Concat_Before_Reduce");
+  {
+    auto A = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "A");
+    auto B = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "B");
+    auto C = net_builder.Concat({A, B}, 3);
+    auto D = net_builder.Reshape(C, {32, 32, 1024});
+    auto E = net_builder.ReduceSum(D, {2}, false);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_Test_Reshape_Before_Reduce) {
+  NetBuilder net_builder("Elementwise_Test_Reshape_Before_Reduce");
+  {
+    auto A = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "A");
+    auto B = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "B");
+    auto C = net_builder.Add(A, B);
+    auto D = net_builder.Reshape(C, {32, 32, 512});
+    auto E = net_builder.CreateInput(Float(32), {32, 32, 512}, "E");
+    auto F = net_builder.Add(D, E);
+    auto G = net_builder.ReduceSum(F, {0, 1}, false);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_Test_Reshape_After_Reduce) {
+  NetBuilder net_builder("Elementwise_Test_Reshape_After_Reduce");
+  {
+    auto A = net_builder.CreateInput(Float(32), {32, 32, 32}, "A");
+    auto B = net_builder.ReduceSum(A, {1}, false);
+    auto C = net_builder.CreateInput(Float(32), {16, 4, 16}, "C");
+    auto D = net_builder.Reshape(C, {32, 32});
+    auto E = net_builder.Transpose(D, {1, 0});
+    auto F = net_builder.CreateInput(Float(32), {32, 32}, "F");
+    auto G = net_builder.Add(E, F);
+    auto H = net_builder.Add(B, G);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_Test_Reshape_Fuse_Concat) {
+  NetBuilder net_builder("Elementwise_Test_Reshape_Fuse_Concat");
+  {
+    auto A  = net_builder.CreateInput(Float(32), {8, 8, 8, 8}, "A");
+    auto B  = net_builder.Reshape(A, {16, 16, 16});
+    auto C  = net_builder.CreateInput(Float(32), {16, 16}, "C");
+    auto D  = net_builder.CreateInput(Float(32), {16, 16}, "D");
+    auto DT = net_builder.Transpose(D, {1, 0});
+    auto E  = net_builder.Add(C, DT);
+    auto F  = net_builder.BroadcastTo(E, {16, 16, 16}, {1, 2});
+    auto G  = net_builder.Add(B, F);
+    auto H  = net_builder.CreateInput(Float(32), {16, 16, 16}, "H");
+    auto I  = net_builder.Concat({G, H}, 2);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_TEST_Split_0) {
+  NetBuilder net_builder("Elementwise_TEST_Split_0");
+  {
+    auto A = net_builder.CreateInput(Float(32), {32, 64}, "A");
+    auto B = net_builder.Split(A, {3, 5, 16, 2, 6}, 0);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_TEST_Split_1) {
+  NetBuilder net_builder("Elementwise_TEST_Split_1");
+  {
+    auto A = net_builder.CreateInput(Float(32), {128, 128}, "A");
+    auto B = net_builder.Split(A, {32, 32, 32, 32}, 1);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_TEST_Split_2) {
+  NetBuilder net_builder("Elementwise_TEST_Split_2");
+  {
+    auto A = net_builder.CreateInput(Float(32), {128, 128}, "A");
+    auto B = net_builder.Split(A, {64, 32, 32}, 1);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_TEST_0) {
+  NetBuilder net_builder("Elementwise_TEST_0");
+  {
+    auto x  = net_builder.FillConstant<float>({1}, 128.0, "x");
+    auto o1 = net_builder.Scale(x, -1.0, 0.0);
+    auto o2 = net_builder.Scale(x, -1.0, 0.0);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, NonFusibleOp_TEST_0) {
+  NetBuilder net_builder("NonFusibleOp_TEST_0");
+  {
+    auto A = net_builder.CreateInput(Float(32), {9801, 2}, "A");
+    auto B = net_builder.Reshape(A, {9801, 2});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, NonFusibleOp_TEST_1) {
+  NetBuilder net_builder("NonFusibleOp_TEST_1");
+  {
+    auto A = net_builder.CreateInput(Float(32), {128, 128}, "A");
+    auto B = net_builder.CreateInput(Float(32), {128, 128}, "B");
+    auto C = net_builder.Matmul(A, B);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, NonFusibleOp_TEST_2) {
+  NetBuilder net_builder("NonFusibleOp_TEST_2");
+  {
+    auto A = net_builder.CreateInput(Float(32), {128, 128}, "A");
+    auto B = net_builder.Matmul(A, A);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, NonFusibleOp_TEST_3) {
+  NetBuilder net_builder("NonFusibleOp_TEST_3");
+  {
+    auto A = net_builder.CreateInput(Float(32), {128, 256}, "A");
+    auto C = net_builder.Split(A, {4}, 1);
+  }
+
+  Compile(net_builder);
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(OP_LOWERING, NonFusibleOp_TEST_4) {
+  NetBuilder net_builder("NonFusibleOp_TEST_4");
+  {
+    auto A = net_builder.CreateInput(Float(32), {128, 128}, "A");
+    auto B = net_builder.CreateInput(Float(32), {128, 128}, "B");
+    auto C = net_builder.CreateInput(Float(32), {128, 128}, "C");
+    auto D = net_builder.Matmul(A, B);
+    auto E = net_builder.Add(C, D);
+  }
+
+  Compile(net_builder);
+}
+#endif
+
+TEST(OP_LOWERING, Transform_TEST_0) {
+  NetBuilder net_builder("Transform_TEST_0");
+  {
+    auto A = net_builder.CreateInput(Float(32), {128, 128}, "A");
+    auto B = net_builder.CreateInput(Float(32), {128, 128}, "B");
+    auto C = net_builder.CreateInput(Float(32), {128, 128}, "C");
+    auto D = net_builder.Concat({A, B, C}, 1);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_Test_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Elementwise_Test_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, D);
+    auto G = net_builder.Add(E, F);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_Test_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Elementwise_Test_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(E, C);
+    auto G = net_builder.Add(E, D);
+    auto H = net_builder.Add(F, G);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Elementwise_Test_2) {
+  int h = 50, w = 10201;
+  NetBuilder net_builder("Elementwise_Test_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(E, C);
+    auto G = net_builder.Add(E, D);
+    auto H = net_builder.Add(F, G);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w, h}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 1});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_1) {
+  int c = 32, h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 1, 2});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_2) {
+  int c = 32, h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 1});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_3) {
+  int c = 32, h = 16, w = 16;
+  NetBuilder net_builder("Reduce_Test_3");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 1, 2});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_4) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_4");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w, h}, "A");
+    auto B = net_builder.ReduceSum(A, {0});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_5) {
+  int h = 32, w = 768;
+  NetBuilder net_builder("Reduce_Test_5");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {0});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_6) {
+  int h = 32, w = 2048;
+  NetBuilder net_builder("Reduce_Test_6");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {0});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_7) {
+  int h = 32, w = 512;
+  NetBuilder net_builder("Reduce_Test_7");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {1});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_8) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_8");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w, w}, "A");
+    auto B = net_builder.ReduceSum(A, {1, 2});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_9) {
+  int n = 16, c = 128, h = 56, w = 56;
+  NetBuilder net_builder("Reduce_Test_9");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {0, 2, 3});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Test_10) {
+  int n = 16, c = 16, h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_10");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {1});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fusion_Test_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+
+    auto C = net_builder.ReduceSum(A, {0});
+    auto D = net_builder.Add(B, C);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fusion_Test_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {1});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_2) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fusion_Test_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {w}, "D");
+
+    auto E = net_builder.ReduceSum(A, {0});
+    auto F = net_builder.Add(B, C);
+    auto G = net_builder.Add(D, F);
+    auto H = net_builder.Add(E, G);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_3) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fusion_Test_3");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {0});
+    auto C = net_builder.ReduceSum(A, {0});
+    auto D = net_builder.Add(B, C);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_4) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fusion_Test_4");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+
+    auto D = net_builder.ReduceSum(C, {0});
+    auto E = net_builder.ReduceSum(C, {0});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_5) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Fusion_Test_5");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+
+    auto D = net_builder.ReduceSum(C, {1});
+    auto E = net_builder.ReduceSum(C, {1});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_6) {
+  int h = 128, w = 128;
+  NetBuilder net_builder("Reduce_Fusion_Test_6");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0});
+    auto F = net_builder.ReduceSum(D, {0});
+    auto G = net_builder.Add(E, C);
+    auto I = net_builder.Add(F, C);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_7) {
+  int h = 128, w = 128;
+  NetBuilder net_builder("Reduce_Fusion_Test_7");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {1});
+    auto F = net_builder.ReduceSum(D, {1});
+    auto G = net_builder.Add(E, C);
+    auto I = net_builder.Add(F, C);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_8) {
+  int h = 128, w = 128;
+  NetBuilder net_builder("Reduce_Fusion_Test_8");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {1}, "C");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0, 1});
+    auto F = net_builder.ReduceSum(D, {0, 1});
+    auto G = net_builder.Add(E, C);
+    auto I = net_builder.Add(F, C);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_9) {
+  int c = 128, h = 128, w = 128;
+  NetBuilder net_builder("Reduce_Fusion_Test_9");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {c, h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h}, "C");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0, 2});
+    auto F = net_builder.ReduceSum(D, {0, 2});
+    auto G = net_builder.Add(E, C);
+    auto I = net_builder.Add(F, C);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_10) {
+  int h = 10201, w = 50;
+  NetBuilder net_builder("Reduce_Fusion_Test_10");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.ReduceSum(A, {0});
+    auto D = net_builder.Add(B, C);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_11) {
+  int n = 128, c = 128, h = 16, w = 16;
+  NetBuilder net_builder("Reduce_Fusion_Test_11");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0, 2, 3});
+    auto F = net_builder.ReduceSum(D, {0, 2, 3});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_12) {
+  int n = 128, c = 128, h = 112, w = 112;
+  NetBuilder net_builder("Reduce_Fusion_Test_12");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0, 2, 3});
+    auto F = net_builder.ReduceSum(D, {0, 2, 3});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_13) {
+  int n = 8, c = 8, h = 8, w = 8;
+  NetBuilder net_builder("Reduce_Fusion_Test_13");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0, 1, 2});
+    auto F = net_builder.ReduceSum(D, {0, 1, 2});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_14) {
+  int n = 8, c = 8, h = 8, w = 8;
+  NetBuilder net_builder("Reduce_Fusion_Test_14");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {n, n, n, c, h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {n, n, n, c, h, w}, "B");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0, 3, 4});
+    auto F = net_builder.ReduceSum(D, {0, 3, 4});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_15) {
+  int h = 512, w = 32;
+  NetBuilder net_builder("Reduce_Fusion_Test_15");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0});
+    auto F = net_builder.ReduceSum(D, {0});
+  }
+
+  Compile(net_builder);
+}
+TEST(OP_LOWERING, Reduce_Fusion_Test_16) {
+  int n = 128, c = 128, h = 28, w = 28;
+  NetBuilder net_builder("Reduce_Fusion_Test_16");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0, 2, 3});
+    auto F = net_builder.ReduceSum(D, {0, 2, 3});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_17) {
+  int h = 128, w = 768;
+  NetBuilder net_builder("Reduce_Fusion_Test_17");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h * 2, w}, "B");
+    auto E = net_builder.ReduceSum(A, {0});
+    auto F = net_builder.ReduceSum(B, {0});
+    auto G = net_builder.Add(E, F);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_18) {
+  int h = 128, w = 768;
+  NetBuilder net_builder("Reduce_Fusion_Test_18");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {16, h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {16, h * 2, w}, "B");
+    auto E = net_builder.ReduceSum(A, {1});
+    auto F = net_builder.ReduceSum(B, {1});
+    auto G = net_builder.Add(E, F);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_19) {
+  int h = 128, w = 128;
+  NetBuilder net_builder("Reduce_Fusion_Test_19");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h * 2, w}, "B");
+    auto E = net_builder.ReduceSum(A, {0});
+    auto F = net_builder.ReduceSum(B, {0});
+    auto G = net_builder.Add(E, F);
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OP_LOWERING, Reduce_Fusion_Test_20) {
+  int h = 128, w = 128;
+  NetBuilder net_builder("Reduce_Fusion_Test_20");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h * 2, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h * 3, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h * 4, w}, "D");
+    auto E = net_builder.ReduceSum(A, {0});
+    auto F = net_builder.ReduceSum(B, {0});
+    auto G = net_builder.ReduceSum(C, {0});
+    auto H = net_builder.ReduceSum(D, {0});
+    auto I = net_builder.Add(E, F);
+    auto J = net_builder.Add(G, I);
+    auto K = net_builder.Add(H, J);
+  }
+
+  Compile(net_builder);
+}
+
+/*
+TEST(OP_LOWERING, Reduce_Fusion_Test_21) {
+  int h = 128, w = 4;
+  NetBuilder net_builder("Reduce_Fusion_Test_21");
+  // create model
+  {
+    auto A0  = net_builder.CreateInput(Float(32), {256, w}, "A0");
+    auto B0  = net_builder.CreateInput(Float(32), {256, w}, "B0");
+    auto C0  = net_builder.CreateInput(Float(32), {55200, w}, "C0");
+    auto D0  = net_builder.CreateInput(Float(32), {2750, w}, "D0");
+    auto A1  = net_builder.CreateInput(Float(32), {256, w}, "A1");
+    auto B1  = net_builder.CreateInput(Float(32), {256, w}, "B1");
+    auto C1  = net_builder.CreateInput(Float(32), {55200, w}, "C1");
+    auto D1  = net_builder.CreateInput(Float(32), {2750, w}, "D1");
+    auto AA  = net_builder.Add(A0, A1);
+    auto BB  = net_builder.Add(B0, B1);
+    auto CC  = net_builder.Add(C0, C1);
+    auto DD  = net_builder.Add(D0, D1);
+    auto E   = net_builder.ReduceSum(AA, {0});
+    auto F   = net_builder.ReduceSum(BB, {0});
+    auto G   = net_builder.ReduceSum(CC, {0});
+    auto H   = net_builder.ReduceSum(DD, {0});
+    auto I   = net_builder.Add(E, F);
+    auto J   = net_builder.Add(G, I);
+    auto K   = net_builder.Add(H, J);
+    auto AAA = net_builder.Add(AA, A1);
+    auto BBB = net_builder.Add(BB, B1);
+    auto CCC = net_builder.Add(CC, C1);
+    auto DDD = net_builder.Add(DD, D1);
+  }
+
+  Compile(net_builder);
+}
+*/
+
+TEST(OpFusionPass, Block_Reduce_Fuse_Broadcast) {
+  int sm_count              = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int max_threads_per_sm    = common::DefaultNVGPUTarget().get_max_threads_per_sm();
+  int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
+  int h                     = warp_reduce_threshold - 10;
+  int w                     = 256;
+  NetBuilder net_builder("Block_Reduce_Fuse_Broadcast");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {1}, true);
+    auto C = net_builder.BroadcastTo(B, {h, w}, {0, 1});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OpFusionPass, Block_Reduce_Fuse_Elementwise) {
+  int sm_count              = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int max_threads_per_sm    = common::DefaultNVGPUTarget().get_max_threads_per_sm();
+  int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
+  int h                     = warp_reduce_threshold - 10;
+  int w                     = 256;
+  NetBuilder net_builder("Block_Reduce_Fuse_Elementwise");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h}, "B");
+    auto C = net_builder.ReduceSum(A, {1}, true);
+    auto D = net_builder.Add(B, C);
+  }
+
+  Compile(net_builder);
+}
+TEST(OpFusionPass, Warp_Reduce_Fuse_Broadcast) {
+  int sm_count              = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int max_threads_per_sm    = common::DefaultNVGPUTarget().get_max_threads_per_sm();
+  int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
+  int h                     = warp_reduce_threshold + 10;
+  int w                     = 256;
+  NetBuilder net_builder("Warp_Reduce_Fuse_Broadcast");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.ReduceSum(A, {1}, true);
+    auto C = net_builder.BroadcastTo(B, {h, w}, {0, 1});
+  }
+
+  Compile(net_builder);
+}
+
+TEST(OpFusionPass, Warp_Reduce_Fuse_Elementwise) {
+  int sm_count              = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int max_threads_per_sm    = common::DefaultNVGPUTarget().get_max_threads_per_sm();
+  int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
+  int h                     = warp_reduce_threshold + 10;
+  int w                     = 256;
+  NetBuilder net_builder("Warp_Reduce_Fuse_Elementwise");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h}, "B");
+    auto C = net_builder.ReduceSum(A, {1}, true);
+    auto D = net_builder.Add(B, C);
+  }
+
+  Compile(net_builder);
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
new file mode 100644
index 0000000000000..8220c586008db
--- /dev/null
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -0,0 +1,1661 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/op_lowering_util.h"
+
+#include "cinn/hlir/pe/nn_util.h"
+#include "cinn/utils/string.h"
+#ifdef CINN_WITH_CUDA
+#include "cinn/common/bfloat16.h"
+#include "cinn/common/float16.h"
+#endif
+#include <queue>
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+namespace utils {
+struct NodeCompare {
+  bool operator()(Node* lhs, Node* rhs) const { return lhs->id() < rhs->id(); }
+};
+}  // namespace utils
+
+std::vector<NodeData*> GetInputNodeData(const Node* node) {
+  std::vector<NodeData*> producers;
+  for (auto& link : node->inlinks_in_order()) {
+    auto node_data = link->source()->safe_as<NodeData>();
+    producers.push_back(node_data);
+  }
+  return producers;
+}
+
+ir::Tensor GetTensor(const NodeData* node_data,
+                     const absl::flat_hash_map<std::string, Type>& type_dict,
+                     const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
+  auto dtype = type_dict.at(node_data->id());
+  if (dtype.is_float(32)) {
+    return lang::Placeholder<float>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_float(64)) {
+    return lang::Placeholder<double>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_bfloat16()) {
+    return lang::Placeholder<common::bfloat16>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_float16()) {
+    return lang::Placeholder<common::float16>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_bool()) {
+    return lang::Placeholder<bool>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_int(8)) {
+    return lang::Placeholder<int8_t>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_int(16)) {
+    return lang::Placeholder<int16_t>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_int(32)) {
+    return lang::Placeholder<int32_t>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_int(64)) {
+    return lang::Placeholder<int64_t>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_uint(8)) {
+    return lang::Placeholder<uint8_t>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_uint(16)) {
+    return lang::Placeholder<uint16_t>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_uint(32)) {
+    return lang::Placeholder<uint32_t>(node_data->id(), shape_dict.at(node_data->id()));
+  } else if (dtype.is_uint(64)) {
+    return lang::Placeholder<uint64_t>(node_data->id(), shape_dict.at(node_data->id()));
+  } else {
+    LOG(FATAL) << "Unsupport dtype: " << dtype;
+  }
+}
+
+std::vector<ir::Tensor> CollectInputTensor(const Node* node,
+                                           std::vector<ir::Tensor>& func_args,
+                                           std::unordered_map<std::string, ir::Tensor>& tensor_map,
+                                           const absl::flat_hash_map<std::string, Type>& type_dict,
+                                           const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
+  std::vector<ir::Tensor> tensors;
+  // get all input nodes
+  for (auto& node_data : GetInputNodeData(node)) {
+    CHECK(node_data);
+    auto tensor = GetTensor(node_data, type_dict, shape_dict);
+    if (!tensor_map.count(node_data->id())) {
+      tensor_map[node_data->id()] = tensor;
+      // record func input args
+      func_args.push_back(tensor);
+    }
+    tensors.push_back(tensor);
+  }
+  return tensors;
+}
+
+NodeData* GetNodeData(const Node* node) {
+  auto node_data = (*node->outlinks().begin())->sink()->safe_as<NodeData>();
+  CHECK(node_data);
+  return node_data;
+}
+
+std::vector<NodeData*> GetAllNodeData(const Node* node) {
+  std::vector<NodeData*> node_datas;
+  for (auto& link : node->outlinks_in_order()) {
+    auto node_data = link->sink()->safe_as<NodeData>();
+    CHECK(node_data);
+    node_datas.push_back(node_data);
+  }
+
+  return node_datas;
+}
+
+std::vector<Node*> GetConsumers(const Node* node) {
+  std::vector<Node*> consumers;
+  auto node_data = GetNodeData(node);
+  for (auto& link : node_data->outlinks()) {
+    auto consumer = link->sink()->safe_as<Node>();
+    CHECK(consumer);
+    consumers.push_back(consumer);
+  }
+  return consumers;
+}
+
+std::vector<Node*> GetConsumersInSet(const Node* node, const std::unordered_set<Node*>& node_set) {
+  std::vector<Node*> consumers;
+  auto node_data = GetNodeData(node);
+  for (auto& link : node_data->outlinks()) {
+    auto consumer = link->sink()->safe_as<Node>();
+    CHECK(consumer);
+    if (node_set.count(consumer)) {
+      consumers.push_back(consumer);
+    }
+  }
+  return consumers;
+}
+
+std::vector<Node*> GetProducers(const Node* node) {
+  std::vector<Node*> producers;
+  for (auto& link : node->inlinks_in_order()) {
+    auto data = link->source()->safe_as<NodeData>();
+    CHECK(data);
+    if (data->source_node.get()) {
+      producers.push_back(data->source_node.get());
+    }
+  }
+  return producers;
+}
+
+std::vector<Node*> GetProducersInSet(const Node* node, const std::unordered_set<Node*>& node_set) {
+  std::vector<Node*> producers;
+  for (auto& link : node->inlinks_in_order()) {
+    auto data = link->source()->safe_as<NodeData>();
+    CHECK(data);
+    if (data->source_node.get() && node_set.count(data->source_node.get())) {
+      producers.push_back(data->source_node.get());
+    }
+  }
+  return producers;
+}
+
+bool IsConstOp(const framework::Node* node) {
+  static std::unordered_set<std::string> const_op_type = {"const_scalar", "fill_constant", "arange"};
+  if (const_op_type.count(node->op()->name)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+std::vector<int> GetInputShape(const Node* node, const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
+  const auto& in_links = node->inlinks_in_order();
+  CHECK(!in_links.empty()) << "Cannot get input shape from a no-input op \"" << node->id() << "\"";
+
+  auto* producer_data = in_links.front()->source()->safe_as<NodeData>();
+  CHECK_NOTNULL(producer_data);
+  return shape_dict.at(producer_data->id());
+}
+
+std::vector<int> GetOutputShape(const Node* node, const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
+  auto node_data = GetNodeData(node);
+  return shape_dict.at(node_data->id());
+}
+
+Node* FindGlobalReducer(const std::vector<Node*>& nodes_in_order) {
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+  for (auto iter = nodes_in_order.rbegin(); iter != nodes_in_order.rend(); ++iter) {
+    if (op_pattern_dict[(*iter)->op()] == framework::kReduction) {
+      return *iter;
+    }
+  }
+
+  return nullptr;
+}
+
+using Visitor = std::function<std::vector<Node*>(const Node*, const std::unordered_set<Node*>&)>;
+Node* FindReducerInRoute(const Node* node, const std::unordered_set<Node*>& nodes_set, Visitor visitor) {
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+  std::queue<const Node*> candidates;
+  candidates.push(node);
+  while (!candidates.empty()) {
+    auto candidate = candidates.front();
+    candidates.pop();
+
+    for (auto consumer : visitor(candidate, nodes_set)) {
+      if (op_pattern_dict[consumer->op()] == framework::kReduction) {
+        return consumer;
+      }
+      candidates.push(consumer);
+    }
+  }
+
+  return nullptr;
+}
+
+Node* FindNearestReducer(const Node* node, const std::unordered_set<Node*>& nodes_set) {
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+  // from consumers find reducer.
+  auto reducer = FindReducerInRoute(node, nodes_set, GetConsumersInSet);
+  if (reducer)
+    return reducer;
+  else
+    return FindReducerInRoute(node, nodes_set, GetProducersInSet);
+}
+
+std::unordered_map<Node*, Node*> BuildVirtualConsumer(const GroupPtr& group,
+                                                      const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
+  std::unordered_map<Node*, Node*> virtual_consumers;
+  std::unordered_set<Node*> nodes_set = group->NodeSet();
+  if (group->op_pattern_kind != framework::kReduction) {
+    return virtual_consumers;
+  }
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+
+  Node* e_node = nullptr;
+  Node* r_node = nullptr;
+  for (auto t_node : group->master_nodes) {
+    if (op_pattern_dict[t_node->op()] != framework::kReduction) {
+      // producer exits reduce-sum and not consumers.
+      if (!e_node && FindReducerInRoute(t_node, nodes_set, GetProducersInSet) &&
+          GetConsumersInSet(t_node, nodes_set).size() == 0) {
+        e_node = t_node;
+      }
+    } else if (!r_node) {
+      r_node = t_node;
+    }
+  }
+
+  // try to find reducer with different shape.
+  for (auto t_node : group->output_nodes) {
+    if (op_pattern_dict[t_node->op()] == framework::kReduction) {
+      if (e_node) {
+        virtual_consumers[t_node] = e_node;
+      }
+      continue;
+    }
+    if (FindNearestReducer(t_node, nodes_set)) {
+      continue;
+    }
+
+    bool found = false;
+    std::unordered_set<Node*> visited;
+    std::queue<Node*> candidates;
+
+    candidates.push(t_node);
+    visited.insert(t_node);
+    // from producers find reducer consumer.
+    while (!found && !candidates.empty()) {
+      auto candidate = candidates.front();
+      candidates.pop();
+
+      for (auto producer : GetProducersInSet(candidate, nodes_set)) {
+        if (visited.count(producer)) {
+          continue;
+        }
+
+        auto reducer = FindReducerInRoute(producer, nodes_set, GetConsumersInSet);
+        if (reducer) {
+          virtual_consumers[t_node] = reducer;
+          found                     = true;
+          break;
+        }
+        candidates.push(producer);
+        visited.insert(producer);
+      }
+    }
+
+    auto output_shape = GetOutputShape(t_node, shape_dict);
+    if (!found && t_node != e_node && e_node) {
+      auto enode_output_shape = GetOutputShape(e_node, shape_dict);
+      if (std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>()) ==
+          std::accumulate(enode_output_shape.begin(), enode_output_shape.end(), 1, std::multiplies<int>())) {
+        virtual_consumers[t_node] = e_node;
+        found                     = true;
+      }
+    }
+    if (!found && r_node) {
+      auto rnode_input_shape = GetInputShape(r_node, shape_dict);
+      if (std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>()) ==
+          std::accumulate(rnode_input_shape.begin(), rnode_input_shape.end(), 1, std::multiplies<int>())) {
+        virtual_consumers[t_node] = r_node;
+        found                     = true;
+      }
+    }
+  }
+  // Establish virtual consumer relationships between output nodes with the same shape.
+  // This allows the calculation of output nodes without affiliation to be placed under the same loop.
+  std::unordered_map<int, Node*> numel_consumers;
+  for (auto out_node : group->output_nodes) {
+    if (virtual_consumers.find(out_node) != virtual_consumers.end() ||
+        !GetConsumersInSet(out_node, nodes_set).empty()) {
+      continue;
+    }
+    auto shape = GetOutputShape(out_node, shape_dict);
+    int numel  = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+    if (numel_consumers.find(numel) == numel_consumers.end()) {
+      numel_consumers.insert(std::make_pair(numel, out_node));
+    } else {
+      virtual_consumers[out_node] = numel_consumers[numel];
+    }
+  }
+
+  return virtual_consumers;
+}
+
+std::vector<Node*> FindConsumers(Node* node,
+                                 const std::unordered_set<Node*>& nodes_set,
+                                 const std::unordered_map<Node*, Node*>& virtual_consumers) {
+  auto consumers = GetConsumersInSet(node, nodes_set);
+  if (virtual_consumers.count(node)) {
+    consumers.push_back(virtual_consumers.find(node)->second);
+  }
+  return consumers;
+}
+
+std::vector<Node*> FindProducers(Node* node,
+                                 const std::unordered_set<Node*>& nodes_set,
+                                 const std::unordered_map<Node*, Node*>& virtual_consumers) {
+  auto producers = GetProducersInSet(node, nodes_set);
+  for (const auto& iter : virtual_consumers) {
+    if (iter.second == node) {
+      producers.push_back(iter.first);
+    }
+  }
+
+  return producers;
+}
+
+std::vector<Node*> TopologicalOrder(const GroupPtr& group, const std::unordered_map<Node*, Node*>& virtual_consumers) {
+  std::vector<Node*> nodes_in_order;
+  std::unordered_set<Node*> nodes_set = group->NodeSet();
+
+  while (!nodes_set.empty()) {
+    std::set<Node*, utils::NodeCompare> tmp_node_set(nodes_set.begin(), nodes_set.end());
+    for (auto node : tmp_node_set) {
+      auto consumers     = FindConsumers(node, nodes_set, virtual_consumers);
+      bool cant_be_erase = false;
+      for (auto consumer : consumers) {
+        if (nodes_set.count(consumer)) {
+          cant_be_erase = true;
+          break;
+        }
+      }
+
+      if (cant_be_erase) continue;
+      nodes_in_order.push_back(node);
+      nodes_set.erase(node);
+    }
+  }
+
+  return nodes_in_order;
+}
+
+std::vector<Node*> BFSTopologicalOrderWithPriority(const GroupPtr& group,
+                                                   const std::unordered_map<Node*, Node*>& virtual_consumers,
+                                                   const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
+  struct NodeWithPriority {
+    Node* node;
+    int priority;
+  };
+
+  struct Comparator {
+    bool operator()(const NodeWithPriority& lhs, const NodeWithPriority& rhs) { return lhs.priority > rhs.priority; }
+  };
+
+  std::vector<Node*> nodes_in_order;
+  std::unordered_set<Node*> visited;
+  std::unordered_set<Node*> nodes_set = group->NodeSet();
+  std::unordered_map<Node*, int> degree_map;
+  std::priority_queue<NodeWithPriority, std::vector<NodeWithPriority>, Comparator> priority_candidates;
+  std::vector<int> visited_numel;
+
+  // Calculate the priority of a node.
+  // The smaller the value, the higher the priority.
+  // Prioritize the same shape before considering OpPattern
+  auto PriorityFunc = [&visited_numel, &shape_dict](const Node* node) -> int {
+    auto node_shape = GetOutputShape(node, shape_dict);
+    int numel       = std::accumulate(node_shape.begin(), node_shape.end(), 1, std::multiplies<int>());
+    int index       = -1;
+    for (int i = 0; i < visited_numel.size(); ++i) {
+      if (numel == visited_numel[i]) {
+        index = i;
+        break;
+      }
+    }
+    if (index == -1) {
+      index = visited_numel.size();
+      visited_numel.push_back(numel);
+    }
+    auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+    return index * 10 + static_cast<int>(op_pattern_dict[node->op()]);
+  };
+
+  for (Node* node : nodes_set) {
+    auto consumers = FindConsumers(node, nodes_set, virtual_consumers);
+    // Some nodes may have multiple edges between them, resulting in duplicates in the consumer.
+    // We only need to calculate once.
+    std::unordered_set<Node*> consumers_without_duplicate(consumers.begin(), consumers.end());
+    degree_map[node] = consumers_without_duplicate.size();
+    if (degree_map.at(node) == 0) {
+      priority_candidates.push(NodeWithPriority{node, PriorityFunc(node)});
+    }
+  }
+
+  // Nested BFS, outer layer traverses priority, inner layer performs BFS on current priority.
+  while (!priority_candidates.empty()) {
+    Node* cur_priority_node = priority_candidates.top().node;
+    priority_candidates.pop();
+
+    std::queue<Node*> bfs_queue;
+    bfs_queue.push(cur_priority_node);
+    visited.insert(cur_priority_node);
+    while (!bfs_queue.empty()) {
+      Node* cur = bfs_queue.front();
+      bfs_queue.pop();
+
+      nodes_in_order.push_back(cur);
+      auto producers = FindProducers(cur, nodes_set, virtual_consumers);
+      std::unordered_set<Node*> producers_without_duplicate(producers.begin(), producers.end());
+      for (Node* node : producers_without_duplicate) {
+        --degree_map[node];
+        // Ensure that each node is accessed only once and maintain topological order.
+        if (visited.count(node) != 0 || degree_map[node] != 0) {
+          continue;
+        }
+        // Perform BFS access to the current priority producers
+        int node_priority = PriorityFunc(node);
+        if (node_priority <= PriorityFunc(cur_priority_node)) {
+          bfs_queue.push(node);
+          visited.insert(node);
+        } else {
+          priority_candidates.push(NodeWithPriority{node, node_priority});
+        }
+      }
+    }
+  }
+
+  return nodes_in_order;
+}
+
+bool WithoutLastDimInReduce(const std::vector<int>& shape, const std::vector<int>& axes) {
+  if (axes.empty()) {
+    return false;
+  }
+  // if last axis is in reduce.
+  if (std::find(axes.begin(), axes.end(), shape.size() - 1) != axes.end() ||
+      std::find(axes.begin(), axes.end(), -1) != axes.end()) {
+    return false;
+  }
+
+  int sum_last_axes = 1;
+  for (int idx = axes.back() + 1; idx < shape.size(); ++idx) {
+    sum_last_axes *= shape[idx];
+  }
+
+  if (sum_last_axes > 1) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void LoopOrderAssignReduce(ir::IRSchedule& ir_sch,
+                           const std::string& block_name,
+                           const std::vector<int>& axes,
+                           const common::Target& target,
+                           const bool just_reorder = false) {
+  // reorder none-last reduce axis to last.
+  // like: shape = [16,16,16,16,16],axes = [1,3] -> new order = [0, 2, 4, 1, 3].
+  std::vector<int> order;
+  int n_out_dims = ir_sch.GetLoops(block_name).size();
+  for (int idx = 0; idx < n_out_dims; ++idx) {
+    if (std::find(axes.begin(), axes.end(), idx) == axes.end()) {
+      order.push_back(idx);
+    }
+  }
+  for (auto axis : axes) {
+    order.push_back(axis);
+  }
+  ir_sch.Reorder(ir_sch.GetBlock(block_name), order);
+
+  if (just_reorder) {
+    return;
+  }
+  // fuse others none-reduce axis.
+  int last_dimension_num = n_out_dims - axes.back() - 1;
+  int index              = n_out_dims - last_dimension_num - axes.size();
+
+  // fuse last_dimension_num - 1 times
+  for (auto idx = index; idx < index + last_dimension_num - 1; ++idx) {
+    ir_sch.Fuse(block_name, {index, index + 1});
+  }
+
+  auto loops = ir_sch.GetLoops(block_name);
+  auto psize = ir::GetLoopExtent(loops[index]);
+
+  if (psize > target.max_num_threads()) {
+    for (int idx = target.max_num_threads(); idx > 0; --idx) {
+      if (psize % idx == 0) {
+        ir_sch.Split(loops[index], {-1, idx});
+        break;
+      }
+      CHECK_GT(idx, 1);
+    }
+  }
+
+  // fuse index - 1 times
+  for (int idx = 0; idx < index - 1; ++idx) {
+    ir_sch.Fuse(block_name, {0, 1});
+  }
+}
+
+void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,
+                                 const std::string& block_name,
+                                 const std::vector<int>& inshape,
+                                 const std::vector<int>& axes,
+                                 const common::Target& target) {
+  int tail   = 0;
+  bool bound = true;
+  auto shape = pe::GetFirstStepReduceShape(inshape, axes, bound, tail);
+  CHECK(bound) << std::accumulate(
+      inshape.begin(), inshape.end(), std::string(""), [](const std::string& left, const int right) {
+        return left + std::to_string(right) + " ";
+      });
+
+  VLOG(4) << "LoopAssignReduceWithoutLast: THe input shape=[" << cinn::utils::Join(inshape, ", ")
+          << "], first step reduce shape=[" << cinn::utils::Join(shape, ", ") << "]"
+          << ", axes=[" << cinn::utils::Join(axes, ", ") << "], tail=" << tail;
+
+  // remove loop size = 1 and remove axis in axes.
+  std::vector<int> nshape, axes_shift_num(axes.size(), 0);
+  for (int idx = 0; idx < shape.size(); ++idx) {
+    if (shape[idx] == 1 && idx < axes.back()) {
+      for (int j = 0; j < axes.size(); ++j) {
+        if (axes[j] == idx) {
+          // the loop size at axis is 1, need remove
+          axes_shift_num[j] = -1;
+        } else if (axes[j] > idx) {
+          // the axies value need left shift
+          axes_shift_num[j]++;
+        }
+      }
+    } else {
+      nshape.push_back(shape[idx]);
+    }
+  }
+
+  // remove loop size - 1 axes
+  std::vector<int> naxes;
+  for (int i = 0; i < axes_shift_num.size(); ++i) {
+    if (axes_shift_num[i] != -1) {
+      // the axis do not need remove, but need left shift
+      naxes.emplace_back(axes[i] - axes_shift_num[i]);
+    }
+  }
+
+  // fuse tail for bind threadIdx.x
+  int ptail = 1;
+  int index = naxes.back() + 2;
+  for (int idx = index; idx < nshape.size(); ++idx) {
+    ptail *= nshape[idx];
+  }
+  nshape.resize(index);
+  nshape.push_back(ptail);
+
+  ir_sch.Split(block_name, 0, nshape);
+  LoopOrderAssignReduce(ir_sch, block_name, naxes, target, true);
+
+  // fuse loop for bind blockIdx.x
+  auto loops = ir_sch.GetLoops(block_name);
+  auto fsize = nshape.size() - (naxes.size() + 2);
+  if (fsize > 1) {
+    ir_sch.Fuse({loops.begin(), loops.begin() + fsize});
+  }
+
+  auto get_tile_size = [&](int idx) {
+    auto range = GetLoopExtent(loops[idx - 1]);
+    if (range > 32) {
+      return 8;
+    } else if (range > 16) {
+      return 16;
+    } else if (range > 4) {
+      return 32;
+    } else {
+      return 64;
+    }
+  };
+
+  std::vector<int> new_order;
+  loops = ir_sch.GetLoops(block_name);
+  if (fsize) {
+    int tail_index = 2;
+    auto tile_size = get_tile_size(tail_index);
+    if (GetLoopExtent(loops[tail_index]) > tile_size) {
+      // split index
+      ir_sch.Split(loops[tail_index], {-1, tile_size});
+      loops = ir_sch.GetLoops(block_name);
+      // order
+      new_order = {0, 2, 3, 1};
+    } else {
+      // order
+      new_order = {0, 2, 1};
+    }
+  } else {
+    int tail_index = 1;
+    auto tile_size = get_tile_size(tail_index);
+    if (GetLoopExtent(loops[tail_index]) > tile_size) {
+      // split index
+      ir_sch.Split(loops[tail_index], {-1, tile_size});
+      loops = ir_sch.GetLoops(block_name);
+      // order
+      new_order = {1, 2, 0};
+    } else {
+      // order
+      new_order = {1, 0};
+    }
+  }
+  for (int idx = new_order.size(); idx < loops.size(); ++idx) {
+    new_order.push_back(idx);
+  }
+  ir_sch.Reorder(block_name, new_order);
+}
+
+void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,
+                              const std::string& block_name,
+                              const std::vector<int>& inshape,
+                              const std::vector<int>& axes,
+                              const common::Target& target) {
+  // If the number of current device SM is smaller than the number of SM
+  // required by Warp Reduce, the performance of Warp Reduce is better.
+  // Otherwise, use Block Reduce.
+  auto max_num_threads       = common::DefaultNVGPUTarget().max_num_threads();
+  int need_reduce_last_count = 1;
+  for (int i = 0; i < inshape.size(); i++) {
+    if (find(axes.begin(), axes.end(), i) == axes.end()) {
+      need_reduce_last_count *= inshape[i];
+    }
+  }
+  int warp_reduce_need_sm_count = ceil((need_reduce_last_count * 32) / float(target.get_max_threads_per_sm()));
+  // Set Num_max_threads to 32 is Warp Reduce
+  if (target.get_multi_processor_count() < warp_reduce_need_sm_count) {
+    max_num_threads = 32;
+  }
+  // find first reduce and second reduce axis.
+  int lane  = 1;
+  int index = static_cast<int>(axes.size()) - 1;
+
+  for (; index >= 0; --index) {
+    if (index + 1 < axes.size() && axes[index] != axes[index + 1] - 1) {
+      break;
+    }
+    lane *= inshape[axes[index]];
+    if (index == 0 && lane <= max_num_threads) {
+      LOG(FATAL) << "Error! lane is less equal than max_num_threads, Please check!";
+    }
+    if (lane >= max_num_threads / 2) {
+      if (lane <= max_num_threads) {
+        --index;
+      }
+      break;
+    }
+  }
+  std::vector<int> first_axes(axes.begin(), axes.begin() + index + 1);
+  if (lane > max_num_threads) {
+    // last reduce axis size > 1024
+    if (index == static_cast<int>(axes.size()) - 1) {
+      int tail         = max_num_threads;
+      bool check_bound = true;
+      for (; tail >= max_num_threads / 2; --tail) {
+        if (lane % tail == 0) {
+          check_bound = false;
+          break;
+        }
+      }
+      if (check_bound) {
+        lane = ((lane + max_num_threads - 1) / max_num_threads) * max_num_threads;
+        ir_sch.Split(block_name, axes[index], {lane});
+      }
+      int idx = max_num_threads;
+      do {
+        if (lane % idx == 0) {
+          ir_sch.Split(block_name, axes[index], {-1, idx});
+          break;
+        }
+        --idx;
+      } while (idx >= max_num_threads / 2);
+      // if can't be divide by(1024, 512), it's shouldn't be fused.
+      CHECK_GE(idx, max_num_threads / 2) << "Check bounds exist, can't fuse!";
+    } else {
+      int axis   = axes[index];
+      int prefix = inshape[axis];
+      int tail   = lane / prefix;
+      for (int idx = max_num_threads / tail; idx > (max_num_threads / 2) / tail; --idx) {
+        if (prefix % idx == 0) {
+          ir_sch.Split(block_name, axis, {-1, idx});
+          break;
+        }
+        CHECK_GT(idx, (max_num_threads / 2) / tail) << "Error, it's shouldn't fuse!";
+      }
+    }
+    LoopOrderAssignReduce(ir_sch, block_name, first_axes, target);
+    // The current one-dimensional reduce does not make full use of SM.
+    // This case is optimized into a two-dimensional.
+    auto loops       = ir_sch.GetLoops(block_name);
+    auto block_dim_x = loops[1].As<ir::For>()->extent.as_int32();
+    int block_dim_y  = block_dim_x <= 32 ? 2 : 1;
+    if (block_dim_y != 1) {
+      ir_sch.Split(loops[0], {-1, block_dim_y});
+    }
+  } else {
+    int fuse_times = axes.size() - (index + 1) - 1;
+    for (int idx = 0; idx < fuse_times; ++idx) {
+      ir_sch.Fuse(block_name, {axes[index + 1], axes[index + 1] + 1});
+    }
+    LoopOrderAssignReduce(ir_sch, block_name, first_axes, target, true);
+    // fuse axis before reduce to bind blockidx.
+    for (int idx = 0; idx < int(inshape.size() - axes.size()) - 1; ++idx) {
+      ir_sch.Fuse(block_name, {0, 1});
+    }
+  }
+}
+
+bool CanbeInline(Node* node,
+                 const std::vector<Node*> consumers,
+                 const Node* reducer,
+                 const std::unordered_set<Node*> masters,
+                 const GroupPtr& group,
+                 const std::unordered_set<Node*>& nodes_set,
+                 const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
+  if (group->output_nodes.count(node)) {
+    return false;
+  }
+
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+  for (auto consumer : consumers) {
+    if (op_pattern_dict[consumer->op()] == framework::kReduction) {
+      return false;
+    }
+  }
+
+  if (IsConstOp(node)) {
+    return true;
+  }
+
+  if (op_pattern_dict[node->op()] == framework::kReduction) {
+    return false;
+  }
+
+  if (consumers.size() == 1) {
+    return true;
+  }
+
+  if (reducer) {
+    // node is before reducer and node is not after reduce.
+    if (FindReducerInRoute(node, nodes_set, GetConsumersInSet) &&
+        !FindReducerInRoute(node, nodes_set, GetProducersInSet)) {
+      auto node_shape  = GetOutputShape(node, shape_dict);
+      auto input_shape = GetInputShape(reducer, shape_dict);
+      // check with same shape with reducer input.
+      if (std::accumulate(node_shape.begin(), node_shape.end(), 1, std::multiplies<int>()) !=
+          std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>())) {
+        return true;
+      }
+    }
+
+    return false;
+  } else {
+    auto node_shape = GetOutputShape(node, shape_dict);
+    auto node_size  = std::accumulate(node_shape.begin(), node_shape.end(), 1, std::multiplies<int>());
+
+    for (auto master : masters) {
+      auto master_shape = GetOutputShape(master, shape_dict);
+      auto master_size  = std::accumulate(master_shape.begin(), master_shape.end(), 1, std::multiplies<int>());
+      if (node_size != master_size) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+}
+
+Node* GetMasterToComputeAt(Node* node,
+                           const std::vector<Node*>& nodes_in_order,
+                           const std::unordered_set<Node*>& nodes_inline,
+                           const std::unordered_set<Node*>& nodes_set,
+                           const std::unordered_map<Node*, Node*>& virtual_consumers,
+                           const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+  // if node is reduction, try find horizontal to compute at.
+  if (op_pattern_dict[node->op()] == framework::kReduction) {
+    // find all reduce node has done schedule.
+    std::unordered_set<Node*> done_schedule;
+    for (auto tmp : nodes_in_order) {
+      if (tmp == node) {
+        break;
+      }
+      if (op_pattern_dict[tmp->op()] == framework::kReduction) {
+        done_schedule.insert(tmp);
+      }
+    }
+    // remove all consuemr reducer node of node from done_schedule.
+    std::unordered_set<Node*> visited;
+    std::queue<Node*> candidates;
+    candidates.push(node);
+
+    while (!candidates.empty()) {
+      auto candidate = candidates.front();
+      candidates.pop();
+
+      for (auto consumer : GetConsumersInSet(candidate, nodes_set)) {
+        // remove reduction node from done_schedule.
+        if (op_pattern_dict[consumer->op()] == framework::kReduction) {
+          done_schedule.erase(consumer);
+        }
+        if (visited.count(consumer)) {
+          continue;
+        }
+        candidates.push(consumer);
+        visited.insert(consumer);
+      }
+    }
+
+    if (done_schedule.size()) {
+      auto shape = shape_dict.at(node->inlinks_in_order()[0]->source()->id());
+      for (auto rnode : done_schedule) {
+        auto rshape = shape_dict.at(rnode->inlinks_in_order()[0]->source()->id());
+        if (shape == rshape) {
+          return rnode;
+        }
+      }
+      return *done_schedule.begin();
+    }
+  }
+
+  // collect all consumers.
+  std::unordered_set<Node*> visited, masters;
+  std::queue<Node*> candidates;
+  candidates.push(node);
+
+  while (!candidates.empty()) {
+    auto candidate = candidates.front();
+    candidates.pop();
+
+    auto consumers = FindConsumers(candidate, nodes_set, virtual_consumers);
+    for (auto consumer : consumers) {
+      if (visited.count(consumer)) {
+        continue;
+      }
+      if (nodes_inline.count(consumer)) {
+        candidates.push(consumer);
+        visited.insert(consumer);
+      } else {
+        masters.insert(consumer);
+      }
+    }
+  }
+
+  // nodes-in-order
+  for (int idx = 0; idx < nodes_in_order.size(); ++idx) {
+    if (nodes_in_order[idx] == node) {
+      for (int idy = idx - 1; idy >= 0; --idy) {
+        if (masters.count(nodes_in_order[idy])) {
+          return nodes_in_order[idy];
+        }
+      }
+      break;
+    }
+  }
+  return nullptr;
+}
+
+void LoopAssignReduce(ir::IRSchedule& ir_sch,
+                      const Node* node,
+                      const Node* reducer,
+                      const Target& target,
+                      const std::unordered_map<std::string, ir::Tensor>& tensor_map,
+                      const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+  // if node is reducer, return.
+  if (op_pattern_dict[node->op()] == framework::kReduction) {
+    return;
+  }
+  auto node_data    = GetNodeData(node);
+  auto reducer_data = GetNodeData(reducer);
+
+  // flatten loops.
+  auto loops = ir_sch.GetLoops(node_data->id());
+  // do loop flatten.
+  if (op_pattern_dict[node->op()] == framework::kElementWise) {
+    ir_sch.FlattenLoops(loops, true);
+  } else {
+    ir_sch.FlattenLoops(loops, false);
+  }
+
+  // shape and axis.
+  CHECK(shape_dict.count(reducer->inlinks_in_order()[0]->source()->id()));
+  auto shape = shape_dict.at(reducer->inlinks_in_order()[0]->source()->id());
+  auto axes  = absl::get<std::vector<int>>(reducer->attrs.attr_store.at("dim"));
+  if (axes.empty()) {
+    for (int idx = 0; idx < shape.size(); idx++) {
+      axes.push_back(idx);
+    }
+  }
+
+  auto copy_loop_info = [](std::vector<ir::Expr>& loops, std::vector<ir::Expr>& rloops) {
+    for (int idx = 0; idx < std::min(rloops.size(), loops.size()); ++idx) {
+      auto l0 = rloops[idx].As<ir::For>();
+      auto l1 = loops[idx].As<ir::For>();
+      l1->set_for_type(l0->for_type());
+      l1->set_bind_info(l0->bind_info());
+    }
+  };
+
+  auto node_shape = shape_dict.at(node_data->id());
+  // The output shape of node is different from that of reduce node
+  if (std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) !=
+      std::accumulate(node_shape.begin(), node_shape.end(), 1, std::multiplies<int>())) {
+    // get loop factors of reduce node
+    int extend = 1;
+    std::vector<int> factors;
+    loops       = ir_sch.GetLoops(node_data->id());
+    auto rloops = ir_sch.GetLoops(reducer_data->id());
+
+    for (auto& loop : rloops) {
+      if (extend >= loops.back().As<ir::For>()->extent.as_int32() && factors.size() &&
+          loop.As<ir::For>()->extent.as_int32() > 1) {
+        break;
+      }
+      extend *= loop.As<ir::For>()->extent.as_int32();
+      factors.push_back(loop.As<ir::For>()->extent.as_int32());
+    }
+
+    // If there are IfThenElse stmt in loop, we need to find out the indices in condition,
+    // and special treatment should be applied to loops with these indices.
+    // We apply two step split on loop of src node to align the loop of reduce node.
+    std::unordered_set<int> loop_index_in_if;
+    auto first_reduce_loop = rloops.front();
+    // collect if
+    auto if_checker               = [](const Expr* x) { return x->As<ir::IfThenElse>(); };
+    auto if_set                   = ir::CollectIRNodesWithoutTensor(first_reduce_loop.As<ir::For>()->body, if_checker);
+    std::string reduce_block_name = reducer_data->id();
+    for (auto if_expr : if_set) {
+      auto checker = [reduce_block_name](const Expr* x) {
+        return x->As<ir::ScheduleBlockRealize>() &&
+               x->As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name == reduce_block_name;
+      };
+      auto blocks_in_if = ir::CollectIRNodesWithoutTensor(if_expr, checker);
+      if (!blocks_in_if.empty()) {
+        ir::Expr condition = if_expr.As<ir::IfThenElse>()->condition;
+        auto indices_in_if =
+            ir::CollectIRNodesWithoutTensor(condition, [](const Expr* x) { return x->As<ir::_Var_>(); });
+        for (int i = 0; i < rloops.size(); ++i) {
+          std::string var_name = rloops[i].As<ir::For>()->loop_var->name;
+          auto find_var_iter = std::find_if(indices_in_if.begin(), indices_in_if.end(), [&var_name](const ir::Expr& x) {
+            return x.As<ir::_Var_>()->name == var_name;
+          });
+          if (find_var_iter != indices_in_if.end()) {
+            loop_index_in_if.insert(i);
+          }
+        }
+        break;
+      }
+    }
+
+    // prepare factors of two step split
+    std::vector<int> first_step_factors;
+    std::vector<int> second_step_factors;
+    int second_start_loop_index;
+    for (int i = 0; i < factors.size(); ++i) {
+      if (loop_index_in_if.count(i) == 0) {
+        first_step_factors.push_back(factors[i]);
+      } else if (loop_index_in_if.count(i) != 0 && second_step_factors.empty()) {
+        first_step_factors.push_back(-1);
+        second_step_factors.push_back(factors[i]);
+        second_start_loop_index = i;
+      } else if (loop_index_in_if.count(i) != 0 && !second_step_factors.empty()) {
+        second_step_factors.push_back(factors[i]);
+      }
+    }
+    // do two step split
+    if (!first_step_factors.empty()) {
+      ir_sch.Split(loops.back(), first_step_factors);
+      loops = ir_sch.GetLoops(node_data->id());
+    }
+    if (!second_step_factors.empty()) {
+      ir_sch.Split(loops.at(second_start_loop_index), second_step_factors);
+      loops = ir_sch.GetLoops(node_data->id());
+    }
+
+    // copy loop info form rloops.
+    copy_loop_info(loops, rloops);
+    return;
+  }
+
+  // node output is same shape with reduce input.
+  if (WithoutLastDimInReduce(shape, axes)) {
+    // if using two strep reduce.
+    if (tensor_map.count(reducer_data->id() + "_1")) {
+      VLOG(4) << "Try assign loop of " << node_data->id() << " into two strep reduce loop of " << reducer_data->id();
+      LoopAssignReduceWithoutLast(ir_sch, node_data->id(), shape, axes, target);
+      auto nloops = ir_sch.GetLoops(node_data->id());
+      auto rloops = ir_sch.GetLoops(tensor_map.find(reducer_data->id() + "_0")->second->name);
+
+      VLOG(4) << node_data->id() << "'s loop level is " << nloops.size() << ", and " << reducer_data->id()
+              << "'s loop level is " << rloops.size();
+      if (nloops.size() < rloops.size()) {
+        ir_sch.Split(nloops[0], {1, -1});
+      }
+
+      nloops = ir_sch.GetLoops(node_data->id());
+      // copy loop info form rloops.
+      copy_loop_info(nloops, rloops);
+    } else {
+      VLOG(4) << "Try assign loop of " << node_data->id() << " into reduce loop of " << reducer_data->id();
+
+      auto nloops = ir_sch.GetLoops(node_data->id());
+      ir_sch.Split(nloops.back(), shape);
+      LoopOrderAssignReduce(ir_sch, node_data->id(), axes, target);
+      nloops      = ir_sch.GetLoops(node_data->id());
+      auto rloops = ir_sch.GetLoops(tensor_map.find(reducer_data->id())->second->name);
+      if (nloops.size() < rloops.size()) {
+        ir_sch.Split(nloops[0], {1, -1});
+      }
+
+      nloops = ir_sch.GetLoops(node_data->id());
+      // copy loop info form rloops.
+      copy_loop_info(nloops, rloops);
+    }
+  } else {
+    if (tensor_map.count(reducer_data->id() + "_1")) {
+      {
+        auto nloops = ir_sch.GetLoops(node_data->id());
+        ir_sch.Split(nloops.back(), shape);
+      }
+      LoopAssignReduceWithLast(ir_sch, node_data->id(), shape, axes, target);
+
+      auto nloops = ir_sch.GetLoops(node_data->id());
+      auto rloops = ir_sch.GetLoops(tensor_map.find(reducer_data->id() + "_1")->second->name);
+      if (nloops.size() < rloops.size()) {
+        ir_sch.Split(nloops[0], {1, -1});
+      }
+
+      nloops = ir_sch.GetLoops(node_data->id());
+      // copy loop info form rloops.
+      copy_loop_info(nloops, rloops);
+    } else if (tensor_map.count(reducer_data->id() + "_0")) {
+      auto tensor = tensor_map.find(reducer_data->id() + "_0")->second;
+      auto rloops = ir_sch.GetLoops(tensor->name);
+      std::vector<int> factors;
+      for (auto& loop : rloops) {
+        factors.push_back(loop.As<ir::For>()->extent.as_int32());
+      }
+      auto nloops = ir_sch.GetLoops(node_data->id());
+      ir_sch.Split(nloops.back(), factors);
+
+      nloops = ir_sch.GetLoops(node_data->id());
+      // copy loop info form rloops.
+      copy_loop_info(nloops, rloops);
+    } else {
+      LOG(FATAL) << "Error! Unkown Reduce Type!";
+    }
+  }
+}
+
+// The struct used to remove the original block in ComputeAt.
+class RemoveExpr : public ir::IRMutator<> {
+ public:
+  RemoveExpr(const Expr& target) : target_(target) {}
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::ScheduleBlockRealize* expr, Expr* op) override { IRMutator::Visit(expr, op); }
+
+  void Visit(const ir::For* expr, Expr* op) override { IRMutator::Visit(expr, op); }
+
+  void Visit(const ir::Block* expr, Expr* op) override {
+    auto* node = op->As<ir::Block>();
+    auto iter  = std::find(node->stmts.begin(), node->stmts.end(), target_);
+    if (iter != node->stmts.end()) {
+      node->stmts.erase(iter);
+    } else {
+      for (auto stmt : node->stmts) {
+        IRMutator::Visit(&stmt, &stmt);
+      }
+    }
+  }
+
+ private:
+  const Expr& target_;
+};
+
+void MergeLoops(ir::Expr root, std::vector<ir::Expr>& src, std::vector<ir::Expr>& dst, int index) {
+  if (index < 0) {
+    return;
+  }
+  CHECK_GT(src.size(), index) << "\nindex -> " << index << "\n" << src[0];
+  CHECK_GT(dst.size(), index) << "\nindex -> " << index << "\n" << dst[0];
+
+  if (src[0] == dst[0]) {
+    return;
+  }
+
+  std::vector<ir::Var> src_vars;
+  std::vector<ir::Expr> dst_vars;
+  for (int idx = 0; idx <= index; ++idx) {
+    src_vars.push_back(src[idx].As<ir::For>()->loop_var);
+    dst_vars.push_back(ir::Expr(dst[idx].As<ir::For>()->loop_var));
+  }
+
+  auto src_body = src[index].As<ir::For>()->body;
+  ReplaceExpr(&src_body, src_vars, dst_vars);
+  dst[index].As<ir::For>()->body = ir::Block::Make({src_body, dst[index].As<ir::For>()->body});
+
+  RemoveExpr remove_expr(src[0]);
+  remove_expr(&root);
+}
+
+void InsertSyncThread(ir::IRSchedule& ir_sch,
+                      const Node* node,
+                      const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                      const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
+  CHECK(shape_dict.count(node->inlinks_in_order()[0]->source()->id()));
+  auto shape = shape_dict.at(node->inlinks_in_order()[0]->source()->id());
+  auto axes  = absl::get<std::vector<int>>(node->attrs.attr_store.at("dim"));
+  if (axes.empty()) {
+    for (int idx = 0; idx < shape.size(); idx++) {
+      axes.push_back(idx);
+    }
+  }
+  if (!WithoutLastDimInReduce(shape, axes)) {
+    return;
+  }
+
+  auto node_data   = GetNodeData(node);
+  std::string post = "";
+  for (int idx = 0;; ++idx) {
+    if (!tensor_map.count(node_data->id() + post)) {
+      break;
+    }
+    auto tensor = tensor_map.find(node_data->id() + post)->second;
+    if (!ir_sch.HasBlock(tensor->name)) {
+      break;
+    }
+
+    post = "_" + std::to_string(idx);
+    if (idx > 0) {
+      // insert syncthreads.
+      auto loops = ir_sch.GetLoops(node_data->id());
+      ir_sch.SyncThreads(loops[loops.size() - 2], false);
+      return;
+    }
+  }
+}
+
+// The struct used to remove the original block in ComputeAt.
+class InsertExpr : public ir::IRMutator<> {
+ public:
+  InsertExpr(Expr& target, Expr& anchor) : target_(target), anchor_(anchor) {}
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::ScheduleBlockRealize* expr, Expr* op) override { IRMutator::Visit(expr, op); }
+
+  void Visit(const ir::For* expr, Expr* op) override { IRMutator::Visit(expr, op); }
+
+  void Visit(const ir::Block* expr, Expr* op) override {
+    auto* node = op->As<ir::Block>();
+    auto iter  = std::find(node->stmts.begin(), node->stmts.end(), anchor_);
+    if (iter != node->stmts.end()) {
+      node->stmts.insert(iter, target_);
+    } else {
+      for (auto stmt : node->stmts) {
+        IRMutator::Visit(&stmt, &stmt);
+      }
+    }
+  }
+
+ private:
+  Expr target_;
+  Expr anchor_;
+};
+
+void MergeReduceToReduce(ir::IRSchedule& ir_sch,
+                         const Node* node,
+                         const Node* master,
+                         const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                         const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
+  auto node_data   = GetNodeData(node);
+  auto master_data = GetNodeData(master);
+
+  CHECK(shape_dict.count(node->inlinks_in_order()[0]->source()->id()));
+  auto shape = shape_dict.at(node->inlinks_in_order()[0]->source()->id());
+  auto axes  = absl::get<std::vector<int>>(node->attrs.attr_store.at("dim"));
+  if (axes.empty()) {
+    for (int idx = 0; idx < shape.size(); idx++) {
+      axes.push_back(idx);
+    }
+  }
+  if (WithoutLastDimInReduce(shape, axes)) {
+    auto mshape = shape_dict.at(master->inlinks_in_order()[0]->source()->id());
+    if (tensor_map.count(node_data->id() + "_1")) {
+      if (shape == mshape) {
+        // second step reduce
+        {
+          auto block = ir_sch.GetBlock(node_data->id());
+          auto loops = ir_sch.GetLoops(master_data->id());
+          ir_sch.SimpleComputeAt(block, loops.back());
+          // reduce init
+          {
+            auto block = ir_sch.GetBlock(node_data->id() + "__reduce_init");
+            auto loops = ir_sch.GetLoops(master_data->id() + "__reduce_init");
+            ir_sch.SimpleComputeAt(block, loops.back());
+          }
+        }
+        // first step reduce
+        {
+          auto n_tensor = tensor_map.find(node_data->id() + "_0")->second;
+          auto m_tensor = tensor_map.find(master_data->id() + "_0")->second;
+
+          auto block = ir_sch.GetBlock(n_tensor->name);
+          auto loops = ir_sch.GetLoops(m_tensor->name);
+          ir_sch.SimpleComputeAt(block, loops.back());
+          // reduce init
+          {
+            auto block = ir_sch.GetBlock(n_tensor->name + "__reduce_init");
+            auto loops = ir_sch.GetLoops(m_tensor->name + "__reduce_init");
+            ir_sch.SimpleComputeAt(block, loops.back());
+          }
+        }
+      } else {
+        auto n_tensor = tensor_map.find(node_data->id() + "_0")->second;
+        auto m_tensor = tensor_map.find(master_data->id() + "_0")->second;
+        if (n_tensor->shape == m_tensor->shape) {
+          // second step reduce
+          {
+            auto block = ir_sch.GetBlock(node_data->id());
+            auto loops = ir_sch.GetLoops(master_data->id());
+            ir_sch.SimpleComputeAt(block, loops.back());
+            // reduce init
+            {
+              auto block = ir_sch.GetBlock(node_data->id() + "__reduce_init");
+              auto loops = ir_sch.GetLoops(master_data->id() + "__reduce_init");
+              ir_sch.SimpleComputeAt(block, loops.back());
+            }
+          }
+          // first step reduce
+          {
+            auto n_tensor = tensor_map.find(node_data->id() + "_0")->second;
+            auto m_tensor = tensor_map.find(master_data->id() + "_0")->second;
+
+            auto n_loops = ir_sch.GetLoops(n_tensor->name + "__reduce_init");
+            auto m_loops = ir_sch.GetLoops(m_tensor->name + "__reduce_init");
+
+            CHECK_EQ(n_loops.size(), m_loops.size());
+            MergeLoops(ir_sch.GetModule().GetExprs().at(0), n_loops, m_loops, n_loops.size() - 1);
+          }
+        } else {
+          LOG(FATAL) << "not support this type fusion!";
+        }
+      }
+    } else {
+      if (shape == mshape) {
+        // reduce loop
+        {
+          auto block = ir_sch.GetBlock(node_data->id());
+          auto loops = ir_sch.GetLoops(master_data->id());
+          ir_sch.SimpleComputeAt(block, loops.back());
+          // reduce init
+          {
+            auto block = ir_sch.GetBlock(node_data->id() + "__reduce_init");
+            auto loops = ir_sch.GetLoops(master_data->id() + "__reduce_init");
+            ir_sch.SimpleComputeAt(block, loops.back());
+          }
+        }
+      } else {
+        // reduce loop
+        {
+          auto block  = ir_sch.GetBlock(node_data->id());
+          auto nloops = ir_sch.GetLoops(node_data->id());
+          auto mloops = ir_sch.GetLoops(master_data->id());
+          for (int idx = 0; idx < mloops.size(); ++idx) {
+            if (GetLoopExtent(nloops[idx]) != GetLoopExtent(mloops[idx])) {
+              ir_sch.SimpleComputeAt(block, mloops[idx - 1]);
+              break;
+            }
+          }
+          // reduce init
+          {
+            auto block = ir_sch.GetBlock(node_data->id() + "__reduce_init");
+            auto loops = ir_sch.GetLoops(master_data->id() + "__reduce_init");
+            ir_sch.SimpleComputeAt(block, loops.back());
+          }
+        }
+      }
+    }
+  } else {
+    if (tensor_map.count(node_data->id() + "_1")) {
+      // identity
+      {
+        auto block = ir_sch.GetBlock(node_data->id());
+        auto loops = ir_sch.GetLoops(master_data->id());
+        ir_sch.SimpleComputeAt(block, loops.back());
+      }
+      // reduce
+      {
+        auto n_tensor = tensor_map.find(node_data->id() + "_1")->second;
+        auto m_tensor = tensor_map.find(master_data->id() + "_1")->second;
+
+        auto block = ir_sch.GetBlock(n_tensor->name);
+        auto loops = ir_sch.GetLoops(m_tensor->name);
+        ir_sch.SimpleComputeAt(block, loops.back());
+        // reduce init
+        {
+          auto block = ir_sch.GetBlock(n_tensor->name + "__reduce_init");
+          auto loops = ir_sch.GetLoops(m_tensor->name + "__reduce_init");
+          ir_sch.SimpleComputeAt(block, loops.back());
+        }
+      }
+      // block shuffle
+      {
+        auto n_tensor = tensor_map.find(node_data->id() + "_0")->second;
+        auto m_tensor = tensor_map.find(master_data->id() + "_0")->second;
+
+        auto n_block = ir_sch.GetBlock(n_tensor->name);
+        auto m_block = ir_sch.GetBlock(m_tensor->name);
+
+        auto n_loops = ir_sch.GetLoops(n_tensor->name);
+        auto m_loops = ir_sch.GetLoops(m_tensor->name);
+        CHECK_EQ(n_loops.size(), m_loops.size());
+
+        std::vector<ir::Var> src_vars;
+        std::vector<ir::Expr> dst_vars;
+        for (int idx = 0; idx < m_loops.size(); ++idx) {
+          src_vars.push_back(n_loops[idx].As<ir::For>()->loop_var);
+          dst_vars.push_back(ir::Expr(m_loops[idx].As<ir::For>()->loop_var));
+        }
+        ReplaceExpr(&n_block, src_vars, dst_vars);
+
+        InsertExpr insert_expr(n_block, m_block);
+        insert_expr(&m_loops.back());
+
+        RemoveExpr remove_expr(n_loops[0]);
+        remove_expr(&ir_sch.GetModule().GetExprs().at(0));
+      }
+    } else if (tensor_map.count(node_data->id() + "_0")) {
+      // identity
+      {
+        auto block = ir_sch.GetBlock(node_data->id());
+        auto loops = ir_sch.GetLoops(master_data->id());
+        ir_sch.SimpleComputeAt(block, loops.back());
+      }
+      // shuffle reduce
+      {
+        auto n_tensor = tensor_map.find(node_data->id() + "_0")->second;
+        auto m_tensor = tensor_map.find(master_data->id() + "_0")->second;
+
+        auto block = ir_sch.GetBlock(n_tensor->name);
+        auto loops = ir_sch.GetLoops(m_tensor->name);
+        ir_sch.SimpleComputeAt(block, loops.back());
+      }
+    } else {
+      LOG(FATAL) << "Error! Unkown Reduce Type, Please Check!";
+    }
+  }
+}
+
+void MergeReduceLoop(ir::IRSchedule& ir_sch,
+                     Node* node,
+                     const Node* master,
+                     const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                     const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+  if (op_pattern_dict[master->op()] == kReduction && node != master) {
+    MergeReduceToReduce(ir_sch, node, master, shape_dict, tensor_map);
+    return;
+  }
+
+  auto node_data   = GetNodeData(node);
+  auto master_data = GetNodeData(master);
+
+  int min_index_loop = INT_MAX;
+  std::string post_ = "", post__ = "_0";
+  for (int idx = 0;; ++idx) {
+    if (!tensor_map.count(node_data->id() + post__)) {
+      break;
+    }
+    auto tensor_  = tensor_map.find(node_data->id() + post_)->second;
+    auto tensor__ = tensor_map.find(node_data->id() + post__)->second;
+    if (!ir_sch.HasBlock(tensor__->name)) {
+      break;
+    }
+
+    auto dst_loops = ir_sch.GetLoops(tensor_->name);
+    auto src_loops = ir_sch.GetLoops(tensor__->name);
+    int index      = -1;
+    while (src_loops[index + 1].As<ir::For>()->extent.as_int32() ==
+           dst_loops[index + 1].As<ir::For>()->extent.as_int32()) {
+      ++index;
+      if (src_loops.size() == index + 1 || dst_loops.size() == index + 1) {
+        break;
+      }
+    }
+    min_index_loop = std::min(min_index_loop, index);
+    MergeLoops(ir_sch.GetModule().GetExprs().at(0), src_loops, dst_loops, index);
+
+    post_  = "_" + std::to_string(idx);
+    post__ = "_" + std::to_string(idx + 1);
+  }
+  InsertSyncThread(ir_sch, node, shape_dict, tensor_map);
+
+  if (node == master) return;
+  auto node_loops   = ir_sch.GetLoops(node_data->id());
+  auto master_loops = ir_sch.GetLoops(master_data->id());
+
+  int index = std::min(node_loops.size(), master_loops.size()) - 1;
+  do {
+    // if loop range is not equal.
+    if (node_loops[index].As<ir::For>()->extent.as_int32() != master_loops[index].As<ir::For>()->extent.as_int32()) {
+      continue;
+    }
+
+    MergeLoops(ir_sch.GetModule().GetExprs().at(0), node_loops, master_loops, std::min(index, min_index_loop));
+    if (index > min_index_loop) {
+      auto block = ir_sch.GetBlock(node_data->id());
+      auto loops = ir_sch.GetLoops(master_data->id());
+      ir_sch.SimpleComputeAt(block, loops.back());
+
+      if (ir_sch.HasBlock(node_data->id() + "__reduce_init")) {
+        auto block = ir_sch.GetBlock(node_data->id() + "__reduce_init");
+        auto loops = ir_sch.GetLoops(master_data->id());
+        ir_sch.SimpleComputeAt(block, loops.back());
+      }
+    }
+
+    break;
+  } while (--index >= 0);
+}
+
+// The struct used to find all ir::For or ScheduleBlock in given block.
+class FindExprInBlock : public ir::IRMutator<> {
+ public:
+  FindExprInBlock() {}
+
+  std::vector<ir::Expr> operator()(Expr* expr) {
+    IRMutator::Visit(expr, expr);
+    return exprs_;
+  }
+
+ private:
+  void Visit(const ir::ScheduleBlockRealize* expr, Expr* op) override { exprs_.push_back(*op); }
+
+  void Visit(const ir::For* expr, Expr* op) override { exprs_.push_back(*op); }
+
+  void Visit(const ir::Block* expr, Expr* op) override {
+    auto node = op->As<ir::Block>();
+    for (auto stmt : node->stmts) {
+      IRMutator::Visit(&stmt, &stmt);
+    }
+  }
+
+  std::vector<ir::Expr> exprs_;
+};
+
+void LoopComputeAt(ir::IRSchedule& ir_sch,
+                   Node* node,
+                   const Node* master,
+                   const GroupPtr& group,
+                   const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                   const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+  if (!group->output_nodes.count(node)) {
+    auto block = ir_sch.GetBlock(GetNodeData(node)->id());
+    ir_sch.SetBuffer(block, "local");
+  }
+
+  if (op_pattern_dict[node->op()] == framework::kReduction) {
+    MergeReduceLoop(ir_sch, node, master, shape_dict, tensor_map);
+    return;
+  }
+
+  if (node == master) return;
+
+  auto node_data   = GetNodeData(node);
+  auto master_data = GetNodeData(master);
+
+  auto node_loops   = ir_sch.GetLoops(node_data->id());
+  auto master_loops = ir_sch.GetLoops(master_data->id());
+
+  if (op_pattern_dict[master->op()] == framework::kReduction) {
+    // find real master loops.
+    std::string prefix = "", post = "";
+    for (int idx = 0;; ++idx) {
+      if (!tensor_map.count(master_data->id() + post)) {
+        break;
+      }
+      auto tensor = tensor_map.find(master_data->id() + post)->second;
+      if (!ir_sch.HasBlock(tensor->name)) {
+        break;
+      }
+
+      prefix = post;
+      post   = "_" + std::to_string(idx);
+    }
+
+    auto tensor  = tensor_map.find(master_data->id() + prefix)->second;
+    master_loops = ir_sch.GetLoops(tensor->name);
+  }
+
+  int index = std::min(node_loops.size(), master_loops.size()) - 1;
+  do {
+    // if loop range is not equal.
+    if (node_loops[index].As<ir::For>()->extent.as_int32() != master_loops[index].As<ir::For>()->extent.as_int32()) {
+      continue;
+    }
+    MergeLoops(ir_sch.GetModule().GetExprs().at(0), node_loops, master_loops, index);
+
+    break;
+  } while (--index >= 0);
+}
+
+std::unordered_map<std::string, NodeData*> GetNodeDataSet(const std::unordered_set<Node*>& nodes_set) {
+  std::unordered_map<std::string, NodeData*> node_data_set;
+  for (auto node : nodes_set) {
+    auto node_data                 = GetNodeData(node);
+    node_data_set[node_data->id()] = node_data;
+  }
+  return node_data_set;
+}
+
+std::unordered_set<Node*> GetMasters(Node* node,
+                                     const std::unordered_set<Node*>& nodes_inline,
+                                     const std::unordered_set<Node*>& nodes_set) {
+  // find consumer
+  std::unordered_set<Node*> visited;
+  std::queue<Node*> candidates;
+  candidates.push(node);
+  std::unordered_set<Node*> masters;
+
+  while (!candidates.empty()) {
+    auto candidate = candidates.front();
+    candidates.pop();
+
+    auto consumers = GetConsumersInSet(candidate, nodes_set);
+    for (auto consumer : consumers) {
+      if (visited.count(consumer)) {
+        continue;
+      }
+      if (nodes_inline.count(consumer)) {
+        candidates.push(consumer);
+        visited.insert(consumer);
+      } else {
+        masters.insert(consumer);
+      }
+    }
+  }
+
+  return masters;
+}
+
+void SyncThreadWithShared(ir::IRSchedule& ir_sch,
+                          const GroupPtr& group,
+                          const std::unordered_set<Node*>& nodes_inline,
+                          const std::unordered_set<Node*>& nodes_set,
+                          const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                          const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
+  auto exprs_inorder    = ir_sch.GetAllBlocks();
+  auto node_data_set    = GetNodeDataSet(nodes_set);
+  auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+
+  std::unordered_set<std::string> sync_mark;
+  auto check_sync_mark = [&](const int start, const std::string& m_id) {
+    for (int idx = start + 1; exprs_inorder.size(); ++idx) {
+      auto expr = exprs_inorder[idx];
+      CHECK(expr.As<ir::ScheduleBlockRealize>());
+      CHECK(expr.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>());
+      auto block = expr.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>();
+
+      if (sync_mark.count(block->name)) {
+        return false;
+      }
+
+      if (block->name == m_id) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  for (int idx = 0; idx < exprs_inorder.size() - 1; ++idx) {
+    auto expr = exprs_inorder[idx];
+    CHECK(expr.As<ir::ScheduleBlockRealize>());
+    CHECK(expr.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>());
+    auto block = expr.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>();
+
+    if (!node_data_set.count(block->name)) {
+      continue;
+    }
+    auto node_data  = node_data_set.find(block->name)->second;
+    auto node       = node_data->source_node.get();
+    auto node_shape = shape_dict.at(node_data->id());
+
+    auto masters = GetMasters(node, nodes_inline, nodes_set);
+    if (masters.empty()) {
+      continue;
+    }
+
+    bool do_set_buffer_to_shared = false;
+    for (auto master : masters) {
+      auto master_data  = GetNodeData(master);
+      auto master_shape = shape_dict.at(master_data->id());
+      if (op_pattern_dict[master->op()] == framework::kReduction) {
+        master_shape = shape_dict.at(master->inlinks_in_order()[0]->source()->id());
+      }
+
+      auto node_size   = std::accumulate(node_shape.begin(), node_shape.end(), 1, std::multiplies<int>());
+      auto master_size = std::accumulate(master_shape.begin(), master_shape.end(), 1, std::multiplies<int>());
+
+      if (node_size != master_size) {
+        if (check_sync_mark(idx, master_data->id())) {
+          auto loops = ir_sch.GetLoops(master_data->id());
+          ir_sch.SyncThreads(loops.back(), false);
+          sync_mark.insert(master_data->id());
+        }
+        do_set_buffer_to_shared = true;
+      }
+    }
+    if (do_set_buffer_to_shared && group->output_nodes.find(node) == group->output_nodes.end()) {
+      auto block = ir_sch.GetBlock(node_data->id());
+      ir_sch.SetBuffer(block, "shared");
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.h b/paddle/cinn/hlir/framework/op_lowering_util.h
new file mode 100644
index 0000000000000..f081411ec055a
--- /dev/null
+++ b/paddle/cinn/hlir/framework/op_lowering_util.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <queue>
+
+#include "cinn/hlir/framework/op_lowering.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+std::vector<NodeData*> GetInputNodeData(const Node* node);
+
+ir::Tensor GetTensor(const NodeData* node_data,
+                     const absl::flat_hash_map<std::string, Type>& type_dict,
+                     const absl::flat_hash_map<std::string, shape_t>& shape_dict);
+
+std::vector<ir::Tensor> CollectInputTensor(const Node* node,
+                                           std::vector<ir::Tensor>& func_args,
+                                           std::unordered_map<std::string, ir::Tensor>& tensor_map,
+                                           const absl::flat_hash_map<std::string, Type>& type_dict,
+                                           const absl::flat_hash_map<std::string, shape_t>& shape_dict);
+
+std::unordered_map<Node*, Node*> BuildVirtualConsumer(const GroupPtr& group,
+                                                      const absl::flat_hash_map<std::string, shape_t>& shape_dict);
+
+NodeData* GetNodeData(const Node* node);
+
+std::vector<NodeData*> GetAllNodeData(const Node* node);
+
+std::vector<Node*> GetConsumers(const Node* node);
+
+bool IsConstOp(const framework::Node* node);
+
+std::vector<Node*> GetConsumersInSet(const Node* node, const std::unordered_set<Node*>& node_set);
+
+std::vector<Node*> TopologicalOrder(const GroupPtr& group, const std::unordered_map<Node*, Node*>& virtual_consumers);
+
+std::vector<Node*> BFSTopologicalOrderWithPriority(const GroupPtr& group,
+                                                   const std::unordered_map<Node*, Node*>& virtual_consumers,
+                                                   const absl::flat_hash_map<std::string, shape_t>& shape_dict);
+
+Node* FindGlobalReducer(const std::vector<Node*>& nodes_in_order);
+
+Node* FindNearestReducer(const Node* node, const std::unordered_set<Node*>& nodes_set);
+
+bool CanbeInline(Node* node,
+                 const std::vector<Node*> consumers,
+                 const Node* reducer,
+                 const std::unordered_set<Node*> masters,
+                 const GroupPtr& group,
+                 const std::unordered_set<Node*>& nodes_set,
+                 const absl::flat_hash_map<std::string, shape_t>& shape_dict);
+
+Node* GetMasterToComputeAt(Node* node,
+                           const std::vector<Node*>& nodes_in_order,
+                           const std::unordered_set<Node*>& nodes_inline,
+                           const std::unordered_set<Node*>& nodes_set,
+                           const std::unordered_map<Node*, Node*>& virtual_consumers,
+                           const absl::flat_hash_map<std::string, shape_t>& shape_dict);
+
+std::unordered_set<Node*> GetMasters(Node* node,
+                                     const std::unordered_set<Node*>& nodes_inline,
+                                     const std::unordered_set<Node*>& nodes_set);
+
+void LoopAssignReduce(ir::IRSchedule& ir_sch,
+                      const Node* node,
+                      const Node* reducer,
+                      const Target& target,
+                      const std::unordered_map<std::string, ir::Tensor>& tensor_map,
+                      const absl::flat_hash_map<std::string, shape_t>& shape_dict);
+
+void LoopComputeAt(ir::IRSchedule& ir_sch,
+                   Node* node,
+                   const Node* master,
+                   const GroupPtr& group,
+                   const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                   const std::unordered_map<std::string, ir::Tensor>& tensor_map);
+
+void SyncThreadWithShared(ir::IRSchedule& ir_sch,
+                          const GroupPtr& group,
+                          const std::unordered_set<Node*>& nodes_inline,
+                          const std::unordered_set<Node*>& nodes_set,
+                          const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                          const std::unordered_map<std::string, ir::Tensor>& tensor_map);
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/op_strategy.cc b/paddle/cinn/hlir/framework/op_strategy.cc
new file mode 100644
index 0000000000000..6ab11854c24f8
--- /dev/null
+++ b/paddle/cinn/hlir/framework/op_strategy.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/op_strategy.h"
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+std::shared_ptr<OpImpl> OpStrategy::SelectImpl(const std::shared_ptr<OpStrategy>& strategy) {
+  //! should get the host info from global environment.
+  std::string curr_condition  = "default";
+  std::shared_ptr<OpImpl> res = nullptr;
+  for (auto& spec : strategy->specializations) {
+    if (spec->condition == "default") {
+      for (auto& i : spec->implementations) {
+        if (!res || res->plevel < i->plevel) {
+          res = i;
+        }
+      }
+    }
+  }
+  CHECK(res) << "There is no available strategy implementation! SelectImpl failed!";
+  return res;
+}
+
+void OpStrategy::AddImpl(CINNCompute fcompute, CINNSchedule fschedule, std::string name, int plevel) {
+  //! TODO(haozech) : here curr_cond should get the condition from outside.
+  //! Expected : auto curr_cond = SpecializedCondition::Current();
+  std::string curr_condition = "default";
+  for (auto& op_spec : specializations) {
+    if (op_spec->condition == curr_condition) {
+      op_spec->AddImpl(fcompute, fschedule, std::move(name), plevel);
+      return;
+    }
+  }
+  std::shared_ptr<OpSpec> n = std::make_shared<OpSpec>();
+  n->condition              = curr_condition;
+  n->AddImpl(fcompute, fschedule, std::move(name), plevel);
+  this->specializations.push_back(n);
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/op_strategy.h b/paddle/cinn/hlir/framework/op_strategy.h
new file mode 100644
index 0000000000000..b5e86fab11e2d
--- /dev/null
+++ b/paddle/cinn/hlir/framework/op_strategy.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/schedule.h"
+#include "cinn/lang/packed_func.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using CINNCompute  = lang::PackedFunc;
+using CINNSchedule = lang::PackedFunc;
+
+class OpStrategy;
+
+using StrategyFunction = std::function<std::shared_ptr<OpStrategy>(const NodeAttr&,
+                                                                   const std::vector<ir::Tensor>&,
+                                                                   const std::vector<Type>&,
+                                                                   const std::vector<std::vector<int>>&,
+                                                                   const common::Target&)>;
+using InferShapeFunction =
+    std::function<std::vector<std::vector<int>>(const std::vector<std::vector<int>>&, const AttrMapType&)>;
+
+//! Operator implementation that includes compute and schedule function.
+class OpImpl : public common::Object {
+ public:
+  //! Compute function
+  CINNCompute fcompute;
+  //! Schedule function
+  CINNSchedule fschedule;
+  //! Name of the implementation
+  std::string name;
+  //! Priority level
+  int plevel;
+  /**
+   * \brief Invoke the operator compute function.
+   * @param attrs The attribute of the primitive
+   * @param inputs The input tensors.
+   * @param out_type The output type information.
+   * @return The output compute description of the operator.
+   */
+  ir::Tensor Compute(const std::vector<ir::Tensor>& inputs, const Type& out_type) {
+    // TODO(haozech) : add support for packedfunc to return Tensor
+    // Expected : return this->fcompute(inputs, out_type);
+    ir::Tensor temp;
+    return temp;
+  }
+  /**
+   * \brief Build the computation schedule.
+   * @param attrs The attribute of the node.
+   * @param outs The output tensors.
+   * @param target The build target.
+   * @return The computation schedule.
+   */
+  common::Shared<Schedule> GetSchedule(const std::vector<ir::Tensor>& outs,
+                                       const std::vector<ir::Tensor>& temp_tensors,
+                                       const Target& target) {
+    // TODO(haozech) : add support for packedfunc to return Schedule
+    // Expected : return this->fschedule(outs, target);
+    return nullptr;
+  }
+
+  const char* type_info() const override { return __type_info__; }
+
+ private:
+  static constexpr char* __type_info__ = "OpImplementation";
+};
+
+//! Specialized implementations for operators under certain conditions.
+class OpSpec : public common::Object {
+ public:
+  //! List of implementations.
+  std::vector<std::shared_ptr<OpImpl>> implementations;
+
+  /** \brief Condition to enable the specialization.
+   *    Could be undefined to represent generic case.
+   *  TODO(haozech) : build a specified class SpecializedCondition to represent the condition.
+   *  Expected : SpecializedCondition condition;
+   */
+  std::string condition;
+
+  const char* type_info() const override { return __type_info__; }
+
+  void AddImpl(CINNCompute fcompute, CINNSchedule fschedule, std::string name, int plevel) {
+    auto n       = std::make_shared<OpImpl>();
+    n->fcompute  = fcompute;
+    n->fschedule = fschedule;
+    n->name      = std::move(name);
+    n->plevel    = plevel;
+    this->implementations.push_back(n);
+  }
+
+ private:
+  static constexpr char* __type_info__ = "OpSpecialization";
+};
+
+//! Operator strategy class.
+class OpStrategy : public common::Object {
+ public:
+  const char* type_info() const override { return __type_info__; }
+  //! List of operator specializations.
+  std::vector<std::shared_ptr<OpSpec>> specializations;
+
+  /**
+   * \brief Add an implementation.
+   * @param fcompute Compute function
+   * @param fschedule Schedule function
+   * @param name Name of the implementation
+   * @param plevel Priority level of the implementation
+   */
+  void AddImpl(CINNCompute fcompute, CINNSchedule fschedule, std::string name, int plevel);
+  static std::shared_ptr<OpImpl> SelectImpl(const std::shared_ptr<OpStrategy>& strategy);
+
+ private:
+  static constexpr char* __type_info__ = "OpStrategy";
+};
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/op_test.cc b/paddle/cinn/hlir/framework/op_test.cc
new file mode 100644
index 0000000000000..406e1ad628ffd
--- /dev/null
+++ b/paddle/cinn/hlir/framework/op_test.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/op.h"
+
+#include <gtest/gtest.h>
+
+#include <functional>
+#include <string>
+
+#include "cinn/cinn.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using CCompute = std::function<std::shared_ptr<ir::Tensor>(const std::vector<ir::Tensor>)>;
+
+TEST(Operator, GetAttrs) {
+  auto add      = Operator::Get("elementwise_add");
+  Operator temp = *add;
+  auto strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  Expr M(100), N(32);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  NodeAttr attrs;
+  std::vector<ir::Tensor> inputs{A, B};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+  auto impl             = OpStrategy::SelectImpl(strategy[add](attrs, inputs, type, {{100, 32}}, target));
+
+  ASSERT_EQ(impl->name, "strategy.elementwise_add.x86");
+  ASSERT_EQ(add->description, "elementwise_add function");
+
+  std::string func_name = "add1";
+
+  if (FLAGS_cinn_ir_schedule) {
+    std::string out_name = "C";
+    common::CINNValuePack cinn_input =
+        common::CINNValuePack{{common::CINNValue(A), common::CINNValue(B), common::CINNValue(out_name)}};
+    std::vector<std::string> input_output_names{"A", "B", out_name};
+
+    auto funcs = framework::GetFuncFromImpl(impl, cinn_input, inputs, input_output_names, func_name, target);
+
+    for (auto func : funcs) {
+      LOG(INFO) << "Test Operator_ElementWise_Add_Test0's Strategy, func is :\n" << func;
+    }
+  } else {
+    common::CINNValuePack cinn_input = common::CINNValuePack{{common::CINNValue(A), common::CINNValue(B)}};
+    common::CINNValuePack rets       = impl->fcompute(cinn_input);
+    ASSERT_EQ(rets.size(), 2UL);
+    rets = impl->fschedule(rets);
+    ASSERT_EQ(rets.size(), 2UL);
+    // the last element is a StageMap
+    for (int i = 0; i < rets->size() - 1; i++) {
+      ir::Expr temp = rets[i];
+      inputs.push_back(temp.as_tensor_ref());
+    }
+    auto func = Lower(func_name, rets.back(), inputs);
+    LOG(INFO) << "Test Strategy Codegen:\n" << func;
+  }
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.cc b/paddle/cinn/hlir/framework/parallel_compiler.cc
new file mode 100644
index 0000000000000..ede13cab04f13
--- /dev/null
+++ b/paddle/cinn/hlir/framework/parallel_compiler.cc
@@ -0,0 +1,230 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/parallel_compiler.h"
+
+#include <algorithm>
+#include <fstream>
+#include <thread>
+
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/codegen_cuda_host.h"
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/compiler.h"
+#include "cinn/backends/llvm/codegen_x86.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "cinn/common/context.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/ir/module.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_int32(cinn_parallel_compile_size);
+DECLARE_int32(cinn_parallel_compile_thread);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+static constexpr int DebugLogMaxLen = 30000;
+
+std::vector<std::unique_ptr<Instruction>> ParallelCompiler::operator()() {
+  if (graph_->fusion_groups.size() == 0) {
+    hlir::framework::ApplyPasses(graph_.get(), {"BuildNonFusedGroupsPass"});
+  }
+  // Task Spilt
+  SplitTask();
+  // launch task
+  LaunchTask();
+  // merge instruction
+  return MergeResult();
+}
+
+OpPatternKind GetOpKind(const framework::Node* node) {
+  auto& op_pattern_dict = framework::Operator::GetAttrs<OpPatternKind>("OpPattern");
+  CHECK(op_pattern_dict.Find(node->op())) << "Don't find the pattern of op : " << node->id();
+  auto kind = op_pattern_dict[node->op()];
+
+  if (kind == framework::kBroadcast) {
+    // As binary op was defined as broadcast, actually it should be element-wise.
+    if (node->op()->name != "broadcast_to") {
+      return framework::kElementWise;
+    }
+  }
+
+  return kind;
+}
+
+void ParallelCompiler::SplitTask() {
+  CHECK(graph_->fusion_groups.size());
+  CHECK(graph_->fusion_groups.size() == option_.lowered_funcs.size() || option_.lowered_funcs.size() == 0);
+  // split task
+  int max_task_num =
+      FLAGS_cinn_parallel_compile_thread > 0 ? FLAGS_cinn_parallel_compile_thread : graph_->fusion_groups.size();
+
+  int group_per_task = graph_->fusion_groups.size();
+  if (max_task_num > 1) {
+    group_per_task = FLAGS_cinn_parallel_compile_size > 0
+                         ? FLAGS_cinn_parallel_compile_size
+                         : ((graph_->fusion_groups.size() + max_task_num - 1) / max_task_num);
+  }
+
+  for (int idx = 0; idx < graph_->fusion_groups.size(); idx += group_per_task) {
+    tasks_.emplace_back(this, scope_, graph_, option_, target_);
+  }
+  VLOG(2) << "Split task to " << tasks_.size() << " sub-task!";
+}
+
+void RunTask(ParallelCompiler::Task* task) {
+  VLOG(2) << "Stark run sub-task, Thread Id : " << std::this_thread::get_id();
+  VLOG(4) << "Start Lowering";
+  task->Lowering();
+  VLOG(4) << "Start CodegenAndJit";
+  task->CodegenAndJit();
+  VLOG(4) << "Start BuildInstruction";
+  task->BuildInstruction();
+  VLOG(2) << "Finish run sub-task, Thread Id : " << std::this_thread::get_id();
+}
+
+void ParallelCompiler::LaunchTask() {
+  // start sub-task.
+  std::vector<std::thread> threads;
+  for (int idx = 1; idx < tasks_.size(); ++idx) {
+    threads.emplace_back(RunTask, &tasks_[idx]);
+  }
+
+  RunTask(&tasks_[0]);
+  // syncthreads.
+  for (auto& worker : threads) {
+    worker.join();
+  }
+}
+
+std::vector<std::unique_ptr<Instruction>> ParallelCompiler::MergeResult() {
+  std::vector<std::unique_ptr<Instruction>> res(graph_->fusion_groups.size());
+  for (auto& task : tasks_) {
+    for (int idx = 0; idx < task.gidx.size(); ++idx) {
+      res[task.gidx[idx]] = std::move(task.instructions[idx]);
+    }
+  }
+  return std::move(res);
+}
+
+void ParallelCompiler::Task::Lowering() {
+  if (options.lowered_funcs.size()) {
+    CHECK_EQ(options.lowered_funcs.size(), graph->fusion_groups.size());
+  }
+  auto& dtype_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, Type>>("inferdtype");
+  auto& shape_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+
+  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+  while (true) {
+    int idx = compiler->GetGroupIdx();
+    if (idx < 0) {
+      break;
+    }
+
+    gidx.push_back(idx);
+    if (options.lowered_funcs.size()) {
+      lowered_funcs.push_back(options.lowered_funcs[idx]);
+      continue;
+    }
+    auto& group = graph->fusion_groups[idx];
+    VLOG(1) << "Start Lowering Group " << idx << " at " << std::this_thread::get_id() << " :\n"
+            << "Group " << idx << " {\n"
+            << graph->DebugGroupedGraph(group->CollectNodes()) << "}\n";
+    lowered_funcs.emplace_back(std::move(op_lowerer.Lower(group)));
+    CHECK_EQ(lowered_funcs.back().size(), 1) << "Lowerd Function Is Not Equal 1!";
+  }
+}
+
+void ParallelCompiler::Task::CodegenAndJit() {
+  VLOG(2) << "Start Codegen and JIT with Group [" << cinn::utils::Join(this->gidx, ", ") << "] at "
+          << std::this_thread::get_id();
+  // build module
+  ir::Module::Builder builder(common::UniqName("module"), target);
+  for (auto& func : lowered_funcs) {
+    CHECK_EQ(func.size(), 1);
+    builder.AddFunction(func[0]);
+  }
+
+  auto ir_module = builder.Build();
+  // codegen compile
+  if (target == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+    auto splited_module = backends::SplitCudaAndHostModule(ir_module);
+    auto hmodule        = std::get<0>(splited_module);
+    auto dmodule        = std::get<1>(splited_module);
+
+    VLOG(3) << "Host Code:\n" << hmodule;
+    VLOG(3) << "Device Code:\n" << dmodule;
+    backends::CodeGenCUDA_Dev codegen(target);
+    auto cuda_c = codegen.Compile(dmodule);
+    CHECK(!cuda_c.empty()) << "Compile CUDA C code failed from device module:\n" << dmodule;
+
+    cinn::backends::SourceCodePrint::GetInstance()->write(cuda_c);
+    graph->SaveSourceCode(cuda_c);
+
+    using runtime::cuda::CUDAModule;
+    backends::nvrtc::Compiler compiler;
+    auto ptx = compiler(cuda_c);
+    CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c;
+    // load cumodule
+    cumodule.reset(new CUDAModule(ptx, compiler.compile_to_cubin() ? CUDAModule::Kind::CUBIN : CUDAModule::Kind::PTX));
+
+    // register kernel
+    backends::RuntimeSymbols symbols;
+    for (auto& fn : dmodule.functions()) {
+      auto cufunc = cumodule->GetFunction(0, fn->name);
+      CHECK(cufunc);
+      symbols.RegisterVar(fn->name + "_ptr_", reinterpret_cast<void*>(cufunc));
+    }
+    engine = backends::ExecutionEngine::Create(backends::ExecutionOptions(), std::move(symbols));
+    engine->Link<backends::CodeGenCUDA_Host>(hmodule);
+#endif
+  } else {
+    engine = backends::ExecutionEngine::Create(backends::ExecutionOptions());
+    engine->Link<backends::CodeGenX86>(ir_module);
+  }
+}
+
+void ParallelCompiler::Task::BuildInstruction() {
+  // create instruction.
+  for (int idx : gidx) {
+    VLOG(2) << "Start BuildInstruction of Group " << idx << " at " << std::this_thread::get_id();
+    auto& group = graph->fusion_groups[idx];
+    CHECK(group->input_names.size() > 0 || group->output_names.size() > 0);
+    auto instr = std::unique_ptr<Instruction>(
+        new Instruction(target, scope.get(), group->input_names, group->output_names, group->GetFuncName()));
+
+    auto fn_ptr = engine->Lookup(group->GetFuncName());
+    CHECK(fn_ptr) << "Can't find jit function : " << group->GetFuncName();
+    instr->SetLoweredFunc(reinterpret_cast<void*>(fn_ptr), group->GetFuncName());
+
+    instr->Finalize();
+    instructions.push_back(std::move(instr));
+  }
+}
+
+int ParallelCompiler::GetGroupIdx() {
+  std::lock_guard<std::mutex> lock(mtx_);
+  if (index < graph_->fusion_groups.size()) {
+    return index++;
+  } else {
+    return -1;
+  }
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.h b/paddle/cinn/hlir/framework/parallel_compiler.h
new file mode 100644
index 0000000000000..251d82ad285ed
--- /dev/null
+++ b/paddle/cinn/hlir/framework/parallel_compiler.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <mutex>
+#include <vector>
+
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/instruction.h"
+#include "cinn/hlir/framework/op_lowering.h"
+#include "cinn/ir/lowered_func.h"
+#ifdef CINN_WITH_CUDA
+#include "cinn/runtime/cuda/cuda_module.h"
+#endif
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+class ParallelCompiler {
+ public:
+  struct CompileOptions {
+    std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
+  };
+
+ public:
+  explicit ParallelCompiler(std::shared_ptr<Scope>& scope,
+                            std::shared_ptr<Graph>& graph,
+                            const CompileOptions& option,
+                            const common::Target& target)
+      : scope_(scope), graph_(graph), option_(option), target_(target) {}
+  ~ParallelCompiler() {}
+  std::vector<std::unique_ptr<Instruction>> operator()();
+
+ private:
+  void SplitTask();
+  void LaunchTask();
+  std::vector<std::unique_ptr<Instruction>> MergeResult();
+
+ public:
+  struct Task {
+   public:
+    Task(ParallelCompiler* p,
+         std::shared_ptr<Scope>& s,
+         std::shared_ptr<Graph>& g,
+         const CompileOptions& cp,
+         const Target& t)
+        : compiler(p), scope(s), graph(g), options(cp), target(t) {}
+    void Lowering();
+    void CodegenAndJit();
+    void BuildInstruction();
+
+   public:
+    const Target target;
+    ParallelCompiler* compiler;
+    std::shared_ptr<Scope> scope;
+    std::shared_ptr<Graph> graph;
+    const CompileOptions& options;
+
+    std::vector<int> gidx;
+    std::vector<std::unique_ptr<Instruction>> instructions;
+    std::vector<std::vector<ir::LoweredFunc>> lowered_funcs;
+
+   public:
+    std::unique_ptr<backends::ExecutionEngine> engine;
+#ifdef CINN_WITH_CUDA
+    std::unique_ptr<runtime::cuda::CUDAModule> cumodule;
+#endif
+  };
+  std::vector<Task> tasks_;
+  int GetGroupIdx();
+
+ private:
+  int index{0};
+  std::mutex mtx_;
+
+  const common::Target target_;
+  const CompileOptions& option_;
+  std::shared_ptr<Scope> scope_;
+  std::shared_ptr<Graph> graph_;
+};
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/parallel_compiler_test.cc b/paddle/cinn/hlir/framework/parallel_compiler_test.cc
new file mode 100644
index 0000000000000..4e9ff5ef62f4a
--- /dev/null
+++ b/paddle/cinn/hlir/framework/parallel_compiler_test.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/parallel_compiler.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using namespace frontend;
+
+TEST(ParallelCompilerTest, Add_TEST_0) {
+  frontend::NetBuilder builder("Add_TEST_0");
+  auto A       = builder.CreateInput(Float(32), {128, 128}, "A");
+  auto B       = builder.CreateInput(Float(32), {128, 128}, "B");
+  auto C       = builder.Add(A, B);
+  auto target  = common::DefaultNVGPUTarget();
+  auto program = builder.Build();
+  auto graph   = std::make_shared<Graph>(program, target);
+  auto scope   = BuildScope(target, graph);
+
+  ParallelCompiler::CompileOptions option;
+  ParallelCompiler pc(scope, graph, option, target);
+  auto runtime_program = pc();
+}
+
+TEST(ParallelCompilerTest, Conv2d_Test_0) {
+  frontend::NetBuilder builder("Conv2d_Test_0");
+  auto A = builder.CreateInput(Float(32), {1, 64, 112, 112}, "A");
+  auto B = builder.CreateInput(Float(32), {64, 64, 3, 3}, "B");
+  auto C = builder.CreateInput(Float(32), {1, 64, 56, 56}, "C");
+  auto D = builder.Conv2d(A, B, {2, 2}, {1, 1});
+  auto E = builder.Add(C, D);
+
+  auto target  = common::DefaultNVGPUTarget();
+  auto program = builder.Build();
+  auto graph   = Optimize(&program, {}, target);
+  auto scope   = BuildScope(target, graph);
+
+  ParallelCompiler::CompileOptions option;
+  ParallelCompiler pc(scope, graph, option, target);
+  auto runtime_program = pc();
+}
+
+TEST(ParallelCompilerTest, Matmul_Test_0) {
+  frontend::NetBuilder builder("Matmul_Test_0");
+  auto A = builder.CreateInput(Float(32), {64, 128, 128}, "A");
+  auto B = builder.CreateInput(Float(32), {64, 128, 128}, "B");
+  auto C = builder.CreateInput(Float(32), {64, 128, 128}, "C");
+  auto D = builder.Matmul(A, B);
+  auto E = builder.Add(C, D);
+
+  auto target  = common::DefaultNVGPUTarget();
+  auto program = builder.Build();
+  auto graph   = Optimize(&program, {}, target);
+  auto scope   = BuildScope(target, graph);
+
+  ParallelCompiler::CompileOptions option;
+  ParallelCompiler pc(scope, graph, option, target);
+  auto runtime_program = pc();
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/pass.cc b/paddle/cinn/hlir/framework/pass.cc
new file mode 100644
index 0000000000000..ad4d6152e5f98
--- /dev/null
+++ b/paddle/cinn/hlir/framework/pass.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/pass.h"
+
+#include "cinn/hlir/framework/visualize_helper.h"
+#include "cinn/hlir/pass/use_pass.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+void ApplyPasses(Graph* g, const std::vector<std::string>& passes) {
+  std::vector<const PassFunctionRegister*> fpass;
+  for (auto& name : passes) {
+    VLOG(1) << "Run Pass -> " << name;
+    auto* reg = Registry<PassFunctionRegister>::Global()->Find(name);
+    CHECK(reg) << "Cannot find pass " << name << " in the registry";
+    fpass.push_back(reg);
+  }
+  for (auto* r : fpass) {
+    cinn::hlir::framework::PassPrinter::GetInstance()->PassBegin(r->name, g);
+    for (auto& dep : r->graph_attr_dependency) {
+      CHECK_NE(g->attrs.count(dep), 0) << "To apply pass [" << r->name << "], Graph's attribute [" << dep
+                                       << "] is required, but it is not available.";
+      if (g->attrs.count(dep) == 0) {
+        auto* pass_dep = FindPassDep(dep);
+        CHECK(!pass_dep) << "And the attribute is provided by pass [" << pass_dep->name << "].";
+      }
+    }
+    r->body(g);
+    cinn::hlir::framework::PassPrinter::GetInstance()->PassEnd(r->name, g);
+  }
+}
+
+const PassFunctionRegister* FindPassDep(const std::string& attr_name) {
+  for (auto* r : Registry<PassFunctionRegister>::Global()->List()) {
+    for (auto& s : r->graph_attr_targets) {
+      if (s == attr_name) return r;
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/pass.h b/paddle/cinn/hlir/framework/pass.h
new file mode 100644
index 0000000000000..8a51c98eaa352
--- /dev/null
+++ b/paddle/cinn/hlir/framework/pass.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/utils/registry.h"
+
+#define CINN_REGISTER_PASS(name) \
+  CINN_REGISTRY_REGISTER(::cinn::hlir::framework::PassFunctionRegister, PassFunctionRegister, name)
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+class PassFunctionRegister;
+typedef std::function<void(Graph* g)> PassFunction;
+
+/**
+ * \brief Given an attribute of graph, find the pass that generates this attribute.
+ * @param attr_name Name of the graph attribute.
+ * @return The pass that generates it.
+ */
+const PassFunctionRegister* FindPassDep(const std::string& attr_name);
+
+class PassFunctionRegister : public FunctionRegEntryBase<PassFunctionRegister, PassFunction> {
+ public:
+  bool change_structure{false};
+  //! dependencies on operator attributes
+  std::vector<std::string> op_attr_dependency{};
+  //! dependencies on attributes in the graph
+  std::vector<std::string> graph_attr_dependency{};
+  //! generated targets of graph attributes
+  std::vector<std::string> graph_attr_targets{};
+
+  /**
+   * \brief Imply whether this pass will change the Graph's structure.
+   * @param in A bool variable implying whether this pass will change the Graph's structure.
+   * @return Reference to self.
+   */
+  PassFunctionRegister& set_change_structure(bool in) {
+    change_structure = in;
+    return *this;
+  }
+
+  /**
+   * \brief Declare that this pass will generate the given graph attribute name
+   *        once it is applied on the graph.
+   * @param attr_name Name of the graph attribute.
+   * @return Reference to self.
+   */
+  PassFunctionRegister& provide_graph_attr(const std::string& attr_name) {
+    graph_attr_targets.push_back(attr_name);
+    return *this;
+  }
+
+  /**
+   * \brief Declare this pass requires the given operator attribute to be
+   *        available before being applied on the graph.
+   * @param attr_name Name of the attribute.
+   * @return Reference to self.
+   */
+  PassFunctionRegister& depend_op_attr(const std::string& attr_name) {
+    op_attr_dependency.push_back(attr_name);
+    return *this;
+  }
+
+  /**
+   * \brief Declare this pass requires the given graph attribute to be
+   *        available before being applied on the graph.
+   * @param attr_name Name of the attribute.
+   * @return Reference to self.
+   */
+  PassFunctionRegister& depend_graph_attr(const std::string& attr_name) {
+    graph_attr_dependency.push_back(attr_name);
+    return *this;
+  }
+};
+
+const PassFunctionRegister* FindPassDep(const std::string& attr_name);
+
+/**
+ * \brief Apply a sequence of passes on a graph.
+ * @param g The input graph to apply passes on.
+ * @param passes The sequence of pass.
+ * @return The graph after being modified by the passes.
+ */
+void ApplyPasses(Graph* g, const std::vector<std::string>& passes);
+
+// Apply a single pass on a graph.
+inline void ApplyPass(Graph* g, const std::string& pass) { return ApplyPasses(g, {pass}); }
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/print_graph_pass_test.cc b/paddle/cinn/hlir/framework/print_graph_pass_test.cc
new file mode 100644
index 0000000000000..6e63cc4a77618
--- /dev/null
+++ b/paddle/cinn/hlir/framework/print_graph_pass_test.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/types/any.h>
+#include <gtest/gtest.h>
+
+#include <string>
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+void PrintGraphPass(Graph* src) {
+  std::string res;
+  auto store_node = std::get<0>(src->topological_order());
+  int index       = 0;
+  for (auto& i : store_node) {
+    if (i->is_type<Node>()) {
+      res += std::to_string(index) + ":";
+      res += i->safe_as<Node>()->attrs.node_name;
+      res += "(" + i->id() + ")\n";
+      index++;
+    }
+  }
+  src->attrs["print_graph"] = std::make_shared<absl::any>(res);
+}
+
+CINN_REGISTER_PASS(PrintGraph)
+    .describe("This pass just save the visulization Graph to g.attrs[\"print_graph\"].")
+    .set_change_structure(false)
+    .provide_graph_attr("print_graph")
+    .set_body(PrintGraphPass);
+
+TEST(Operator, GetAttrs) {
+  frontend::Program prog;
+  frontend::Variable a("A");
+  frontend::Variable b("B");
+  Type t  = Float(32);
+  a->type = t;
+  b->type = t;
+  auto c  = prog.add(a, b);
+  auto d  = prog.add(c, b);
+  auto e  = prog.add(c, d);
+  ASSERT_EQ(prog.size(), 3);
+  Graph* g = new Graph(prog, common::DefaultHostTarget());
+  ApplyPass(g, "PrintGraph");
+  auto s = g->GetAttrs<std::string>("print_graph");
+  LOG(INFO) << s;
+  std::string target_str = R"ROC(
+0:elementwise_add(elementwise_add_0)
+1:elementwise_add(elementwise_add_1)
+2:elementwise_add(elementwise_add_2)
+)ROC";
+  ASSERT_EQ(utils::Trim(s), utils::Trim(target_str));
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/schedule.h b/paddle/cinn/hlir/framework/schedule.h
new file mode 100644
index 0000000000000..afd7816ca642d
--- /dev/null
+++ b/paddle/cinn/hlir/framework/schedule.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+/**
+ * \brief Global schedule container
+ *  For operations and all the operations they depend on.
+ *  The schedule per Operation is named as stage.
+ */
+class Schedule : public common::Object {
+ public:
+  const char* type_info() const override { return __type_info__; }
+
+  /**
+   * \brief Get the stage corresponds to the op
+   * @param op The operation.
+   */
+  ir::Tensor operator[](const ir::Operation& op) {
+    auto it = stage_map.find(op.name);
+    CHECK(it != stage_map.end()) << "Cannot find Stage for operator " << op.name << " in the schedule";
+    return it->second;
+  }
+
+  //! The output operations in original data flow graph
+  std::vector<ir::Operation> outputs;
+  /**
+   * \brief list of all stages for ops.
+   * The stages are sorted in dependency order.
+   */
+  std::vector<poly::Stage> stages;
+
+  //! map of original operation to the stages
+  absl::flat_hash_map<std::string, ir::Tensor> stage_map;
+
+ private:
+  static constexpr char* __type_info__ = "CINNSchedule";
+};
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/scope.cc b/paddle/cinn/hlir/framework/scope.cc
new file mode 100755
index 0000000000000..581d18a6a3405
--- /dev/null
+++ b/paddle/cinn/hlir/framework/scope.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/scope.h"
+
+#include "cinn/common/common.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+void Scope::EraseVar(const std::string& name) {
+  CHECK(data_.count(name)) << "Variable(" << name << ") not found";
+  data_.erase(name);
+}
+
+Variable* Scope::FindVar(const std::string& name) const {
+  auto it = data_.find(name);
+  if (it != data_.end()) return it->second.get();
+  return nullptr;
+}
+
+Tensor Scope::GetTensor(const std::string& name) const {
+  CheckVarNameValid(name);
+  auto* var = FindVar(name);
+  CHECK(var) << "No variable called [" << name << "] found";
+  return absl::get<Tensor>(*var);
+}
+
+std::vector<absl::string_view> Scope::var_names() const {
+  std::vector<absl::string_view> names;
+  for (auto& item : data_) {
+    names.push_back(item.first);
+  }
+  return names;
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/scope.h b/paddle/cinn/hlir/framework/scope.h
new file mode 100755
index 0000000000000..f28d540a7d54e
--- /dev/null
+++ b/paddle/cinn/hlir/framework/scope.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+#include <absl/strings/string_view.h>
+#include <absl/types/any.h>
+#include <absl/types/variant.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using Variable = absl::variant<Tensor>;
+
+struct _Tensor_;
+
+class Scope {
+ public:
+  static std::shared_ptr<Scope> Create() { return std::make_shared<Scope>(); }
+
+  //! Get or create a variable.
+  template <typename T>
+  Variable* Var(const std::string& name);
+
+  // Erase a variable, check exists firstly
+  void EraseVar(const std::string& name);
+
+  //! Find a variable, get null if not exists.
+  Variable* FindVar(const std::string& name) const;
+
+  Tensor GetTensor(const std::string& name) const;
+
+  //! Get variable names.
+  std::vector<absl::string_view> var_names() const;
+
+  Scope() = default;
+
+ private:
+  absl::flat_hash_map<std::string, std::unique_ptr<Variable>> data_;
+
+  CINN_DISALLOW_COPY_AND_ASSIGN(Scope);
+};
+
+template <typename T>
+Variable* Scope::Var(const std::string& name) {
+  VLOG(4) << "Scope insert Var [" << name << "]";
+  Variable* x = FindVar(name);
+  if (x) return x;
+  auto* data = new Variable(T());
+  data_[name].reset(data);
+  return data;
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/scope_test.cc b/paddle/cinn/hlir/framework/scope_test.cc
new file mode 100644
index 0000000000000..b94dbf6851361
--- /dev/null
+++ b/paddle/cinn/hlir/framework/scope_test.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/scope.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+TEST(Scope, basic) {
+  Scope scope;
+  auto* var    = scope.Var<Tensor>("key");
+  auto& tensor = absl::get<Tensor>(*var);
+  tensor->Resize(Shape{{3, 1}});
+  auto* data = tensor->mutable_data<float>(common::DefaultHostTarget());
+  data[0]    = 0.f;
+  data[1]    = 1.f;
+  data[2]    = 2.f;
+}
+
+TEST(ScopeTest, TestEraseVar) {
+  Scope scope;
+  scope.Var<Tensor>("key");
+  ASSERT_NE(scope.FindVar("key"), nullptr);
+  scope.EraseVar("key");
+  EXPECT_EQ(scope.FindVar("key"), nullptr);
+  ASSERT_DEATH(scope.EraseVar("key"), "");
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/tensor.cc b/paddle/cinn/hlir/framework/tensor.cc
new file mode 100644
index 0000000000000..ba84747970d5a
--- /dev/null
+++ b/paddle/cinn/hlir/framework/tensor.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/tensor.h"
+
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+void _Tensor_::set_type(Type type) {
+  type_ = type;
+  if (type.is_bool()) {
+    buffer_->data()->type = cinn_bool_t();
+  } else if (type.is_int(8)) {
+    buffer_->data()->type = cinn_int8_t();
+  } else if (type.is_int(16)) {
+    buffer_->data()->type = cinn_int16_t();
+  } else if (type.is_int(32)) {
+    buffer_->data()->type = cinn_int32_t();
+  } else if (type.is_int(64)) {
+    buffer_->data()->type = cinn_int64_t();
+  } else if (type.is_uint(8)) {
+    buffer_->data()->type = cinn_uint8_t();
+  } else if (type.is_uint(16)) {
+    buffer_->data()->type = cinn_uint16_t();
+  } else if (type.is_uint(32)) {
+    buffer_->data()->type = cinn_uint32_t();
+  } else if (type.is_uint(64)) {
+    buffer_->data()->type = cinn_uint64_t();
+  } else if (type.is_float(32)) {
+    buffer_->data()->type = cinn_float32_t();
+  } else if (type.is_float(64)) {
+    buffer_->data()->type = cinn_float64_t();
+  } else if (type.is_bfloat16()) {
+    buffer_->data()->type = cinn_bfloat16_t();
+  } else if (type.is_float16()) {
+    buffer_->data()->type = cinn_float16_t();
+  } else {
+    buffer_->data()->type = cinn_unk_t();
+  }
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/tensor.h b/paddle/cinn/hlir/framework/tensor.h
new file mode 100644
index 0000000000000..cb46fa4697fea
--- /dev/null
+++ b/paddle/cinn/hlir/framework/tensor.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/strings/string_view.h>
+
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/buffer.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+using common::Target;
+
+struct Shape {
+  using dim_t = int;
+
+  Shape() = default;
+  explicit Shape(const std::vector<dim_t>& data) : data_(data) {}
+
+  void SetData(const std::vector<dim_t>& data) { data_ = data; }
+
+  const std::vector<dim_t>& data() const CINN_RESULT_SHOULD_USE { return data_; }
+  std::vector<dim_t>& data() CINN_RESULT_SHOULD_USE { return data_; }
+  size_t size() const CINN_RESULT_SHOULD_USE { return data_.size(); }
+  uint32_t numel() const CINN_RESULT_SHOULD_USE {
+    return std::accumulate(data_.begin(), data_.end(), 1, [](dim_t a, dim_t b) { return a * b; });
+  }
+
+ private:
+  std::vector<dim_t> data_;
+};
+
+class _Tensor_ : public Object {
+ public:
+  _Tensor_() : buffer_(std::make_shared<Buffer>()) {}
+
+  Shape& shape() { return shape_; }
+
+  void Resize(const Shape& shape) {
+    shape_ = shape;
+    buffer_->data()->resize(reinterpret_cast<const cinn_dimension_t*>(shape.data().data()), shape.size());
+  }
+
+  inline void* mutable_data(const Target& target, const Type& type) {
+    set_type(type);
+    if (target == common::DefaultHostTarget()) {
+      buffer_->ResizeLazy(1024, shape_.numel() * type.bytes(), target);
+    } else {
+      buffer_->ResizeLazy(shape_.numel() * type.bytes(), target);
+    }
+    return reinterpret_cast<void*>(buffer_->data()->memory);
+  }
+
+  template <typename T>
+  inline T* mutable_data(const Target& target) {
+    set_type(type_of<T>());
+    if (target == common::DefaultHostTarget()) {
+      buffer_->ResizeLazy(1024, shape_.numel() * sizeof(T), target);
+    } else {
+      buffer_->ResizeLazy(shape_.numel() * sizeof(T), target);
+    }
+    return reinterpret_cast<T*>(buffer_->data()->memory);
+  }
+
+  template <typename T>
+  const T* data() const {
+    return reinterpret_cast<T*>(buffer_->data()->memory);
+  }
+
+  const Type& type() { return type_; }
+
+  void set_type(Type type);
+  const Type& type() const { return type_; }
+
+  cinn_buffer_t* buffer() { return buffer_->data(); }
+  std::shared_ptr<Buffer> get_buffer() { return buffer_; }
+  void set_buffer(std::shared_ptr<Buffer> buffer) { buffer_ = buffer; }
+
+  const char* type_info() const override { return __type_info__; }
+
+ private:
+  common::Type type_;
+  // A shared ptr to make it easier to share buffer between tensors.
+  std::shared_ptr<Buffer> buffer_;
+  Shape shape_;
+
+  static constexpr char* __type_info__ = "_frontend_tensor_";
+};
+
+class Tensor : public Shared<_Tensor_> {
+ public:
+  Tensor() : Shared(new _Tensor_) {}
+  explicit Tensor(_Tensor_* x) : Shared(x) {}
+};
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/tensor_test.cc b/paddle/cinn/hlir/framework/tensor_test.cc
new file mode 100644
index 0000000000000..2a589f5e34920
--- /dev/null
+++ b/paddle/cinn/hlir/framework/tensor_test.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/tensor.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+TEST(Tensor, basic) {
+  _Tensor_ tensor;
+  tensor.Resize(Shape{{3, 2}});
+
+  auto* data = tensor.mutable_data<float>(common::DefaultHostTarget());
+
+  for (int i = 0; i < tensor.shape().numel(); i++) {
+    data[i] = i;
+  }
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/variable.cc b/paddle/cinn/hlir/framework/variable.cc
new file mode 100644
index 0000000000000..ad9f115f4d838
--- /dev/null
+++ b/paddle/cinn/hlir/framework/variable.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/variable.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/variable.h b/paddle/cinn/hlir/framework/variable.h
new file mode 100644
index 0000000000000..f1092aac8f42f
--- /dev/null
+++ b/paddle/cinn/hlir/framework/variable.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace cinn {
+namespace hlir {
+namespace framework {}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/visualize_helper.cc b/paddle/cinn/hlir/framework/visualize_helper.cc
new file mode 100644
index 0000000000000..81056a51eb880
--- /dev/null
+++ b/paddle/cinn/hlir/framework/visualize_helper.cc
@@ -0,0 +1,440 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/visualize_helper.h"
+
+#include <errno.h>
+#include <sys/stat.h>
+
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/dot_lang.h"
+#include "cinn/utils/string.h"
+
+DECLARE_string(cinn_pass_visualize_dir);
+DECLARE_string(cinn_check_fusion_accuracy_pass);
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+bool PassPrinter::Begin(const std::unordered_set<std::string>& fetch_ids) {
+  if (FLAGS_cinn_pass_visualize_dir.empty()) {
+    VLOG(3) << "No set \"FLAGS_cinn_pass_visualize_dir\", the pass visualize information will print directly.";
+    save_path_.clear();
+    return false;
+  }
+  pass_id_   = 0;
+  fetch_ids_ = fetch_ids;
+
+  save_path_ = utils::StringFormat("%s/fusion_groups_%d/", FLAGS_cinn_pass_visualize_dir.c_str(), graph_id_);
+  if (!MakeDirectory(save_path_, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
+    LOG_IF(WARNING, graph_id_ == 0) << "Failed to make directory: \"" << save_path_
+                                    << "\", the CINN subgraph's pass visualize information will not print.";
+    return false;
+  }
+  LOG_IF(INFO, graph_id_ == 0) << "The CINN subgraph's pass visualize information will writing into path: \""
+                               << FLAGS_cinn_pass_visualize_dir << "\"";
+  return true;
+}
+
+bool PassPrinter::PassBegin(const std::string& pass_name, const frontend::Program& program) {
+  if (save_path_.empty()) {
+    return false;
+  }
+  const auto& program_info = utils::GetStreamCnt(program);
+  VLOG(3) << "Before " << pass_name << " Pass:\n" << program_info;
+  const std::string& file_path =
+      utils::StringFormat("%s/pass_%d_%s_before.txt", save_path_.c_str(), pass_id_, pass_name.c_str());
+  WriteToFile(file_path, program_info);
+  return true;
+}
+
+bool PassPrinter::PassEnd(const std::string& pass_name, const frontend::Program& program) {
+  if (save_path_.empty()) {
+    return false;
+  }
+  const auto& program_info = utils::GetStreamCnt(program);
+  VLOG(3) << "After " << pass_name << " Pass:\n" << program_info;
+  const std::string& file_path =
+      utils::StringFormat("%s/pass_%d_%s_after.txt", save_path_.c_str(), pass_id_, pass_name.c_str());
+  WriteToFile(file_path, program_info);
+
+  ++pass_id_;
+  return true;
+}
+
+bool PassPrinter::PassBegin(const std::string& pass_name, Graph* g) {
+  if (save_path_.empty()) {
+    return false;
+  }
+  const auto& graph_info = g->DebugGroupedGraph(fetch_ids_);
+  VLOG(3) << "Before " << pass_name << " Pass:\n" << graph_info;
+  const std::string& file_path =
+      utils::StringFormat("%s/pass_%d_%s_before.txt", save_path_.c_str(), pass_id_, pass_name.c_str());
+  WriteToFile(file_path, graph_info);
+
+  const auto& dot_info = g->VisualizeGraph(fetch_ids_);
+  const std::string& dot_path =
+      utils::StringFormat("%s/pass_%d_%s_before.dot", save_path_.c_str(), pass_id_, pass_name.c_str());
+  WriteToFile(dot_path, dot_info);
+  return true;
+}
+
+bool PassPrinter::PassEnd(const std::string& pass_name, Graph* g) {
+  if (save_path_.empty()) {
+    return false;
+  }
+  const auto& graph_info = g->DebugGroupedGraph(fetch_ids_);
+  VLOG(3) << "After " << pass_name << " Pass:\n" << graph_info;
+
+  const std::string& file_path =
+      utils::StringFormat("%s/pass_%d_%s_after.txt", save_path_.c_str(), pass_id_, pass_name.c_str());
+  WriteToFile(file_path, graph_info);
+
+  const auto& dot_info = g->VisualizeGraph(fetch_ids_);
+  const std::string& dot_path =
+      utils::StringFormat("%s/pass_%d_%s_after.dot", save_path_.c_str(), pass_id_, pass_name.c_str());
+  WriteToFile(dot_path, dot_info);
+
+  ++pass_id_;
+  return true;
+}
+
+bool PassPrinter::End() {
+  ++graph_id_;
+
+  pass_id_ = 0;
+  fetch_ids_.clear();
+  save_path_.clear();
+  return true;
+}
+
+bool MakeDirectory(const std::string& dirname, mode_t mode) {
+  auto len = dirname.length();
+  std::vector<char> dir_path(len + 1, '\0');
+  strncpy(dir_path.data(), dirname.c_str(), len);
+  char* path = dir_path.data();
+  for (char* p = strchr(path + 1, '/'); p; p = strchr(p + 1, '/')) {
+    *p = '\0';
+    if (mkdir(path, mode) == -1) {
+      if (errno != EEXIST) {
+        *p = '/';
+        return false;
+      }
+    }
+    *p = '/';
+  }
+  return true;
+}
+
+std::string GetFilePathForGroup(const std::vector<std::vector<Node*>>& groups,
+                                const int group_id,
+                                const std::string& viz_path) {
+  std::string filename = "";
+  for (auto* node : groups[group_id]) {
+    filename += "_" + node->id();
+  }
+
+  int max_len                     = 50;
+  std::string simplified_filename = filename;
+  if (filename.size() > max_len) {
+    static std::unordered_map<std::string, std::string> funcname_map = {{"const_scalar", "scalar"},
+                                                                        {"fill_constant", "fill"},
+                                                                        {"identity", "copy"},
+                                                                        {"broadcast_to", "broadcast"},
+                                                                        {"elementwise_add", "add"},
+                                                                        {"subtract", "sub"},
+                                                                        {"elementwise_mul", "mul"},
+                                                                        {"divide", "div"},
+                                                                        {"reduce_sum", "reduce"},
+                                                                        {"reduce_prod", "reduce"},
+                                                                        {"reduce_max", "reduce"},
+                                                                        {"reduce_min", "reduce"}};
+    for (auto& item : funcname_map) {
+      size_t index = 0;
+      while (true) {
+        index = simplified_filename.find(item.first, index);
+        if (index == std::string::npos) {
+          break;
+        }
+        simplified_filename.replace(index, item.first.size(), item.second);
+        index += item.second.size();
+      }
+    }
+  }
+
+  int width = std::to_string(groups.size()).size();
+  std::stringstream ss;
+  ss << viz_path;
+  ss << std::setw(width) << std::setfill('0') << group_id;
+  ss << simplified_filename.substr(0, 50) << ".dot";
+  return ss.str();
+}
+
+std::string GenNodeDataLabel(const NodeData* node,
+                             const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                             const absl::flat_hash_map<std::string, common::Type>& dtype_dict,
+                             const std::string dot_nodedata_id) {
+  std::stringstream ss;
+  ss << dot_nodedata_id;
+  if (shape_dict.count(node->id())) {
+    shape_t node_shape = shape_dict.at(node->id());
+    ss << "\\n[";
+    for (size_t i = 0; i < node_shape.size(); ++i) {
+      if (i > 0) {
+        ss << "x";
+      }
+      ss << node_shape[i];
+    }
+    ss << "]";
+  }
+  if (dtype_dict.count(node->id())) {
+    ss << "\\n";
+    ss << common::Type2Str(dtype_dict.at(node->id()));
+  }
+
+  return ss.str();
+}
+
+void Summary(const std::vector<std::vector<Node*>>& groups, const std::string& viz_path) {
+  std::map<std::string, size_t> group_summary;
+  std::map<std::string, size_t> single_group_detail;
+  std::map<std::string, size_t> fusion_group_detail;
+
+  for (auto& group : groups) {
+    size_t group_size = group.size();
+    group_summary[std::to_string(group_size)]++;
+    if (group_size == 1) {
+      // Like "fill_constant_1", remove the "_1" at the end of the string.
+      std::string node_id = group[0]->id();
+      int index           = node_id.size() - 1;
+      while (index != -1) {
+        if (node_id[index] >= '0' && node_id[index] <= '9') {
+          index--;
+        } else {
+          break;
+        }
+      }
+      if (node_id[index] == '_') {
+        index--;
+      }
+      if (index >= 0) {
+        node_id = node_id.substr(0, index + 1);
+        single_group_detail[node_id]++;
+      }
+    } else {
+      std::string key = "others";
+      for (auto* node : group) {
+        if (node->id().find("reduce") != std::string::npos) {
+          key = "reduce";
+          break;
+        }
+      }
+      fusion_group_detail[key]++;
+    }
+  }
+
+  std::stringstream ss;
+
+  auto print_table = [&](const std::map<std::string, size_t>& res) {
+    int total = 0;
+    for (auto& item : res) {
+      ss << std::setw(20) << item.first << item.second << "\n";
+      total += item.second;
+    }
+    ss << "-------------------------------------------\n";
+    ss << std::setw(20) << "total" << total << "\n";
+    ss << "-------------------------------------------\n";
+  };
+
+  ss << "-------------------------------------------\n";
+  ss << "             Summary of Groups\n";
+  ss << "-------------------------------------------\n";
+  ss << std::setiosflags(std::ios::left);
+  ss << std::setfill(' ');
+  ss << std::setw(20) << "Size"
+     << "Numbers\n";
+  print_table(group_summary);
+
+  if (single_group_detail.size()) {
+    ss << "\n\n-------------------------------------------\n";
+    ss << "          Detail of Single Groups\n";
+    ss << "-------------------------------------------\n";
+    ss << std::setw(20) << "Type"
+       << "Numbers\n";
+    print_table(single_group_detail);
+  }
+
+  ss << "\n\n-------------------------------------------\n";
+  ss << "          Detail of Fusion Groups\n";
+  ss << "-------------------------------------------\n";
+  ss << std::setw(20) << "Type"
+     << "Numbers\n";
+  print_table(fusion_group_detail);
+
+  std::string filepath = viz_path + "summary.txt";
+  WriteToFile(filepath, ss.str());
+}
+
+std::string DebugString(const Node* node) {
+  std::vector<std::string> out_names;
+  for (auto& outlink : node->outlinks_in_order()) {
+    auto* outnode = outlink->sink()->safe_as<NodeData>();
+    if (outnode) {
+      out_names.emplace_back(outnode->id());
+    }
+  }
+
+  std::vector<std::string> in_names;
+  for (auto& inlink : node->inlinks_in_order()) {
+    auto* innode = inlink->source()->safe_as<NodeData>();
+    if (innode) {
+      in_names.emplace_back(innode->id());
+    }
+  }
+
+  std::stringstream ss;
+  ss << cinn::utils::Join(out_names, ", ") << " = builder." << node->op()->name << "("
+     << cinn::utils::Join(in_names, ", ");
+
+  bool first = true;
+  std::map<std::string, std::string> attr_str_map;
+  for (const auto& attr_pair : node->attrs.attr_store) {
+    attr_str_map[attr_pair.first] = utils::Attribute2String(attr_pair.second);
+  }
+
+  for (const auto& attr_pair : attr_str_map) {
+    if (!first) {
+      ss << ", ";
+    } else {
+      if (!in_names.empty()) {
+        // insert a split letter before if inputs not empty
+        ss << ", ";
+      }
+      first = false;
+    }
+    ss << attr_pair.first << "=" << attr_pair.second;
+  }
+  ss << ")";
+  return ss.str();
+}
+
+void FindRecomputeNodes(const std::vector<std::vector<Node*>>& groups,
+                        std::unordered_map<std::string, int>* recompute_nodes) {
+  std::unordered_map<std::string, int> op_count;
+  for (auto& group : groups) {
+    for (auto* node : group) {
+      op_count[node->id()]++;
+    }
+  }
+  for (auto& iter : op_count) {
+    if (iter.second > 1) {
+      (*recompute_nodes)[iter.first] = 0;
+    }
+  }
+}
+
+void AddGroupNode(const Node* node,
+                  const std::string& dot_cluster_id,
+                  const std::unordered_set<std::string>& fetch_var_ids,
+                  const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                  const absl::flat_hash_map<std::string, common::Type>& dtype_dict,
+                  std::unordered_map<std::string, int>* recompute_nodes,
+                  std::unordered_map<std::string, std::string>* outnode2dot_id,
+                  std::unordered_set<std::string>* nodedatas_set,
+                  utils::DotLang* dot) {
+  bool is_recomputed = recompute_nodes->count(node->id());
+  int recompute_id   = is_recomputed ? (*recompute_nodes)[node->id()]++ : -1;
+
+  std::string dot_node_id = GenNodeId(node, is_recomputed, recompute_id);
+  dot->AddNode(dot_node_id, GetGroupOpAttrs(is_recomputed), "", dot_cluster_id);
+
+  for (auto& inlink : node->inlinks()) {
+    auto* innode = inlink->source()->safe_as<NodeData>();
+    if (innode) {
+      if (!outnode2dot_id->count(innode->id())) {
+        (*outnode2dot_id)[innode->id()] = innode->id();
+      }
+      std::string dot_innode_id = outnode2dot_id->at(innode->id());
+      if (!nodedatas_set || !nodedatas_set->count(dot_innode_id)) {
+        std::string label = GenNodeDataLabel(innode, shape_dict, dtype_dict, dot_innode_id);
+        dot->AddNode(dot_innode_id, GetGroupVarAttrs(false), label, dot_cluster_id, true);
+        if (nodedatas_set) {
+          nodedatas_set->insert(dot_innode_id);
+        }
+      }
+      dot->AddEdge(dot_innode_id, dot_node_id, {});
+    }
+  }
+
+  for (auto& outlink : node->outlinks()) {
+    auto* outnode = outlink->sink()->safe_as<NodeData>();
+    if (outnode) {
+      std::string dot_outnode_id       = GenNodeDataId(outnode, is_recomputed, recompute_id);
+      (*outnode2dot_id)[outnode->id()] = dot_outnode_id;
+      if (!nodedatas_set || !nodedatas_set->count(dot_outnode_id)) {
+        bool is_fetched   = fetch_var_ids.count(outnode->id());
+        std::string label = GenNodeDataLabel(outnode, shape_dict, dtype_dict, dot_outnode_id);
+        dot->AddNode(dot_outnode_id, GetGroupVarAttrs(is_fetched), label, dot_cluster_id, true);
+        if (nodedatas_set) {
+          nodedatas_set->insert(dot_outnode_id);
+        }
+      }
+      dot->AddEdge(dot_node_id, dot_outnode_id, {});
+    }
+  }
+}
+
+bool IsAccCheckOp(const Node* op) { return op->attrs.node_name.find("_acc_check") != std::string::npos; }
+bool IsAccCheckVar(const NodeData* var) { return var->id().find("_acc_check") != std::string::npos; }
+
+std::string GenerateAccCheckNodeId(const std::string& node_id) {
+  return node_id + cinn::common::UniqName("_acc_check");
+}
+
+bool IsAccCheckGroup(const std::vector<Node*>& group) {
+  for (auto* node : group) {
+    if (IsAccCheckOp(node)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<std::vector<Node*>> RemoveAccCheckGroups(const std::vector<std::vector<Node*>>& groups) {
+  if (cinn::runtime::CheckStringFlagFalse(FLAGS_cinn_check_fusion_accuracy_pass)) {
+    // no set acc check flag
+    return groups;
+  }
+
+  std::vector<std::vector<Node*>> new_groups;
+  for (const auto& group : groups) {
+    if (!IsAccCheckGroup(group)) {
+      new_groups.emplace_back(group);
+    }
+  }
+  return new_groups;
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/framework/visualize_helper.h b/paddle/cinn/hlir/framework/visualize_helper.h
new file mode 100644
index 0000000000000..4f9a54e4a0ddd
--- /dev/null
+++ b/paddle/cinn/hlir/framework/visualize_helper.h
@@ -0,0 +1,161 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/utils/dot_lang.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+class PassPrinter {
+ public:
+  static PassPrinter* GetInstance() {
+    static PassPrinter printer;
+    return &printer;
+  }
+
+  bool Begin(const std::unordered_set<std::string>& fetch_ids = {});
+  bool PassBegin(const std::string& pass_name, const frontend::Program& program);
+  bool PassEnd(const std::string& pass_name, const frontend::Program& program);
+  bool PassBegin(const std::string& pass_name, Graph* g);
+  bool PassEnd(const std::string& pass_name, Graph* g);
+  bool End();
+
+ private:
+  std::unordered_set<std::string> fetch_ids_;
+  std::string save_path_;
+  int64_t graph_id_{0};
+  int64_t pass_id_{0};
+};
+
+inline void WriteToFile(const std::string& filepath, const std::string& content) {
+  VLOG(4) << "Write to " << filepath;
+  std::ofstream of(filepath);
+  CHECK(of.is_open()) << "Failed to open " << filepath;
+  of << content;
+  of.close();
+}
+
+inline std::string GenClusterId(const std::vector<Node*>& group, int group_id) {
+  return "group_" + std::to_string(group_id) + "(size=" + std::to_string(group.size()) + ")";
+}
+
+inline std::string GenNodeId(const Node* node, bool is_recomputed, int recompute_id) {
+  if (is_recomputed) {
+    return node->id() + "/" + std::to_string(recompute_id);
+  } else {
+    return node->id();
+  }
+}
+
+inline std::string GenNodeDataId(const NodeData* data, bool is_recomputed, int recompute_id) {
+  if (is_recomputed) {
+    return data->id() + "/" + std::to_string(recompute_id);
+  } else {
+    return data->id();
+  }
+}
+
+inline std::vector<utils::DotAttr> GetGroupOpAttrs(bool is_recomputed = false) {
+  std::string color = is_recomputed ? "#836FFF" : "#8EABFF";
+  return std::vector<utils::DotAttr>{
+      utils::DotAttr("shape", "Mrecord"), utils::DotAttr("color", color), utils::DotAttr("style", "filled")};
+}
+
+inline std::vector<utils::DotAttr> GetOutlinkOpAttrs() {
+  return std::vector<utils::DotAttr>{
+      utils::DotAttr("shape", "Mrecord"), utils::DotAttr("color", "#ff7f00"), utils::DotAttr("style", "filled")};
+}
+
+inline std::vector<utils::DotAttr> GetGroupVarAttrs(bool is_fetched = false) {
+  if (is_fetched) {
+    return std::vector<utils::DotAttr>{
+        utils::DotAttr("peripheries", "2"), utils::DotAttr("color", "#43CD80"), utils::DotAttr("style", "filled")};
+  } else {
+    return std::vector<utils::DotAttr>{utils::DotAttr("color", "#FFDC85"), utils::DotAttr("style", "filled")};
+  }
+}
+
+inline std::vector<utils::DotAttr> GetGroupAttrs(size_t group_size) {
+  std::string fillcolor;
+  if (group_size == 1) {
+    fillcolor = "#E8E8E8";
+  } else if (group_size <= 3) {
+    fillcolor = "#FFFFF0";
+  } else if (group_size <= 10) {
+    fillcolor = "#F0FFFF";
+  } else {
+    // group_size > 10
+    fillcolor = "#EEE5DE";
+  }
+  std::vector<utils::DotAttr> attrs = {
+      utils::DotAttr("color", "grey"), utils::DotAttr("style", "filled"), utils::DotAttr("fillcolor", fillcolor)};
+  return attrs;
+}
+
+bool MakeDirectory(const std::string& dirname, mode_t mode);
+
+std::string GetFilePathForGroup(const std::vector<std::vector<Node*>>& groups,
+                                const int group_id,
+                                const std::string& viz_path);
+
+std::string GenNodeDataLabel(const NodeData* node,
+                             const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                             const absl::flat_hash_map<std::string, common::Type>& dtype_dict,
+                             const std::string dot_nodedata_id);
+
+void Summary(const std::vector<std::vector<Node*>>& groups, const std::string& viz_path);
+
+std::string DebugString(const Node* node);
+
+void FindRecomputeNodes(const std::vector<std::vector<Node*>>& groups,
+                        std::unordered_map<std::string, int>* recompute_nodes);
+
+void AddGroupNode(const Node* node,
+                  const std::string& dot_cluster_id,
+                  const std::unordered_set<std::string>& fetch_var_ids,
+                  const absl::flat_hash_map<std::string, shape_t>& shape_dict,
+                  const absl::flat_hash_map<std::string, common::Type>& dtype_dict,
+                  std::unordered_map<std::string, int>* recompute_nodes,
+                  std::unordered_map<std::string, std::string>* outnode2dot_id,
+                  std::unordered_set<std::string>* nodedatas_set,
+                  utils::DotLang* dot);
+
+// used for CheckFusionAccuracyPass
+std::string GenerateAccCheckNodeId(const std::string& node_id);
+
+bool IsAccCheckOp(const Node* op);
+bool IsAccCheckVar(const NodeData* var);
+bool IsAccCheckGroup(const std::vector<Node*>& group);
+
+std::vector<std::vector<Node*>> RemoveAccCheckGroups(const std::vector<std::vector<Node*>>& groups);
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/kernels/CMakeLists.txt b/paddle/cinn/hlir/kernels/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/cinn/hlir/op/CMakeLists.txt b/paddle/cinn/hlir/op/CMakeLists.txt
new file mode 100644
index 0000000000000..306d1ce2aea55
--- /dev/null
+++ b/paddle/cinn/hlir/op/CMakeLists.txt
@@ -0,0 +1,23 @@
+add_subdirectory(contrib)
+
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    nn.cc
+    broadcast.cc
+    transform.cc
+    elementwise.cc
+    reduction.cc
+    op_util.cc
+    custom_call.cc
+    external_api_registry.cc
+    )
+
+cc_test(test_cinn_op_broadcast SRCS op_broadcast_test.cc DEPS cinncore)
+cc_test(test_cinn_op_nn SRCS op_nn_test.cc DEPS cinncore)
+cc_test(test_cinn_op_transform SRCS transform_test.cc DEPS cinncore)
+cc_test(test_external_api_registry SRCS external_api_registry_test.cc DEPS cinncore)
+
+if (WITH_CUDA)
+cc_test(test_cinn_op_reduction SRCS reduction_test.cc DEPS cinncore)
+endif()
diff --git a/paddle/cinn/hlir/op/broadcast.cc b/paddle/cinn/hlir/op/broadcast.cc
new file mode 100644
index 0000000000000..427a23398f78b
--- /dev/null
+++ b/paddle/cinn/hlir/op/broadcast.cc
@@ -0,0 +1,455 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/broadcast.h"
+
+#include <iostream>
+
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/layout.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+#define StrategyForBinary(op_name__, pe__)                                                             \
+  std::shared_ptr<OpStrategy> StrategyFor##pe__(const framework::NodeAttr &attrs,                      \
+                                                const std::vector<ir::Tensor> &inputs,                 \
+                                                const std::vector<Type> &out_type,                     \
+                                                const std::vector<std::vector<int>> &output_shapes,    \
+                                                const Target &target) {                                \
+    return StrategyForBroadcast(attrs, inputs, out_type, output_shapes, target, #op_name__, pe::pe__); \
+  }
+
+std::shared_ptr<OpStrategy> StrategyForBroadcast(
+    const framework::NodeAttr &attrs,
+    const std::vector<ir::Tensor> &inputs,
+    const std::vector<Type> &out_type,
+    const std::vector<std::vector<int>> &output_shapes,
+    const Target &target,
+    const std::string &op_name,
+    ir::Tensor (*pe_func)(const ir::Tensor &A, const ir::Tensor &B, const std::string &output_name, const Expr &axis)) {
+  framework::CINNCompute binary_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of " << op_name << " compute is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 2U) << "at least 2 input tensors for " << op_name << " compute";
+    std::string tensor_name = UniqName(op_name + "_Out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_GE(pack_args.size(), 3U) << op_name << " 's input is not enough!";
+      CHECK(pack_args[2].is_string());
+      tensor_name = pack_args[2].operator std::string();
+    }
+    Expr A_expr = pack_args[0];
+    Expr B_expr = pack_args[1];
+    CHECK(A_expr.as_tensor());
+    CHECK(B_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+    ir::Tensor B = B_expr.as_tensor_ref();
+    Expr axis;
+    bool trans_a;
+    for (auto &iter : attrs.attr_store) {
+      if (iter.first == "axis") {
+        axis = Expr(absl::get<int>(iter.second));
+        break;
+      }
+    }
+    auto out    = pe_func(A, B, tensor_name, axis);
+    auto stages = CreateStages({A, B, out});
+    *ret        = CINNValuePack{{CINNValue(Expr(out.get())), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(binary_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy." + op_name + ".x86", 1);
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForBroadcast(const std::vector<shape_t> &inputs_shape,
+                                            const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 2UL);
+  std::vector<int> out_shape;
+
+  int axis = -1;
+  for (auto &iter : attrs) {
+    if (iter.first == "axis") {
+      axis = absl::get<int>(iter.second);
+      break;
+    }
+  }
+  VLOG(3) << "broadcast input shapes are : " << utils::Join(inputs_shape[0], ", ") << "; "
+          << utils::Join(inputs_shape[1], ", ");
+  pe::GetBroadcastOutShape(inputs_shape[0], inputs_shape[1], &out_shape, axis);
+  VLOG(3) << "broadcast out shape: " << utils::Join(out_shape, ", ");
+  return {out_shape};
+}
+
+std::vector<Type> InferDtypeForBroadcast(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForBroadcastCmp(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  return {Bool()};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForBroadcast(const std::vector<std::vector<int>> &input_shapes,
+                                                              const std::vector<std::string> &input_layouts,
+                                                              const framework::NodeAttr &attrs,
+                                                              const Target &target) {
+  int input_size = input_layouts.size();
+  CHECK(input_size == 2U || input_size == 3U) << "The input's layouts size is not 2 or 3! Please check again.";
+  int axis = -1;
+  if (attrs.attr_store.find("axis") != attrs.attr_store.end()) {
+    axis = absl::get<int>(attrs.attr_store.at("axis"));
+  }
+  std::vector<std::string> out_layouts = input_layouts;
+  if (input_layouts[0].empty() && input_layouts[1].empty()) {
+    return {{input_layouts[0]}, input_layouts};
+  } else if (input_layouts[0].empty() || input_layouts[1].empty()) {
+    int undef_idx = input_layouts[0] == "" ? 0 : 1;
+    int def_idx   = 1 - undef_idx;
+    CHECK_GE(input_shapes[def_idx].size(), input_shapes[undef_idx].size());
+    auto ret = out_layouts[def_idx];
+    if (input_size == 2) {
+      return {{ret}, {ret, ret}};
+    } else {
+      return {{ret}, {ret, ret, ret}};
+    }
+  } else {
+    // e.g. NCHWxc + NCHW
+    ir::Layout layout0(input_layouts[0]);
+    ir::Layout layout1(input_layouts[1]);
+    int large_idx = layout0.ndims() >= layout1.ndims() ? 0 : 1;
+    auto ret      = input_layouts[large_idx];
+    if (input_size == 2) {
+      return {{ret}, {ret, ret}};
+    } else {
+      return {{ret}, {ret, ret, ret}};
+    }
+  }
+}
+
+std::shared_ptr<OpStrategy> StrategyForBroadcastTo(const framework::NodeAttr &attrs,
+                                                   const std::vector<ir::Tensor> &inputs,
+                                                   const std::vector<Type> &out_type,
+                                                   const std::vector<std::vector<int>> &output_shapes,
+                                                   const Target &target) {
+  std::vector<int> out_shape;
+  std::vector<int> broadcast_axes;
+  CHECK(attrs.attr_store.count("out_shape"));
+  out_shape = absl::get<std::vector<int>>(attrs.attr_store.at("out_shape"));
+  CHECK(attrs.attr_store.count("broadcast_axes"));
+  broadcast_axes = absl::get<std::vector<int>>(attrs.attr_store.at("broadcast_axes"));
+  VLOG(3) << "broadcast out shape: " << utils::Join(out_shape, ", ");
+  VLOG(3) << "broadcast_axes shape: " << utils::Join(broadcast_axes, ", ");
+
+  framework::CINNCompute broadcast_to_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of broadcast_to compute is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "The input tensors of broadcast_to compute is empty! Please check.";
+    std::string tensor_name = UniqName("broadcast_to_Out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_GE(pack_args.size(), 2U);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+    Expr A_expr = pack_args[0];
+    CHECK(A_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+    auto out     = pe::BroadcastTo(A, out_shape, broadcast_axes, tensor_name);
+    auto stages  = CreateStages({A, out});
+    *ret         = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      broadcast_to_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.broadcast_to.x86", 1);
+
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForBroadcastTo(const std::vector<shape_t> &inputs_shape,
+                                              const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 1UL) << "input_shape size should be one. Please Check.";
+  std::vector<int> broadcast_axes;
+  std::vector<int> out_shape;
+  CHECK(attrs.count("broadcast_axes"));
+  CHECK(attrs.count("out_shape"));
+  out_shape      = absl::get<std::vector<int>>(attrs.at("out_shape"));
+  broadcast_axes = absl::get<std::vector<int>>(attrs.at("broadcast_axes"));
+
+  VLOG(3) << "broadcast input shape: " << utils::Join(inputs_shape[0], ", ");
+  VLOG(3) << "broadcast out shape: " << utils::Join(out_shape, ", ");
+  VLOG(3) << "broadcast_axes shape: " << utils::Join(broadcast_axes, ", ");
+  CHECK_EQ(inputs_shape[0].size(), broadcast_axes.size())
+      << "broadcast_axes's size should be same with the input shape's size";
+  CHECK_GE(out_shape.size(), broadcast_axes.size()) << "broadcast_axes's size should be no more than out_shape's size";
+
+  return {out_shape};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForBroadcastTo(const std::vector<std::vector<int>> &input_shapes,
+                                                                const std::vector<std::string> &input_layouts,
+                                                                const framework::NodeAttr &attrs,
+                                                                const Target &target) {
+  CHECK(input_layouts.size() == 1U) << "The input's layouts size is not 1! Please check again.";
+  std::vector<std::string> out_layouts = {""};
+  if (attrs.attr_store.count("out_layouts")) {
+    out_layouts = absl::get<std::vector<std::string>>(attrs.attr_store.at("out_layouts"));
+  }
+  return {out_layouts, input_layouts};
+}
+
+std::vector<Type> InferDtypeForBroadcastGrad(const std::vector<Type> &inputs_type,
+                                             const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 3UL);
+  // Avoid no need buffer var, like elementwise_add_grad's input X and Y is no need buffer var,
+  // in this situation, the X and Y's type is default value FP32, not the real type,
+  // we should get the real type from dout.
+  std::vector<Type> out_type{inputs_type[0], inputs_type[0]};
+  return out_type;
+}
+
+std::vector<shape_t> InferShapeForBroadcastGrad(const std::vector<shape_t> &inputs_shape,
+                                                const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 3UL);
+  std::vector<shape_t> out_shape{inputs_shape[1], inputs_shape[2]};
+
+  return out_shape;
+}
+
+std::shared_ptr<OpStrategy> StrategyForBroadcastGrad(const framework::NodeAttr &attrs,
+                                                     const std::vector<ir::Tensor> &inputs,
+                                                     const std::vector<Type> &out_type,
+                                                     const std::vector<std::vector<int>> &output_shapes,
+                                                     const Target &target) {
+  LOG(FATAL)
+      << "Gradient operator will be decomposed into several primitive operators. Please Use Decomposer Program Pass.";
+}
+
+std::shared_ptr<OpStrategy> StrategyForIsClose(const framework::NodeAttr &attrs,
+                                               const std::vector<ir::Tensor> &inputs,
+                                               const std::vector<Type> &out_type,
+                                               const std::vector<shape_t> &output_shapes,
+                                               const Target &target) {
+  float rtol = 1e-05f, atol = 1e-08f;
+  bool equal_nan = false;
+  int axis       = -1;
+
+  if (attrs.attr_store.count("axis")) {
+    axis = absl::get<int>(attrs.attr_store.at("axis"));
+  }
+  if (attrs.attr_store.count("rtol")) {
+    rtol = absl::get<float>(attrs.attr_store.at("rtol"));
+  }
+  if (attrs.attr_store.count("atol")) {
+    atol = absl::get<float>(attrs.attr_store.at("atol"));
+  }
+  if (attrs.attr_store.count("equal_nan")) {
+    equal_nan = absl::get<bool>(attrs.attr_store.at("equal_nan"));
+  }
+
+  framework::CINNCompute isclose_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of isclose compute is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    int input_size          = pack_args.size();
+
+    std::string tensor_name = UniqName("IsClose_output");
+    if (FLAGS_cinn_ir_schedule) {
+      // the last pack argument is the output tensor name
+      tensor_name = pack_args.back().operator std::string();
+      --input_size;
+    }
+    CHECK_EQ(input_size, 2) << "The input number of isclose should be 2, but here " << input_size << "! Please check.";
+
+    // the input tensor are in front
+    Expr x_expr = pack_args[0];
+    CHECK(x_expr.as_tensor());
+    auto x_tensor = x_expr.as_tensor_ref();
+
+    Expr y_expr = pack_args[1];
+    CHECK(y_expr.as_tensor());
+    auto y_tensor = y_expr.as_tensor_ref();
+
+    auto out = pe::IsClose(x_tensor, y_tensor, axis, rtol, atol, equal_nan, tensor_name);
+
+    auto stages = CreateStages({out});
+    *ret        = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(isclose_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.assertisclose", 1);
+
+  return strategy;
+}
+
+std::vector<Type> InferDtypeForIsClose(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  int input_size = inputs_type.size();
+  CHECK_EQ(input_size, 2UL) << "The input number of isclose should be a multiple of 2, but here " << input_size
+                            << "! Please check.";
+  CHECK(inputs_type[0].is_float()) << "The op \"isclose\" only support float point dtype now, but here "
+                                   << inputs_type[0];
+  CHECK(inputs_type[0] == inputs_type[1])
+      << "The two inputs dtype sof isclose should be equal, but here x:" << inputs_type[0] << " != y:" << inputs_type[1]
+      << "! Please check.";
+
+  return {Bool()};
+}
+
+StrategyForBinary(elementwise_add, Add);
+StrategyForBinary(atan2, Atan2);
+StrategyForBinary(elementwise_mul, Multiply);
+
+StrategyForBinary(subtract, Subtract);
+StrategyForBinary(divide, Divide);
+StrategyForBinary(floor_divide, FloorDivide);
+StrategyForBinary(mod, Mod);
+StrategyForBinary(remainder, Remainder);
+StrategyForBinary(max, Maximum);
+StrategyForBinary(min, Minimum);
+StrategyForBinary(pow, Pow);
+
+StrategyForBinary(logical_and, LogicalAnd);
+StrategyForBinary(logical_or, LogicalOr);
+StrategyForBinary(logical_xor, LogicalXOr);
+StrategyForBinary(greater_than, Greater);
+StrategyForBinary(less_than, Less);
+StrategyForBinary(equal, Equal);
+StrategyForBinary(not_equal, NotEqual);
+StrategyForBinary(greater_equal, GreaterEqual);
+StrategyForBinary(less_equal, LessEqual);
+
+StrategyForBinary(bitwise_or, BitwiseOr);
+StrategyForBinary(bitwise_xor, BitwiseXor);
+StrategyForBinary(bitwise_and, BitwiseAnd);
+StrategyForBinary(left_shift, LeftShift);
+StrategyForBinary(right_shift, RightShift);
+StrategyForBinary(logical_right_shift, LogicalRightShift);
+
+#undef StrategyForBinary
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(broadcast_ops) {
+#define CINN_REGISTER_BINARY(op__, op_stragegy__)                                                                    \
+  CINN_REGISTER_OP(op__)                                                                                             \
+      .describe(#op__ " function")                                                                                   \
+      .set_num_inputs(1)                                                                                             \
+      .set_num_outputs(1)                                                                                            \
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyFor##op_stragegy__) \
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForBroadcast))                                \
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForBroadcast))                                \
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForBroadcast))                              \
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kBroadcast) \
+      .set_support_level(4);
+
+#define CINN_REGISTER_BINARY_CMP(op__, op_stragegy__)                                                                \
+  CINN_REGISTER_OP(op__)                                                                                             \
+      .describe(#op__ " function")                                                                                   \
+      .set_num_inputs(1)                                                                                             \
+      .set_num_outputs(1)                                                                                            \
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyFor##op_stragegy__) \
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForBroadcast))                                \
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForBroadcastCmp))                             \
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForBroadcast))                              \
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kBroadcast) \
+      .set_support_level(4);
+
+  CINN_REGISTER_BINARY(elementwise_add, Add);
+  CINN_REGISTER_BINARY(atan2, Atan2);
+  CINN_REGISTER_BINARY(elementwise_mul, Multiply);
+
+  CINN_REGISTER_BINARY(subtract, Subtract);
+  CINN_REGISTER_BINARY(divide, Divide);
+  CINN_REGISTER_BINARY(floor_divide, FloorDivide);
+  CINN_REGISTER_BINARY(mod, Mod);
+  CINN_REGISTER_BINARY(remainder, Remainder);
+  CINN_REGISTER_BINARY(max, Maximum);
+  CINN_REGISTER_BINARY(min, Minimum);
+  CINN_REGISTER_BINARY(pow, Pow);
+
+  CINN_REGISTER_BINARY_CMP(logical_and, LogicalAnd);
+  CINN_REGISTER_BINARY_CMP(logical_or, LogicalOr);
+  CINN_REGISTER_BINARY_CMP(logical_xor, LogicalXOr);
+  CINN_REGISTER_BINARY_CMP(greater_than, Greater);
+  CINN_REGISTER_BINARY_CMP(less_than, Less);
+  CINN_REGISTER_BINARY_CMP(equal, Equal);
+  CINN_REGISTER_BINARY_CMP(not_equal, NotEqual);
+  CINN_REGISTER_BINARY_CMP(greater_equal, GreaterEqual);
+  CINN_REGISTER_BINARY_CMP(less_equal, LessEqual);
+
+  CINN_REGISTER_BINARY(bitwise_or, BitwiseOr);
+  CINN_REGISTER_BINARY(bitwise_xor, BitwiseXor);
+  CINN_REGISTER_BINARY(bitwise_and, BitwiseAnd);
+  CINN_REGISTER_BINARY(left_shift, LeftShift);
+  CINN_REGISTER_BINARY(right_shift, RightShift);
+  CINN_REGISTER_BINARY(logical_right_shift, LogicalRightShift);
+#undef CINN_REGISTER_BINARY
+
+  CINN_REGISTER_OP(broadcast_to)
+      .describe("broadcast one tensor to the target shape")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForBroadcastTo)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForBroadcastTo))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForBroadcast))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForBroadcastTo))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kBroadcast)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(isclose)
+      .describe("This operator checks if all x and y satisfy the condition: |x - y| <= atol + rtol * |y|")
+      .set_num_inputs(2)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForIsClose)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForBroadcast))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForIsClose))
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForBroadcast))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kBroadcast)
+      .set_support_level(4);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(broadcast_grad_ops) {
+  CINN_REGISTER_OP(elementwise_add_grad)
+      .describe("The gradient of elementwise_add operator.")
+      .set_num_inputs(3)
+      .set_num_outputs(2)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForBroadcastGrad)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForBroadcastGrad))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForBroadcastGrad));
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/CMakeLists.txt b/paddle/cinn/hlir/op/contrib/CMakeLists.txt
new file mode 100644
index 0000000000000..d8237fb503134
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/CMakeLists.txt
@@ -0,0 +1,29 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+        gather_nd.cc
+        sort.cc
+        argmin.cc
+        argmax.cc
+        repeat.cc
+        lookup_table.cc
+        one_hot.cc
+        reciprocal.cc
+        gaussian_random.cc
+        uniform_random.cc
+        cholesky.cc
+        triangular_solve.cc
+        bitcast_convert.cc
+        randint.cc
+        resize.cc
+        assert_true.cc
+        )
+
+cc_test(test_gather_nd SRCS gather_nd_test.cc DEPS cinncore)
+cc_test(test_sort SRCS sort_test.cc DEPS cinncore)
+cc_test(test_argmin SRCS argmin_test.cc DEPS cinncore)
+cc_test(test_argmax SRCS argmax_test.cc DEPS cinncore)
+cc_test(test_repeat SRCS repeat_test.cc DEPS cinncore)
+cc_test(test_one_hot SRCS one_hot_test.cc DEPS cinncore)
+cc_test(test_lookup_table SRCS lookup_table_test.cc DEPS cinncore)
+cc_test(test_reciprocal SRCS reciprocal_test.cc DEPS cinncore)
diff --git a/paddle/cinn/hlir/op/contrib/argmax.cc b/paddle/cinn/hlir/op/contrib/argmax.cc
new file mode 100644
index 0000000000000..985b600e99bc3
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/argmax.cc
@@ -0,0 +1,249 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/argmax.h"
+
+#include <iostream>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/cinn_value.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/contrib/sort.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using framework::shape_t;
+using ir::Tensor;
+
+std::vector<ir::Tensor> Argmax(const Tensor &in_tensor,
+                               const common::Target &target,
+                               poly::StageMap stages,
+                               const int &axis,
+                               const bool &keep_dims,
+                               const std::string &name) {
+  auto shape = in_tensor->shape;
+  auto ndim  = shape.size();
+  CHECK_GT(ndim, 0) << "tensor's dim must be more than 0";
+
+  int pos_axis = axis;
+  if (axis < 0) {
+    pos_axis = static_cast<int>(ndim) + axis;
+  }
+  CHECK_LT(pos_axis, ndim) << "Axis must be less than tensor's dim";
+  CHECK_GE(pos_axis, 0) << "Axis must be more than 0";
+
+  std::vector<Expr> output_shape;
+  for (int i = 0; i < shape.size(); ++i) {
+    CHECK(shape[i].is_constant()) << "Input tensor's shape should be constant value.";
+    if (pos_axis == i) {
+      if (keep_dims) {
+        output_shape.push_back(Expr(1));
+      }
+    } else {
+      output_shape.push_back(shape[i]);
+    }
+  }
+  if (output_shape.empty()) {
+    output_shape.push_back(Expr(1));
+  }
+
+  auto sort_index = ArgSort(in_tensor, target, stages, pos_axis, false, name + "_index");
+  auto res        = Compute(
+      output_shape,
+      [=](const std::vector<Expr> &indices) {
+        std::vector<Expr> eval_indices(indices);
+        if (!keep_dims && ndim > 1) {
+          eval_indices.insert(eval_indices.begin() + pos_axis, Expr(0));
+        } else {
+          eval_indices[pos_axis] = Expr(0);
+        }
+        return sort_index.at(0)(eval_indices);
+      },
+      name);
+  stages->InsertLazily(sort_index.at(0));
+  return {res, sort_index.at(0), sort_index.at(1)};
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForArgmax(const framework::NodeAttr &attrs,
+                                                         const std::vector<Tensor> &inputs,
+                                                         const std::vector<Type> &out_type,
+                                                         const std::vector<std::vector<int>> &output_shapes,
+                                                         const Target &target) {
+  int axis;
+  bool keep_dims = false;
+
+  if (attrs.attr_store.count("axis")) {
+    axis = absl::get<int>(attrs.attr_store.at("axis"));
+  } else {
+    LOG(FATAL) << "reduce dimension is not set!";
+  }
+  if (attrs.attr_store.count("keep_dim")) {
+    keep_dims = absl::get<bool>(attrs.attr_store.at("keep_dim"));
+  }
+
+  framework::CINNCompute argmax_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of argmax compute is empty! Please check.";
+    common::CINNValuePack pack_args = args[0];
+    std::string tensor_name         = UniqName("Argmax_out");
+    CHECK_GE(pack_args.size(), 1U) << "There should be 1 input args for argmax compute";
+    Expr in_expr = pack_args[0];
+    CHECK(in_expr.as_tensor());
+    Tensor in_tensor = in_expr.as_tensor_ref();
+    auto stages      = CreateStages({in_tensor});
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2U);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+    std::vector<ir::Tensor> out_tensor = Argmax(in_tensor, target, stages, axis, keep_dims, tensor_name);
+
+    stages->InsertLazily(out_tensor[0]);
+    std::vector<CINNValue> cinn_values{
+        CINNValue(out_tensor[0]), CINNValue(out_tensor[1]), CINNValue(out_tensor[2]), CINNValue(stages)};
+    *ret = common::CINNValuePack{cinn_values};
+  });
+
+  framework::CINNSchedule argmax_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of argmax_schedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      auto blocks = ir_sch.GetAllBlocks();
+      // TODO: It needs to be rewritten according to the reduction_max operator to improve performance.
+      // Do not use local variables, because the size will exceed the limit.
+      ir_sch.SetBuffer(blocks[0], "local");
+      ir_sch.SetBuffer(blocks[1], "local");
+
+      long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
+      if (prod_size > 1 && target.arch == Target::Arch::X86) {
+        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
+      }
+      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = common::CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of arange_schedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      Expr out                       = arg_pack[0];
+      CHECK(out.as_tensor());
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(argmax_compute, argmax_schedule, "strategy.argmax", 1);
+
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForArgmax(const std::vector<shape_t> &inputs_shape,
+                                         const framework::AttrMapType &attrs) {
+  CHECK(inputs_shape.size() == 1UL);
+  auto ndim = inputs_shape[0].size();
+  CHECK_GT(ndim, 0) << "tensor's dim must be more than 0";
+  int axis;
+  bool keep_dim;
+
+  CHECK(attrs.find("axis") != attrs.end());
+  axis = absl::get<int>(attrs.at("axis"));
+  if (axis < 0) {
+    axis = static_cast<int>(ndim) + axis;
+  }
+  CHECK_LT(axis, ndim) << "Axis must be less than tensor's dim";
+  CHECK_GE(axis, 0) << "Axis must be more than 0";
+
+  CHECK(attrs.find("keep_dim") != attrs.end());
+  keep_dim = absl::get<bool>(attrs.at("keep_dim"));
+
+  std::vector<int> out_shapes;
+  for (size_t i = 0; i < ndim; ++i) {
+    if (axis == i) {
+      if (keep_dim) {
+        out_shapes.push_back(1);
+      }
+    } else {
+      out_shapes.push_back(inputs_shape[0][i]);
+    }
+  }
+
+  if (keep_dim) {
+    CHECK_EQ(ndim, out_shapes.size());
+  } else {
+    CHECK_EQ(ndim - 1, out_shapes.size());
+  }
+  if (out_shapes.empty()) {
+    out_shapes.push_back(1);
+  }
+
+  return {out_shapes};
+}
+
+std::vector<Type> InferDtypeForArgmax(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  return {Int(32)};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForArgmax(const std::vector<framework::shape_t> &input_shapes,
+                                                           const std::vector<std::string> &input_layouts,
+                                                           const framework::NodeAttr &attrs,
+                                                           const Target &target) {
+  CHECK_EQ(input_shapes.size(), 1U) << "The input's shape size is not 1! Please check again.";
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layout size is not 1! Please check again.";
+  return {input_layouts, input_layouts};
+}
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(argmax_ops) {
+  CINN_REGISTER_OP(argmax)
+      .describe("This operator implements the op argmax.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForArgmax)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForArgmax))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForArgmax))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/argmax.h b/paddle/cinn/hlir/op/contrib/argmax.h
new file mode 100644
index 0000000000000..ab253be44178d
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/argmax.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+std::vector<ir::Tensor> Argmax(const ir::Tensor &in_tensor,
+                               const common::Target &target,
+                               poly::StageMap stages,
+                               const int &axis,
+                               const bool &keep_dims   = false,
+                               const std::string &name = "T_Argmax_out");
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/argmax_test.cc b/paddle/cinn/hlir/op/contrib/argmax_test.cc
new file mode 100644
index 0000000000000..3b42e73f1fdd5
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/argmax_test.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/argmax.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+TEST(GenerateCode_Cpu, Argmax_Keep) {
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultHostTarget();
+
+  int axis = 1;
+  ir::Expr n(4);
+  ir::Expr in_c(3);
+  ir::Expr out_c(1);
+  ir::Expr h(28);
+  ir::Expr w(28);
+
+  lang::Placeholder<float> in("in", {n, in_c, h, w});
+  poly::StageMap stages = poly::CreateStages({in});
+  ir::Tensor res        = Argmax(in, target, stages, axis, true, "test_argmax_in").at(0);
+  stages->InsertLazily(res);
+
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_Argmax_Keep", stages, {in, res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("Argmax_Keep_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code   = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  auto target_source = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void TestGenerateCodeCpu_Argmax_Keep(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _in = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _test_argmax_in = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _test_argmax_in_index = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 3, 28, 28 });
+  cinn_buffer_t* _test_argmax_in_index_temp = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 3, 28, 28 });
+  cinn_buffer_malloc((void*)(0), _test_argmax_in);
+  cinn_buffer_malloc((void*)(0), _test_argmax_in_index);
+  cinn_buffer_malloc((void*)(0), _test_argmax_in_index_temp);
+  const float* in = ((const float*)(_in->memory));
+  int32_t* test_argmax_in = ((int32_t*)(_test_argmax_in->memory));
+  int32_t* test_argmax_in_index = ((int32_t*)(_test_argmax_in_index->memory));
+  int32_t* test_argmax_in_index_temp = ((int32_t*)(_test_argmax_in_index_temp->memory));
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t j = 0; j < 3; j += 1) {
+      for (int32_t k = 0; k < 28; k += 1) {
+        for (int32_t a = 0; a < 28; a += 1) {
+          test_argmax_in_index_temp[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_gt_num_fp32(_in, 3, in[((2352 * i) + ((784 * j) + ((28 * k) + a)))], ((2352 * i) + ((28 * k) + a)), 784);
+        };
+      };
+    };
+  };
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t j = 0; j < 3; j += 1) {
+      for (int32_t k = 0; k < 28; k += 1) {
+        for (int32_t a = 0; a < 28; a += 1) {
+          test_argmax_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_next_smallest_int32(_test_argmax_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784);
+        };
+      };
+    };
+  };
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t k = 0; k < 28; k += 1) {
+      for (int32_t a = 0; a < 28; a += 1) {
+        test_argmax_in[((784 * i) + ((28 * k) + a))] = test_argmax_in_index[((2352 * i) + ((28 * k) + a))];
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _test_argmax_in_index);
+  cinn_buffer_free((void*)(0), _test_argmax_in_index_temp);
+  cinn_buffer_free((void*)(0), _test_argmax_in);
+}
+  )ROC";
+  CHECK_EQ(utils::Trim(code), utils::Trim(target_source));
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/argmin.cc b/paddle/cinn/hlir/op/contrib/argmin.cc
new file mode 100644
index 0000000000000..51214f30eb641
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/argmin.cc
@@ -0,0 +1,248 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/argmin.h"
+
+#include <iostream>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/cinn_value.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/contrib/sort.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using framework::shape_t;
+using ir::Tensor;
+
+std::vector<Tensor> Argmin(const Tensor &in_tensor,
+                           const common::Target &target,
+                           poly::StageMap stages,
+                           const int &axis,
+                           const bool &keep_dims,
+                           const std::string &name) {
+  auto shape = in_tensor->shape;
+  auto ndim  = shape.size();
+  CHECK_GT(ndim, 0) << "tensor's dim must be more than 0";
+
+  int pos_axis = axis;
+  if (axis < 0) {
+    pos_axis = static_cast<int>(ndim) + axis;
+  }
+  CHECK_LT(pos_axis, ndim) << "Axis must be less than tensor's dim";
+  CHECK_GE(pos_axis, 0) << "Axis must be more than 0";
+
+  std::vector<Expr> output_shape;
+  for (int i = 0; i < shape.size(); ++i) {
+    CHECK(shape[i].is_constant()) << "Input tensor's shape should be constant value.";
+    if (pos_axis == i) {
+      if (keep_dims) {
+        output_shape.push_back(Expr(1));
+      }
+    } else {
+      output_shape.push_back(shape[i]);
+    }
+  }
+  if (output_shape.empty()) {
+    output_shape.push_back(Expr(1));
+  }
+  auto sort_index = ArgSort(in_tensor, target, stages, pos_axis, true, name + "_index");
+  auto res        = Compute(
+      output_shape,
+      [=](const std::vector<Expr> &indices) {
+        std::vector<Expr> eval_indices(indices);
+        if (!keep_dims && ndim > 1) {
+          eval_indices.insert(eval_indices.begin() + pos_axis, Expr(0));
+        } else {
+          eval_indices[pos_axis] = Expr(0);
+        }
+        return sort_index.at(0)(eval_indices);
+      },
+      name);
+  stages->InsertLazily(sort_index.at(0));
+  return {res, sort_index.at(0), sort_index.at(1)};
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForArgmin(const framework::NodeAttr &attrs,
+                                                         const std::vector<Tensor> &inputs,
+                                                         const std::vector<Type> &out_type,
+                                                         const std::vector<std::vector<int>> &output_shapes,
+                                                         const Target &target) {
+  int axis;
+  bool keep_dims = false;
+
+  if (attrs.attr_store.count("axis")) {
+    axis = absl::get<int>(attrs.attr_store.at("axis"));
+  } else {
+    LOG(FATAL) << "reduce dimension is not set!";
+  }
+  if (attrs.attr_store.count("keep_dim")) {
+    keep_dims = absl::get<bool>(attrs.attr_store.at("keep_dim"));
+  }
+
+  framework::CINNCompute argmin_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of argmin compute is empty! Please check.";
+    common::CINNValuePack pack_args = args[0];
+    std::string tensor_name         = UniqName("Argmin_out");
+    CHECK_GE(pack_args.size(), 1U) << "There should be 1 input args for argmax compute";
+    Expr in_expr = pack_args[0];
+    CHECK(in_expr.as_tensor());
+    Tensor in_tensor = in_expr.as_tensor_ref();
+    auto stages      = CreateStages({in_tensor});
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2U);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+    auto out_tensor = Argmin(in_tensor, target, stages, axis, keep_dims, tensor_name);
+
+    stages->InsertLazily(out_tensor[0]);
+    std::vector<CINNValue> cinn_values{
+        CINNValue(out_tensor[0]), CINNValue(out_tensor[1]), CINNValue(out_tensor[2]), CINNValue(stages)};
+    *ret = common::CINNValuePack{cinn_values};
+  });
+
+  framework::CINNSchedule argmin_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of arange_schedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      auto blocks = ir_sch.GetAllBlocks();
+      // TODO: It needs to be rewritten according to the reduction_min operator to improve performance.
+      // Do not use local variables, because the size will exceed the limit.
+      ir_sch.SetBuffer(blocks[0], "local");
+      ir_sch.SetBuffer(blocks[1], "local");
+      long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
+      if (prod_size > 1 && target.arch == Target::Arch::X86) {
+        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
+      }
+      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = common::CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of arange_schedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      Expr out                       = arg_pack[0];
+      CHECK(out.as_tensor());
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(argmin_compute, argmin_schedule, "strategy.argmin.x86", 1);
+
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForArgmin(const std::vector<shape_t> &inputs_shape,
+                                         const framework::AttrMapType &attrs) {
+  CHECK(inputs_shape.size() == 1UL);
+  auto ndim = inputs_shape[0].size();
+  CHECK_GT(ndim, 0) << "tensor's dim must be more than 0";
+  int axis;
+  bool keep_dim;
+
+  CHECK(attrs.find("axis") != attrs.end());
+  axis = absl::get<int>(attrs.at("axis"));
+  if (axis < 0) {
+    axis = static_cast<int>(ndim) + axis;
+  }
+  CHECK_LT(axis, ndim) << "Axis must be less than tensor's dim";
+  CHECK_GE(axis, 0) << "Axis must be more than 0";
+
+  CHECK(attrs.find("keep_dim") != attrs.end());
+  keep_dim = absl::get<bool>(attrs.at("keep_dim"));
+
+  std::vector<int> out_shapes;
+  for (size_t i = 0; i < ndim; ++i) {
+    if (axis == i) {
+      if (keep_dim) {
+        out_shapes.push_back(1);
+      }
+    } else {
+      out_shapes.push_back(inputs_shape[0][i]);
+    }
+  }
+
+  if (keep_dim) {
+    CHECK_EQ(ndim, out_shapes.size());
+  } else {
+    CHECK_EQ(ndim - 1, out_shapes.size());
+  }
+
+  if (out_shapes.empty()) {
+    out_shapes.push_back(1);
+  }
+
+  return {out_shapes};
+}
+
+std::vector<Type> InferDtypeForArgmin(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  return {Int(32)};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForArgmin(const std::vector<framework::shape_t> &input_shapes,
+                                                           const std::vector<std::string> &input_layouts,
+                                                           const framework::NodeAttr &attrs,
+                                                           const Target &target) {
+  CHECK_EQ(input_shapes.size(), 1U) << "The input's shape size is not 1! Please check again.";
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layout size is not 1! Please check again.";
+  return {input_layouts, input_layouts};
+}
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(argmin_ops) {
+  CINN_REGISTER_OP(argmin)
+      .describe("This operator implements the op argmin.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForArgmin)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForArgmin))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForArgmin))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/argmin.h b/paddle/cinn/hlir/op/contrib/argmin.h
new file mode 100644
index 0000000000000..9de4860b889a2
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/argmin.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+std::vector<ir::Tensor> Argmin(const ir::Tensor& in_tensor,
+                               const common::Target& target,
+                               poly::StageMap stages,
+                               const int& axis,
+                               const bool& keep_dims   = false,
+                               const std::string& name = "T_Argmin_out");
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/argmin_test.cc b/paddle/cinn/hlir/op/contrib/argmin_test.cc
new file mode 100644
index 0000000000000..bfe053f101383
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/argmin_test.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/argmin.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+TEST(GenerateCode_Cpu, Argmin_Keep) {
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultHostTarget();
+
+  int axis = 1;
+  ir::Expr n(4);
+  ir::Expr in_c(3);
+  ir::Expr out_c(1);
+  ir::Expr h(28);
+  ir::Expr w(28);
+
+  lang::Placeholder<float> in("in", {n, in_c, h, w});
+  poly::StageMap stages = poly::CreateStages({in});
+  ir::Tensor res        = Argmin(in, target, stages, axis, true, "test_argmin_in").at(0);
+  stages->InsertLazily(res);
+
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_Argmin_Keep", stages, {in, res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("Argmin_Keep_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code   = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  auto target_source = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void TestGenerateCodeCpu_Argmin_Keep(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _in = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _test_argmin_in = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _test_argmin_in_index = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 3, 28, 28 });
+  cinn_buffer_t* _test_argmin_in_index_temp = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 3, 28, 28 });
+  cinn_buffer_malloc((void*)(0), _test_argmin_in);
+  cinn_buffer_malloc((void*)(0), _test_argmin_in_index);
+  cinn_buffer_malloc((void*)(0), _test_argmin_in_index_temp);
+  const float* in = ((const float*)(_in->memory));
+  int32_t* test_argmin_in = ((int32_t*)(_test_argmin_in->memory));
+  int32_t* test_argmin_in_index = ((int32_t*)(_test_argmin_in_index->memory));
+  int32_t* test_argmin_in_index_temp = ((int32_t*)(_test_argmin_in_index_temp->memory));
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t j = 0; j < 3; j += 1) {
+      for (int32_t k = 0; k < 28; k += 1) {
+        for (int32_t a = 0; a < 28; a += 1) {
+          test_argmin_in_index_temp[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_lt_num_fp32(_in, 3, in[((2352 * i) + ((784 * j) + ((28 * k) + a)))], ((2352 * i) + ((28 * k) + a)), 784);
+        };
+      };
+    };
+  };
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t j = 0; j < 3; j += 1) {
+      for (int32_t k = 0; k < 28; k += 1) {
+        for (int32_t a = 0; a < 28; a += 1) {
+          test_argmin_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_next_smallest_int32(_test_argmin_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784);
+        };
+      };
+    };
+  };
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t k = 0; k < 28; k += 1) {
+      for (int32_t a = 0; a < 28; a += 1) {
+        test_argmin_in[((784 * i) + ((28 * k) + a))] = test_argmin_in_index[((2352 * i) + ((28 * k) + a))];
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _test_argmin_in_index);
+  cinn_buffer_free((void*)(0), _test_argmin_in_index_temp);
+  cinn_buffer_free((void*)(0), _test_argmin_in);
+}
+  )ROC";
+  CHECK_EQ(utils::Trim(code), utils::Trim(target_source));
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/assert_true.cc b/paddle/cinn/hlir/op/contrib/assert_true.cc
new file mode 100644
index 0000000000000..3f0e43cc2b8fb
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/assert_true.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using common::CINNValuePack;
+
+std::shared_ptr<framework::OpStrategy> StrategyForAssertTrue(const framework::NodeAttr &attrs,
+                                                             const std::vector<ir::Tensor> &inputs,
+                                                             const std::vector<Type> &out_type,
+                                                             const std::vector<std::vector<int>> &output_shapes,
+                                                             const Target &target) {
+  framework::CINNCompute assert_true_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of assert_true is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 2U) << "Two input tensors are required for the computation of assert_true.";
+    Expr a_expr             = pack_args[0];
+    Expr b_expr             = pack_args[1];
+    ir::Tensor a            = a_expr.as_tensor_ref();
+    ir::Tensor b            = b_expr.as_tensor_ref();
+    std::string tensor_name = "assert_true_out";
+    auto out                = pe::Identity(b, tensor_name).front();
+    auto stages             = CreateStages({out});
+    std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
+    *ret = CINNValuePack{res};
+  });
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      assert_true_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.assert_true.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForAssertTrue(const std::vector<framework::shape_t> &inputs_shape,
+                                                        const framework::AttrMapType &attrs) {
+  return inputs_shape;
+}
+
+std::vector<Type> InferDtypeForAssertTrue(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  return inputs_type;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(assert_true_ops) {
+  CINN_REGISTER_OP(assert_true)
+      .describe("AssertTrue")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForAssertTrue)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForAssertTrue))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForAssertTrue))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/bitcast_convert.cc b/paddle/cinn/hlir/op/contrib/bitcast_convert.cc
new file mode 100644
index 0000000000000..beb7e9fe9be6b
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/bitcast_convert.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/transform.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::shape_t;
+
+ir::Tensor BitcastConvert(const ir::Tensor &input, const Type &dtype, const std::string &name) {
+  auto res = Compute(
+      input->shape, [=](const std::vector<Expr> &indices) { return input(indices); }, name);
+  return res;
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForBitcastConvert(const framework::NodeAttr &attrs,
+                                                                 const std::vector<ir::Tensor> &inputs,
+                                                                 const std::vector<Type> &out_type,
+                                                                 const std::vector<std::vector<int>> &output_shapes,
+                                                                 const Target &target) {
+  std::string op_name("bitcast_convert");
+
+  framework::CINNCompute bitcast_convert_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of " << op_name << " compute is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 1U) << "1 input tensor for " << op_name << " compute";
+    std::string tensor_name = UniqName(op_name + "_Out");
+    Expr A_expr             = pack_args[0];
+    CHECK(A_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+    auto out     = BitcastConvert(A, out_type[0], tensor_name);
+    auto stages  = CreateStages({A});
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res.push_back(CINNValue(out));
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      bitcast_convert_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.bitcast_convert.x86", 1);
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForBitcastConvert(const std::vector<shape_t> &inputs_shape,
+                                                 const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 1U) << "The input's shape size should be 1! Please check again.";
+
+  auto input_data_type_name  = absl::get<std::string>(attrs.at("input_data_type"));
+  auto output_data_type_name = absl::get<std::string>(attrs.at("dtype"));
+  auto input_data_type       = common::Str2Type(input_data_type_name);
+  auto output_data_type      = common::Str2Type(output_data_type_name);
+
+  auto output_shape = std::vector<shape_t>(inputs_shape.begin(), inputs_shape.end());
+  auto ratio        = input_data_type.bits() / output_data_type.bits();
+  if (ratio == 1) return inputs_shape;
+
+  if (ratio > 0) {
+    output_shape.back().emplace_back(ratio);
+  } else {
+    if (output_shape.back().back() != (output_data_type.bits() / input_data_type.bits())) {
+      LOG(FATAL)
+          << "The rightmost dimension of input must be equal to sizeof(output_data_type)/sizeof(input_data_type) when "
+             "sizeof(output_data_type) > sizeof(input_data_type)";
+    }
+    output_shape.back().pop_back();
+  }
+
+  return output_shape;
+}
+
+std::vector<Type> InferDtypeForBitcastConvert(const std::vector<Type> &inputs_type,
+                                              const framework::AttrMapType &attrs) {
+  CHECK(attrs.count("dtype"));
+  return {common::Str2Type(absl::get<std::string>(attrs.at("dtype")))};
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(bitcast_convert_ops) {
+  CINN_REGISTER_OP(bitcast_convert)
+      .describe("BitcastConvert")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForBitcastConvert)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForBitcastConvert))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForBitcastConvert))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/cholesky.cc b/paddle/cinn/hlir/op/contrib/cholesky.cc
new file mode 100644
index 0000000000000..d1c3c4f5dab31
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/cholesky.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/types/variant.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/cinn_value.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/poly/stage.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using common::CINNValuePack;
+
+std::shared_ptr<framework::OpStrategy> StrategyForCholesky(const framework::NodeAttr &attrs,
+                                                           const std::vector<ir::Tensor> &inputs,
+                                                           const std::vector<Type> &out_type,
+                                                           const std::vector<std::vector<int>> &output_shapes,
+                                                           const Target &target) {
+  framework::CINNCompute cholesky_compute([=](lang::Args args, lang::RetValue *ret) {
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "at least one input tensor for cholesky compute\n";
+    Expr x_expr             = pack_args[0];
+    ir::Tensor x            = x_expr.as_tensor_ref();
+    std::string tensor_name = "cholesky_out";
+    auto out                = pe::Identity(x, tensor_name).front();
+    auto stages             = CreateStages({out});
+    std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
+    *ret = CINNValuePack{res};
+  });
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(cholesky_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.cholesky.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForCholesky(const std::vector<framework::shape_t> &inputs_shape,
+                                                      const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 1U) << "The input's shape size should be 1! Please check again.";
+  framework::shape_t x_shape = inputs_shape[0];
+  int x_shape_size           = x_shape.size();
+  CHECK_GE(x_shape_size, 2U) << "The input x shape size should >= 2! Please check again.";
+  CHECK_EQ(x_shape[x_shape_size - 2], x_shape[x_shape_size - 1])
+      << "The last two dimensions of the input x must be the same!";
+  return inputs_shape;
+}
+
+std::vector<Type> InferDtypeForCholesky(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 1U) << "The input's shape size should be 1! Please check again.";
+  CHECK(inputs_type[0].is_float(32) || inputs_type[0].is_float(64))
+      << "The input's dtype should be float32 or float64! Please check again.";
+  return inputs_type;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(cholesky_ops) {
+  CINN_REGISTER_OP(cholesky)
+      .describe("Cholesky")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForCholesky)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForCholesky))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForCholesky))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/gather_nd.cc b/paddle/cinn/hlir/op/contrib/gather_nd.cc
new file mode 100644
index 0000000000000..b80810a039271
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/gather_nd.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/gather_nd.h"
+
+#include <gflags/gflags.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/type.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using common::CINNValuePack;
+
+ir::Tensor GatherNd(const ir::Tensor &x, const ir::Tensor &index, const std::string &name) {
+  std::vector<Expr> x_shape     = x->shape;
+  std::vector<Expr> index_shape = index->shape;
+  size_t x_shape_size           = x_shape.size();
+  size_t index_shape_size       = index_shape.size();
+  std::vector<Expr> out_shape;
+  out_shape.insert(out_shape.end(), index_shape.begin(), index_shape.end() - 1);
+  out_shape.insert(out_shape.end(), x_shape.begin() + index_shape.back().as_int32(), x_shape.end());
+  auto res = Compute(
+      out_shape,
+      [=](const std::vector<Expr> &indices) {
+        std::vector<Expr> indices_position;
+        for (size_t i = 0; i < index_shape_size - 1; ++i) {
+          indices_position.push_back(ir::Cast::Make(common::Int(32), indices[i]));
+        }
+        indices_position.push_back(ir::Cast::Make(common::Int(32), Expr(0)));
+        size_t indices_position_size = indices_position.size();
+        std::vector<Expr> real_indices;
+        for (size_t i = 0; i < index_shape.back().as_int32(); ++i) {
+          indices_position[indices_position_size - 1] = ir::Cast::Make(common::Int(32), Expr(i));
+          real_indices.push_back(ir::Cast::Make(common::Int(32), index(indices_position)));
+        }
+        if (real_indices.size() == x_shape_size) {
+          return x(real_indices);
+        }
+        for (size_t i = index_shape_size - 1; i < indices.size(); ++i) {
+          real_indices.push_back(indices[i]);
+        }
+        return x(real_indices);
+      },
+      name);
+  return res;
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForGatherNd(const framework::NodeAttr &attrs,
+                                                           const std::vector<ir::Tensor> &inputs,
+                                                           const std::vector<Type> &out_type,
+                                                           const std::vector<std::vector<int>> &output_shapes,
+                                                           const Target &target) {
+  std::string op_name("gather_nd");
+
+  framework::CINNCompute gather_nd_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of " << op_name << " compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 2U) << "2 input tensors for " << op_name << " compute\n";
+    Expr x     = pack_args[0];
+    Expr index = pack_args[1];
+    CHECK(x.as_tensor());
+    CHECK(index.as_tensor());
+    CHECK(!output_shapes.empty());
+    auto tensor_x     = x.as_tensor_ref();
+    auto tensor_index = index.as_tensor_ref();
+    auto stages       = CreateStages({tensor_x, tensor_index});
+    VLOG(3) << "x shape: " << utils::Join(tensor_x->shape, ", ")
+            << ", index shape: " << utils::Join(tensor_index->shape, ", ")
+            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+    std::string tensor_name = UniqName("GatherNd_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 3U);
+      tensor_name = pack_args[2].operator std::string();
+    }
+    ir::Tensor out = GatherNd(tensor_x, tensor_index, tensor_name);
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res.push_back(CINNValue(out));
+    CHECK(!out_type.empty()) << "Output type of " << op_name << " is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule gather_nd_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of gather_nd_schedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
+      if (prod_size > 1) {
+        if (target.arch == Target::Arch::NVGPU) {
+          pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+        } else if (target.arch == Target::Arch::X86) {
+          pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
+        }
+      }
+      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = common::CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of gather_nd_schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      Expr out               = arg_pack[0];
+      CHECK(out.as_tensor());
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(gather_nd_compute, gather_nd_schedule, "strategy.gather_nd.x86", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForGatherNd(const std::vector<std::vector<int>> &inputs_shape,
+                                                    const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 2U) << "The input's shape size should be 2! Please check again.";
+  std::vector<int> x_shape     = inputs_shape[0];
+  std::vector<int> index_shape = inputs_shape[1];
+  CHECK_GE(index_shape.size(), 1U) << "Index shape must greater or equal to 1!";
+  CHECK_LE(index_shape.back(), x_shape.size()) << "Index shape[-1] must be no more than x.rank! Please check again.";
+  std::vector<int> output_shape;
+  output_shape.insert(output_shape.end(), index_shape.begin(), index_shape.end() - 1);
+  output_shape.insert(output_shape.end(), x_shape.begin() + index_shape.back(), x_shape.end());
+  return {output_shape};
+}
+
+std::vector<Type> InferDtypeForGatherNd(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(gather_nd_ops) {
+  CINN_REGISTER_OP(gather_nd)
+      .describe("GatherNd.")
+      .set_num_inputs(2)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForGatherNd)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForGatherNd))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForGatherNd))
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/gather_nd.h b/paddle/cinn/hlir/op/contrib/gather_nd.h
new file mode 100644
index 0000000000000..09561cff7a636
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/gather_nd.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+ir::Tensor GatherNd(const ir::Tensor& x, const ir::Tensor& index, const std::string& name);
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/gather_nd_test.cc b/paddle/cinn/hlir/op/contrib/gather_nd_test.cc
new file mode 100644
index 0000000000000..b1b23bbdc14bf
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/gather_nd_test.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/gather_nd.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+TEST(GenerateCode_Cpu, GatherNd) {
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultHostTarget();
+
+  ir::Expr dim0(1);
+  ir::Expr dim1(2);
+  ir::Expr dim2(3);
+  ir::Expr dim3(4);
+
+  lang::Placeholder<float> x("x", {dim1, dim2, dim3});
+  lang::Placeholder<int32_t> index("index", {dim0, dim1, dim2});
+  ir::Tensor res = GatherNd(x, index, "test_gather_nd_out");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_GatherNd", stages, {res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("GatherNd_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  VLOG(6) << "Cpu Codegen result:";
+  VLOG(6) << code << std::endl;
+
+  auto target_source = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void TestGenerateCodeCpu_GatherNd(void* _args, int32_t num_args)
+{
+  cinn_buffer_t* _test_gather_nd_out = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _index = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 1, 2, 3 });
+  cinn_buffer_t* _x = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 2, 3, 4 });
+  cinn_buffer_malloc((void*)(0), _test_gather_nd_out);
+  cinn_buffer_malloc((void*)(0), _index);
+  cinn_buffer_malloc((void*)(0), _x);
+  const int32_t* index = ((const int32_t*)(_index->memory));
+  float* test_gather_nd_out = ((float*)(_test_gather_nd_out->memory));
+  const float* x = ((const float*)(_x->memory));
+  for (int32_t j = 0; j < 2; j += 1) {
+    test_gather_nd_out[j] = x[((12 * index[(3 * j)]) + ((4 * index[(1 + (3 * j))]) + index[(2 + (3 * j))]))];
+  };
+  cinn_buffer_free((void*)(0), _index);
+  cinn_buffer_free((void*)(0), _x);
+  cinn_buffer_free((void*)(0), _test_gather_nd_out);
+}
+  )ROC";
+  CHECK_EQ(utils::Trim(code), utils::Trim(target_source));
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/gaussian_random.cc b/paddle/cinn/hlir/op/contrib/gaussian_random.cc
new file mode 100644
index 0000000000000..0072c0a73067b
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/gaussian_random.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/types/variant.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/cinn_value.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/poly/stage.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using common::CINNValuePack;
+
+std::shared_ptr<framework::OpStrategy> StrategyForGaussianRandom(const framework::NodeAttr &attrs,
+                                                                 const std::vector<ir::Tensor> &inputs,
+                                                                 const std::vector<Type> &out_type,
+                                                                 const std::vector<std::vector<int>> &output_shapes,
+                                                                 const Target &target) {
+  framework::CINNCompute gaussian_random_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(attrs.attr_store.count("shape"));
+    ir::Tensor shape_tensor;
+    std::string tensor_name = "gaussian_random_out";
+    auto out                = pe::Identity(shape_tensor, tensor_name).front();
+    auto stages             = CreateStages({out});
+    std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
+    *ret = CINNValuePack{res};
+  });
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      gaussian_random_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.gaussian_random.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForGaussianRandom(const std::vector<framework::shape_t> &inputs_shape,
+                                                            const framework::AttrMapType &attrs) {
+  CHECK(attrs.count("shape"));
+  auto shape = absl::get<std::vector<int>>(attrs.at("shape"));
+  CHECK(!shape.empty()) << "shape attr is empty!";
+  return {shape};
+}
+
+std::vector<Type> InferDtypeForGaussianRandom(const std::vector<Type> &inputs_type,
+                                              const framework::AttrMapType &attrs) {
+  std::string dtype = "float32";
+  if (attrs.find("dtype") != attrs.end()) {
+    dtype = absl::get<std::string>(attrs.at("dtype"));
+  }
+  std::vector<Type> res{common::Str2Type(dtype)};
+  CHECK(res[0].is_float(32) || res[0].is_float(64))
+      << "gaussian_random only support float32 and float64, but here " << res[0] << "! Please check.";
+  return res;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(gaussian_random_ops) {
+  CINN_REGISTER_OP(gaussian_random)
+      .describe("GaussianRandom")
+      .set_num_inputs(0)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForGaussianRandom)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForGaussianRandom))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForGaussianRandom))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/logical_right_shift.cc b/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
new file mode 100644
index 0000000000000..6dcfad2b89e8b
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
@@ -0,0 +1,157 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "gflags/gflags.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+ir::Tensor LogicalRightShift(const ir::Tensor &A,
+                             const ir::Tensor &B,
+                             const Target &target,
+                             const std::string &output_name) {
+  std::string extern_func = "cinn_";
+  if (target == common::DefaultHostTarget()) {
+    extern_func += "host_";
+  } else if (target == common::DefaultNVGPUTarget()) {
+    extern_func += "nvgpu_";
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  extern_func += "logical_right_shift";
+
+  if (A->type().is_int(32) || A->type().is_uint(32)) {
+    extern_func += "_int32";
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  return Compute(
+      A->shape,
+      [=](const std::vector<Expr> &indices) {
+        Expr x = A(indices);
+        Expr y = B(indices);
+        return lang::CallExtern(extern_func, {x, y});
+      },
+      output_name);
+}
+
+std::shared_ptr<OpStrategy> StrategyForLogicalRightShift(const framework::NodeAttr &attrs,
+                                                         const std::vector<ir::Tensor> &inputs,
+                                                         const std::vector<Type> &out_type,
+                                                         const std::vector<std::vector<int>> &output_shapes,
+                                                         const Target &target) {
+  std::string op_name("logical_right_shift");
+
+  framework::CINNCompute logical_right_shift_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of " << op_name << " compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 2U) << "2 input tensors for " << op_name << " compute\n";
+
+    Expr A_expr = pack_args[0];
+    Expr B_expr = pack_args[1];
+    CHECK(A_expr.as_tensor());
+    CHECK(B_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+    ir::Tensor B = B_expr.as_tensor_ref();
+
+    std::string tensor_name = UniqName("T_LogicalRightShift_out");
+
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 3U);
+      tensor_name = pack_args[2].operator std::string();
+    }
+
+    auto out    = LogicalRightShift(A, B, target, tensor_name);
+    auto stages = CreateStages({out});
+    *ret        = CINNValuePack{{CINNValue(Expr(out.get())), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(logical_right_shift_compute,
+                    GetInjectiveScheduleFunc(output_shapes, target),
+                    "strategy.logical_right_shift.x86",
+                    1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForLogicalRightShift(const std::vector<framework::shape_t> &inputs_shape,
+                                                               const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 2U) << "The input's shape size should be 2! Please check again.";
+  CHECK_EQ(inputs_shape[0].size(), inputs_shape[1].size()) << "The inputs' dims should be equal.";
+  std::vector<framework::shape_t> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForLogicalRightShift(const std::vector<Type> &inputs_type,
+                                                 const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 2UL) << "The logical_right_shift op should has two inputs! Please check.";
+  CHECK_EQ(inputs_type[0], inputs_type[1])
+      << "The data type of input tensors of logical_right_shift op should be equal, but here x:" << inputs_type[0]
+      << " != y:" << inputs_type[1] << "! Please check.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(logical_right_shift_ops) {
+  CINN_REGISTER_OP(logical_right_shift)
+      .describe("Logical Right Shift.")
+      .set_num_inputs(2)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForLogicalRightShift)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForLogicalRightShift))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForLogicalRightShift))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/logical_right_shift.h b/paddle/cinn/hlir/op/contrib/logical_right_shift.h
new file mode 100644
index 0000000000000..cf1d0ec5220ef
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+ir::Tensor LogicalRightShift(const ir::Tensor& A,
+                             const ir::Tensor& B,
+                             const Target& target,
+                             const std::string& output_name);
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc b/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
new file mode 100644
index 0000000000000..b93f364e896e5
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/logical_right_shift.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+TEST(GenerateCode_Cpu, LogicalRightShift) {
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultHostTarget();
+  lang::Placeholder<int> x("x", std::vector<int>{10});
+  lang::Placeholder<int> y("y", std::vector<int>{10});
+  ir::Tensor res = LogicalRightShift(x, y, target, "test_logical_right_shift");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_LogicalRightShift", stages, {res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("LogicalRightShift_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  VLOG(6) << "Cpu Codegen result:";
+  VLOG(6) << code << std::endl;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/lookup_table.cc b/paddle/cinn/hlir/op/contrib/lookup_table.cc
new file mode 100644
index 0000000000000..39a7a4862255f
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/lookup_table.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/lookup_table.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/type.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "gflags/gflags.h"
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using common::CINNValuePack;
+
+ir::Tensor LookupTable(const ir::Tensor& table,
+                       const ir::Tensor& ids,
+                       const int64_t padding_idx,
+                       const std::string& output_name) {
+  CHECK_EQ(table->shape.size(), 2);
+  CHECK_GT(ids->shape.size(), 1);
+  auto output_shape   = ids->shape;
+  output_shape.back() = table->shape.back();
+
+  return lang::Compute(
+      output_shape,
+      [&](const std::vector<ir::Expr>& indices) {
+        std::vector<Expr> offsets;
+        for (int i = 0; i < indices.size() - 1; ++i) {
+          offsets.emplace_back(indices[i]);
+        }
+        offsets.emplace_back(Expr(0));
+        // Because the current conversion rules have not been completed, static conversion is done here.
+        auto ids_offset = ir::Cast::Make(common::I32(), ids(offsets));
+        auto pred =
+            ir::And::Make(Expr(padding_idx != -1), ir::EQ::Make(ids_offset, Expr(static_cast<int32_t>(padding_idx))));
+        return ir::Select::Make(pred, ir::Cast::Make(table->type(), Expr(0)), table(ids_offset, indices.back()));
+      },
+      common::UniqName(output_name));
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForLookupTable(const framework::NodeAttr& attrs,
+                                                              const std::vector<ir::Tensor>& inputs,
+                                                              const std::vector<Type>& out_type,
+                                                              const std::vector<std::vector<int>>& output_shapes,
+                                                              const Target& target) {
+  std::string op_name("lookup_table");
+  const auto& attr_store = attrs.attr_store;
+  CHECK(attr_store.count("padding_idx")) << "find no attr of axis";
+  auto padding_idx = absl::get<int64_t>(attr_store.at("padding_idx"));
+
+  framework::CINNCompute lookup_table_compute([=](lang::Args args, lang::RetValue* ret) {
+    CHECK(!args.empty()) << "The input arguments of " << op_name << " compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 2U) << "2 input tensors for " << op_name << " compute\n";
+    Expr A = pack_args[0];
+    Expr B = pack_args[1];
+    CHECK(A.as_tensor());
+    CHECK(B.as_tensor());
+    CHECK(!output_shapes.empty());
+    auto tensor_A = A.as_tensor_ref();
+    auto tensor_B = B.as_tensor_ref();
+    auto stages   = CreateStages({tensor_A, tensor_B});
+    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ") << ", B shape: " << utils::Join(tensor_B->shape, ", ")
+            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+    std::string tensor_name = UniqName("LookupTable_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 3U);
+      tensor_name = pack_args[2].operator std::string();
+    }
+    ir::Tensor out = LookupTable(tensor_A, tensor_B, padding_idx, tensor_name);
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res.push_back(CINNValue(out));
+    CHECK(!out_type.empty()) << "Output type of " << op_name << " is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(lookup_table_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.lookup_table", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForLookupTable(const std::vector<framework::shape_t>& inputs_shape,
+                                                         const framework::AttrMapType& attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+
+  auto res   = inputs_shape[1];
+  res.back() = inputs_shape[0].back();
+  return {res};
+}
+
+std::vector<Type> InferDtypeForLookupTable(const std::vector<Type>& inputs_type, const framework::AttrMapType& attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(lookup_table_ops) {
+  CINN_REGISTER_OP(lookup_table)
+      .describe("Lookup table Operator.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForLookupTable)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForLookupTable))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForLookupTable))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective);
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/lookup_table.h b/paddle/cinn/hlir/op/contrib/lookup_table.h
new file mode 100644
index 0000000000000..202f640938ca4
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/lookup_table.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/compute.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+ir::Tensor LookupTable(const ir::Tensor& table,
+                       const ir::Tensor& ids,
+                       const int64_t padding_idx,
+                       const std::string& output_name);
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/lookup_table_test.cc b/paddle/cinn/hlir/op/contrib/lookup_table_test.cc
new file mode 100644
index 0000000000000..5c235c6a0736a
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/lookup_table_test.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/lookup_table.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+#include "cinn/runtime/flags.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+TEST(GenerateCode_Cpu, LookupTable) {
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultHostTarget();
+
+  lang::Placeholder<float> in1("in1", {10, 20});
+  lang::Placeholder<int64_t> in2("in2", std::vector<int32_t>{2, 2, 1});
+  ir::Tensor res = LookupTable(in1, in2, 1, "test_lookup_table_out");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_LookupTable", stages, {res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("LookupTable_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  VLOG(6) << "codegen code: " << code;
+}
+
+TEST(GenerateCode_Gpu, LookupTable) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultNVGPUTarget();
+
+  lang::Placeholder<float> in1("in1", {10, 20});
+  lang::Placeholder<int64_t> in2("in2", std::vector<int32_t>{2, 2, 1});
+  ir::Tensor res = LookupTable(in1, in2, 1, "test_lookup_table_out");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  stages[res]->Bind(0, "blockIdx.x");
+  stages[res]->Bind(1, "threadIdx.y");
+  stages[res]->SetBuffer("global");
+
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCuda_LookupTable", stages, {res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CUDA codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("LookupTable_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/one_hot.cc b/paddle/cinn/hlir/op/contrib/one_hot.cc
new file mode 100755
index 0000000000000..5f6f74c6f8bd3
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/one_hot.cc
@@ -0,0 +1,225 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/one_hot.h"
+
+#include <gflags/gflags.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/transform.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValuePack;
+
+ir::Tensor OneHot(const ir::Tensor& indices,
+                  const ir::Tensor& on_value,
+                  const ir::Tensor& off_value,
+                  const int depth,
+                  const int axis,
+                  const Type& dtype,
+                  const std::string& output_name) {
+  int ndim = static_cast<int>(indices->shape.size());
+  CHECK(axis == -1 || (0 <= axis && axis <= ndim)) << "one_hot only accepts `axis` in [-1, data.ndim]"
+                                                   << ", but got axis = " << axis << ", and data.ndim = " << ndim;
+  CHECK(depth > 0) << "one_hot only accepts `depth > 0`"
+                   << ", but got depth = " << depth;
+
+  CHECK(on_value->shape.size() == 1U && on_value->shape[0].as_int32() == 1U) << "The shape of on_value must be [1]";
+  CHECK(off_value->shape.size() == 1U && off_value->shape[0].as_int32() == 1U) << "The shape of off_value must be [1]";
+
+  int true_axis = (axis == -1) ? ndim : axis;
+  std::vector<Expr> new_shape;
+  int indices_index = 0;
+
+  for (int i = 0; i < ndim + 1; ++i) {
+    if (i == true_axis) {
+      new_shape.push_back(Expr(depth));
+    } else {
+      new_shape.push_back(indices->shape[indices_index++]);
+    }
+  }
+
+  Expr on_value_cast  = ir::Cast::Make(dtype, on_value(Expr(0)));
+  Expr off_value_cast = ir::Cast::Make(dtype, off_value(Expr(0)));
+
+  ir::Tensor res = lang::Compute(
+      new_shape,
+      [=](const std::vector<Expr>& iter) {
+        std::vector<Expr> indices_indices;
+
+        for (size_t i = 0; i < iter.size(); i++) {
+          if (static_cast<int>(i) == true_axis) {
+            continue;
+          }
+          indices_indices.push_back(iter[i]);
+        }
+
+        Expr idx  = iter[true_axis];
+        Expr elem = ir::Cast::Make(idx.type(), indices(indices_indices));
+        return ir::Select::Make(ir::EQ::Make(elem, idx), on_value_cast, off_value_cast);
+      },
+      common::UniqName(output_name));
+
+  return res;
+}
+
+std::vector<framework::shape_t> InferShapeForOneHot(const std::vector<framework::shape_t>& inputs_shape,
+                                                    const framework::AttrMapType& attrs) {
+  CHECK_EQ(inputs_shape.size(), 3UL) << "The number of one_hot's input should be 3";
+
+  int depth;
+  int axis;
+
+  for (auto& iter : attrs) {
+    if (iter.first == "depth") {
+      depth = absl::get<int>(iter.second);
+    } else if (iter.first == "axis") {
+      axis = absl::get<int>(iter.second);
+    }
+  }
+
+  const std::vector<int>& in_shape = inputs_shape[0];
+  int ndim                         = static_cast<int>(in_shape.size());
+  int true_axis                    = (axis == -1) ? in_shape.size() : axis;
+  int indices_index                = 0;
+  std::vector<int> new_shape;
+
+  for (int i = 0; i < ndim + 1; ++i) {
+    if (i == true_axis) {
+      new_shape.push_back(depth);
+    } else {
+      new_shape.push_back(in_shape[indices_index++]);
+    }
+  }
+
+  std::vector<std::vector<int>> res{new_shape};
+  return res;
+}
+
+std::vector<Type> InferDtypeForOneHot(const std::vector<Type>& inputs_type, const framework::AttrMapType& attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+
+  std::string dtype = "float32";
+  if (attrs.find("dtype") != attrs.end()) {
+    dtype = absl::get<std::string>(attrs.at("dtype"));
+  }
+
+  std::vector<Type> res{common::Str2Type(dtype)};
+  return res;
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForOneHot(const framework::NodeAttr& attrs,
+                                                         const std::vector<ir::Tensor>& inputs,
+                                                         const std::vector<Type>& out_type,
+                                                         const std::vector<std::vector<int>>& output_shapes,
+                                                         const Target& target) {
+  int depth;
+  int axis;
+  std::string dtype = "float32";
+
+  for (auto& iter : attrs.attr_store) {
+    if (iter.first == "depth") {
+      depth = absl::get<int>(iter.second);
+    } else if (iter.first == "axis") {
+      axis = absl::get<int>(iter.second);
+    } else if (iter.first == "dtype") {
+      dtype = absl::get<std::string>(iter.second);
+    }
+  }
+
+  CHECK(depth > 0) << "one_hot only accepts `depth > 0`"
+                   << ", but got depth = " << depth;
+
+  framework::CINNCompute one_hot_compute([=](lang::Args args, lang::RetValue* ret) {
+    CHECK(!args.empty()) << "The input argument of one_hot compute is empty! Please check.\n";
+    common::CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "at least one input tensor for transpose compute\n";
+    CHECK_GE(pack_args.size(), 3U);
+    Expr indices_expr   = pack_args[0];
+    Expr on_value_expr  = pack_args[1];
+    Expr off_value_expr = pack_args[2];
+    CHECK(indices_expr.as_tensor());
+    CHECK(on_value_expr.as_tensor());
+    CHECK(off_value_expr.as_tensor());
+
+    ir::Tensor indices   = indices_expr.as_tensor_ref();
+    ir::Tensor on_value  = on_value_expr.as_tensor_ref();
+    ir::Tensor off_value = off_value_expr.as_tensor_ref();
+
+    std::string tensor_name = common::UniqName("T_OneHot_out");
+
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 4U);
+      tensor_name = pack_args[3].operator std::string();
+    }
+
+    ir::Tensor out = OneHot(indices, on_value, off_value, depth, axis, common::Str2Type(dtype), tensor_name);
+
+    std::vector<common::CINNValue> res;
+    auto stages = CreateStages({indices, on_value, off_value});
+    stages->InsertLazily(out);
+    res.push_back(common::CINNValue(out));
+    res.push_back(common::CINNValue(stages));
+    *ret = common::CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(one_hot_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.one_hot.x86", 1);
+
+  return strategy;
+}
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(one_hot_ops) {
+  CINN_REGISTER_OP(one_hot)
+      .describe(
+          "Returns a one-hot tensor where the locations repsented by indices take value `on_value`, "
+          "other locations take value `off_value`.")
+      .set_num_inputs(3)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForOneHot)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForOneHot))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForOneHot))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/one_hot.h b/paddle/cinn/hlir/op/contrib/one_hot.h
new file mode 100644
index 0000000000000..90e21d8c01c4e
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/one_hot.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+ir::Tensor OneHot(const ir::Tensor& indices,
+                  const ir::Tensor& on_value,
+                  const ir::Tensor& off_value,
+                  const int depth,
+                  const int axis,
+                  const Type& dtype,
+                  const std::string& output_name);
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/one_hot_test.cc b/paddle/cinn/hlir/op/contrib/one_hot_test.cc
new file mode 100644
index 0000000000000..2c51165228cdd
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/one_hot_test.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/one_hot.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+TEST(GenerateCode_Cpu, OneHot) {
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultHostTarget();
+
+  Expr m(4);
+  Expr n(4);
+  const int depth         = 3;
+  const int axis          = 1;
+  const std::string dtype = "float32";
+
+  lang::Placeholder<int32_t> in("in", {m, n});
+  lang::Placeholder<int32_t> on_value("on_value", {Expr(1)});
+  lang::Placeholder<int32_t> off_value("off_value", {Expr(1)});
+
+  ir::Tensor res = OneHot(in, on_value, off_value, depth, axis, common::Str2Type(dtype), "test_one_hot");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_OneHot", stages, {res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("OneHot_Module", target);
+  for (auto &f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  VLOG(6) << "Cpu Codegen result:";
+  VLOG(6) << code << std::endl;
+
+  auto target_source = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void TestGenerateCodeCpu_OneHot(void* _args, int32_t num_args)
+{
+  cinn_buffer_t* _test_one_hot = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _in = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 4 });
+  cinn_buffer_t* _off_value = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), {  });
+  cinn_buffer_t* _on_value = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), {  });
+  cinn_buffer_malloc((void*)(0), _test_one_hot);
+  cinn_buffer_malloc((void*)(0), _in);
+  cinn_buffer_malloc((void*)(0), _off_value);
+  cinn_buffer_malloc((void*)(0), _on_value);
+  const int32_t* in = ((const int32_t*)(_in->memory));
+  const int32_t* off_value = ((const int32_t*)(_off_value->memory));
+  const int32_t* on_value = ((const int32_t*)(_on_value->memory));
+  float* test_one_hot = ((float*)(_test_one_hot->memory));
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t j = 0; j < 3; j += 1) {
+      for (int32_t k = 0; k < 4; k += 1) {
+        test_one_hot[((12 * i) + ((4 * j) + k))] = (((in[((4 * i) + k)] == j)) ? ((float)(on_value[0])) : ((float)(off_value[0])));
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _in);
+  cinn_buffer_free((void*)(0), _off_value);
+  cinn_buffer_free((void*)(0), _on_value);
+  cinn_buffer_free((void*)(0), _test_one_hot);
+}
+  )ROC";
+
+  ASSERT_EQ(utils::Trim(code), utils::Trim(target_source));
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/randint.cc b/paddle/cinn/hlir/op/contrib/randint.cc
new file mode 100644
index 0000000000000..f69fb4d44bb5b
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/randint.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/types/variant.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/cinn_value.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/poly/stage.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using common::CINNValuePack;
+
+std::shared_ptr<framework::OpStrategy> StrategyForRandInt(const framework::NodeAttr &attrs,
+                                                          const std::vector<ir::Tensor> &inputs,
+                                                          const std::vector<Type> &out_type,
+                                                          const std::vector<std::vector<int>> &output_shapes,
+                                                          const Target &target) {
+  framework::CINNCompute randint_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(attrs.attr_store.count("shape"));
+    ir::Tensor shape_tensor;
+    std::string tensor_name = "randint_out";
+    auto out                = pe::Identity(shape_tensor, tensor_name).front();
+    auto stages             = CreateStages({out});
+    std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
+    *ret = CINNValuePack{res};
+  });
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(randint_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.randint.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForRandInt(const std::vector<framework::shape_t> &inputs_shape,
+                                                     const framework::AttrMapType &attrs) {
+  CHECK(attrs.count("shape"));
+  auto shape = absl::get<std::vector<int>>(attrs.at("shape"));
+  CHECK(!shape.empty()) << "shape attr is empty!";
+  return {shape};
+}
+
+std::vector<Type> InferDtypeForRandInt(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  std::string dtype = "int32";
+  std::vector<Type> res{common::Str2Type(dtype)};
+  return res;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(randint_ops) {
+  CINN_REGISTER_OP(randint)
+      .describe("RandInt")
+      .set_num_inputs(0)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForRandInt)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForRandInt))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForRandInt))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/reciprocal.cc b/paddle/cinn/hlir/op/contrib/reciprocal.cc
new file mode 100644
index 0000000000000..b642a24b0f7e8
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/reciprocal.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "gflags/gflags.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+ir::Tensor Reciprocal(const ir::Tensor &input, const std::string &output_name) {
+  std::string extern_func = "cinn_";
+
+  extern_func += "reciprocal";
+
+  if (input->type().is_float(32)) {
+    extern_func += "_fp32";
+  } else if (input->type().is_float(64)) {
+    extern_func += "_fp64";
+  } else if (input->type().is_bfloat16()) {
+    extern_func += "_bf16";
+  } else if (input->type().is_float16()) {
+    extern_func += "_fp16";
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  return {Compute(
+      input->shape,
+      [=](const std::vector<Expr> &indice) {
+        ir::Tensor out_tensor(input);
+        auto e = out_tensor(indice);
+        return common::make_const(input->type(), 1.0f) / e;
+      },
+      output_name)};
+}
+
+std::shared_ptr<OpStrategy> StrategyForReciprocal(const framework::NodeAttr &attrs,
+                                                  const std::vector<ir::Tensor> &inputs,
+                                                  const std::vector<Type> &out_type,
+                                                  const std::vector<std::vector<int>> &output_shapes,
+                                                  const Target &target) {
+  std::string op_name("reciprocal");
+
+  framework::CINNCompute reciprocal_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of " << op_name << " compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "at least one input tensor for " << op_name << " compute\n";
+
+    std::string tensor_name = UniqName("Reciprocal_out");
+
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    CHECK(!output_shapes.empty());
+    auto tensor_A = A.as_tensor_ref();
+    auto stages   = CreateStages({tensor_A});
+    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
+            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2U);
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    ir::Tensor out = Reciprocal(tensor_A, tensor_name);
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res.push_back(CINNValue(out));
+    CHECK(!out_type.empty()) << "Output type of Reciprocal is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(reciprocal_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.reciprocal.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForReciprocal(const std::vector<framework::shape_t> &inputs_shape,
+                                                        const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty()) << "The input's shape size is empty! Please check again.";
+  std::vector<framework::shape_t> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForReciprocal(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(reciprocal_ops) {
+  CINN_REGISTER_OP(reciprocal)
+      .describe("Counting Leading Zeros.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForReciprocal)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForReciprocal))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForReciprocal))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/reciprocal.h b/paddle/cinn/hlir/op/contrib/reciprocal.h
new file mode 100644
index 0000000000000..8b95d111ee638
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/reciprocal.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+ir::Tensor Reciprocal(const ir::Tensor& input, const std::string& name = "T_Reciprocal_out");
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/reciprocal_test.cc b/paddle/cinn/hlir/op/contrib/reciprocal_test.cc
new file mode 100644
index 0000000000000..ba059a2148456
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/reciprocal_test.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/reciprocal.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+TEST(GenerateCode_Cpu, Reciprocal) {
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultHostTarget();
+
+  ir::Expr n(4);
+  ir::Expr m(2);
+
+  lang::Placeholder<float> in("in", {n, m});
+
+  ir::Tensor res = Reciprocal(in, "test_reciprocal_out");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_Reciprocal", stages, {res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("Reciprocal_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  VLOG(6) << "Cpu Codegen result:";
+  VLOG(6) << code << std::endl;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/repeat.cc b/paddle/cinn/hlir/op/contrib/repeat.cc
new file mode 100755
index 0000000000000..141e302b7271e
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/repeat.cc
@@ -0,0 +1,230 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/repeat.h"
+
+#include <gflags/gflags.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/transform.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValuePack;
+
+std::vector<ir::Tensor> Repeat(const ir::Tensor &tensor, int repeats, int axis, const std::string &output_name) {
+  int ndim = static_cast<int>(tensor->shape.size());
+  CHECK(-ndim - 1 <= axis && axis <= ndim) << "repeat only accepts `axis` in [-data.ndim - 1, data.ndim]"
+                                           << ", but got axis = " << axis << ", and data.ndim = " << ndim;
+  CHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
+                      << ", but got repeats = " << repeats;
+
+  if (axis < 0) {
+    // Calculate offset from last dimension
+    axis += ndim;
+  }
+  std::vector<Expr> new_shape;
+  for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+    new_shape.push_back(tensor->shape[i]);
+  }
+  new_shape.push_back(repeats * tensor->shape[axis]);
+  for (size_t i = axis + 1; i < tensor->shape.size(); ++i) {
+    new_shape.push_back(tensor->shape[i]);
+  }
+
+  ir::Tensor res = lang::Compute(
+      {new_shape},
+      [=](const std::vector<ir::Expr> &indices) {
+        std::vector<Expr> idx;
+        for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+          idx.push_back(indices[i]);
+        }
+        idx.push_back(lang::FloorDivide(indices[axis], Expr(repeats)));
+        for (size_t i = axis + 1; i < indices.size(); ++i) {
+          idx.push_back(indices[i]);
+        }
+        return tensor(idx);
+      },
+      common::UniqName(output_name));
+  return {res};
+}
+
+std::vector<std::vector<int>> InferShapeForRepeat(const std::vector<std::vector<int>> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 1U) << "The input's shape size should be 1! Please check again.";
+
+  int repeats = 0;
+  int axis    = 0;
+  std::vector<int> new_shape;
+  const std::vector<int> &tensor_shape = inputs_shape[0];
+  int ndim                             = static_cast<int>(tensor_shape.size());
+
+  if (attrs.find("repeats") != attrs.end()) {
+    repeats = absl::get<int>(attrs.at("repeats"));
+  }
+  if (attrs.find("axis") != attrs.end()) {
+    axis = absl::get<int>(attrs.at("axis"));
+  }
+
+  if (axis < 0) {
+    // Calculate offset from last dimension
+    axis += ndim;
+  }
+
+  for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+    new_shape.push_back(tensor_shape[i]);
+  }
+  new_shape.push_back(repeats * tensor_shape[axis]);
+  for (size_t i = axis + 1; i < tensor_shape.size(); ++i) {
+    new_shape.push_back(tensor_shape[i]);
+  }
+
+  std::vector<std::vector<int>> res{new_shape};
+  return res;
+}
+
+std::vector<Type> InferDtypeForRepeat(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForRepeat(const framework::NodeAttr &attrs,
+                                                         const std::vector<ir::Tensor> &inputs,
+                                                         const std::vector<Type> &out_type,
+                                                         const std::vector<std::vector<int>> &output_shapes,
+                                                         const Target &target) {
+  int repeats = 0;
+  int axis    = 0;
+  for (auto &iter : attrs.attr_store) {
+    if (iter.first == "repeats") {
+      repeats = absl::get<int>(iter.second);
+    } else if (iter.first == "axis") {
+      axis = absl::get<int>(iter.second);
+    }
+  }
+
+  CHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
+                      << ", but got repeats = " << repeats;
+
+  framework::CINNCompute repeat_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of Repeat compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 1U) << "at least 1 input tensors for Repeat compute\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    CHECK(!output_shapes.empty());
+    auto tensor_A = A.as_tensor_ref();
+    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
+            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+    std::string tensor_name = common::UniqName("T_Repeat_out");
+
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2U);
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    std::vector<ir::Tensor> out = Repeat(tensor_A, repeats, axis, tensor_name);
+    CHECK(out.size() == 1U) << "The size of Repeat's output should be 1";
+
+    std::vector<common::CINNValue> res;
+    auto stages = CreateStages({tensor_A});
+    for (auto &t : out) {
+      stages->InsertLazily(t);
+      res.push_back(common::CINNValue(t));
+    }
+
+    res.push_back(common::CINNValue(stages));
+    *ret = common::CINNValuePack{res};
+  });
+
+  framework::CINNSchedule repeat_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of repeat schedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
+      if (prod_size > 1) {
+        if (target.arch == Target::Arch::NVGPU) {
+          pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+        } else if (target.arch == Target::Arch::X86) {
+          pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
+        }
+      }
+      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = common::CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of repeat schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      Expr out               = arg_pack[0];
+      CHECK(out.as_tensor());
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(repeat_compute, repeat_schedule, "strategy.repeat.x86", 1);
+
+  return strategy;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(repeat_ops) {
+  CINN_REGISTER_OP(repeat)
+      .describe("Repeat elements of an array `repeats` times along axis `axis`")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForRepeat)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForRepeat))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForRepeat))
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/repeat.h b/paddle/cinn/hlir/op/contrib/repeat.h
new file mode 100644
index 0000000000000..7b6fcfd174839
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/repeat.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+std::vector<ir::Tensor> Repeat(const ir::Tensor &tensor, int repeats, int axis, const std::string &output_name);
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/repeat_test.cc b/paddle/cinn/hlir/op/contrib/repeat_test.cc
new file mode 100755
index 0000000000000..02977ea19fb7a
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/repeat_test.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/repeat.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+TEST(GenerateCode_Cpu, Repeat) {
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultHostTarget();
+
+  ir::Expr m(4);
+  ir::Expr n(4);
+  const int repeats = 2;
+  const int axis    = 0;
+
+  lang::Placeholder<int32_t> in("in", {m, n});
+
+  std::vector<ir::Tensor> res = Repeat(in, repeats, axis, "test_repeat");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_Repeat", stages, res, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  auto target_source_ir = R"ROC(
+function TestGenerateCodeCpu_Repeat (_test_repeat)
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 8)
+    {
+      serial for (j, 0, 4)
+      {
+        ScheduleBlock(test_repeat)
+        {
+          i0, i1 = axis.bind(i, j)
+          test_repeat[i0, i1] = in[select((((i0 > 0) and (2 > 0)) or ((i0 < 0) and (2 < 0))), (i0 / 2), select(((i0 % 2) == 0), (i0 / 2), ((i0 / 2) - 1))), i1]
+        }
+      }
+    }
+  }
+}
+  )ROC";
+
+  ASSERT_EQ(utils::GetStreamCnt(funcs[0]), utils::Trim(target_source_ir));
+
+  ir::Module::Builder builder("Repeat_Module", target);
+  for (auto &f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  VLOG(6) << "Cpu Codegen result:";
+  VLOG(6) << code << std::endl;
+
+  auto target_source = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void TestGenerateCodeCpu_Repeat(void* _args, int32_t num_args)
+{
+  cinn_buffer_t* _test_repeat = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _in = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 4 });
+  cinn_buffer_malloc((void*)(0), _test_repeat);
+  cinn_buffer_malloc((void*)(0), _in);
+  const int32_t* in = ((const int32_t*)(_in->memory));
+  int32_t* test_repeat = ((int32_t*)(_test_repeat->memory));
+  for (int32_t i = 0; i < 8; i += 1) {
+    for (int32_t j = 0; j < 4; j += 1) {
+      test_repeat[((4 * i) + j)] = in[((4 * (((((i > 0) && (2 > 0)) || ((i < 0) && (2 < 0)))) ? (i / 2) : ((((i & 1) == 0)) ? (i / 2) : ((i / 2) + -1)))) + j)];
+    };
+  };
+  cinn_buffer_free((void*)(0), _in);
+  cinn_buffer_free((void*)(0), _test_repeat);
+}
+  )ROC";
+
+  ASSERT_EQ(utils::Trim(code), utils::Trim(target_source));
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/resize.cc b/paddle/cinn/hlir/op/contrib/resize.cc
new file mode 100644
index 0000000000000..26ad4af1411cc
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/resize.cc
@@ -0,0 +1,241 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/resize.h"
+
+#include <gflags/gflags.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/transform.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValuePack;
+
+#define __get_pixel(input, h, w, n, c, y, x)                                          \
+  input({n,                                                                           \
+         c,                                                                           \
+         common::AutoSimplify(ir::Max::Make(ir::Min::Make(y, h - Expr(1)), Expr(0))), \
+         common::AutoSimplify(ir::Max::Make(ir::Min::Make(x, w - Expr(1)), Expr(0)))})
+
+ir::Tensor Resize(const ir::Tensor &input,
+                  const common::Target &target,
+                  const std::vector<int> &out_shape,
+                  const std::string &mode,
+                  const std::string &output_name) {
+  std::string func_name;
+
+  if (target.arch == common::Target::Arch::NVGPU) {
+    func_name.assign("cinn_cuda_resize_");
+  } else if (target.arch == common::Target::Arch::X86) {
+    func_name.assign("cinn_host_resize_");
+  } else {
+    LOG(FATAL) << "Resize only supports X86 and NVGPU ! Please Check.\n";
+  }
+
+  if (mode == "bilinear") {
+    func_name.append("bilinear");
+  } else if (mode == "bicubic") {
+    func_name.append("bicubic");
+  }
+
+  Expr in_h  = input->shape[2];
+  Expr in_w  = input->shape[3];
+  Expr out_h = Expr(out_shape[0]);
+  Expr out_w = Expr(out_shape[1]);
+
+  std::vector<Expr> new_shape = {input->shape[0], input->shape[1], out_h, out_w};
+  ir::Tensor res              = lang::Compute(
+      {new_shape},
+      [=](const std::vector<Expr> &indices) {
+        Expr out_y = indices[2];
+        Expr out_x = indices[3];
+        Expr value;
+
+        if (mode == "nearest") {
+          Expr in_y = ir::Cast::Make(common::F32(), in_h) / ir::Cast::Make(common::F32(), out_h) *
+                      ir::Cast::Make(common::F32(), out_y);
+          Expr in_x = ir::Cast::Make(common::F32(), in_w) / ir::Cast::Make(common::F32(), out_w) *
+                      ir::Cast::Make(common::F32(), out_x);
+          Expr in_y_int                = ir::Cast::Make(common::Int(32), lang::Floor(in_y));
+          Expr in_x_int                = ir::Cast::Make(common::Int(32), lang::Floor(in_x));
+          std::vector<Expr> in_indices = {indices[0], indices[1], in_y_int, in_x_int};
+          value                        = input(in_indices);
+
+        } else if (mode == "bilinear") {
+          value = lang::CallExtern(
+              func_name, {input, input->shape[1], in_h, in_w, out_h, out_w, indices[0], indices[1], out_y, out_x});
+
+        } else if (mode == "bicubic") {
+          value = lang::CallExtern(
+              func_name, {input, input->shape[1], in_h, in_w, out_h, out_w, indices[0], indices[1], out_y, out_x});
+        }
+
+        return value;
+      },
+      common::UniqName(output_name));
+
+  return res;
+}
+
+std::vector<std::vector<int>> InferShapeForResize(const std::vector<std::vector<int>> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape[0].size(), 4U) << "The input's shape size should be 4! Please check again.";
+
+  CHECK(attrs.find("out_shape") != attrs.end())
+      << "Cannot find \"out_shape\" attribute in \"resize\" op, Please Check.";
+  std::vector<int> out_shape;
+  out_shape = absl::get<std::vector<int>>(attrs.at("out_shape"));
+  CHECK_EQ(out_shape.size(), 2U) << "The length of out_shape must be 2.";
+  CHECK(out_shape[0] > 0 && out_shape[1] > 0) << "The element of out_shape must be great that 0.";
+
+  CHECK(attrs.find("mode") != attrs.end()) << "Cannot find \"mode\" attribute in \"resize\" op, Please Check.";
+  std::string mode = absl::get<std::string>(attrs.at("mode"));
+  CHECK(mode == "nearest" || mode == "bilinear" || mode == "bicubic")
+      << "Resize only supports `nearest`, `bilinear` and `bicubic` mode.";
+
+  framework::shape_t x_shape = inputs_shape[0];
+  std::vector<int> new_shape;
+  new_shape.push_back(x_shape[0]);
+  new_shape.push_back(x_shape[1]);
+  new_shape.push_back(out_shape[0]);
+  new_shape.push_back(out_shape[1]);
+
+  return {new_shape};
+}
+
+std::vector<Type> InferDtypeForResize(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  CHECK(inputs_type[0] == Int(32)) << "Resize only supports int32 type input.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForResize(const framework::NodeAttr &attrs,
+                                                         const std::vector<ir::Tensor> &inputs,
+                                                         const std::vector<Type> &out_type,
+                                                         const std::vector<std::vector<int>> &output_shapes,
+                                                         const Target &target) {
+  std::vector<int> out_shape;
+  std::string mode = "bilinear";
+
+  for (auto &iter : attrs.attr_store) {
+    if (iter.first == "out_shape") {
+      out_shape = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "mode") {
+      mode = absl::get<std::string>(iter.second);
+    }
+  }
+
+  CHECK(mode == "nearest" || mode == "bilinear" || mode == "bicubic")
+      << "Resize only supports `nearest`, `bilinear` and `bicubic` mode.";
+
+  framework::CINNCompute resize_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of Resize compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 1U) << "at least 1 input tensors for Resize compute\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    CHECK(!output_shapes.empty());
+    auto tensor_A = A.as_tensor_ref();
+    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
+            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+    std::string tensor_name = common::UniqName("T_Resize_out");
+
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2U);
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    ir::Tensor out = Resize(tensor_A, target, out_shape, mode, tensor_name);
+
+    std::vector<common::CINNValue> res;
+    auto stages = CreateStages({tensor_A});
+    stages->InsertLazily(out);
+    res.push_back(common::CINNValue(out));
+    res.push_back(common::CINNValue(stages));
+    *ret = common::CINNValuePack{res};
+  });
+
+  framework::CINNSchedule resize_schedule([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of resize schedule is empty! Please check.\n";
+    common::CINNValuePack arg_pack = args[0];
+    std::vector<Expr> vec_ast;
+    for (int i = 0; i < arg_pack.size(); i++) {
+      if (arg_pack[i].is_expr()) {
+        Expr temp = arg_pack[i];
+        vec_ast.emplace_back(temp);
+      }
+    }
+    CHECK(!vec_ast.empty());
+    ir::ModuleExpr mod_expr(vec_ast);
+    ir::IRSchedule ir_sch(mod_expr);
+    ir_sch.MergeExprs();
+    long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
+    if (prod_size > 1) {
+      if (target.arch == Target::Arch::NVGPU) {
+        pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+      } else if (target.arch == Target::Arch::X86) {
+        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
+      }
+    }
+    std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+    *ret = common::CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(resize_compute, resize_schedule, "strategy.resize.x86", 1);
+
+  return strategy;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(resize_ops) {
+  CINN_REGISTER_OP(resize)
+      .describe(" ")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForResize)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForResize))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForResize))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/resize.h b/paddle/cinn/hlir/op/contrib/resize.h
new file mode 100644
index 0000000000000..694ec71f832d4
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/resize.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+ir::Tensor Resize(const ir::Tensor &x,
+                  const common::Target &target,
+                  const std::vector<int> &out_shape,
+                  const std::string &mode,
+                  const std::string &output_name);
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/sort.cc b/paddle/cinn/hlir/op/contrib/sort.cc
new file mode 100644
index 0000000000000..0f324a73f2196
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/sort.cc
@@ -0,0 +1,412 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/sort.h"
+
+#include <gflags/gflags.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/transform.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using common::CINNValuePack;
+
+std::vector<ir::Tensor> ArgSort(const ir::Tensor &A,
+                                const common::Target &target,
+                                poly::StageMap stages,
+                                const int &axis,
+                                const bool &is_ascend,
+                                const std::string &name) {
+  std::string find_func_name;
+  std::string index_func_name;
+  if (target.arch == common::Target::Arch::NVGPU) {
+    find_func_name.assign("cinn_nvgpu_next_smallest_int32");
+  } else if (target.arch == common::Target::Arch::X86) {
+    find_func_name.assign("cinn_host_next_smallest_int32");
+  } else {
+    LOG(FATAL) << "ArgSort only supports X86 and NVGPU ! Please Check.\n";
+  }
+  if (is_ascend) {
+    index_func_name = cinn::hlir::GetExternFuncName(target, A->type(), "lt_num");
+  } else {
+    index_func_name = cinn::hlir::GetExternFuncName(target, A->type(), "gt_num");
+  }
+  int pos_axis = axis;
+  if (pos_axis < 0) {
+    pos_axis += A->shape.size();
+  }
+  auto positions = Compute(
+      A->shape,
+      [=](const std::vector<Expr> &indices) {
+        Expr offset(0);
+        Expr stride(1);
+        for (int i = 0; i < indices.size(); i++) {
+          if (i < pos_axis) {
+            offset = offset * A->shape[i] + indices[i];
+          } else if (i == pos_axis) {
+            offset = offset * A->shape[i];
+          } else {
+            offset = offset * A->shape[i] + indices[i];
+            stride = stride * A->shape[i];
+          }
+        }
+        offset            = common::AutoSimplify(offset);
+        stride            = common::AutoSimplify(stride);
+        auto A_shape_axis = A->shape[pos_axis];
+        return lang::CallExtern(index_func_name, {A, A_shape_axis, A(indices), offset, stride});
+      },
+      name + "_temp");
+  auto res = Compute(
+      A->shape,
+      [=](const std::vector<Expr> &indices) {
+        Expr offset(0);
+        Expr stride(1);
+        for (int i = 0; i < indices.size(); i++) {
+          if (i < pos_axis) {
+            offset = offset * A->shape[i] + indices[i];
+          } else if (i == pos_axis) {
+            offset = offset * A->shape[i];
+          } else {
+            offset = offset * A->shape[i] + indices[i];
+            stride = stride * A->shape[i];
+          }
+        }
+        offset = common::AutoSimplify(offset);
+        stride = common::AutoSimplify(stride);
+
+        auto A_shape_axis = A->shape[pos_axis];
+        auto idx = lang::CallExtern(find_func_name, {positions, A_shape_axis, indices[pos_axis], offset, stride});
+        return idx;
+      },
+      name);
+  stages->InsertLazily(positions);
+  return {res, positions};
+}
+
+std::vector<ir::Tensor> Sort(const ir::Tensor &A,
+                             const common::Target &target,
+                             poly::StageMap stages,
+                             const int &axis,
+                             const bool &is_ascend,
+                             const std::string &name) {
+  int pos_axis = axis;
+  if (pos_axis < 0) {
+    pos_axis += A->shape.size();
+  }
+  auto sort_index = ArgSort(A, target, stages, pos_axis, is_ascend, name + "_index");
+  auto res        = Compute(
+      A->shape,
+      [=](const std::vector<Expr> &indices) {
+        std::vector<Expr> A_indices(indices);
+        A_indices[pos_axis] = sort_index.at(0)(indices);
+        return A(A_indices);
+      },
+      name);
+  stages->InsertLazily(sort_index.at(0));
+  return {res, sort_index.at(0), sort_index.at(1)};
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForSort(const framework::NodeAttr &attrs,
+                                                       const std::vector<ir::Tensor> &inputs,
+                                                       const std::vector<Type> &out_type,
+                                                       const std::vector<std::vector<int>> &output_shapes,
+                                                       const Target &target) {
+  auto attr_store = attrs.attr_store;
+  std::string op_name("sort");
+
+  CHECK(attr_store.count("axis")) << "find no attr of axis";
+  int axis       = absl::get<int>(attr_store.at("axis"));
+  bool is_ascend = true;
+  if (attr_store.count("is_ascend")) {
+    is_ascend = absl::get<bool>(attr_store.at("is_ascend"));
+  }
+
+  framework::CINNCompute sort_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of Sort compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 1U) << "At least 1 input tensors for Sort compute\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    CHECK(!output_shapes.empty());
+    auto tensor_A = A.as_tensor_ref();
+    auto stages   = CreateStages({tensor_A});
+    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
+            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+    auto tensor_name = UniqName("Sort_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2U);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+    std::vector<ir::Tensor> out = Sort(tensor_A, target, stages, axis, is_ascend, tensor_name);
+    stages->InsertLazily(out[0]);
+    std::vector<CINNValue> res{CINNValue(out[0]), CINNValue(out[1]), CINNValue(out[2])};
+    CHECK(!out_type.empty()) << "Output type of Sort is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule sort_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of sort_schedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      auto blocks = ir_sch.GetAllBlocks();
+      // TODO: remove external calls, do not use local variables, because
+      // the size will exceed the limit.
+      ir_sch.SetBuffer(blocks[0], "local");
+      ir_sch.SetBuffer(blocks[1], "local");
+
+      long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
+      if (prod_size > 1 && target.arch == Target::Arch::X86) {
+        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
+      }
+      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = common::CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of sort_schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      Expr out               = arg_pack[0];
+      CHECK(out.as_tensor());
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(sort_compute, sort_schedule, "strategy.sort", 1);
+  return strategy;
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForArgSort(const framework::NodeAttr &attrs,
+                                                          const std::vector<ir::Tensor> &inputs,
+                                                          const std::vector<Type> &out_type,
+                                                          const std::vector<std::vector<int>> &output_shapes,
+                                                          const Target &target) {
+  auto attr_store = attrs.attr_store;
+  CHECK(attr_store.count("axis")) << "find no attr of axis";
+  int axis       = absl::get<int>(attr_store.at("axis"));
+  bool is_ascend = true;
+  if (attr_store.count("is_ascend")) {
+    is_ascend = absl::get<bool>(attr_store.at("is_ascend"));
+  }
+
+  framework::CINNCompute argsort_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of ArgSort compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 1U) << "At least 1 input tensors for ArgSort compute\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    CHECK(!output_shapes.empty());
+    auto tensor_A = A.as_tensor_ref();
+    auto stages   = CreateStages({tensor_A});
+    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
+            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+    auto tensor_name = UniqName("ArgSort_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 3U);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+    auto out = ArgSort(tensor_A, target, stages, axis, is_ascend, tensor_name);
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out.at(0));
+    stages->InsertLazily(out.at(1));
+    res.push_back(CINNValue(out.at(0)));
+    res.push_back(CINNValue(out.at(1)));
+    CHECK(!out_type.empty()) << "Output type of ArgSort is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule argsort_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of argsort_schedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      auto blocks = ir_sch.GetAllBlocks();
+      // TODO: remove external calls, do not use local variables, because
+      // the size will exceed the limit.
+      // TODO: There is a bug, setting buffer to "local" here will cause the var declared twice at CodeGen.
+      // ir_sch.SetBuffer(blocks[0], "local");
+      long prod_size = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
+      if (prod_size > 1 && target.arch == Target::Arch::X86) {
+        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
+      }
+      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = common::CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of argsort_schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      Expr out               = arg_pack[0];
+      CHECK(out.as_tensor());
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(argsort_compute, argsort_schedule, "strategy.argsort", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForSort(const std::vector<std::vector<int>> &inputs_shape,
+                                                const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 1UL) << "The input's shape size should be 1! Please check again.";
+  int axis = 0;
+  for (auto &iter : attrs) {
+    if (iter.first == "axis") {
+      axis = absl::get<int>(iter.second);
+      break;
+    }
+  }
+  CHECK_GT(inputs_shape[0].size(), axis) << "The input's dim should be greater than axis! ";
+  std::vector<std::vector<int>> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForSort(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 1UL) << "The input's type size should be 1! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<std::vector<int>> InferShapeForArgSort(const std::vector<std::vector<int>> &inputs_shape,
+                                                   const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 1UL) << "The input's shape size should be 1! Please check again.";
+  int axis = 0;
+  for (auto &iter : attrs) {
+    if (iter.first == "axis") {
+      axis = absl::get<int>(iter.second);
+      break;
+    }
+  }
+  if (axis < 0) {
+    axis += inputs_shape[0].size();
+  }
+  CHECK_GT(inputs_shape[0].size(), axis) << "The input's dim should be greater than axis! ";
+  std::vector<std::vector<int>> res{inputs_shape[0], inputs_shape[0]};
+
+  return res;
+}
+
+std::vector<Type> InferDtypeForArgSort(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 1UL) << "The input's type size should be 1! Please check again.";
+  return {Int(32), Int(32)};
+}
+
+std::vector<std::vector<int>> InferShapeForTopK(const std::vector<std::vector<int>> &inputs_shape,
+                                                const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 1UL) << "The input's shape size should be 1! Please check again.";
+  auto res  = inputs_shape;
+  auto k_it = attrs.find("k");
+  CHECK(k_it != attrs.end()) << "The attr k of topk does not exist.";
+  int k        = absl::get<int>(k_it->second);
+  auto axis_it = attrs.find("axis");
+  CHECK(axis_it != attrs.end()) << "The attr axis of topk does not exist.";
+  int axis = absl::get<int>(axis_it->second);
+  if (axis < 0) {
+    axis += res[0].size();
+  }
+  CHECK_GE(axis, 0);
+  CHECK_LT(axis, res[0].size());
+  res[0][axis] = std::min(res[0][axis], k);
+  return {res[0], res[0]};
+}
+
+std::vector<Type> InferDtypeForTopK(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 1UL) << "The input's type size should be 1! Please check again.";
+  std::vector<Type> res{inputs_type[0], Int(64)};
+  return res;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(sort_ops) {
+  CINN_REGISTER_OP(sort)
+      .describe("Sort a variable x along the given axis and return sorted Variable.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForSort)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForSort))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForSort))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(argsort)
+      .describe("Sort a variable x along the given axis and return indices.")
+      .set_num_inputs(1)
+      .set_num_outputs(2)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForArgSort)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForArgSort))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForArgSort))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(top_k)
+      .describe("Find values and indices of the k largest entries for the last dimension..")
+      .set_num_inputs(1)
+      .set_num_outputs(2)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForTopK))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForTopK))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/sort.h b/paddle/cinn/hlir/op/contrib/sort.h
new file mode 100644
index 0000000000000..4290f9e1df181
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/sort.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+std::vector<ir::Tensor> ArgSort(const ir::Tensor& A,
+                                const common::Target& target,
+                                poly::StageMap stages,
+                                const int& axis,
+                                const bool& is_ascend,
+                                const std::string& name);
+
+std::vector<ir::Tensor> Sort(const ir::Tensor& A,
+                             const common::Target& target,
+                             poly::StageMap stages,
+                             const int& axis,
+                             const bool& is_ascend,
+                             const std::string& name);
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/sort_test.cc b/paddle/cinn/hlir/op/contrib/sort_test.cc
new file mode 100644
index 0000000000000..e5f990ba11b1c
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/sort_test.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/sort.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+TEST(GenerateCode_Cpu, ArgSort) {
+  common::Context::Global().ResetNameId();
+
+  Target target = common::DefaultHostTarget();
+
+  ir::Expr n(4);
+  ir::Expr h(28);
+
+  lang::Placeholder<int32_t> in("in", {n, h});
+  poly::StageMap stages = poly::CreateStages({in});
+  ir::Tensor res        = ArgSort(in.tensor(), target, stages, 1, true, "test_arg_sort_out").at(0);
+  stages->InsertLazily(res);
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_ArgSort", stages, {in, res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("ArgSort_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  VLOG(6) << "Cpu Codegen result:";
+  VLOG(6) << code << std::endl;
+}
+
+TEST(GenerateCode_Cpu, Sort) {
+  common::Context::Global().ResetNameId();
+
+  Target target = common::DefaultHostTarget();
+
+  ir::Expr n(4);
+  ir::Expr h(28);
+
+  lang::Placeholder<int32_t> in("in", {n, h});
+  auto stages    = poly::CreateStages({in});
+  ir::Tensor out = Sort(in, target, stages, 1, true, "test_sort_out").at(0);
+  stages->InsertLazily(out);
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_Sort", stages, {in, out}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("Sort_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code   = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  auto target_source = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void TestGenerateCodeCpu_Sort(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _in = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _test_sort_out = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _test_sort_out_index = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 28 });
+  cinn_buffer_t* _test_sort_out_index_temp = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_int32_t(), { 4, 28 });
+  cinn_buffer_malloc((void*)(0), _test_sort_out);
+  cinn_buffer_malloc((void*)(0), _test_sort_out_index);
+  cinn_buffer_malloc((void*)(0), _test_sort_out_index_temp);
+  const int32_t* in = ((const int32_t*)(_in->memory));
+  int32_t* test_sort_out = ((int32_t*)(_test_sort_out->memory));
+  int32_t* test_sort_out_index = ((int32_t*)(_test_sort_out_index->memory));
+  int32_t* test_sort_out_index_temp = ((int32_t*)(_test_sort_out_index_temp->memory));
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t j = 0; j < 28; j += 1) {
+      test_sort_out_index_temp[((28 * i) + j)] = cinn_host_lt_num_int32(_in, 28, in[((28 * i) + j)], (28 * i), 1);
+    };
+  };
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t j = 0; j < 28; j += 1) {
+      test_sort_out_index[((28 * i) + j)] = cinn_host_next_smallest_int32(_test_sort_out_index_temp, 28, j, (28 * i), 1);
+    };
+  };
+  for (int32_t i = 0; i < 4; i += 1) {
+    for (int32_t j = 0; j < 28; j += 1) {
+      test_sort_out[((28 * i) + j)] = in[((28 * i) + test_sort_out_index[((28 * i) + j)])];
+    };
+  };
+  cinn_buffer_free((void*)(0), _test_sort_out_index);
+  cinn_buffer_free((void*)(0), _test_sort_out_index_temp);
+  cinn_buffer_free((void*)(0), _test_sort_out);
+}
+  )ROC";
+  CHECK_EQ(utils::Trim(code), utils::Trim(target_source));
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/contrib/triangular_solve.cc b/paddle/cinn/hlir/op/contrib/triangular_solve.cc
new file mode 100644
index 0000000000000..b7b1cfd79459a
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/triangular_solve.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/common/macros.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using common::CINNValuePack;
+
+std::shared_ptr<framework::OpStrategy> StrategyForTriangularSolve(const framework::NodeAttr &attrs,
+                                                                  const std::vector<ir::Tensor> &inputs,
+                                                                  const std::vector<Type> &out_type,
+                                                                  const std::vector<std::vector<int>> &output_shapes,
+                                                                  const Target &target) {
+  framework::CINNCompute triangular_solve_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of triangular_solve is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 2U) << "Two input tensors are required for the computation of triangular_solve.";
+    Expr a_expr             = pack_args[0];
+    Expr b_expr             = pack_args[1];
+    ir::Tensor a            = a_expr.as_tensor_ref();
+    ir::Tensor b            = b_expr.as_tensor_ref();
+    std::string tensor_name = "triangular_solve_out";
+    auto out                = pe::Identity(b, tensor_name).front();
+    auto stages             = CreateStages({out});
+    std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
+    *ret = CINNValuePack{res};
+  });
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      triangular_solve_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.triangular_solve.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForTriangularSolve(const std::vector<framework::shape_t> &inputs_shape,
+                                                             const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 2U) << "The input's shape size should be 2! Please check again.";
+  framework::shape_t a_shape = inputs_shape[0];
+  framework::shape_t b_shape = inputs_shape[1];
+  int a_shape_size           = a_shape.size();
+  int b_shape_size           = b_shape.size();
+  CHECK_GE(a_shape_size, 2U) << "The input matrix A shape size should >= 2! Please check again.";
+  CHECK_GE(b_shape_size, 2U) << "The input matrix B shape size should >= 2! Please check again.";
+
+  int left_side = -1;
+  for (auto &iter : attrs) {
+    if (iter.first == "left_side") {
+      left_side = absl::get<bool>(iter.second);
+      break;
+    }
+  }
+
+  CHECK_EQ(a_shape[a_shape_size - 2], a_shape[a_shape_size - 1])
+      << "The last two dimensions of the input a must be the same!";
+  if (left_side) {
+    CHECK_EQ(a_shape[a_shape_size - 2], b_shape[b_shape_size - 2])
+        << "The last-but-one dimension of the two vectors must be consistent.";
+  } else {
+    CHECK_EQ(a_shape[a_shape_size - 1], b_shape[b_shape_size - 1])
+        << "The last dimension of the two vectors must be consistent.";
+  }
+
+  return {b_shape};
+}
+
+std::vector<Type> InferDtypeForTriangularSolve(const std::vector<Type> &inputs_type,
+                                               const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 2U) << "The input's shape size should be 2! Please check again.";
+  CHECK(inputs_type[0].is_float(32) || inputs_type[0].is_float(64))
+      << "The input's dtype should be float32 or float64! Please check again.";
+  CHECK(inputs_type[1].is_float(32) || inputs_type[1].is_float(64))
+      << "The input's dtype should be float32 or float64! Please check again.";
+  return std::vector<Type>{inputs_type[1]};
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(triangular_solve_ops) {
+  CINN_REGISTER_OP(triangular_solve)
+      .describe("TriangularSolve")
+      .set_num_inputs(2)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForTriangularSolve)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForTriangularSolve))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForTriangularSolve))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/contrib/uniform_random.cc b/paddle/cinn/hlir/op/contrib/uniform_random.cc
new file mode 100644
index 0000000000000..d4b2f9c8a953d
--- /dev/null
+++ b/paddle/cinn/hlir/op/contrib/uniform_random.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/types/variant.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/cinn_value.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/poly/stage.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::CINNValue;
+using common::CINNValuePack;
+
+std::shared_ptr<framework::OpStrategy> StrategyForUniformRandom(const framework::NodeAttr &attrs,
+                                                                const std::vector<ir::Tensor> &inputs,
+                                                                const std::vector<Type> &out_type,
+                                                                const std::vector<std::vector<int>> &output_shapes,
+                                                                const Target &target) {
+  framework::CINNCompute uniform_random_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(attrs.attr_store.count("shape"));
+    ir::Tensor shape_tensor;
+    std::string tensor_name = "uniform_random_out";
+    auto out                = pe::Identity(shape_tensor, tensor_name).front();
+    auto stages             = CreateStages({out});
+    std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
+    *ret = CINNValuePack{res};
+  });
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      uniform_random_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.uniform_random.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForUniformRandom(const std::vector<framework::shape_t> &inputs_shape,
+                                                           const framework::AttrMapType &attrs) {
+  CHECK(attrs.count("shape"));
+  auto shape = absl::get<std::vector<int>>(attrs.at("shape"));
+  CHECK(!shape.empty()) << "shape attr is empty!";
+  return {shape};
+}
+
+std::vector<Type> InferDtypeForUniformRandom(const std::vector<Type> &inputs_type,
+                                             const framework::AttrMapType &attrs) {
+  std::string dtype = "float32";
+  if (attrs.find("dtype") != attrs.end()) {
+    dtype = absl::get<std::string>(attrs.at("dtype"));
+  }
+  std::vector<Type> res{common::Str2Type(dtype)};
+  CHECK(res[0].is_float(32) || res[0].is_float(64))
+      << "uniform_random only support float32 and float64, but here " << res[0] << "! Please check.";
+  return res;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(uniform_random_ops) {
+  CINN_REGISTER_OP(uniform_random)
+      .describe("UniformRandom")
+      .set_num_inputs(0)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForUniformRandom)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForUniformRandom))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForUniformRandom))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/custom_call.cc b/paddle/cinn/hlir/op/custom_call.cc
new file mode 100644
index 0000000000000..05a63eab713a0
--- /dev/null
+++ b/paddle/cinn/hlir/op/custom_call.cc
@@ -0,0 +1,853 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/common/cas.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/hlir/pe/transform.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/utils/string.h"
+
+#ifdef CINN_WITH_CUDNN
+#include "cudnn.h"
+#endif
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+using ArgsFunc = std::function<std::vector<ir::Expr>(
+    const framework::NodeAttr &, const std::vector<ir::Tensor> &, const std::vector<std::vector<int>> &)>;
+
+class CustomCallArgsFuncRegistry {
+ public:
+  static CustomCallArgsFuncRegistry &Global() {
+    static CustomCallArgsFuncRegistry instance;
+    return instance;
+  }
+
+  void Register(const std::string &custom_call, const common::Target &target, ArgsFunc args_func) {
+    auto id       = custom_call + "_" + target.arch_str();
+    func_map_[id] = args_func;
+  }
+
+  ArgsFunc Lookup(const std::string &custom_call, const common::Target &target) {
+    auto id = custom_call + "_" + target.arch_str();
+    CHECK(func_map_.count(id)) << "Can't find " << custom_call << " for target " << target.arch_str();
+    return func_map_[id];
+  }
+
+ private:
+  CustomCallArgsFuncRegistry() {}
+  std::unordered_map<std::string, ArgsFunc> func_map_;
+};
+
+std::shared_ptr<OpStrategy> StrategyForCustomCall(const framework::NodeAttr &attrs,
+                                                  const std::vector<ir::Tensor> &inputs,
+                                                  const std::vector<Type> &out_type,
+                                                  const std::vector<std::vector<int>> &output_shapes,
+                                                  const Target &target) {
+  framework::CINNCompute compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK_EQ(args.size(), 1UL);
+    CINNValuePack pack_args = args[0];
+    CHECK_EQ(pack_args.size(), 2UL);
+    CHECK(pack_args[0].is_string() && pack_args[1].is_string());
+    std::string func_name       = pack_args[0].operator std::string();
+    std::string custom_call_api = pack_args[1].operator std::string();
+
+    auto args_func = CustomCallArgsFuncRegistry::Global().Lookup(custom_call_api, target);
+    // create call function.
+    ir::Var kernel_args(KERNEL_ARGS, type_of<void *>());
+    ir::Var kernel_args_num(KERNEL_ARGS_NUM, type_of<int>());
+
+    auto args_list                  = args_func(attrs, inputs, output_shapes);
+    std::vector<ir::Expr> host_args = {kernel_args, kernel_args_num};
+    host_args.insert(host_args.end(), args_list.begin(), args_list.end());
+    std::vector<ir::Argument> arguments = {ir::Argument(kernel_args, ir::Argument::IO::kOutput),
+                                           ir::Argument(kernel_args_num, ir::Argument::IO::kInput)};
+    // if target is nvgpu, add stream.
+    if (target == common::DefaultNVGPUTarget()) {
+      ir::Var kernel_stream(KERNEL_STREAM, type_of<void *>());
+
+      host_args.push_back(kernel_stream);
+      arguments.emplace_back(kernel_stream, ir::Argument::IO::kOutput);
+    }
+    auto call_extern_api =
+        ir::Call::Make(Void(), custom_call_api, host_args, {}, ir::CallType::Extern, ir::FunctionRef(), 0);
+    auto func = ir::_LoweredFunc_::Make(func_name, arguments, call_extern_api, {});
+
+    VLOG(3) << func;
+    *ret = CINNValuePack{{CINNValue(ir::Expr(func))}};
+  });
+
+  framework::CINNSchedule schedule([=](lang::Args args, lang::RetValue *ret) {});
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(compute, schedule, "strategy.custom_call.x86", 1);
+  return strategy;
+}
+
+#ifdef CINN_WITH_CUDA
+std::vector<ir::Expr> CustomCallArgsForCublas(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(inputs.size(), 2);
+  CHECK_EQ(output_shapes.size(), 1);
+  CHECK_LE(inputs[0]->shape.size(), 4);
+  CHECK_LE(inputs[1]->shape.size(), 4);
+
+  const auto &attr_store = attrs.attr_store;
+  bool trans_a           = attr_store.count("trans_a") ? absl::get<bool>(attr_store.at("trans_a")) : false;
+  bool trans_b           = attr_store.count("trans_b") ? absl::get<bool>(attr_store.at("trans_b")) : false;
+  bool trans_out         = attr_store.count("trans_out") ? absl::get<bool>(attr_store.at("trans_out")) : false;
+  float alpha            = attr_store.count("alpha") ? absl::get<float>(attr_store.at("alpha")) : 1.0f;
+  float beta             = attr_store.count("beta") ? absl::get<float>(attr_store.at("beta")) : 0.0f;
+
+  int x_num_col_dims = attr_store.count("x_num_col_dims") ? absl::get<int>(attr_store.at("x_num_col_dims")) : 0;
+  int y_num_col_dims = attr_store.count("y_num_col_dims") ? absl::get<int>(attr_store.at("y_num_col_dims")) : 0;
+  bool is_infer      = attr_store.count("is_infer") ? absl::get<bool>(attr_store.at("is_infer")) : false;
+  CHECK((x_num_col_dims == 0 && y_num_col_dims == 0) || (x_num_col_dims > 0 && y_num_col_dims > 0));
+
+  std::vector<ir::Expr> a_shape, b_shape;
+  if (x_num_col_dims == 0 && y_num_col_dims == 0) {
+    int a_rank = inputs[0]->shape.size();
+    int b_rank = inputs[1]->shape.size();
+
+    if (a_rank == 1) {
+      a_shape.resize(4, ir::Expr(1));
+
+      if (trans_a) {
+        a_shape[2] = inputs[0]->shape[0];
+      } else {
+        a_shape[3] = inputs[0]->shape[0];
+      }
+    } else {
+      a_shape           = inputs[0]->shape;
+      int insert_1_to_a = 4 - a_shape.size();
+      for (int idx = 0; idx < insert_1_to_a; ++idx) {
+        a_shape.insert(a_shape.begin(), ir::Expr(1));
+      }
+    }
+
+    if (b_rank == 1) {
+      b_shape.resize(4, ir::Expr(1));
+
+      if (trans_b) {
+        b_shape[3] = inputs[1]->shape[0];
+      } else {
+        b_shape[2] = inputs[1]->shape[0];
+      }
+    } else {
+      b_shape           = inputs[1]->shape;
+      int insert_1_to_b = 4 - b_shape.size();
+      for (int idx = 0; idx < insert_1_to_b; ++idx) {
+        b_shape.insert(b_shape.begin(), ir::Expr(1));
+      }
+    }
+  } else if (x_num_col_dims > 0 && y_num_col_dims > 0) {
+    // input a shape.
+    a_shape      = {Expr(1), Expr(1)};
+    int a_height = 1;
+    int a_width  = 1;
+    for (int idx = 0; idx < x_num_col_dims; ++idx) {
+      a_height *= inputs[0]->shape[idx].as_int32();
+    }
+    for (int idx = x_num_col_dims; idx < inputs[0]->shape.size(); ++idx) {
+      a_width *= inputs[0]->shape[idx].as_int32();
+    }
+    a_shape.emplace_back(a_height);
+    a_shape.emplace_back(a_width);
+
+    // input b shape.
+    b_shape      = {Expr(1), Expr(1)};
+    int b_height = 1;
+    int b_width  = 1;
+    for (int idx = 0; idx < y_num_col_dims; ++idx) {
+      b_height *= inputs[1]->shape[idx].as_int32();
+    }
+    for (int idx = y_num_col_dims; idx < inputs[1]->shape.size(); ++idx) {
+      b_width *= inputs[1]->shape[idx].as_int32();
+    }
+    b_shape.emplace_back(b_height);
+    b_shape.emplace_back(b_width);
+
+    if (is_infer) {
+      CHECK_EQ(a_width, b_width) << "The K dimension of mul shold be equal! Please check.";
+      trans_b = true;
+    } else {
+      CHECK_EQ(a_width, b_height) << "The K dimension of mul shold be equal! Please check.";
+    }
+  } else {
+    LOG(FATAL) << "Unkown Matmul Setting!";
+  }
+
+  CHECK_EQ(a_shape.size(), 4);
+  CHECK_EQ(b_shape.size(), 4);
+  // func args
+  std::vector<ir::Expr> args = {
+      ir::Expr(trans_a), ir::Expr(trans_b), ir::Expr(trans_out), ir::Expr(alpha), ir::Expr(beta)};
+  args.insert(args.end(), a_shape.begin(), a_shape.end());
+  args.insert(args.end(), b_shape.begin(), b_shape.end());
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForBatchedCublas(const framework::NodeAttr &attrs,
+                                                     const std::vector<ir::Tensor> &inputs,
+                                                     const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_GT(inputs.size(), 2);
+  CHECK_GT(output_shapes.size(), 1);
+  CHECK_EQ(inputs.size() - 1, output_shapes.size());
+
+  const auto &attr_store = attrs.attr_store;
+  bool trans_a           = attr_store.count("trans_a") ? absl::get<bool>(attr_store.at("trans_a")) : false;
+  bool trans_b           = attr_store.count("trans_b") ? absl::get<bool>(attr_store.at("trans_b")) : false;
+  bool trans_out         = attr_store.count("trans_out") ? absl::get<bool>(attr_store.at("trans_out")) : false;
+  float alpha            = attr_store.count("alpha") ? absl::get<float>(attr_store.at("alpha")) : 1.0f;
+  float beta             = attr_store.count("beta") ? absl::get<float>(attr_store.at("beta")) : 0.0f;
+
+  int x_num_col_dims = attr_store.count("x_num_col_dims") ? absl::get<int>(attr_store.at("x_num_col_dims")) : 0;
+  int y_num_col_dims = attr_store.count("y_num_col_dims") ? absl::get<int>(attr_store.at("y_num_col_dims")) : 0;
+  bool is_infer      = attr_store.count("is_infer") ? absl::get<bool>(attr_store.at("is_infer")) : false;
+  CHECK((x_num_col_dims == 0 && y_num_col_dims == 0) || (x_num_col_dims > 0 && y_num_col_dims > 0));
+
+  ir::Tensor left, right;
+  CHECK(attr_store.count("side"));
+  if (absl::get<std::string>(attr_store.at("side")) == "left") {
+    left  = inputs[0];
+    right = inputs[1];
+  } else {
+    left  = inputs[1];
+    right = inputs[0];
+  }
+
+  std::vector<ir::Expr> a_shape, b_shape;
+  if (x_num_col_dims == 0 && y_num_col_dims == 0) {
+    int a_rank = left->shape.size();
+    int b_rank = right->shape.size();
+
+    if (a_rank == 1) {
+      a_shape.resize(4, ir::Expr(1));
+
+      if (trans_a) {
+        a_shape[2] = left->shape[0];
+      } else {
+        a_shape[3] = left->shape[0];
+      }
+    } else {
+      a_shape           = left->shape;
+      int insert_1_to_a = 4 - a_shape.size();
+      for (int idx = 0; idx < insert_1_to_a; ++idx) {
+        a_shape.insert(a_shape.begin(), ir::Expr(1));
+      }
+    }
+
+    if (b_rank == 1) {
+      b_shape.resize(4, ir::Expr(1));
+
+      if (trans_b) {
+        b_shape[3] = right->shape[0];
+      } else {
+        b_shape[2] = right->shape[0];
+      }
+    } else {
+      b_shape           = right->shape;
+      int insert_1_to_b = 4 - b_shape.size();
+      for (int idx = 0; idx < insert_1_to_b; ++idx) {
+        b_shape.insert(b_shape.begin(), ir::Expr(1));
+      }
+    }
+  } else if (x_num_col_dims > 0 && y_num_col_dims > 0) {
+    // input a shape.
+    a_shape      = {Expr(1), Expr(1)};
+    int a_height = 1;
+    int a_width  = 1;
+    for (int idx = 0; idx < x_num_col_dims; ++idx) {
+      a_height *= left->shape[idx].as_int32();
+    }
+    for (int idx = x_num_col_dims; idx < left->shape.size(); ++idx) {
+      a_width *= left->shape[idx].as_int32();
+    }
+    a_shape.emplace_back(a_height);
+    a_shape.emplace_back(a_width);
+
+    // input b shape.
+    b_shape      = {Expr(1), Expr(1)};
+    int b_height = 1;
+    int b_width  = 1;
+    for (int idx = 0; idx < y_num_col_dims; ++idx) {
+      b_height *= right->shape[idx].as_int32();
+    }
+    for (int idx = y_num_col_dims; idx < right->shape.size(); ++idx) {
+      b_width *= right->shape[idx].as_int32();
+    }
+    b_shape.emplace_back(b_height);
+    b_shape.emplace_back(b_width);
+
+    if (is_infer) {
+      CHECK_EQ(a_width, b_width) << "The K dimension of mul shold be equal! Please check.";
+      trans_b = true;
+    } else {
+      CHECK_EQ(a_width, b_height) << "The K dimension of mul shold be equal! Please check.";
+    }
+  } else {
+    LOG(FATAL) << "Unkown Matmul Setting!";
+  }
+
+  CHECK_EQ(a_shape.size(), 4);
+  CHECK_EQ(b_shape.size(), 4);
+  // func args
+  std::vector<ir::Expr> args = {absl::get<std::string>(attr_store.at("side")) == "left" ? ir::Expr(0) : ir::Expr(1),
+                                ir::Expr(trans_a),
+                                ir::Expr(trans_b),
+                                ir::Expr(trans_out),
+                                ir::Expr(alpha),
+                                ir::Expr(beta)};
+  args.insert(args.end(), a_shape.begin(), a_shape.end());
+  args.insert(args.end(), b_shape.begin(), b_shape.end());
+  return args;
+}
+
+#endif
+
+#ifdef CINN_WITH_CUDNN
+std::vector<ir::Expr> CustomCallArgsForCudnnConvForward(const framework::NodeAttr &attrs,
+                                                        const std::vector<ir::Tensor> &inputs,
+                                                        const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(inputs.size(), 2UL);
+  // CHECK_EQ(output_shapes.size(), 1UL);
+  const auto &attr_store = attrs.attr_store;
+  float alpha            = attr_store.count("alpha") ? absl::get<float>(attr_store.at("alpha")) : 1.0f;
+  float beta             = attr_store.count("beta") ? absl::get<float>(attr_store.at("beta")) : 0.0f;
+
+  CHECK(attr_store.count("padding"));
+  auto padding = absl::get<std::vector<int>>(attr_store.at("padding"));
+  CHECK(attr_store.count("stride"));
+  auto stride = absl::get<std::vector<int>>(attr_store.at("stride"));
+  auto dilation =
+      attr_store.count("dilation") ? absl::get<std::vector<int>>(attr_store.at("dilation")) : std::vector<int>({1, 1});
+  std::string data_format =
+      attr_store.count("data_format") ? absl::get<std::string>(attr_store.at("data_format")) : "NCHW";
+  if (data_format == "AnyLayout") {
+    data_format = "NCHW";
+  }
+
+  int groups                 = attr_store.count("groups") ? absl::get<int>(attr_store.at("groups")) : 1;
+  cudnnTensorFormat_t format = data_format == "NCHW" ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+  std::vector<Expr> input  = inputs[0]->shape;
+  std::vector<Expr> filter = inputs[1]->shape;
+  std::vector<Expr> output = {};
+  std::transform(output_shapes[0].begin(), output_shapes[0].end(), std::back_inserter(output), [](const int dim) {
+    return ir::Expr(dim);
+  });
+  // if format is nhwc
+  if (format == CUDNN_TENSOR_NHWC) {
+    input  = {input[0], input[3], input[1], input[2]};
+    filter = {filter[0], filter[3], filter[1], filter[2]};
+    output = {output[0], output[3], output[1], output[2]};
+  }
+
+  std::vector<ir::Expr> args = {ir::Expr(static_cast<int>(format)), ir::Expr(alpha), ir::Expr(beta)};
+  args.insert(args.end(), input.begin(), input.end());
+  args.insert(args.end(), filter.begin(), filter.end());
+  args.push_back(ir::Expr(padding[0]));
+  args.push_back(ir::Expr(padding[1]));
+  args.push_back(ir::Expr(stride[0]));
+  args.push_back(ir::Expr(stride[1]));
+  args.push_back(ir::Expr(dilation[0]));
+  args.push_back(ir::Expr(dilation[1]));
+  args.push_back(ir::Expr(groups));
+  args.insert(args.end(), output.begin(), output.end());
+
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForCudnnConvBackwardData(const framework::NodeAttr &attrs,
+                                                             const std::vector<ir::Tensor> &inputs,
+                                                             const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(inputs.size(), 2UL);
+  CHECK_EQ(output_shapes.size(), 1UL);
+  const auto &attr_store = attrs.attr_store;
+  float alpha            = attr_store.count("alpha") ? absl::get<float>(attr_store.at("alpha")) : 1.0f;
+  float beta             = attr_store.count("beta") ? absl::get<float>(attr_store.at("beta")) : 0.0f;
+
+  CHECK(attr_store.count("padding"));
+  auto padding = absl::get<std::vector<int>>(attr_store.at("padding"));
+  CHECK(attr_store.count("stride"));
+  auto stride = absl::get<std::vector<int>>(attr_store.at("stride"));
+  auto dilation =
+      attr_store.count("dilation") ? absl::get<std::vector<int>>(attr_store.at("dilation")) : std::vector<int>({1, 1});
+  std::string data_format =
+      attr_store.count("data_format") ? absl::get<std::string>(attr_store.at("data_format")) : "NCHW";
+  if (data_format == "AnyLayout") {
+    data_format = "NCHW";
+  }
+
+  int groups                 = attr_store.count("groups") ? absl::get<int>(attr_store.at("groups")) : 1;
+  cudnnTensorFormat_t format = data_format == "NCHW" ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+  std::vector<Expr> input = {};
+  std::transform(output_shapes[0].begin(), output_shapes[0].end(), std::back_inserter(input), [](const int dim) {
+    return ir::Expr(dim);
+  });
+  std::vector<Expr> filter = inputs[0]->shape;
+  std::vector<Expr> output = inputs[1]->shape;
+  // if format is nhwc
+  if (format == CUDNN_TENSOR_NHWC) {
+    input  = {input[0], input[3], input[1], input[2]};
+    filter = {filter[0], filter[3], filter[1], filter[2]};
+    output = {output[0], output[3], output[1], output[2]};
+  }
+
+  std::vector<ir::Expr> args = {ir::Expr(static_cast<int>(format)), ir::Expr(alpha), ir::Expr(beta)};
+  args.insert(args.end(), input.begin(), input.end());
+  args.insert(args.end(), filter.begin(), filter.end());
+  args.push_back(ir::Expr(padding[0]));
+  args.push_back(ir::Expr(padding[1]));
+  args.push_back(ir::Expr(stride[0]));
+  args.push_back(ir::Expr(stride[1]));
+  args.push_back(ir::Expr(dilation[0]));
+  args.push_back(ir::Expr(dilation[1]));
+  args.push_back(ir::Expr(groups));
+  args.insert(args.end(), output.begin(), output.end());
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForCudnnConvBackwardFilter(const framework::NodeAttr &attrs,
+                                                               const std::vector<ir::Tensor> &inputs,
+                                                               const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(inputs.size(), 2UL);
+  CHECK_EQ(output_shapes.size(), 1UL);
+  const auto &attr_store = attrs.attr_store;
+  float alpha            = attr_store.count("alpha") ? absl::get<float>(attr_store.at("alpha")) : 1.0f;
+  float beta             = attr_store.count("beta") ? absl::get<float>(attr_store.at("beta")) : 0.0f;
+
+  CHECK(attr_store.count("padding"));
+  auto padding = absl::get<std::vector<int>>(attr_store.at("padding"));
+  CHECK(attr_store.count("stride"));
+  auto stride = absl::get<std::vector<int>>(attr_store.at("stride"));
+  auto dilation =
+      attr_store.count("dilation") ? absl::get<std::vector<int>>(attr_store.at("dilation")) : std::vector<int>({1, 1});
+  std::string data_format =
+      attr_store.count("data_format") ? absl::get<std::string>(attr_store.at("data_format")) : "NCHW";
+  if (data_format == "AnyLayout") {
+    data_format = "NCHW";
+  }
+
+  int groups = attr_store.count("groups") ? absl::get<int>(attr_store.at("groups")) : 1;
+
+  cudnnTensorFormat_t format = data_format == "NCHW" ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+  std::vector<Expr> input  = inputs[0]->shape;
+  std::vector<Expr> filter = {};
+  std::transform(output_shapes[0].begin(), output_shapes[0].end(), std::back_inserter(filter), [](const int dim) {
+    return ir::Expr(dim);
+  });
+  std::vector<Expr> output = inputs[1]->shape;
+  // if format is nhwc
+  if (format == CUDNN_TENSOR_NHWC) {
+    input  = {input[0], input[3], input[1], input[2]};
+    filter = {filter[0], filter[3], filter[1], filter[2]};
+    output = {output[0], output[3], output[1], output[2]};
+  }
+
+  std::vector<ir::Expr> args = {ir::Expr(static_cast<int>(format)), ir::Expr(alpha), ir::Expr(beta)};
+  args.insert(args.end(), input.begin(), input.end());
+  args.insert(args.end(), filter.begin(), filter.end());
+  args.push_back(ir::Expr(padding[0]));
+  args.push_back(ir::Expr(padding[1]));
+  args.push_back(ir::Expr(stride[0]));
+  args.push_back(ir::Expr(stride[1]));
+  args.push_back(ir::Expr(dilation[0]));
+  args.push_back(ir::Expr(dilation[1]));
+  args.push_back(ir::Expr(groups));
+  args.insert(args.end(), output.begin(), output.end());
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForCudnnPoolForward(const framework::NodeAttr &attrs,
+                                                        const std::vector<ir::Tensor> &inputs,
+                                                        const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(inputs.size(), 1UL);
+  CHECK_EQ(output_shapes.size(), 1UL);
+  const auto &attr_store = attrs.attr_store;
+  float alpha            = attr_store.count("alpha") ? absl::get<float>(attr_store.at("alpha")) : 1.0f;
+  float beta             = attr_store.count("beta") ? absl::get<float>(attr_store.at("beta")) : 0.0f;
+
+  CHECK(attr_store.count("kernel_size"));
+  auto kernel = absl::get<std::vector<int>>(attr_store.at("kernel_size"));
+  CHECK(attr_store.count("padding_size"));
+  auto padding = absl::get<std::vector<int>>(attr_store.at("padding_size"));
+  CHECK(attr_store.count("stride_size"));
+  auto stride = absl::get<std::vector<int>>(attr_store.at("stride_size"));
+  CHECK(attr_store.count("pool_type"));
+  auto pool_type = absl::get<std::string>(attr_store.at("pool_type"));
+  CHECK(attr_store.count("data_format"));
+  std::string data_format = absl::get<std::string>(attr_store.at("data_format"));
+
+  bool exclusive             = attr_store.count("exclusive") ? absl::get<bool>(attrs.attr_store.at("exclusive")) : true;
+  cudnnPoolingMode_t mode    = pool_type == "max" ? CUDNN_POOLING_MAX
+                                                  : (exclusive ? CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
+                                                               : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING);
+  cudnnTensorFormat_t format = data_format == "NCHW" ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+  std::vector<Expr> input = inputs[0]->shape;
+  std::vector<Expr> output;
+  std::transform(output_shapes[0].begin(), output_shapes[0].end(), std::back_inserter(output), [](const int dim) {
+    return ir::Expr(dim);
+  });
+  // if format is nhwc
+  if (format == CUDNN_TENSOR_NHWC) {
+    input  = {input[0], input[3], input[1], input[2]};
+    output = {output[0], output[3], output[1], output[2]};
+  }
+
+  std::vector<ir::Expr> args = {
+      ir::Expr(static_cast<int>(mode)), ir::Expr(static_cast<int>(format)), ir::Expr(alpha), ir::Expr(beta)};
+  args.insert(args.end(), input.begin(), input.end());
+  args.push_back(ir::Expr(kernel[0]));
+  args.push_back(ir::Expr(kernel[1]));
+  args.push_back(ir::Expr(padding[0]));
+  args.push_back(ir::Expr(padding[1]));
+  args.push_back(ir::Expr(stride[0]));
+  args.push_back(ir::Expr(stride[1]));
+  args.insert(args.end(), output.begin(), output.end());
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForCudnnPoolBackward(const framework::NodeAttr &attrs,
+                                                         const std::vector<ir::Tensor> &inputs,
+                                                         const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(inputs.size(), 3UL);
+  CHECK_EQ(output_shapes.size(), 1UL);
+  const auto &attr_store = attrs.attr_store;
+  float alpha            = attr_store.count("alpha") ? absl::get<float>(attr_store.at("alpha")) : 1.0f;
+  float beta             = attr_store.count("beta") ? absl::get<float>(attr_store.at("beta")) : 0.0f;
+
+  CHECK(attr_store.count("kernel_size"));
+  auto kernel = absl::get<std::vector<int>>(attr_store.at("kernel_size"));
+  CHECK(attr_store.count("padding_size"));
+  auto padding = absl::get<std::vector<int>>(attr_store.at("padding_size"));
+  CHECK(attr_store.count("stride_size"));
+  auto stride = absl::get<std::vector<int>>(attr_store.at("stride_size"));
+  CHECK(attr_store.count("pool_type"));
+  auto pool_type = absl::get<std::string>(attrs.attr_store.at("pool_type"));
+  CHECK(attr_store.count("data_format"));
+  std::string data_format = absl::get<std::string>(attrs.attr_store.at("data_format"));
+
+  bool exclusive             = attr_store.count("exclusive") ? absl::get<bool>(attrs.attr_store.at("exclusive")) : true;
+  cudnnPoolingMode_t mode    = pool_type == "max" ? CUDNN_POOLING_MAX
+                                                  : (exclusive ? CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
+                                                               : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING);
+  cudnnTensorFormat_t format = data_format == "NCHW" ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
+
+  std::vector<Expr> input  = inputs[0]->shape;  // 'x'
+  std::vector<Expr> output = inputs[1]->shape;  // 'y'
+  // if format is nhwc
+  if (format == CUDNN_TENSOR_NHWC) {
+    input  = {input[0], input[3], input[1], input[2]};
+    output = {output[0], output[3], output[1], output[2]};
+  }
+
+  std::vector<ir::Expr> args = {
+      ir::Expr(static_cast<int>(mode)), ir::Expr(static_cast<int>(format)), ir::Expr(alpha), ir::Expr(beta)};
+  args.insert(args.end(), input.begin(), input.end());
+  args.push_back(ir::Expr(kernel[0]));
+  args.push_back(ir::Expr(kernel[1]));
+  args.push_back(ir::Expr(padding[0]));
+  args.push_back(ir::Expr(padding[1]));
+  args.push_back(ir::Expr(stride[0]));
+  args.push_back(ir::Expr(stride[1]));
+  args.insert(args.end(), output.begin(), output.end());
+
+  return args;
+}
+#endif
+
+std::vector<ir::Expr> CustomCallArgsForAssertTrue(const framework::NodeAttr &attrs,
+                                                  const std::vector<ir::Tensor> &inputs,
+                                                  const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(inputs.size(), 1UL);
+  CHECK_EQ(output_shapes.size(), 1UL);
+  const auto &attr_store = attrs.attr_store;
+  CHECK(attr_store.count("msg"));
+  // TODO(thisjiang): change type from 'int' to 'std::string' when custom call support 'std::string' type
+  int msg           = absl::get<int>(attr_store.at("msg"));
+  bool only_warning = attr_store.count("only_warning") ? absl::get<bool>(attrs.attr_store.at("only_warning")) : false;
+
+  std::vector<ir::Expr> args = {ir::Expr(msg), ir::Expr(only_warning)};
+
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForGaussianRandom(const framework::NodeAttr &attrs,
+                                                      const std::vector<ir::Tensor> &inputs,
+                                                      const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(output_shapes.size(), 1UL);
+
+  const auto &attr_store = attrs.attr_store;
+
+  float mean = attr_store.count("mean") ? absl::get<float>(attrs.attr_store.at("mean")) : 0.0f;
+  float std  = attr_store.count("std") ? absl::get<float>(attrs.attr_store.at("std")) : 1.0f;
+  int seed   = attr_store.count("seed") ? absl::get<int>(attrs.attr_store.at("seed")) : 0;
+
+  std::vector<ir::Expr> args = {ir::Expr(mean), ir::Expr(std), ir::Expr(seed)};
+
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForUniformRandom(const framework::NodeAttr &attrs,
+                                                     const std::vector<ir::Tensor> &inputs,
+                                                     const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(output_shapes.size(), 1UL);
+
+  const auto &attr_store = attrs.attr_store;
+
+  float min = attr_store.count("min") ? absl::get<float>(attrs.attr_store.at("min")) : -1.0f;
+  float max = attr_store.count("max") ? absl::get<float>(attrs.attr_store.at("max")) : 1.0f;
+  int seed  = attr_store.count("seed") ? absl::get<int>(attrs.attr_store.at("seed")) : 0;
+
+  CHECK_GE(max, min) << "Arg max must greater than min, please check.";
+
+  std::vector<ir::Expr> args = {ir::Expr(min), ir::Expr(max), ir::Expr(seed)};
+
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForRandInt(const framework::NodeAttr &attrs,
+                                               const std::vector<ir::Tensor> &inputs,
+                                               const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(output_shapes.size(), 1UL);
+
+  const auto &attr_store = attrs.attr_store;
+
+  int seed = attr_store.count("seed") ? absl::get<int>(attrs.attr_store.at("seed")) : 0;
+
+  std::vector<ir::Expr> args = {ir::Expr(seed)};
+
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForCholesky(const framework::NodeAttr &attrs,
+                                                const std::vector<ir::Tensor> &inputs,
+                                                const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(inputs.size(), 1UL);
+  const auto &attr_store = attrs.attr_store;
+  CHECK(attr_store.count("upper"));
+
+  ir::Tensor x   = inputs.front();
+  int ndim       = static_cast<int>(x->shape.size());
+  int batch_size = 1;
+  for (int i = 0; i < ndim - 2; i++) {
+    batch_size *= x->shape[i].as_int32();
+  }
+  int m = x->shape[ndim - 1].as_int32();
+
+  auto upper = absl::get<bool>(attrs.attr_store.at("upper"));
+
+  std::vector<ir::Expr> args = {ir::Expr(batch_size), ir::Expr(m), ir::Expr(upper)};
+
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForTriangularSolve(const framework::NodeAttr &attrs,
+                                                       const std::vector<ir::Tensor> &inputs,
+                                                       const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(inputs.size(), 2UL);
+  const auto &attr_store = attrs.attr_store;
+  CHECK(attr_store.count("left_side"));
+  CHECK(attr_store.count("upper"));
+  CHECK(attr_store.count("transpose_a"));
+  CHECK(attr_store.count("unit_diagonal"));
+
+  ir::Tensor a   = inputs[0];
+  ir::Tensor b   = inputs[1];
+  int a_ndim     = static_cast<int>(a->shape.size());
+  int b_ndim     = static_cast<int>(b->shape.size());
+  int batch_size = 1;
+  for (int i = 0; i < a_ndim - 2; i++) {
+    batch_size *= a->shape[i].as_int32();
+  }
+
+  auto left_side     = absl::get<bool>(attrs.attr_store.at("left_side"));
+  auto upper         = absl::get<bool>(attrs.attr_store.at("upper"));
+  auto transpose_a   = absl::get<bool>(attrs.attr_store.at("transpose_a"));
+  auto unit_diagonal = absl::get<bool>(attrs.attr_store.at("unit_diagonal"));
+
+  int m = a->shape[a_ndim - 1].as_int32();
+  int k = left_side ? b->shape[b_ndim - 1].as_int32() : b->shape[b_ndim - 2].as_int32();
+
+  std::vector<ir::Expr> args = {ir::Expr(batch_size),
+                                ir::Expr(m),
+                                ir::Expr(k),
+                                ir::Expr(left_side),
+                                ir::Expr(upper),
+                                ir::Expr(transpose_a),
+                                ir::Expr(unit_diagonal)};
+
+  return args;
+}
+
+std::vector<ir::Expr> CustomCallArgsForMemset(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<std::vector<int>> &output_shapes) {
+  const auto &attr_store = attrs.attr_store;
+  CHECK(attr_store.count("value")) << "The memset custom_call must has attribute \"value\"";
+  CHECK(inputs.empty()) << "The memset custom_call should not has any input";
+  CHECK_EQ(output_shapes.size(), 1) << "The memset custom_call should only has one output";
+
+  struct Visitor {
+    int *scalar_;
+    explicit Visitor(int *scalar) : scalar_(scalar) {}
+    void operator()(float v) { *scalar_ = *reinterpret_cast<int *>(&v); }
+    void operator()(double v) {
+      auto tmp = static_cast<float>(v);
+      *scalar_ = *reinterpret_cast<int *>(&tmp);
+    }
+    void operator()(int32_t v) { *scalar_ = v; }
+    void operator()(int64_t v) { *scalar_ = static_cast<int>(v); }
+    void operator()(bool v) { *scalar_ = v ? 0xFFFFFFFF : 0; }
+
+#define EXPAND_MEMSET_TYPE_UNSUPPORT(TYPE) \
+  void operator()(const TYPE &) { LOG(FATAL) << "The type of \"value\" of memset custom_call not support: " << #TYPE; }
+
+    EXPAND_MEMSET_TYPE_UNSUPPORT(std::string)
+    EXPAND_MEMSET_TYPE_UNSUPPORT(std::vector<int>)
+    EXPAND_MEMSET_TYPE_UNSUPPORT(std::vector<int64_t>)
+    EXPAND_MEMSET_TYPE_UNSUPPORT(std::vector<float>)
+    EXPAND_MEMSET_TYPE_UNSUPPORT(std::vector<double>)
+    EXPAND_MEMSET_TYPE_UNSUPPORT(std::vector<bool>)
+    EXPAND_MEMSET_TYPE_UNSUPPORT(std::vector<std::string>)
+#undef EXPAND_MEMSET_TYPE_UNSUPPORT
+  };
+
+  int value              = 0;
+  const auto &value_attr = attr_store.at("value");
+  absl::visit(Visitor(&value), value_attr);
+  // can support memset non-0 ?
+  CHECK_EQ(value, 0) << "Now memset only support value is 0!";
+
+  size_t count = 1;
+  for (auto dim : output_shapes[0]) {
+    count *= dim;
+  }
+
+  const auto &dtype = common::Str2Type(absl::get<std::string>(attr_store.at("dtype")));
+  count *= dtype.bytes();
+  VLOG(4) << "call memset custom_call with value=" << utils::Attribute2String(value_attr) << " (" << value
+          << "), count=" << count;
+
+  return {Expr(value), Expr(count)};
+}
+
+std::vector<ir::Expr> CustomCallArgsForMemcpy(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<std::vector<int>> &output_shapes) {
+  CHECK_EQ(inputs.size(), 1) << "The memcpy custom_call should only has one input";
+  CHECK_EQ(output_shapes.size(), 1) << "The memcpy custom_call should only has one output";
+
+  const auto &input_shape = ToPodVector<int>(inputs[0]->shape);
+
+  size_t count = 1;
+  for (auto dim : input_shape) {
+    count *= dim;
+  }
+
+  const auto &dtype = inputs[0]->type();
+  count *= dtype.bytes();
+
+  return {Expr(count)};
+}
+
+bool RegisteryCustomCallArgsFunc() {
+#ifdef CINN_WITH_CUDA
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cublas", common::DefaultNVGPUTarget(), CustomCallArgsForCublas);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_gaussian_random", common::DefaultNVGPUTarget(), CustomCallArgsForGaussianRandom);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_uniform_random", common::DefaultNVGPUTarget(), CustomCallArgsForUniformRandom);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_randint", common::DefaultNVGPUTarget(), CustomCallArgsForRandInt);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cholesky_nvgpu", common::DefaultNVGPUTarget(), CustomCallArgsForCholesky);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_batched_cublas", common::DefaultNVGPUTarget(), CustomCallArgsForBatchedCublas);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_triangular_solve_nvgpu", common::DefaultNVGPUTarget(), CustomCallArgsForTriangularSolve);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_assert_true_nvgpu", common::DefaultNVGPUTarget(), CustomCallArgsForAssertTrue);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cuda_memset", common::DefaultNVGPUTarget(), CustomCallArgsForMemset);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cuda_memcpy", common::DefaultNVGPUTarget(), CustomCallArgsForMemcpy);
+#endif
+
+#ifdef CINN_WITH_CUDNN
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cudnn_conv2d_forward", common::DefaultNVGPUTarget(), CustomCallArgsForCudnnConvForward);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cudnn_conv2d_backward_data", common::DefaultNVGPUTarget(), CustomCallArgsForCudnnConvBackwardData);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cudnn_conv2d_backward_filter", common::DefaultNVGPUTarget(), CustomCallArgsForCudnnConvBackwardFilter);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cudnn_pool2d_forward", common::DefaultNVGPUTarget(), CustomCallArgsForCudnnPoolForward);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cudnn_pool2d_backward", common::DefaultNVGPUTarget(), CustomCallArgsForCudnnPoolBackward);
+#endif
+
+#ifdef CINN_WITH_MKLDNN
+
+#endif
+
+#ifdef CINN_WITH_MKL_CBLAS
+
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cholesky_host", common::DefaultHostTarget(), CustomCallArgsForCholesky);
+
+#endif
+
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_assert_true_host", common::DefaultHostTarget(), CustomCallArgsForAssertTrue);
+
+  return true;
+}
+
+static bool registry_custom_call_list_func = RegisteryCustomCallArgsFunc();
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(custom_call_op) {
+  CINN_REGISTER_OP(custom_call)
+      .describe("This operator implements the call of extern api!")
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForCustomCall)
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/elementwise.cc b/paddle/cinn/hlir/op/elementwise.cc
new file mode 100644
index 0000000000000..598311837a336
--- /dev/null
+++ b/paddle/cinn/hlir/op/elementwise.cc
@@ -0,0 +1,1056 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/elementwise.h"
+
+#include <iostream>
+
+#include "absl/types/optional.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/utils/functional.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+using PeFunc = std::function<std::vector<ir::Tensor>(const ir::Tensor &A, const std::string &out_name)>;
+
+#define StrategyForUnary(op_name__, pe__)                                                                \
+  std::shared_ptr<OpStrategy> StrategyFor##pe__(const framework::NodeAttr &attrs,                        \
+                                                const std::vector<ir::Tensor> &inputs,                   \
+                                                const std::vector<Type> &out_type,                       \
+                                                const std::vector<std::vector<int>> &output_shapes,      \
+                                                const Target &target) {                                  \
+    return StrategyForElementwise(attrs, inputs, out_type, output_shapes, target, #op_name__, pe::pe__); \
+  }
+
+std::shared_ptr<OpStrategy> StrategyForElementwise(const framework::NodeAttr &attrs,
+                                                   const std::vector<ir::Tensor> &inputs,
+                                                   const std::vector<Type> &out_type,
+                                                   const std::vector<std::vector<int>> &output_shapes,
+                                                   const Target &target,
+                                                   const std::string &op_name,
+                                                   const PeFunc &pe_func) {
+  framework::CINNCompute unary_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of " << op_name << " compute is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 1U) << "1 input tensor for " << op_name << " compute";
+    std::string tensor_name = UniqName(op_name + "_Out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2U);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+    Expr A_expr = pack_args[0];
+    CHECK(A_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+    auto out     = pe_func(A, tensor_name);
+    auto stages  = CreateStages({A});
+    std::vector<CINNValue> res;
+    for (auto &t : out) {
+      stages->InsertLazily(t);
+      res.push_back(CINNValue(t));
+    }
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      unary_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy." + op_name + ".x86", 1);
+
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForElementwise(const std::vector<shape_t> &inputs_shape,
+                                              const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 1UL);
+  std::vector<shape_t> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForElementwise(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForElementwiseBool(const std::vector<Type> &inputs_type,
+                                               const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  return {Bool()};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForElementwise(const std::vector<framework::shape_t> &input_shapes,
+                                                                const std::vector<std::string> &input_layouts,
+                                                                const framework::NodeAttr &attrs,
+                                                                const Target &target) {
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layouts size is not 1! Please check again.";
+  return {input_layouts, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForScale(const framework::NodeAttr &attrs,
+                                             const std::vector<ir::Tensor> &inputs,
+                                             const std::vector<Type> &out_type,
+                                             const std::vector<std::vector<int>> &output_shapes,
+                                             const Target &target) {
+  float scale           = 1.f;
+  float bias            = 0.f;
+  bool bias_after_scale = true;
+  for (auto &iter : attrs.attr_store) {
+    if (iter.first == "scale") {
+      scale = absl::get<float>(iter.second);
+    } else if (iter.first == "bias") {
+      bias = absl::get<float>(iter.second);
+    } else if (iter.first == "bias_after_scale") {
+      bias_after_scale = absl::get<bool>(iter.second);
+    }
+  }
+  framework::CINNCompute scale_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of scale compute is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "The input tensors of scale compute is empty! Please check.";
+    Expr A_expr = pack_args[0];
+    CHECK(A_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+    ir::Tensor out;
+    std::string tensor_name = UniqName("Scale_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    if (bias_after_scale) {
+      out = Compute(
+          A->shape,
+          [=](const std::vector<Expr> &indice) {
+            return ir::Cast::Make(A->type(), Expr(scale)) * A(indice) + ir::Cast::Make(A->type(), Expr(bias));
+          },
+          tensor_name);
+    } else {
+      out = Compute(
+          A->shape,
+          [=](const std::vector<Expr> &indice) {
+            return ir::Cast::Make(A->type(), Expr(scale)) * (A(indice) + ir::Cast::Make(A->type(), Expr(bias)));
+          },
+          tensor_name);
+    }
+    auto stages = CreateStages({out});
+    *ret        = CINNValuePack{{CINNValue(Expr(out.get())), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(scale_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.scale.x86", 1);
+
+  return strategy;
+}
+
+Expr GetScalarExpr(const framework::NodeAttr::attr_t &attr) {
+  Expr scalar;
+  struct Visitor {
+    Expr &scalar_;
+    explicit Visitor(Expr &scalar) : scalar_(scalar) {}
+    void operator()(float v) { scalar_ = Expr(v); }
+    void operator()(double v) { scalar_ = Expr(v); }
+    void operator()(int32_t v) { scalar_ = Expr(v); }
+    void operator()(int64_t v) { scalar_ = Expr(v); }
+    void operator()(bool v) { scalar_ = Expr(v); }
+    void operator()(const std::string &v) { scalar_ = Expr(v); }
+    void operator()(const std::vector<int> &) { LOG(FATAL) << "wrong type std::vector<int>"; }
+    void operator()(const std::vector<int64_t> &) { LOG(FATAL) << "wrong type std::vector<int64_t>"; }
+    void operator()(const std::vector<float> &) { LOG(FATAL) << "wrong type std::vector<float>"; }
+    void operator()(const std::vector<double> &) { LOG(FATAL) << "wrong type std::vector<double>"; }
+    void operator()(const std::vector<bool> &) { LOG(FATAL) << "wrong type std::vector<bool>"; }
+    void operator()(const std::vector<std::string> &) { LOG(FATAL) << "wrong type std::vector<std::string>"; }
+  };
+  absl::visit(Visitor{scalar}, attr);
+  return scalar;
+}
+
+std::shared_ptr<OpStrategy> StrategyForConstScalar(const framework::NodeAttr &attrs,
+                                                   const std::vector<ir::Tensor> &inputs,
+                                                   const std::vector<Type> &out_type,
+                                                   const std::vector<std::vector<int>> &output_shapes,
+                                                   const Target &target) {
+  framework::CINNCompute const_scalar_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of const_float compute is empty! Please check.";
+    auto scalar             = GetScalarExpr(attrs.attr_store.at("value"));
+    auto scalar_type        = out_type.at(0);
+    CINNValuePack pack_args = args[0];
+    std::string tensor_name = UniqName("const_scalar_Out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 1U);
+      CHECK(pack_args[0].is_string());
+      tensor_name = pack_args[0].operator std::string();
+    }
+
+    auto out = lang::Compute(
+        {Expr(1)},
+        [=](const std::vector<Expr> &indice) {
+          auto res = (scalar_type == scalar->type()) ? scalar : ir::Cast::Make(scalar_type, scalar);
+          return res;
+        },
+        tensor_name);
+    CHECK(out.defined()) << "can't create const scalar with the given type " << out_type[0];
+    auto stages = CreateStages({out});
+    *ret        = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      const_scalar_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.const_scalar.x86", 1);
+
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForConstScalar(const std::vector<shape_t> &inputs_shape,
+                                              const framework::AttrMapType &attrs) {
+  return {{1}};
+}
+
+std::vector<Type> InferDtypeForConstScalar(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  Type out_type;
+  if (attrs.find("dtype") != attrs.end()) {
+    auto dtype_str = absl::get<std::string>(attrs.at("dtype"));
+    if (!dtype_str.empty()) {
+      out_type = common::Str2Type(dtype_str);
+    }
+  } else {
+    auto scalar = GetScalarExpr(attrs.at("value"));
+    out_type    = scalar->type();
+  }
+  VLOG(3) << "scalar type: " << out_type;
+  return {out_type};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForConstScalar(const std::vector<framework::shape_t> &input_shapes,
+                                                                const std::vector<std::string> &input_layouts,
+                                                                const framework::NodeAttr &attrs,
+                                                                const Target &target) {
+  return {{"C"}, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForSum(const framework::NodeAttr &attrs,
+                                           const std::vector<ir::Tensor> &inputs,
+                                           const std::vector<Type> &out_type,
+                                           const std::vector<std::vector<int>> &output_shapes,
+                                           const Target &target) {
+  LOG(FATAL) << "The operator will be decomposed into several primitive operators. Please Use Decomposer Program Pass.";
+}
+
+std::vector<shape_t> InferShapeForSum(const std::vector<shape_t> &inputs_shape, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty()) << "At least 1 input tensor for sum operator.";
+  auto shape = inputs_shape[0];
+  for (size_t i = 1; i < inputs_shape.size(); ++i) {
+    if (inputs_shape[i] != shape) {
+      LOG(FATAL) << "The input shapes must be the same. But received: the i-th(" << i << ") input shape is "
+                 << utils::Join(inputs_shape[i], ",") << " and the first input shape is " << utils::Join(shape, ",");
+    }
+  }
+  std::vector<shape_t> out_shape{shape};
+
+  return out_shape;
+}
+
+std::vector<Type> InferDtypeForSum(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "At least 1 input tensor for sum operator.";
+  auto type = inputs_type[0];
+  for (size_t i = 1; i < inputs_type.size(); ++i) {
+    if (inputs_type[i] != type) {
+      LOG(FATAL) << "The input types must be the same. But received: the i-th(" << i << ") input type is "
+                 << inputs_type[i] << " and the first input type is " << type;
+    }
+  }
+  std::vector<Type> res{type};
+  return res;
+}
+
+std::shared_ptr<OpStrategy> StrategyForFillConstant(const framework::NodeAttr &attrs,
+                                                    const std::vector<ir::Tensor> &inputs,
+                                                    const std::vector<Type> &out_type,
+                                                    const std::vector<std::vector<int>> &output_shapes,
+                                                    const Target &target) {
+  framework::CINNCompute fill_constant_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of fill_constant compute is empty! Please check.";
+    bool force_cpu = false;
+    CHECK(attrs.attr_store.count("shape"));
+    auto shape = absl::get<std::vector<int>>(attrs.attr_store.at("shape"));
+    CHECK(attrs.attr_store.count("value"));
+    auto value = GetScalarExpr(attrs.attr_store.at("value"));
+    CHECK(attrs.attr_store.count("force_cpu"));
+    force_cpu = absl::get<bool>(attrs.attr_store.at("force_cpu"));
+
+    if (force_cpu && target != common::DefaultHostTarget()) {
+      LOG(WARNING) << "The attribute \"force_cpu\" of \"fill_constant\" not supported in CINN! The \"fill_constant\"'s "
+                      "output tensor will placed on "
+                   << target;
+    }
+
+    CINNValuePack arg_pack  = args[0];
+    std::string tensor_name = UniqName("fill_constant_Out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(arg_pack.size(), 1U);
+      CHECK(arg_pack[0].is_string());
+      tensor_name = arg_pack[0].operator std::string();
+    }
+    CHECK(!shape.empty()) << "shape attr is empty!";
+    auto shape_exprs = ToCinnExprs(shape);
+    auto out         = lang::Compute(
+        shape_exprs, [=](const std::vector<Expr> &indice) { return ir::Cast::Make(out_type[0], value); }, tensor_name);
+    CHECK(out.defined()) << "can't create fill_constant with the given type " << out_type[0];
+    auto stages = CreateStages({out});
+    *ret        = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      fill_constant_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.fill_constant.x86", 1);
+
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForFillConstant(const std::vector<shape_t> &inputs_shape,
+                                               const framework::AttrMapType &attrs) {
+  CHECK(attrs.count("shape"));
+  auto shape = absl::get<std::vector<int>>(attrs.at("shape"));
+  CHECK(!shape.empty()) << "shape attr is empty!";
+  return {shape};
+}
+
+std::vector<Type> InferDtypeForFillConstant(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  common::Type out_type;
+  CHECK(attrs.count("value"));
+  if (attrs.find("dtype") != attrs.end()) {
+    // attribute [dtype] are given
+    auto dtype_str = absl::get<std::string>(attrs.at("dtype"));
+    out_type       = common::Str2Type(dtype_str);
+    VLOG(3) << "FillConstant output dtype (from [dtype]): " << dtype_str;
+  } else {
+    // attribute [dtype] no given, inferred by value's type
+    auto scalar = GetScalarExpr(attrs.at("value"));
+    out_type    = scalar->type();
+    VLOG(3) << "FillConstant scalar type (from [value]): " << common::Type2Str(out_type);
+  }
+  return {out_type};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForFillConstant(const std::vector<framework::shape_t> &input_shapes,
+                                                                 const std::vector<std::string> &input_layouts,
+                                                                 const framework::NodeAttr &attrs,
+                                                                 const Target &target) {
+  return {{""}, input_layouts};
+}
+
+#define EXPAND_ATTR_TYPE(MACRO) \
+  MACRO(bool)                   \
+  MACRO(int)                    \
+  MACRO(int64_t)                \
+  MACRO(double)                 \
+  MACRO(float)
+
+std::shared_ptr<OpStrategy> StrategyForAssignValue(const framework::NodeAttr &attrs,
+                                                   const std::vector<ir::Tensor> &inputs,
+                                                   const std::vector<Type> &out_type,
+                                                   const std::vector<std::vector<int>> &output_shapes,
+                                                   const Target &target) {
+  framework::CINNCompute assign_value_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of assign_value compute is empty! Please check.";
+    CHECK(attrs.attr_store.count("values")) << "assign_value should set attribute [values]! Please check.";
+    const auto &value = attrs.attr_store.at("values");
+
+    CINNValuePack arg_pack  = args[0];
+    std::string tensor_name = UniqName("T_assign_value_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(arg_pack.size(), 1U);
+      CHECK(arg_pack[0].is_string());
+      tensor_name = arg_pack[0].operator std::string();
+    }
+
+    absl::optional<ir::Tensor> out;
+#define EXPAND_VALUE_TO_TENSOR(TYPE)                                                            \
+  else if (absl::get_if<TYPE>(&value)) {                                                        \
+    out = pe::AssignValue(std::vector<TYPE>{absl::get<TYPE>(value)}, out_type[0], tensor_name); \
+  }                                                                                             \
+  else if (absl::get_if<std::vector<TYPE>>(&value)) {                                           \
+    out = pe::AssignValue(absl::get<std::vector<TYPE>>(value), out_type[0], tensor_name);       \
+  }
+
+    if (false) {
+    }
+    EXPAND_ATTR_TYPE(EXPAND_VALUE_TO_TENSOR)
+    else {
+      LOG(FATAL) << "Assign value not support the type " << out_type[0];
+    }
+#undef EXPAND_VALUE_TO_TENSOR
+
+    CHECK(out && out.value().defined()) << "can't create assign_value with the given type " << out_type[0];
+
+    auto stages = CreateStages({out.value()});
+    *ret        = CINNValuePack{{CINNValue(Expr(out.value().get())), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      assign_value_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.assign_value.x86", 1);
+
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForAssignValue(const std::vector<shape_t> &inputs_shape,
+                                              const framework::AttrMapType &attrs) {
+  CHECK(attrs.count("values")) << "assign_value should set attribute [values]! Please check.";
+  const auto &value = attrs.at("values");
+
+  shape_t shape;
+#define EXPAND_ATTR_TO_GET_SHAPE(TYPE)                              \
+  else if (absl::get_if<TYPE>(&value)) {                            \
+    shape.emplace_back(1);                                          \
+  }                                                                 \
+  else if (absl::get_if<std::vector<TYPE>>(&value)) {               \
+    shape.emplace_back(absl::get<std::vector<TYPE>>(value).size()); \
+  }
+
+  if (false) {
+  }
+  EXPAND_ATTR_TYPE(EXPAND_ATTR_TO_GET_SHAPE)
+  else {
+    LOG(FATAL) << "assign_value not support the type!";
+  }
+#undef EXPAND_ATTR_TO_GET_SHAPE
+
+  VLOG(3) << "The output shape of assign_value is [" << cinn::utils::Join(shape, ", ") << "]";
+
+  return {shape};
+}
+
+std::vector<Type> InferDtypeForAssignValue(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  Type out_type;
+  if (attrs.find("dtype") != attrs.end()) {
+    // attribute [dtype] are given
+    auto dtype_str = absl::get<std::string>(attrs.at("dtype"));
+    if (!dtype_str.empty()) {
+      // if the [dtype] is not empty, output as the given type
+      out_type = common::Str2Type(dtype_str);
+    }
+  }
+
+  // attribute [dtype] not given or is empty
+  if (out_type.is_unk()) {
+    // infer from [values]'s dtype
+    CHECK(attrs.count("values")) << "assign_value should set attribute [values]! Please check.";
+    const auto &value = attrs.at("values");
+
+#define EXPAND_ATTR_TO_GET_DTYPE(TYPE)                \
+  else if (absl::get_if<TYPE>(&value)) {              \
+    out_type = common::type_of<TYPE>();               \
+  }                                                   \
+  else if (absl::get_if<std::vector<TYPE>>(&value)) { \
+    out_type = common::type_of<TYPE>();               \
+  }
+
+    if (false) {
+    }
+    EXPAND_ATTR_TYPE(EXPAND_ATTR_TO_GET_DTYPE)
+    else {
+      LOG(FATAL) << "assign_value not support the type!";
+    }
+#undef EXPAND_ATTR_TO_GET_DTYPE
+  }
+
+  VLOG(3) << "The data type of assign_value is " << out_type;
+
+  return {out_type};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForAssignValue(const std::vector<framework::shape_t> &input_shapes,
+                                                                const std::vector<std::string> &input_layouts,
+                                                                const framework::NodeAttr &attrs,
+                                                                const Target &target) {
+  return {{""}, input_layouts};
+}
+
+#undef EXPAND_ATTR_TYPE
+
+StrategyForUnary(exp, Exp);
+StrategyForUnary(erf, Erf);
+StrategyForUnary(sqrt, Sqrt);
+StrategyForUnary(log, Log);
+StrategyForUnary(floor, Floor);
+StrategyForUnary(ceil, Ceil);
+StrategyForUnary(round, Round);
+StrategyForUnary(tanh, Tanh);
+StrategyForUnary(log2, Log2);
+StrategyForUnary(log10, Log10);
+StrategyForUnary(trunc, Trunc);
+StrategyForUnary(cos, Cos);
+StrategyForUnary(cosh, Cosh);
+StrategyForUnary(tan, Tan);
+StrategyForUnary(sin, Sin);
+StrategyForUnary(sinh, Sinh);
+StrategyForUnary(acos, Acos);
+StrategyForUnary(acosh, Acosh);
+StrategyForUnary(asin, Asin);
+StrategyForUnary(asinh, Asinh);
+StrategyForUnary(atan, Atan);
+StrategyForUnary(atanh, Atanh);
+
+StrategyForUnary(isnan, IsNan);
+StrategyForUnary(isfinite, IsFinite);
+StrategyForUnary(isinf, IsInf);
+StrategyForUnary(bitwise_not, BitwiseNot);
+
+StrategyForUnary(negative, Negative);
+StrategyForUnary(identity, Identity);
+StrategyForUnary(logical_not, LogicalNot);
+StrategyForUnary(sign, Sign);
+StrategyForUnary(abs, Abs);
+StrategyForUnary(rsqrt, Rsqrt);
+StrategyForUnary(sigmoid, Sigmoid);
+StrategyForUnary(cbrt, Cbrt);
+StrategyForUnary(clz, Clz);
+StrategyForUnary(popc, Popc);
+
+#undef StrategyForUnary
+
+std::shared_ptr<framework::OpStrategy> StrategyForSqueeze(const framework::NodeAttr &attrs,
+                                                          const std::vector<ir::Tensor> &inputs,
+                                                          const std::vector<Type> &out_type,
+                                                          const std::vector<std::vector<int>> &output_shapes,
+                                                          const Target &target) {
+  const std::vector<int> &axes =
+      attrs.attr_store.count("axes") ? absl::get<std::vector<int>>(attrs.attr_store.at("axes")) : std::vector<int>{};
+
+  framework::CINNCompute squeeze_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of Squeeze compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 1U) << "at least 1 input tensors for Squeeze compute\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    CHECK(!output_shapes.empty());
+    auto tensor_A = A.as_tensor_ref();
+    auto stages   = CreateStages({tensor_A});
+    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
+            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+
+    std::string tensor_name = UniqName("Squeeze_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2U);
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    ir::Tensor out = pe::Squeeze(tensor_A, axes, tensor_name);
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res.push_back(CINNValue(out));
+    CHECK(!out_type.empty()) << "Output type of Squeeze is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(squeeze_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.squeeze.x86", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForSqueeze(const std::vector<std::vector<int>> &inputs_shape,
+                                                   const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 1U);
+  const std::vector<int> &axes =
+      attrs.count("axes") ? absl::get<std::vector<int>>(attrs.at("axes")) : std::vector<int>{};
+  VLOG(4) << "The [axis] value used in Squeeze: " << cinn::utils::Join(axes, ",");
+
+  const auto &posi_axes = utils::GetPositiveAxes(axes, inputs_shape[0].size());
+  std::vector<int> output_shape;
+  if (posi_axes.size()) {
+    for (int idx = 0; idx < inputs_shape[0].size(); ++idx) {
+      // if can't find idx in axis
+      if (std::find(posi_axes.begin(), posi_axes.end(), idx) == posi_axes.end()) {
+        output_shape.push_back(inputs_shape[0][idx]);
+      } else {
+        CHECK_EQ(inputs_shape[0][idx], 1);
+      }
+    }
+  } else {
+    for (int idx = 0; idx < inputs_shape[0].size(); ++idx) {
+      if (inputs_shape[0][idx] != 1) {
+        output_shape.push_back(inputs_shape[0][idx]);
+      }
+    }
+  }
+
+  VLOG(4) << "The output calculated in Squeeze: " << cinn::utils::Join(output_shape, ", ");
+
+  if (output_shape.size() == 0) {
+    output_shape.push_back(1);
+  }
+  return {output_shape};
+}
+
+std::shared_ptr<OpStrategy> StrategyForExpandDims(const framework::NodeAttr &attrs,
+                                                  const std::vector<ir::Tensor> &inputs,
+                                                  const std::vector<Type> &out_type,
+                                                  const std::vector<std::vector<int>> &output_shapes,
+                                                  const Target &target) {
+  const std::vector<int> &axes =
+      attrs.attr_store.count("axes") ? absl::get<std::vector<int>>(attrs.attr_store.at("axes")) : std::vector<int>{};
+
+  framework::CINNCompute expand_dims_compute{[=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input args are empty! Please check again.";
+    CINNValuePack input_args = args[0];
+    int input_size           = input_args.size();
+    CHECK_GE(input_size, 1U) << "Require 1 input tensors for expand_dims compute.";
+    Expr x = input_args[0];
+    CHECK(x.as_tensor());
+
+    std::string tensor_name = UniqName("expand_dims_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(input_args.size(), 2U);
+      CHECK(input_args[1].is_string());
+      tensor_name = input_args[1].operator std::string();
+    }
+
+    auto out    = pe::ExpandDims(x.as_tensor_ref(), axes, output_shapes[0], tensor_name);
+    auto stages = CreateStages({x.as_tensor_ref()});
+    stages->InsertLazily(out);
+    std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
+    *ret = CINNValuePack{res};
+  }};
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      expand_dims_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.expand_dims.x86", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForExpandDims(const std::vector<std::vector<int>> &inputs_shape,
+                                                      const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+
+  CHECK_EQ(inputs_shape.size(), 1U);
+  const std::vector<int> &axes =
+      attrs.count("axes") ? absl::get<std::vector<int>>(attrs.at("axes")) : std::vector<int>{};
+  VLOG(4) << "The [axes] value used in ExpandDims: " << cinn::utils::Join(axes, ",");
+
+  std::vector<int> out_shape(inputs_shape[0].size() + axes.size(), 1);
+  const auto &posi_axes = utils::GetPositiveAxes(axes, out_shape.size());
+
+  int shape_pos = 0, axes_pos = 0;
+  for (int i = 0; i < out_shape.size(); ++i) {
+    if (axes_pos < posi_axes.size() && posi_axes[axes_pos] == i) {
+      out_shape[i] = 1;
+      ++axes_pos;
+    } else if (shape_pos < inputs_shape[0].size()) {
+      out_shape[i] = inputs_shape[0][shape_pos];
+      ++shape_pos;
+    }
+  }
+
+  VLOG(4) << "The output calculated in ExpandDims: " << cinn::utils::Join(out_shape, ", ");
+  return {out_shape};
+}
+
+std::shared_ptr<OpStrategy> StrategyForReshape(const framework::NodeAttr &attrs,
+                                               const std::vector<ir::Tensor> &inputs,
+                                               const std::vector<Type> &out_type,
+                                               const std::vector<std::vector<int>> &output_shapes,
+                                               const Target &target) {
+  framework::CINNCompute reshape_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of Reshape compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 1U) << "at least 1 input tensors for Reshape compute\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    CHECK(!output_shapes.empty());
+    auto attr_store = attrs.attr_store;
+    CHECK(attr_store.count("shape")) << "find no attr of shape";
+    std::vector<int> new_shape = absl::get<std::vector<int>>(attr_store.at("shape"));
+    auto tensor_A              = A.as_tensor_ref();
+    auto stages                = CreateStages({tensor_A});
+    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
+            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+
+    std::string tensor_name = UniqName("Reshape_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    ir::Tensor out = pe::Reshape(tensor_A, output_shapes[0], tensor_name);
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res.push_back(CINNValue(out));
+    CHECK(!out_type.empty()) << "Output type of Reshape is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(reshape_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.reshape.x86", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForReshape(const std::vector<std::vector<int>> &inputs_shape,
+                                                   const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 1U) << "The input's shape size should be 1! Please check again.";
+  std::vector<int> output_shape;
+  for (auto &iter : attrs) {
+    if (iter.first == "shape") {
+      output_shape = absl::get<std::vector<int>>(iter.second);
+      break;
+    }
+  }
+  int tensor_size = 1;
+  for (auto i : inputs_shape[0]) {
+    tensor_size *= i;
+  }
+  int flag_index = -1;
+  for (int i = 0; i < output_shape.size(); i++) {
+    if (output_shape[i] > 0) {
+      CHECK_EQ(tensor_size % output_shape[i], 0)
+          << "Incompatible input shape and output shape in op reshape: " << tensor_size << ", " << output_shape[i];
+      tensor_size /= output_shape[i];
+    } else if (output_shape[i] == 0) {
+      CHECK_LT(i, inputs_shape[0].size())
+          << "In op reshape, when attribute shape[i] == 0, shape[i] = input_shape[i]. But now the size of input_shape "
+             "<= i, which is incompatible. Please check!";
+      output_shape[i] = inputs_shape[0][i];
+      CHECK_EQ(tensor_size % output_shape[i], 0)
+          << "Incompatible input shape and output shape in op reshape: " << tensor_size << ", " << output_shape[i];
+      tensor_size /= output_shape[i];
+    } else if (output_shape[i] == -1 && flag_index == -1) {
+      flag_index = i;
+    } else if (output_shape[i] == -1) {
+      LOG(FATAL) << "More than one -1 in output_shape of op reshape.";
+    } else {
+      LOG(FATAL) << "Unsupported output_shape " << output_shape[i];
+    }
+  }
+  if (flag_index >= 0) output_shape[flag_index] = tensor_size;
+  std::vector<std::vector<int>> res{output_shape};
+  return res;
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForCast(const framework::NodeAttr &attrs,
+                                                       const std::vector<ir::Tensor> &inputs,
+                                                       const std::vector<Type> &out_type,
+                                                       const std::vector<std::vector<int>> &output_shapes,
+                                                       const Target &target) {
+  framework::CINNCompute cast_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of Cast compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 1U) << "at least 1 input tensors for Cast compute\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    CHECK(!output_shapes.empty());
+    auto tensor_A = A.as_tensor_ref();
+    auto stages   = CreateStages({tensor_A});
+    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
+            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+    std::string tensor_name = UniqName("Cast_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2U);
+      tensor_name = pack_args[1].operator std::string();
+    }
+    ir::Tensor out = pe::Cast(tensor_A, out_type[0], tensor_name);
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res.push_back(CINNValue(out));
+    CHECK(!out_type.empty()) << "Output type of Cast is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(cast_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.reshape.x86", 1);
+  return strategy;
+}
+
+std::vector<Type> InferDtypeForCast(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(attrs.count("dtype"));
+  return {common::Str2Type(absl::get<std::string>(attrs.at("dtype")))};
+}
+
+std::shared_ptr<framework::OpStrategy> StrategyForArange(const framework::NodeAttr &attrs,
+                                                         const std::vector<ir::Tensor> &inputs,
+                                                         const std::vector<Type> &out_type,
+                                                         const std::vector<std::vector<int>> &output_shapes,
+                                                         const Target &target) {
+  auto attr_store = attrs.attr_store;
+  CHECK(attr_store.count("start"));
+  CHECK(attr_store.count("stop"));
+  CHECK(attr_store.count("step"));
+  CHECK(attr_store.count("dtype"));
+
+  auto start = absl::get<float>(attr_store.at("start"));
+  auto stop  = absl::get<float>(attr_store.at("stop"));
+  auto step  = absl::get<float>(attr_store.at("step"));
+  auto dtype = common::Str2Type(absl::get<std::string>(attr_store.at("dtype")));
+
+  framework::CINNCompute arange_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of arange compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+
+    std::string tensor_name = common::UniqName("T_Arange_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 1U);
+      tensor_name = pack_args[0].operator std::string();
+    }
+
+    auto out = pe::Arange(start, stop, step, dtype, tensor_name);
+    std::vector<common::CINNValue> res;
+    auto stages = CreateStages({out});
+    res.push_back(common::CINNValue(out));
+    res.push_back(common::CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(arange_compute, GetElementwiseScheduleFunc(output_shapes, target), "strategy.reshape.x86", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForArange(const std::vector<std::vector<int>> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK(attrs.count("start"));
+  CHECK(attrs.count("stop"));
+  CHECK(attrs.count("step"));
+  float start = absl::get<float>(attrs.at("start"));
+  float stop  = absl::get<float>(attrs.at("stop"));
+  float step  = absl::get<float>(attrs.at("step"));
+  CHECK_NE(step, 0.0f) << "The value of step can't be 0!";
+
+  int num = static_cast<int>(std::ceil((stop - start) / step));
+  CHECK(num) << "Invalid arange parameters, start = " << start << ", stop = " << stop << ", step = " << step
+             << ", cause num_elem = " << num << " which is negative.";
+  return {{num}};
+}
+
+std::vector<Type> InferDtypeForArange(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(attrs.count("dtype"));
+  return {common::Str2Type(absl::get<std::string>(attrs.at("dtype")))};
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(elementwise_ops) {
+#define CINN_REGISTER_UNARY(op__, op_stragegy__)                                                                       \
+  CINN_REGISTER_OP(op__)                                                                                               \
+      .describe(#op__ " function")                                                                                     \
+      .set_num_inputs(1)                                                                                               \
+      .set_num_outputs(1)                                                                                              \
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyFor##op_stragegy__)   \
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForElementwise))                                \
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForElementwise))                                \
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForElementwise))                              \
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise) \
+      .set_support_level(4);
+
+  CINN_REGISTER_UNARY(exp, Exp);
+  CINN_REGISTER_UNARY(erf, Erf);
+  CINN_REGISTER_UNARY(sqrt, Sqrt);
+  CINN_REGISTER_UNARY(log, Log);
+  CINN_REGISTER_UNARY(floor, Floor);
+  CINN_REGISTER_UNARY(ceil, Ceil);
+  CINN_REGISTER_UNARY(round, Round);
+  CINN_REGISTER_UNARY(tanh, Tanh);
+  CINN_REGISTER_UNARY(log2, Log2);
+  CINN_REGISTER_UNARY(log10, Log10);
+  CINN_REGISTER_UNARY(trunc, Trunc);
+  CINN_REGISTER_UNARY(cos, Cos);
+  CINN_REGISTER_UNARY(cosh, Cosh);
+  CINN_REGISTER_UNARY(tan, Tan);
+  CINN_REGISTER_UNARY(sin, Sin);
+  CINN_REGISTER_UNARY(sinh, Sinh);
+  CINN_REGISTER_UNARY(acos, Acos);
+  CINN_REGISTER_UNARY(acosh, Acosh);
+  CINN_REGISTER_UNARY(asin, Asin);
+  CINN_REGISTER_UNARY(asinh, Asinh);
+  CINN_REGISTER_UNARY(atan, Atan);
+  CINN_REGISTER_UNARY(atanh, Atanh);
+  CINN_REGISTER_UNARY(bitwise_not, BitwiseNot)
+
+  CINN_REGISTER_UNARY(negative, Negative)
+  CINN_REGISTER_UNARY(identity, Identity)
+  CINN_REGISTER_UNARY(logical_not, LogicalNot)
+  CINN_REGISTER_UNARY(sign, Sign)
+  CINN_REGISTER_UNARY(abs, Abs)
+  CINN_REGISTER_UNARY(rsqrt, Rsqrt)
+  CINN_REGISTER_UNARY(sigmoid, Sigmoid)
+  CINN_REGISTER_UNARY(cbrt, Cbrt);
+  CINN_REGISTER_UNARY(clz, Clz);
+  CINN_REGISTER_UNARY(popc, Popc);
+
+#undef CINN_REGISTER_UNARY
+
+#define CINN_REGISTER_COMPARE(op__, op_stragegy__)                                                                     \
+  CINN_REGISTER_OP(op__)                                                                                               \
+      .describe(#op__ " function")                                                                                     \
+      .set_num_inputs(1)                                                                                               \
+      .set_num_outputs(1)                                                                                              \
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyFor##op_stragegy__)   \
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForElementwise))                                \
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForElementwiseBool))                            \
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForElementwise))                              \
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise) \
+      .set_support_level(4);
+
+  CINN_REGISTER_COMPARE(isnan, IsNan)
+  CINN_REGISTER_COMPARE(isfinite, IsFinite)
+  CINN_REGISTER_COMPARE(isinf, IsInf)
+
+#undef CINN_REGISTER_COMPARE
+
+  CINN_REGISTER_OP(scale)
+      .describe("Putting scale and bias to the input Tensor")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForScale)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForElementwise))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForElementwise))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForElementwise))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(const_scalar)
+      .describe("create const scalar with the given value")
+      .set_num_inputs(0)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForConstScalar)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForConstScalar))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForConstScalar))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForConstScalar))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(sum)
+      .describe("Sum the input tensors.")
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForSum)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForSum))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForSum))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise);
+
+  CINN_REGISTER_OP(fill_constant)
+      .describe("create tensor with the given value, type and shape")
+      .set_num_inputs(0)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForFillConstant)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForFillConstant))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForFillConstant))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForFillConstant))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(assign_value)
+      .describe("create tensor with the given value, type and shape")
+      .set_num_inputs(0)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForAssignValue)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForAssignValue))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForAssignValue))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForAssignValue))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(squeeze)
+      .describe("The operator is used to squeeze input tensor's dims")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForSqueeze)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForSqueeze))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForElementwise))
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForElementwise))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(expand_dims)
+      .describe("This operator is used to expand input tensor's dims.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForExpandDims)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForExpandDims))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForElementwise))
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForElementwise))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(reshape)
+      .describe("This operator is used to reshape input tensor X.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForReshape)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForReshape))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForElementwise))
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForElementwise))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(cast)
+      .describe("This operator is used to cast input tensor's type to target.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForCast)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForElementwise))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForCast))
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForElementwise))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(arange)
+      .describe("Returns evenly spaced values within a given interval.")
+      .set_num_inputs(0)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForArange)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForArange))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForArange))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(gelu)
+      .describe("The implement of gelu.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForElementwise))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForElementwise))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/external_api_registry.cc b/paddle/cinn/hlir/op/external_api_registry.cc
new file mode 100644
index 0000000000000..8928078be7a12
--- /dev/null
+++ b/paddle/cinn/hlir/op/external_api_registry.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/external_api_registry.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+ExternalApiInfo& ExternalApiRegistry::Register(const std::string& op_name, const common::Target& target) {
+  return __REGISTER__(GenKey(op_name, target));
+}
+
+std::string ExternalApiRegistry::GetExternalApi(const framework::Node* op_node, const common::Target& target) {
+  CHECK(op_node->attrs.attr_store.count("original_op")) << "a custom_call op must store its original op name";
+  std::string op_name                      = absl::get<std::string>(op_node->attrs.attr_store.at("original_op"));
+  const ExternalApiInfo* external_api_info = Find(GenKey(op_name, target));
+  CHECK(external_api_info) << "Op:" << op_name << " doesn't register external_api on " << target;
+  std::string external_api = external_api_info->api_name;
+  if (external_api.empty()) {  // if api_name not set directly, call trans_func to acquire
+    auto&& trans_func = external_api_info->trans_func;
+    CHECK(trans_func) << "Op:" << op_name << " register invalid ExternalApiInfo on " << target;
+    external_api = trans_func(op_node);
+  }
+  return external_api;
+}
+
+std::string ExternalApiRegistry::GenKey(const std::string& op_name, const common::Target& target) {
+  std::ostringstream oss;
+  oss << target;
+  return op_name + "_" + oss.str();
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(op_external_api) {
+  const auto& default_nvgpu = ::cinn::common::DefaultNVGPUTarget();
+  const auto& default_host  = ::cinn::common::DefaultHostTarget();
+
+  CINN_OP_REGISTER_EXTERNAL_API(matmul, default_nvgpu).set_api_name("cinn_call_cublas");
+  CINN_OP_REGISTER_EXTERNAL_API(mul, default_nvgpu).set_api_name("cinn_call_cublas");
+  CINN_OP_REGISTER_EXTERNAL_API(cublas_gemm, default_nvgpu).set_api_name("cinn_call_cublas");
+  CINN_OP_REGISTER_EXTERNAL_API(cublas_matmul, default_nvgpu).set_api_name("cinn_call_cublas");
+  CINN_OP_REGISTER_EXTERNAL_API(gaussian_random, default_nvgpu).set_api_name("cinn_call_gaussian_random");
+  CINN_OP_REGISTER_EXTERNAL_API(uniform_random, default_nvgpu).set_api_name("cinn_call_uniform_random");
+  CINN_OP_REGISTER_EXTERNAL_API(randint, default_nvgpu).set_api_name("cinn_call_randint");
+  CINN_OP_REGISTER_EXTERNAL_API(cholesky, default_nvgpu).set_api_name("cinn_call_cholesky_nvgpu");
+  CINN_OP_REGISTER_EXTERNAL_API(cholesky, default_host).set_api_name("cinn_call_cholesky_host");
+  CINN_OP_REGISTER_EXTERNAL_API(triangular_solve, default_nvgpu).set_api_name("cinn_call_triangular_solve_nvgpu");
+  CINN_OP_REGISTER_EXTERNAL_API(assert_true, default_nvgpu).set_api_name("cinn_assert_true_nvgpu");
+  CINN_OP_REGISTER_EXTERNAL_API(assert_true, default_host).set_api_name("cinn_assert_true_host");
+#ifdef CINN_WITH_CUDNN
+  CINN_OP_REGISTER_EXTERNAL_API(conv2d, default_nvgpu).set_trans_func([](const ::cinn::hlir::framework::Node* node) {
+    CHECK(node->attrs.attr_store.count("conv_type"));
+    std::string conv_type = absl::get<std::string>(node->attrs.attr_store.at("conv_type"));
+    CHECK(conv_type == "forward" || conv_type == "backward_data" || conv_type == "backward_filter")
+        << "unknown conv_type=" << conv_type;
+    return "cinn_call_cudnn_conv2d_" + conv_type;
+  });
+  CINN_OP_REGISTER_EXTERNAL_API(depthwise_conv2d, default_nvgpu)
+      .set_trans_func([](const ::cinn::hlir::framework::Node* node) {
+        std::string conv_type = node->attrs.attr_store.count("conv_type")
+                                    ? absl::get<std::string>(node->attrs.attr_store.at("conv_type"))
+                                    : "forward";
+        CHECK(conv_type == "forward" || conv_type == "backward_data" || conv_type == "backward_filter")
+            << "unknown conv_type=" << conv_type;
+        return "cinn_call_cudnn_conv2d_" + conv_type;
+      });
+  CINN_OP_REGISTER_EXTERNAL_API(pool2d, default_nvgpu).set_api_name("cinn_call_cudnn_pool2d_forward");
+  CINN_OP_REGISTER_EXTERNAL_API(pool2d_grad, default_nvgpu).set_api_name("cinn_call_cudnn_pool2d_backward");
+#endif
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/external_api_registry.h b/paddle/cinn/hlir/op/external_api_registry.h
new file mode 100644
index 0000000000000..bca2b1223229e
--- /dev/null
+++ b/paddle/cinn/hlir/op/external_api_registry.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <sstream>
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/utils/registry.h"
+
+#define CINN_OP_REGISTER_EXTERNAL_API(Name, Target)                                                               \
+  static ::cinn::hlir::op::ExternalApiInfo& CINN_STR_CONCAT(__make_##ExternalApiInfo##_##Name##__, __COUNTER__) = \
+      ::cinn::hlir::op::ExternalApiRegistry::Global()->Register(#Name, Target)
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using OpNodeTransToExternalApiFunction = std::function<std::string(const framework::Node* op_node)>;
+
+// This class contains detail external api information of a specified Operator.
+// To provide the external api name, we can directly set it through `set_api_name`
+// or set a transform function wth `set_trans_func` that return a api name finally
+struct ExternalApiInfo {
+  std::string name;
+  std::string api_name;
+  OpNodeTransToExternalApiFunction trans_func;
+
+  inline ExternalApiInfo& set_api_name(const std::string& name) {
+    this->api_name = name;
+    return *this;
+  }
+
+  inline ExternalApiInfo& set_trans_func(OpNodeTransToExternalApiFunction func) {
+    this->trans_func = func;
+    return *this;
+  }
+};
+
+// A registry that stores external api for ops supported by vendor library
+class ExternalApiRegistry : public Registry<ExternalApiInfo> {
+ public:
+  static ExternalApiRegistry* Global() {
+    static ExternalApiRegistry x;
+    return &x;
+  }
+
+  ExternalApiInfo& Register(const std::string& op_name, const common::Target& target);
+
+  bool Has(const std::string& op_name, const common::Target& target) {
+    return nullptr != Registry<ExternalApiInfo>::Find(GenKey(op_name, target));
+  }
+
+  // return the api name on the specified target
+  std::string GetExternalApi(const framework::Node* op_node, const common::Target& target);
+
+ private:
+  ExternalApiRegistry() = default;
+  CINN_DISALLOW_COPY_AND_ASSIGN(ExternalApiRegistry);
+
+  // the registered key consist of the name of op and the specified target
+  std::string GenKey(const std::string& op_name, const common::Target& target);
+};
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/external_api_registry_test.cc b/paddle/cinn/hlir/op/external_api_registry_test.cc
new file mode 100644
index 0000000000000..f2ea23237d5ba
--- /dev/null
+++ b/paddle/cinn/hlir/op/external_api_registry_test.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/external_api_registry.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using cinn::hlir::framework::Node;
+using cinn::hlir::op::ExternalApiRegistry;
+
+TEST(ExternalApiRegistry, Has) {
+  ASSERT_TRUE(ExternalApiRegistry::Global()->Has("matmul", common::DefaultNVGPUTarget()));
+  ASSERT_TRUE(ExternalApiRegistry::Global()->Has("cholesky", common::DefaultHostTarget()));
+  ASSERT_FALSE(ExternalApiRegistry::Global()->Has("op_doesn't_exist", common::DefaultNVGPUTarget()));
+}
+
+TEST(ExternalApiRegistry, GetExternalApi) {
+  auto node                             = std::make_unique<Node>(Operator::Get("custom_call"), "custom_call");
+  node->attrs.attr_store["original_op"] = std::string("matmul");
+  ASSERT_EQ("cinn_call_cublas",
+            ExternalApiRegistry::Global()->GetExternalApi(node.get(), common::DefaultNVGPUTarget()));
+#ifdef CINN_WITH_CUDNN
+  node->attrs.attr_store["conv_type"]   = std::string("backward_data");
+  node->attrs.attr_store["original_op"] = std::string("conv2d");
+  ASSERT_EQ("cinn_call_cudnn_conv2d_backward_data",
+            ExternalApiRegistry::Global()->GetExternalApi(node.get(), common::DefaultNVGPUTarget()));
+#endif
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/nn.cc b/paddle/cinn/hlir/op/nn.cc
new file mode 100644
index 0000000000000..f6c08202ee703
--- /dev/null
+++ b/paddle/cinn/hlir/op/nn.cc
@@ -0,0 +1,2460 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/nn.h"
+
+#include <functional>
+
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/layout.h"
+#include "cinn/poly/stage.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+std::shared_ptr<OpStrategy> StrategyForRelu(const framework::NodeAttr &attrs,
+                                            const std::vector<ir::Tensor> &inputs,
+                                            const std::vector<Type> &out_type,
+                                            const std::vector<std::vector<int>> &output_shapes,
+                                            const Target &target) {
+  framework::CINNCompute relu_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of relu compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "at least one input tensor for relu compute\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    std::string tensor_name = UniqName("Relu_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+    auto out    = pe::Relu(A.as_tensor_ref(), 0.0, tensor_name);
+    auto stages = CreateStages({out});
+    *ret        = CINNValuePack{{CINNValue(Expr(out.get())), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  CHECK(out_type.size()) << "Out_type of relu op is empty! Please check.";
+  strategy->AddImpl(relu_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.relu.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForRelu(const std::vector<framework::shape_t> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty()) << "The input's shape is empty! Please check again.";
+  std::vector<framework::shape_t> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForRelu(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::shared_ptr<OpStrategy> StrategyForRelu6(const framework::NodeAttr &attrs,
+                                             const std::vector<ir::Tensor> &inputs,
+                                             const std::vector<Type> &out_type,
+                                             const std::vector<std::vector<int>> &output_shapes,
+                                             const Target &target) {
+  framework::CINNCompute relu6_compute([](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of relu6 compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "at least one input tensor for relu6 compute\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    std::string tensor_name = UniqName("Relu6_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+    auto out    = pe::Relu6(A.as_tensor_ref(), 0.0, tensor_name);
+    auto stages = CreateStages({out});
+    *ret        = CINNValuePack{{CINNValue(Expr(out.get())), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  CHECK(out_type.size()) << "Out_type of relu6 op is empty! Please check.";
+  strategy->AddImpl(relu6_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.relu6.x86", 1);
+  return strategy;
+}
+
+std::shared_ptr<OpStrategy> StrategyForConv2d(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<Type> &out_type,
+                                              const std::vector<std::vector<int>> &output_shapes,
+                                              const Target &target) {
+  std::vector<int> padding({0, 0});
+  std::vector<int> stride({1, 1});
+  std::vector<int> dilation({1, 1});
+  std::string data_format = "NCHW";
+  int groups              = 1;
+  std::string key         = "";
+  std::string conv_type   = "";
+  bool use_mkldnn         = false;
+  if (attrs.attr_store.find("padding") != attrs.attr_store.end()) {
+    padding = absl::get<std::vector<int>>(attrs.attr_store.at("padding"));
+  }
+  if (attrs.attr_store.find("stride") != attrs.attr_store.end()) {
+    stride = absl::get<std::vector<int>>(attrs.attr_store.at("stride"));
+  }
+  if (attrs.attr_store.find("dilation") != attrs.attr_store.end()) {
+    dilation = absl::get<std::vector<int>>(attrs.attr_store.at("dilation"));
+  }
+  if (attrs.attr_store.find("data_format") != attrs.attr_store.end()) {
+    data_format = absl::get<std::string>(attrs.attr_store.at("data_format"));
+  }
+  if (attrs.attr_store.find("groups") != attrs.attr_store.end()) {
+    groups = absl::get<int>(attrs.attr_store.at("groups"));
+  }
+  if (attrs.attr_store.find("use_mkldnn") != attrs.attr_store.end()) {
+    use_mkldnn = absl::get<bool>(attrs.attr_store.at("use_mkldnn"));
+  }
+  if (attrs.attr_store.find("key") != attrs.attr_store.end()) {
+    key = absl::get<std::string>(attrs.attr_store.at("key"));
+  }
+  // get conv type
+  if (attrs.attr_store.find("conv_type") != attrs.attr_store.end()) {
+    conv_type = absl::get<std::string>(attrs.attr_store.at("conv_type"));
+  } else {
+    conv_type = "forward";
+  }
+
+#ifndef CINN_WITH_CUDNN
+  CHECK_EQ(conv_type, "forward") << "cudnn is not found, backward_data/backward_filter is not supported!";
+#endif
+
+  framework::CINNCompute conv2d_compute([=](lang::Args args, lang::RetValue *ret) {
+    std::vector<CINNValue> res;
+    CHECK(!args.empty()) << "The input argument of conv2d compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 2U) << "at least 2 input tensors for conv2d compute\n";
+    Expr A = pack_args[0];
+    Expr B = pack_args[1];
+    CHECK(A.as_tensor());
+    CHECK(B.as_tensor());
+    CHECK_EQ(padding.size(), 2) << "The size of padding in conv2d op is not 2! Please check.";
+    CHECK_EQ(stride.size(), 2) << "The size of stride in conv2d op is not 2! Please check.";
+    CHECK_EQ(dilation.size(), 2) << "The size of stride in conv2d op is not 2! Please check.";
+    std::vector<ir::Tensor> out;
+    VLOG(3) << "input shape: " << utils::Join(A.as_tensor_ref()->shape, ", ");
+    VLOG(3) << "weight shape: " << utils::Join(B.as_tensor_ref()->shape, ", ");
+    std::string tensor_name = UniqName("Conv2d_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_GE(pack_args.size(), 3);
+      CHECK(pack_args[2].is_string());
+      tensor_name = pack_args[2].operator std::string();
+    }
+    if (data_format == "NCHW") {
+      // A is input: [N, C, H, W], B is filter: [C_out, C_in/group, filter_h, filter_w]
+      if (target.arch == Target::Arch::X86) {
+        if (groups == 1 && !use_mkldnn) {
+          out = pe::Conv2d_NCHW_5D(A.as_tensor_ref(),
+                                   B.as_tensor_ref(),
+                                   padding[0],
+                                   padding[1],
+                                   stride[0],
+                                   stride[1],
+                                   dilation[0],
+                                   dilation[1],
+                                   key,
+                                   tensor_name,
+                                   target);
+        } else {
+#ifdef CINN_WITH_MKLDNN
+          out = pe::Conv2d_NCHW_MKLDNN(A.as_tensor_ref(),
+                                       B.as_tensor_ref(),
+                                       padding[0],
+                                       padding[1],
+                                       stride[0],
+                                       stride[1],
+                                       dilation[0],
+                                       dilation[1],
+                                       tensor_name);
+#else
+          out = pe::Conv2d_NCHW_5D(A.as_tensor_ref(),
+                                   B.as_tensor_ref(),
+                                   padding[0],
+                                   padding[1],
+                                   stride[0],
+                                   stride[1],
+                                   dilation[0],
+                                   dilation[1],
+                                   key,
+                                   tensor_name);
+#endif
+        }
+      } else {
+        if (conv_type == "forward") {
+          out = pe::Conv2d_NCHW(A.as_tensor_ref(),
+                                B.as_tensor_ref(),
+                                padding[0],
+                                padding[1],
+                                stride[0],
+                                stride[1],
+                                dilation[0],
+                                dilation[1],
+                                tensor_name);
+          out.push_back(B.as_tensor_ref());
+        } else {
+#ifdef CINN_WITH_CUDNN
+          // as backward_data and backward_filter is not support now, we built a fake op to instead.
+          // as the runtime use cudnn to compute the conv2d, so this fake op is not been called.
+          // When cinn support backward_filter/backward_data code gen, this code is to be removed.
+          out = pe::Identity(A.as_tensor_ref());
+          out.push_back(A.as_tensor_ref());
+          out.push_back(B.as_tensor_ref());
+#endif
+        }
+      }
+    } else if (data_format == "NHWC") {
+      // A is input: [N, H, W, C], B is filter: [C_out, C_in/group, filter_h, filter_w]
+      out = pe::Conv2d_NHWC(A.as_tensor_ref(),
+                            B.as_tensor_ref(),
+                            padding[0],
+                            padding[1],
+                            stride[0],
+                            stride[1],
+                            dilation[0],
+                            dilation[1],
+                            tensor_name);
+    } else {
+      LOG(FATAL) << "Only support NCHW and NHWC data layout\n";
+    }
+    auto stages = CreateStages({A.as_tensor_ref(), B.as_tensor_ref()});
+
+    for (auto &t : out) {
+      stages->InsertLazily(t);
+      res.push_back(CINNValue(t));
+    }
+    CHECK(out.size() == 3U || out.size() == 2U || out.size() == 5U || out.size() == 12U)
+        << "The output tensor sizes of conv2d op in conv2d op should be 2 or 3 or 5\n";
+
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule conv2d_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of conv2d schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      if (target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDNN
+        // If conv_type is backward_filter or backward_data, we built a fake op.
+        // As runtime use cudnn to compute conv2d, this fake op is not to be called.
+        // When cinn support backward_filter/backward_data code gen, this code is to be removed.
+        if (conv_type != "forward") {
+          CHECK_EQ(vec_ast.size(), 1);
+          pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+          std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+          *ret = CINNValuePack{res};
+          return;
+        }
+#endif
+        int expr_size = vec_ast.size();
+        if (expr_size == 2) {
+          pe::IRCudaScheduleConv(ir_sch, target);
+          VLOG(3) << "After IRCudaScheduleConv, arg_pack[0] is : " << ir_sch.GetModule().GetExprs().at(0);
+          std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+          *ret = CINNValuePack{res};
+          return;
+        } else {
+          CINN_NOT_IMPLEMENTED
+        }
+      } else if (target.arch == Target::Arch::X86) {
+        CINN_NOT_IMPLEMENTED
+      }
+      LOG(FATAL) << "This target [" << target << "] is not supported yet.";
+    } else {
+      CHECK(!args.empty()) << "The input argument of conv2d schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      CHECK(arg_pack.size() == 4UL || arg_pack.size() == 3UL || arg_pack.size() == 6UL || arg_pack.size() == 13UL);
+      poly::StageMap stages = arg_pack.back();
+      if (target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDNN
+        // If conv_type is backward_filter or backward_data, we built a fake op.
+        // As runtime use cudnn to compute conv2d, this fake op is not to be called.
+        // When cinn support backward_filter/backward_data code gen, this code is to be removed.
+        if (conv_type != "forward") {
+          Expr out = arg_pack[0];
+          pe::CudaScheduleInjective(stages[out.as_tensor_ref()], output_shapes.front(), target);
+          *ret = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+          return;
+        }
+#endif
+        if (arg_pack.size() == 4UL) {
+          Expr Out             = arg_pack[0];
+          Expr input_pad       = arg_pack[1];
+          Expr weights         = arg_pack[2];
+          ir::Tensor out_t     = Out.as_tensor_ref();
+          ir::Tensor input_t   = input_pad.as_tensor_ref();
+          ir::Tensor weights_t = weights.as_tensor_ref();
+          CHECK(Out.as_tensor());
+          pe::CudaScheduleConv(stages, input_t, weights_t, out_t, target);
+          arg_pack[0] = Expr(out_t);
+          arg_pack[1] = Expr(input_t);
+          arg_pack[2] = Expr(weights_t);
+          *ret        = CINNValuePack{{arg_pack[0], CINNValue(stages)}};
+          return;
+        } else if (arg_pack.size() == 13UL) {
+          Expr wino_weights_dilation          = arg_pack[0];
+          Expr wino_input_pad                 = arg_pack[1];
+          Expr wino_A                         = arg_pack[2];
+          Expr wino_B                         = arg_pack[3];
+          Expr wino_G                         = arg_pack[4];
+          Expr kernel_pack                    = arg_pack[5];
+          Expr input_tile                     = arg_pack[6];
+          Expr data_pack                      = arg_pack[7];
+          Expr bgemm                          = arg_pack[8];
+          Expr inverse                        = arg_pack[9];
+          Expr wino_conv                      = arg_pack[10];
+          ir::Tensor wino_weights_dilation_t  = wino_weights_dilation.as_tensor_ref();
+          ir::Tensor wino_input_pad_t         = wino_input_pad.as_tensor_ref();
+          ir::Tensor wino_A_t                 = wino_A.as_tensor_ref();
+          ir::Tensor wino_B_t                 = wino_B.as_tensor_ref();
+          ir::Tensor wino_G_t                 = wino_G.as_tensor_ref();
+          ir::Tensor kernel_pack_t            = kernel_pack.as_tensor_ref();
+          ir::Tensor input_tile_t             = input_tile.as_tensor_ref();
+          ir::Tensor data_pack_t              = data_pack.as_tensor_ref();
+          ir::Tensor bgemm_t                  = bgemm.as_tensor_ref();
+          ir::Tensor inverse_t                = inverse.as_tensor_ref();
+          ir::Tensor wino_conv_t              = wino_conv.as_tensor_ref();
+          std::vector<ir::Tensor> all_tensors = {wino_weights_dilation_t,
+                                                 wino_input_pad_t,
+                                                 wino_A_t,
+                                                 wino_B_t,
+                                                 wino_G_t,
+                                                 kernel_pack_t,
+                                                 input_tile_t,
+                                                 data_pack_t,
+                                                 bgemm_t,
+                                                 inverse_t,
+                                                 wino_conv_t};
+          hlir::pe::CudaScheduleWinogradConv(stages, all_tensors, target);
+          arg_pack[0]  = Expr(all_tensors[0]);
+          arg_pack[1]  = Expr(all_tensors[1]);
+          arg_pack[2]  = Expr(all_tensors[2]);
+          arg_pack[3]  = Expr(all_tensors[3]);
+          arg_pack[4]  = Expr(all_tensors[4]);
+          arg_pack[5]  = Expr(all_tensors[5]);
+          arg_pack[6]  = Expr(all_tensors[6]);
+          arg_pack[7]  = Expr(all_tensors[7]);
+          arg_pack[8]  = Expr(all_tensors[8]);
+          arg_pack[9]  = Expr(all_tensors[9]);
+          arg_pack[10] = Expr(all_tensors[10]);
+          *ret         = CINNValuePack{{arg_pack[10], arg_pack[5], arg_pack[7], arg_pack[8], CINNValue(stages)}};
+          return;
+        }
+      } else if (target.arch == Target::Arch::X86) {
+        if (arg_pack.size() == 6UL) {
+          Expr res              = arg_pack[0];
+          Expr packed_out       = arg_pack[1];
+          Expr weights_dilation = arg_pack[2];
+          Expr input_pad        = arg_pack[3];
+          Expr data             = arg_pack[4];
+          CHECK(res.as_tensor());
+          CHECK(packed_out.as_tensor());
+          CHECK(input_pad.as_tensor());
+          CHECK(weights_dilation.as_tensor());
+          CHECK(data.as_tensor());
+          std::vector<Expr> kernel_shape = weights_dilation.as_tensor_ref()->shape;
+          // kernel_h == 1 && kernel_w == 1
+          CHECK_EQ(kernel_shape.size(), 6U) << "kernel_dialtion shape size should be 6";
+          bool is_1x1                  = (is_zero(kernel_shape[2] - 1)) && (is_zero(kernel_shape[3] - 1));
+          ir::Tensor packed_out_tensor = packed_out.as_tensor_ref();
+          bool do_padding              = (padding[0] == 0 && padding[1] == 0) ? false : true;
+
+          if (groups == 1) {
+            if (is_1x1) {
+              pe::Conv2d_NCHWc_1X1_Schedule_CPU(stages,
+                                                res.as_tensor_ref(),
+                                                packed_out_tensor,
+                                                input_pad.as_tensor_ref(),
+                                                weights_dilation.as_tensor_ref(),
+                                                data.as_tensor_ref(),
+                                                target,
+                                                key,
+                                                do_padding);
+            } else {
+              pe::Conv2d_NCHWc_Schedule_CPU(stages,
+                                            res.as_tensor_ref(),
+                                            packed_out_tensor,
+                                            input_pad.as_tensor_ref(),
+                                            weights_dilation.as_tensor_ref(),
+                                            data.as_tensor_ref(),
+                                            target,
+                                            key,
+                                            do_padding);
+            }
+            if (do_padding) {
+              *ret = CINNValuePack{
+                  {CINNValue(res), CINNValue(packed_out_tensor), arg_pack[2], arg_pack[3], CINNValue(stages)}};
+            } else {
+              *ret = CINNValuePack{{CINNValue(res), CINNValue(packed_out_tensor), arg_pack[2], CINNValue(stages)}};
+            }
+            return;
+          } else {
+            // todo: opt group_conv schedule
+            VLOG(3) << "use simple group convolution schedule";
+            stages[input_pad.as_tensor_ref()]->ComputeInline();
+            stages[weights_dilation.as_tensor_ref()]->ComputeInline();
+            stages[data.as_tensor_ref()]->ComputeInline();
+            *ret = CINNValuePack{{arg_pack[0], CINNValue(packed_out_tensor), CINNValue(stages)}};
+          }
+          return;
+        } else if (arg_pack.size() == 4UL) {
+          Expr input_pad = arg_pack[1];
+          CHECK(input_pad.as_tensor());
+          stages[input_pad.as_tensor_ref()]->ComputeInline();
+          Expr weights_dilation = arg_pack[2];
+          CHECK(weights_dilation.as_tensor());
+          stages[weights_dilation.as_tensor_ref()]->ComputeInline();
+          *ret = CINNValuePack{{arg_pack[0], CINNValue(stages)}};
+          return;
+        }
+      }
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  CHECK(out_type.size()) << "Out_type of conv2d op is empty! Please check.";
+  strategy->AddImpl(conv2d_compute, conv2d_schedule, "strategy.conv2d.x86", 1);
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForConv2d(const std::vector<shape_t> &inputs_shape,
+                                         const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 2) << "The conv2d should has and only has 2 inputs";
+  CHECK_EQ(inputs_shape[0].size(), 4) << "The conv2d's first input only support 4-dimension tensor";
+  CHECK_EQ(inputs_shape[1].size(), 4) << "The conv2d's first input only support 4-dimension tensor";
+
+  std::vector<int> padding({0, 0});
+  std::vector<int> stride({1, 1});
+  std::vector<int> dilation({1, 1});
+  int groups              = 1;
+  std::string data_format = "NCHW";
+  std::string conv_type   = "forward";
+
+  if (attrs.find("padding") != attrs.end()) {
+    padding = absl::get<std::vector<int>>(attrs.at("padding"));
+  }
+  if (attrs.find("stride") != attrs.end()) {
+    stride = absl::get<std::vector<int>>(attrs.at("stride"));
+  }
+  if (attrs.find("dilation") != attrs.end()) {
+    dilation = absl::get<std::vector<int>>(attrs.at("dilation"));
+  }
+  if (attrs.find("groups") != attrs.end()) {
+    groups = absl::get<int>(attrs.at("groups"));
+  }
+  if (attrs.find("data_format") != attrs.end()) {
+    data_format = absl::get<std::string>(attrs.at("data_format"));
+    if (data_format == "AnyLayout") {
+      data_format = "NCHW";
+    }
+  }
+  if (attrs.find("conv_type") != attrs.end()) {
+    conv_type = absl::get<std::string>(attrs.at("conv_type"));
+  }
+
+  CHECK_EQ(padding.size(), 2) << "The size of padding in conv2d op is not 2! Please check.";
+  CHECK_EQ(stride.size(), 2) << "The size of stride in conv2d op is not 2! Please check.";
+  CHECK_GE(inputs_shape[0].size(), 3) << "The first input tensor's shape size of conv2d op is < 3! Please check.";
+  CHECK(conv_type == "forward" || conv_type == "backward_data" || conv_type == "backward_filter")
+      << "The conv type should be one of {forward, backward_data, backward_filter}.";
+  CHECK(data_format == "NCHW" || data_format == "NHWC")
+      << "The conv2d only support NCHW/NHWC, but here " << data_format;
+
+  int n = 0, c = 1, h = 2, w = 3;
+  if (data_format == "NHWC") {
+    n = 0;
+    h = 1;
+    w = 2;
+    c = 3;
+  }
+
+  std::vector<int> output_shape(4, 0);
+  int out_shape_h = 0, out_shape_w = 0;
+  if (conv_type == "forward") {
+    // A is input: [N, C, H, W], B is filter: [C_out, C_in/group, filter_h, filter_w]
+    out_shape_h = (inputs_shape[0][h] - ((inputs_shape[1][h] - 1) * dilation[0] + 1) + 2 * padding[0]) / stride[0] + 1;
+    out_shape_w = (inputs_shape[0][w] - ((inputs_shape[1][w] - 1) * dilation[1] + 1) + 2 * padding[1]) / stride[1] + 1;
+
+    output_shape[n] = inputs_shape[0][n];
+    output_shape[c] = inputs_shape[1][n];
+    output_shape[h] = out_shape_h;
+    output_shape[w] = out_shape_w;
+  } else if (conv_type == "backward_data") {
+    CHECK(attrs.find("output_shape") != attrs.end()) << "The shape of backward_data is not found! Please check.";
+    const auto &x_shape = absl::get<std::vector<int>>(attrs.at("output_shape"));
+    CHECK_EQ(x_shape.size(), 4) << "The rank of x shape is not 4! Please check";
+
+    // input[0] = w(C_out, C_in/group, h, w)
+    // input[1] = dy(batch, C_out, h, w)
+    // output = dx(batch, C_in, h, w)
+    output_shape[n] = inputs_shape[1][n];
+    output_shape[c] = inputs_shape[0][c] * groups;
+    output_shape[h] = x_shape[h];
+    output_shape[w] = x_shape[w];
+  } else if (conv_type == "backward_filter") {
+    CHECK(attrs.find("output_shape") != attrs.end()) << "The shape of backward_filter is not found! Please check.";
+    const auto &weight_shape = absl::get<std::vector<int>>(attrs.at("output_shape"));
+    CHECK_EQ(weight_shape.size(), 4) << "The rank of weight shape is not 4! Please check";
+
+    // input[0] = x(batch, C_in, h, w)
+    // input[1] = dy(batch, C_out, h, w)
+    // output = dw (C_out, C_in/group, h, w)
+    output_shape[n] = inputs_shape[1][c];
+    output_shape[c] = inputs_shape[0][c] / groups;
+    output_shape[h] = weight_shape[h];
+    output_shape[w] = weight_shape[w];
+  }
+
+  std::vector<shape_t> res = {output_shape};
+  if (data_format == "NCHW") {
+    absl::flat_hash_map<std::string, int> conv2d_factors;
+    int batch       = inputs_shape[0][0];
+    int oc          = inputs_shape[1][0];
+    int ic          = inputs_shape[0][1];
+    int fc          = inputs_shape[1][1];
+    int h_in        = inputs_shape[0][2];
+    int w_in        = inputs_shape[0][3];
+    int h_f         = inputs_shape[1][2];
+    int w_f         = inputs_shape[1][3];
+    int pad_h       = padding[0];
+    int pad_w       = padding[1];
+    std::string key = pe::GenerateX86ConvKey(inputs_shape[0], inputs_shape[1], stride, padding, dilation);
+    VLOG(3) << "key: " << key;
+    pe::GetConv2dFactors(&conv2d_factors, oc, ic, fc, -1, -1, Float(32), common::DefaultHostTarget(), key);
+    int ic_bn = conv2d_factors["ic_bn"];
+    int oc_bn = conv2d_factors["oc_bn"];
+    int fc_bn = conv2d_factors["fc_bn"];
+    VLOG(3) << "ic_bn: " << ic_bn;
+    VLOG(3) << "oc_bn: " << oc_bn;
+    VLOG(3) << "fc_bn: " << fc_bn;
+    int oc_chunk                            = oc / oc_bn;
+    int ic_chunk                            = ic / ic_bn;
+    int fc_chunk                            = fc / fc_bn;
+    std::vector<int> packed_out_shape       = {batch, oc_chunk, out_shape_h, out_shape_w, oc_bn};
+    std::vector<int> input_pad_shape        = {batch, ic_chunk, h_in + 2 * pad_h, w_in + 2 * pad_w, ic_bn};
+    std::vector<int> weights_dilation_shape = {
+        oc_chunk, fc_chunk, dilation[0] * (h_f - 1) + 1, dilation[1] * (w_f - 1) + 1, fc_bn, oc_bn};
+    std::vector<int> data_shape = {batch, ic_chunk, h_in, w_in, ic_bn};
+
+    res = {output_shape, packed_out_shape, weights_dilation_shape, input_pad_shape};
+  } else if (data_format == "NHWC") {
+    // now conv2d codegen version only support NCHW data format
+    res = {output_shape};
+  }
+  return res;
+}
+
+std::vector<Type> InferDtypeForConv2d(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0], inputs_type[0], inputs_type[0], inputs_type[0]};
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForConv2d(const std::vector<framework::shape_t> &input_shapes,
+                                                           const std::vector<std::string> &input_layouts,
+                                                           const framework::NodeAttr &attrs,
+                                                           const Target &target) {
+  CHECK_EQ(input_layouts.size(), 2U) << "The input's layouts size is not 2! Please check again.";
+  ir::Layout weight_layout(input_layouts[1]);
+  return {{input_layouts[0], input_layouts[0], input_layouts[0], input_layouts[0]}, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForConv2dNCHWc(const framework::NodeAttr &attrs,
+                                                   const std::vector<ir::Tensor> &inputs,
+                                                   const std::vector<Type> &out_type,
+                                                   const std::vector<std::vector<int>> &output_shapes,
+                                                   const Target &target) {
+  std::vector<int> padding({0, 0});
+  std::vector<int> stride({1, 1});
+  std::vector<int> dilation({1, 1});
+  std::string data_format = "NCHWc";
+  int groups              = 1;
+  if (attrs.attr_store.find("padding") != attrs.attr_store.end()) {
+    padding = absl::get<std::vector<int>>(attrs.attr_store.at("padding"));
+  }
+  if (attrs.attr_store.find("stride") != attrs.attr_store.end()) {
+    stride = absl::get<std::vector<int>>(attrs.attr_store.at("stride"));
+  }
+  if (attrs.attr_store.find("dilation") != attrs.attr_store.end()) {
+    dilation = absl::get<std::vector<int>>(attrs.attr_store.at("dilation"));
+  }
+  if (attrs.attr_store.find("data_format") != attrs.attr_store.end()) {
+    data_format = absl::get<std::string>(attrs.attr_store.at("data_format"));
+  }
+  if (attrs.attr_store.find("groups") != attrs.attr_store.end()) {
+    groups = absl::get<int>(attrs.attr_store.at("groups"));
+  }
+  CHECK(data_format == "NCHWc") << "conv2d_NCHWc op's data_format should be NCHWc";
+  framework::CINNCompute conv2d_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of conv2d_NCHWc compute is empty! Please check.\n";
+    CINNValuePack a = args[0];
+    CHECK_GE(a.size(), 2U) << "at least 2 input tensors for conv2d_NCHWc compute\n";
+    Expr A = a[0];
+    Expr B = a[1];
+    CHECK(A.as_tensor());
+    CHECK(B.as_tensor());
+    auto tensor_a = A.as_tensor_ref();
+    auto tensor_b = B.as_tensor_ref();
+    CHECK_EQ(tensor_a->shape.size(), 5) << "input's shape should be 5";
+    CHECK_EQ(tensor_b->shape.size(), 6) << "weight's shape should be 6";
+    CHECK_EQ(padding.size(), 2) << "The size of padding in conv2d_NCHWc op is not 2! Please check.";
+    CHECK_EQ(stride.size(), 2) << "The size of stride in conv2d_NCHWc op is not 2! Please check.";
+    CHECK_EQ(dilation.size(), 2) << "The size of stride in conv2d_NCHWc op is not 2! Please check.";
+    std::vector<ir::Tensor> out;
+    CHECK(target.arch == Target::Arch::X86) << "conv2d_NCHWc op is only used in x86";
+    // A is input: [N, C_in_outer, H, W, C_in_inner], B is filter: [C_out, C_in_group_outer, filter_h, filter_w,
+    // C_in_group_inner]
+    std::string key;
+    VLOG(3) << "input[" << utils::Join(tensor_a->shape, ", ") << "], weight shape["
+            << utils::Join(tensor_b->shape, ", ") << "]";
+    out = pe::Conv2d_NCHWc(tensor_a,
+                           tensor_b,
+                           padding[0],
+                           padding[1],
+                           stride[0],
+                           stride[1],
+                           dilation[0],
+                           dilation[1],
+                           UniqName("T_conv2d_NCHWc_out"),
+                           target);
+
+    auto stages = CreateStages({tensor_a, tensor_b});
+
+    std::vector<CINNValue> res;
+    CHECK(out.size() == 2U) << "The output tensor sizes of conv2d_NCHWc op should be 2\n";
+    for (auto &t : out) {
+      stages->InsertLazily(t);
+      res.push_back(CINNValue(t));
+    }
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule conv2d_schedule([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of conv2d_NCHWc schedule is empty! Please check.\n";
+    CINNValuePack arg_pack = args[0];
+    CHECK_EQ(arg_pack.size(), 3UL);
+    poly::StageMap stages = arg_pack.back();
+    Expr packed_out       = arg_pack[0];
+    Expr input_pad        = arg_pack[1];
+    CHECK(packed_out.as_tensor());
+    CHECK(input_pad.as_tensor());
+    std::vector<Expr> kernel_shape = inputs[1]->shape;
+    // kernel_h == 1 && kernel_w == 1
+    CHECK_EQ(kernel_shape.size(), 6U) << "kernel_dialtion shape size should be 6";
+    bool is_1x1 = (is_zero(kernel_shape[2] - 1)) && (is_zero(kernel_shape[3] - 1));
+    ir::Tensor res;
+    ir::Tensor data;
+    ir::Tensor weights;
+    ir::Tensor packed_out_tensor = packed_out.as_tensor_ref();
+    std::string key;
+    bool do_padding = (padding[0] == 0 && padding[1] == 0) ? false : true;
+    if (attrs.attr_store.find("key") != attrs.attr_store.end()) {
+      key = absl::get<std::string>(attrs.attr_store.at("key"));
+    }
+    if (is_1x1) {
+      pe::Conv2d_NCHWc_1X1_Schedule_CPU(
+          stages, res, packed_out_tensor, input_pad.as_tensor_ref(), weights, data, target, key, do_padding);
+    } else {
+      pe::Conv2d_NCHWc_Schedule_CPU(
+          stages, res, packed_out_tensor, input_pad.as_tensor_ref(), weights, data, target, key, do_padding);
+    }
+    if (do_padding) {
+      *ret = CINNValuePack{{CINNValue(packed_out_tensor), arg_pack[0], arg_pack[1], CINNValue(stages)}};
+    } else {
+      *ret = CINNValuePack{{CINNValue(packed_out_tensor), arg_pack[0], CINNValue(stages)}};
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  CHECK(out_type.size()) << "Out_type of conv2d_NCHWc op is empty! Please check.";
+  if (out_type[0] == Float(32)) {
+    strategy->AddImpl(conv2d_compute, conv2d_schedule, "strategy.conv2d_NCHWc.x86", 1);
+  } else {
+    LOG(FATAL) << "conv2d_NCHWc op with dtype != float32 is not implemented yet!";
+  }
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForConv2dNCHWc(const std::vector<shape_t> &inputs_shape,
+                                              const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+  std::vector<int> padding({0, 0});
+  std::vector<int> stride({1, 1});
+  std::vector<int> dilation({1, 1});
+  std::string data_format = "NCHWc";
+  if (attrs.find("padding") != attrs.end()) {
+    padding = absl::get<std::vector<int>>(attrs.at("padding"));
+  }
+  if (attrs.find("stride") != attrs.end()) {
+    stride = absl::get<std::vector<int>>(attrs.at("stride"));
+  }
+  if (attrs.find("dilation") != attrs.end()) {
+    dilation = absl::get<std::vector<int>>(attrs.at("dilation"));
+  }
+  if (attrs.find("data_format") != attrs.end()) {
+    data_format = absl::get<std::string>(attrs.at("data_format"));
+  }
+  CHECK_EQ(padding.size(), 2) << "The size of padding in conv2d_NCHWc op is not 2! Please check.";
+  CHECK_EQ(stride.size(), 2) << "The size of stride in conv2d_NCHWc op is not 2! Please check.";
+  CHECK_EQ(inputs_shape[0].size(), 5)
+      << "The first input tensor's shape size of conv2d_NCHWc op should be 5! Please check.";
+  CHECK_EQ(inputs_shape[1].size(), 6)
+      << "The second input tensor's shape size of conv2d_NCHWc op should be 6! Please check.";
+
+  std::vector<shape_t> res;
+  CHECK(data_format == "NCHWc") << "NCHWc op's data_format should be NCHWc";
+  int out_shape_h =
+      (inputs_shape[0][2] - ((inputs_shape[1][2] - 1) * dilation[0] + 1) + 2 * padding[0]) / stride[0] + 1;
+  int out_shape_w =
+      (inputs_shape[0][3] - ((inputs_shape[1][3] - 1) * dilation[1] + 1) + 2 * padding[1]) / stride[1] + 1;
+
+  // A: NCHWc, B: OIHWio
+  int batch                         = inputs_shape[0][0];
+  int h_in                          = inputs_shape[0][2];
+  int w_in                          = inputs_shape[0][3];
+  int oc                            = inputs_shape[1][0];
+  int h_f                           = inputs_shape[1][2];
+  int w_f                           = inputs_shape[1][3];
+  int pad_h                         = padding[0];
+  int pad_w                         = padding[1];
+  int ic_bn                         = inputs_shape[0][4];
+  int ic_chunk                      = inputs_shape[0][1];
+  int oc_bn                         = inputs_shape[1][5];
+  int oc_chunk                      = inputs_shape[1][0];
+  std::vector<int> packed_out_shape = {batch, oc_chunk, out_shape_h, out_shape_w, oc_bn};
+  auto pad_h_bound                  = (out_shape_h - 1) * stride[0] + (h_f - 1) * dilation[0] + 1;
+  auto pad_w_bound                  = (out_shape_w - 1) * stride[1] + (w_f - 1) * dilation[1] + 1;
+  auto input_pad_h                  = std::min(pad_h_bound, h_in + 2 * pad_h);
+  auto input_pad_w                  = std::min(pad_w_bound, w_in + 2 * pad_w);
+  std::vector<int> input_pad_shape  = {batch, ic_chunk, input_pad_h, input_pad_w, ic_bn};
+  VLOG(3) << "packed_out_shape: " << utils::Join(packed_out_shape, ", ");
+  return {packed_out_shape, packed_out_shape, input_pad_shape};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForConv2dNCHWc(const std::vector<framework::shape_t> &input_shapes,
+                                                                const std::vector<std::string> &input_layouts,
+                                                                const framework::NodeAttr &attrs,
+                                                                const Target &target) {
+  CHECK_EQ(input_layouts.size(), 2U) << "The input's layouts size is not 2! Please check again.";
+  ir::Layout weight_layout(input_layouts[1]);
+  CHECK_EQ(weight_layout.ndims(), 6U);
+  auto var   = weight_layout.axes().back();
+  int factor = var->upper_bound.as_int32();
+  CHECK_GE(factor, 1) << "factor should be larger than 1";
+  std::string outlayout = "NCHW" + std::to_string(factor) + "c";
+  return {{outlayout, outlayout, input_layouts[0]}, input_layouts};
+}
+
+std::vector<Type> InferDtypeForConv2dNCHWc(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0], inputs_type[0], inputs_type[0]};
+  return res;
+}
+
+std::shared_ptr<OpStrategy> StrategyForDepthwiseConv2d(const framework::NodeAttr &attrs,
+                                                       const std::vector<ir::Tensor> &inputs,
+                                                       const std::vector<Type> &out_type,
+                                                       const std::vector<std::vector<int>> &output_shapes,
+                                                       const Target &target) {
+  std::vector<int> padding  = {0, 0};
+  std::vector<int> stride   = {1, 1};
+  std::vector<int> dilation = {1, 1};
+  std::string data_format   = "NCHW";
+  std::string key;
+  if (attrs.attr_store.find("padding") != attrs.attr_store.end()) {
+    padding = absl::get<std::vector<int>>(attrs.attr_store.at("padding"));
+  }
+  if (attrs.attr_store.find("stride") != attrs.attr_store.end()) {
+    stride = absl::get<std::vector<int>>(attrs.attr_store.at("stride"));
+  }
+  if (attrs.attr_store.find("data_format") != attrs.attr_store.end()) {
+    data_format = absl::get<std::string>(attrs.attr_store.at("data_format"));
+  }
+  if (attrs.attr_store.find("dilation") != attrs.attr_store.end()) {
+    dilation = absl::get<std::vector<int>>(attrs.attr_store.at("dilation"));
+  }
+  if (attrs.attr_store.find("key") != attrs.attr_store.end()) {
+    key = absl::get<std::string>(attrs.attr_store.at("key"));
+  }
+
+  framework::CINNCompute depthwise_conv2d_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of depthwise_conv compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 2U) << "at least 2 input tensors for depthwise_conv compute\n";
+    Expr A = pack_args[0];
+    Expr B = pack_args[1];
+    CHECK(A.as_tensor());
+    CHECK(B.as_tensor());
+    CHECK_EQ(padding.size(), 2) << "The size of padding in depthwise_conv op is not 2! Please check.\n";
+    CHECK_EQ(stride.size(), 2) << "The size of stride in depthwise_conv op is not 2! Please check.\n";
+    CHECK(data_format == "NCHW" || data_format == "NHWC") << "only support NCHW/NHWC data_format.\n";
+    std::vector<ir::Tensor> out;
+    std::string tensor_name = UniqName("Depthwise_Conv2d_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_GE(pack_args.size(), 3);
+      CHECK(pack_args[2].is_string());
+      tensor_name = pack_args[2].operator std::string();
+    }
+    if (data_format == "NCHW") {
+      if (target.arch == Target::Arch::X86) {
+        out = pe::Conv2d_NCHW_5D(A.as_tensor_ref(),
+                                 B.as_tensor_ref(),
+                                 padding[0],
+                                 padding[1],
+                                 stride[0],
+                                 stride[1],
+                                 dilation[0],
+                                 dilation[1],
+                                 key,
+                                 tensor_name,
+                                 target);
+      } else {
+        out = pe::Depthwise_Conv2d_NCHW(
+            A.as_tensor_ref(), B.as_tensor_ref(), padding[0], padding[1], stride[0], stride[1], tensor_name);
+      }
+    } else if (data_format == "NHWC") {
+      out = pe::Depthwise_Conv2d_NHWC(
+          A.as_tensor_ref(), B.as_tensor_ref(), padding[0], padding[1], stride[0], stride[1], tensor_name);
+    } else {
+      LOG(FATAL) << "Only support NCHW and NHWC data layout\n";
+    }
+
+    auto stages = CreateStages({A.as_tensor_ref(), B.as_tensor_ref()});
+    std::vector<CINNValue> res;
+    for (auto &t : out) {
+      stages->InsertLazily(t);
+      res.push_back(CINNValue(t));
+    }
+    CHECK(out.size() == 2U || out.size() == 1U || out.size() == 5U)
+        << "The output tensor sizes of depthwise_conv op in depthwise_conv op should be 1 or 2 or 5\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule depthwise_conv2d_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of InjectiveSchedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      std::vector<Expr> vec_tensor;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        } else if (arg_pack[i].is_tensor()) {
+          Expr temp = arg_pack[i];
+          vec_tensor.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      if (target.arch == Target::Arch::NVGPU) {
+        pe::IRCudaScheduleDepthwiseConv(ir_sch, vec_tensor);
+      } else {
+        CINN_NOT_IMPLEMENTED
+      }
+      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = common::CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of depthwise_conv schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      CHECK(arg_pack.size() == 2UL || arg_pack.size() == 3UL || arg_pack.size() == 6UL);
+      poly::StageMap stages = arg_pack[arg_pack.size() - 1];
+      Expr Out              = arg_pack[0];
+      CHECK(Out.as_tensor());
+      if (arg_pack.size() == 3UL) {
+        Expr input_pad = arg_pack[1];
+        CHECK(input_pad.as_tensor());
+        stages[input_pad.as_tensor_ref()]->ComputeInline();
+      }
+      if (target.arch == Target::Arch::NVGPU) {
+        ir::Tensor output = Out.as_tensor_ref();
+        CHECK(Out.as_tensor());
+        pe::CudaScheduleDepthwiseConv(stages, output, target);
+        arg_pack[0] = Expr(output);
+      } else if (target.arch == Target::Arch::X86) {
+        if (arg_pack.size() == 6UL) {
+          Expr res              = arg_pack[0];
+          Expr packed_out       = arg_pack[1];
+          Expr weights_dilation = arg_pack[2];
+          Expr input_pad        = arg_pack[3];
+          Expr data             = arg_pack[4];
+          CHECK(res.as_tensor());
+          CHECK(packed_out.as_tensor());
+          CHECK(input_pad.as_tensor());
+          CHECK(weights_dilation.as_tensor());
+          CHECK(data.as_tensor());
+          ir::Tensor packed_out_tensor = packed_out.as_tensor_ref();
+          bool do_padding              = (padding[0] == 0 && padding[1] == 0) ? false : true;
+          pe::Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(stages,
+                                                         res.as_tensor_ref(),
+                                                         packed_out_tensor,
+                                                         input_pad.as_tensor_ref(),
+                                                         weights_dilation.as_tensor_ref(),
+                                                         data.as_tensor_ref(),
+                                                         target,
+                                                         do_padding);
+          if (do_padding) {
+            *ret = CINNValuePack{
+                {CINNValue(res), CINNValue(packed_out_tensor), arg_pack[2], arg_pack[3], CINNValue(stages)}};
+          } else {
+            *ret = CINNValuePack{{CINNValue(res), CINNValue(packed_out_tensor), arg_pack[2], CINNValue(stages)}};
+          }
+          return;
+        }
+      }
+
+      *ret = CINNValuePack{{arg_pack[0], CINNValue(stages)}};
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  CHECK(out_type.size()) << "Out_type of depthwise_conv op is empty! Please check.";
+  if (out_type[0] == Float(32)) {
+    strategy->AddImpl(depthwise_conv2d_compute, depthwise_conv2d_schedule, "strategy.depthwise_conv.x86", 1);
+  } else {
+    VLOG(3) << "depthwise_conv op with dtype != float32 is not implemented yet!";
+  }
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForDepthwiseConv2d(const std::vector<shape_t> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 2U) << "at least 2 input tensors for depthwise_conv2d op\n";
+  CHECK_EQ(inputs_shape[0].size(), 4U) << "The input tensor's shape should be 4! Please check again.";
+  CHECK_EQ(inputs_shape[1].size(), 4U) << "The input tensor's shape should be 4! Please check again.";
+  std::vector<int> padding = {0, 0};
+  std::vector<int> stride  = {1, 1};
+  std::string data_format  = "NCHW";
+  if (attrs.find("padding") != attrs.end()) {
+    padding = absl::get<std::vector<int>>(attrs.at("padding"));
+  }
+  if (attrs.find("stride") != attrs.end()) {
+    stride = absl::get<std::vector<int>>(attrs.at("stride"));
+  }
+  if (attrs.find("data_format") != attrs.end()) {
+    data_format = absl::get<std::string>(attrs.at("data_format"));
+  }
+  std::vector<shape_t> res;
+  CHECK_EQ(padding.size(), 2U) << "The size of padding in depthwise_conv2d op is not 2! Please check.";
+  CHECK_EQ(stride.size(), 2U) << "The size of stride in depthwise_conv2d op is not 2! Please check.";
+  if (data_format == "NCHW") {
+    // A is input: [N, C, H, W], and B is filter: [C_in, channel_multiplier, f_h, f_w]
+    int out_shape_h = (inputs_shape[0][2] - inputs_shape[1][2] + 2 * padding[0]) / stride[0] + 1;
+    int out_shape_w = (inputs_shape[0][3] - inputs_shape[1][3] + 2 * padding[1]) / stride[1] + 1;
+    res             = {{inputs_shape[0][0], inputs_shape[1][1] * inputs_shape[0][1], out_shape_h, out_shape_w}};
+  } else if (data_format == "NHWC") {
+    // A is input: [N, H, W, C], and B is filter: [C_in, channel_multiplier, f_h, f_w]
+    int out_shape_h = (inputs_shape[0][1] - inputs_shape[1][1] + 2 * padding[0]) / stride[0] + 1;
+    int out_shape_w = (inputs_shape[0][2] - inputs_shape[1][2] + 2 * padding[1]) / stride[1] + 1;
+    res             = {{inputs_shape[0][0], out_shape_h, out_shape_w, inputs_shape[1][1] * inputs_shape[0][3]}};
+  } else {
+    LOG(FATAL) << "Only support NCHW and NHWC data layout\n";
+  }
+  return res;
+}
+
+std::vector<Type> InferDtypeForDepthwiseConv2d(const std::vector<Type> &inputs_type,
+                                               const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::shared_ptr<OpStrategy> StrategyForBatchNorm(const framework::NodeAttr &attrs,
+                                                 const std::vector<ir::Tensor> &inputs,
+                                                 const std::vector<Type> &out_type,
+                                                 const std::vector<std::vector<int>> &output_shapes,
+                                                 const Target &target) {
+  float epsilon = 0.00001f;
+  std::vector<std::string> input_layouts;
+  if (attrs.attr_store.find("epsilon") != attrs.attr_store.end()) {
+    epsilon = absl::get<float>(attrs.attr_store.at("epsilon"));
+  }
+  if (attrs.attr_store.find("input_layouts") != attrs.attr_store.end()) {
+    input_layouts = absl::get<std::vector<std::string>>(attrs.attr_store.at("input_layouts"));
+  }
+  framework::CINNCompute batchnorm_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of batchnorm compute is empty! Please check.\n";
+    CINNValuePack arg_pack = args[0];
+    CHECK_GE(arg_pack.size(), 5U) << "at least 5 input tensors for batchnorm compute\n";
+    Expr A               = arg_pack[0];
+    Expr Scale           = arg_pack[1];
+    Expr Bias            = arg_pack[2];
+    Expr Mean            = arg_pack[3];
+    Expr Variance        = arg_pack[4];
+    std::string out_name = UniqName("BatchNorm_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(arg_pack.size(), 6U);
+      CHECK(arg_pack[5].is_string());
+      std::string str = arg_pack[5];
+      out_name        = str;
+    }
+    CHECK(A.as_tensor());
+    CHECK(Scale.as_tensor());
+    CHECK(Bias.as_tensor());
+    CHECK(Mean.as_tensor());
+    CHECK(Variance.as_tensor());
+    ir::Tensor out;
+    auto tensor_input = A.as_tensor_ref();
+    if (tensor_input->shape.size() != 4 && target.arch == Target::Arch::X86) {
+      CHECK_EQ(input_layouts.size(), 5U) << "batch_norm_NCHWc's input layout should be 5";
+      std::string input_layout = input_layouts[0];
+      CHECK_GE(input_layout.size(), 5U);
+      CHECK_EQ(input_layout.substr(0, 4), "NCHW");
+      CHECK_EQ(tensor_input->shape.size(), 5U);
+      out = pe::BatchNorm_NCHWc(tensor_input,
+                                Scale.as_tensor_ref(),
+                                Bias.as_tensor_ref(),
+                                Mean.as_tensor_ref(),
+                                Variance.as_tensor_ref(),
+                                epsilon,
+                                out_name);
+    } else {
+      out = pe::BatchNorm_NCHW(tensor_input,
+                               Scale.as_tensor_ref(),
+                               Bias.as_tensor_ref(),
+                               Mean.as_tensor_ref(),
+                               Variance.as_tensor_ref(),
+                               epsilon,
+                               out_name);
+    }
+    auto stages = CreateStages({out});
+    *ret        = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  CHECK(out_type.size()) << "Out_type of batchnorm op is empty! Please check.";
+  if (out_type[0] == Float(32)) {
+    strategy->AddImpl(batchnorm_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.batchnorm.x86", 1);
+  } else {
+    LOG(FATAL) << "BatchNorm op with dtype != float32 is not implemented yet!";
+  }
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForBatchNorm(const std::vector<shape_t> &inputs_shape,
+                                            const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+  std::vector<shape_t> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForBatchNorm(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 5U) << "The BatchNorm Infer input's type size should be 5! Please check again.";
+  CHECK_EQ(inputs_type[1], inputs_type[2]) << "The BatchNorm Infer scale type should the same as bias type";
+  CHECK_EQ(inputs_type[1], inputs_type[3]) << "The BatchNorm Infer scale type should the same as moving_mean type";
+  CHECK_EQ(inputs_type[1], inputs_type[4]) << "The BatchNorm Infer scale type should the same as moving_variance type";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForBatchNorm(const std::vector<framework::shape_t> &input_shapes,
+                                                              const std::vector<std::string> &input_layouts,
+                                                              const framework::NodeAttr &attrs,
+                                                              const Target &target) {
+  CHECK_EQ(input_layouts.size(), 5U) << "The input's layouts size is not 5! Please check again.";
+  std::string input_layout = input_layouts[0];
+  CHECK_GE(input_layout.size(), 4) << "batchnorm's first input layout size should be >= 4";
+  return {{input_layout}, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForPool1d(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<Type> &out_type,
+                                              const std::vector<std::vector<int>> &output_shapes,
+                                              const Target &target) {
+  framework::CINNCompute pool1d_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of pool1d compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "The input tensor of pool1d compute is empty! Please check.\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    auto attr_store = attrs.attr_store;
+    std::vector<int> kernel_size;   // [kernel_w]
+    std::vector<int> stride_size;   // [stride_w]
+    std::vector<int> padding_size;  // [padding_left, padding_right]
+    std::string pool_type   = "max";
+    bool ceil_mode          = false;
+    bool exclusive          = true;
+    std::string data_format = "NCW";
+    for (auto &iter : attrs.attr_store) {
+      if (iter.first == "kernel_size") {
+        kernel_size = absl::get<std::vector<int>>(iter.second);
+      } else if (iter.first == "stride_size") {
+        stride_size = absl::get<std::vector<int>>(iter.second);
+      } else if (iter.first == "padding_size") {
+        padding_size = absl::get<std::vector<int>>(iter.second);
+      } else if (iter.first == "pool_type") {
+        pool_type = absl::get<std::string>(iter.second);
+      } else if (iter.first == "ceil_mode") {
+        ceil_mode = absl::get<bool>(iter.second);
+      } else if (iter.first == "exclusive") {
+        exclusive = absl::get<bool>(iter.second);
+      } else if (iter.first == "data_format") {
+        data_format = absl::get<std::string>(iter.second);
+      } else {
+        LOG(ERROR) << "Unsupported attr: " << iter.first << std::endl;
+      }
+    }
+    CHECK(!kernel_size.empty()) << "kernel_size for pool1d is empty. Please check.\n";
+    CHECK(!stride_size.empty()) << "stride_size for pool1d is empty. Please check.\n";
+    CHECK(!padding_size.empty()) << "padding_size for pool1d is empty. Please check.\n";
+    CHECK(pool_type == "max" || pool_type == "avg") << "pool_type for pool1d should be max or avg.\n";
+
+    std::string tensor_name = UniqName("Pool1d_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    auto out = pe::Pool1d(A.as_tensor_ref(),
+                          kernel_size,
+                          stride_size,
+                          padding_size,
+                          pool_type,
+                          ceil_mode,
+                          exclusive,
+                          data_format,
+                          tensor_name);
+
+    auto stages = CreateStages(out);
+    CHECK(out.size() == 1U || out.size() == 2U) << "The size of pe::Pool1d's output should be 1 or 2.";
+    CHECK(!out_type.empty()) << "Output type of Pool1d is empty! Please check.\n";
+    std::vector<CINNValue> res;
+    for (auto &t : out) {
+      res.push_back(CINNValue(Expr(t.get())));
+    }
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule pool1d_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of pool1d schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      std::vector<Expr> vec_tensor;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        } else if (arg_pack[i].is_tensor()) {
+          Expr temp = arg_pack[i];
+          vec_tensor.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      if (arg_pack.size() == 3UL) {
+        CHECK_EQ(vec_tensor.size(), 2);
+        Expr input_pad = vec_tensor[1];
+        CHECK(input_pad.as_tensor());
+        auto block_input_pad = ir_sch.GetBlock(input_pad.as_tensor()->name);
+        ir_sch.ComputeInline(block_input_pad);
+      }
+      if (target.arch == Target::Arch::NVGPU) {
+        CHECK(!vec_tensor.empty());
+        Expr Out = vec_tensor[0];
+        CHECK(Out.as_tensor());
+        auto loops = ir_sch.GetLoops(Out.as_tensor()->name);
+        ir_sch.Split(loops[1], {-1, 2});
+        loops = ir_sch.GetLoops(Out.as_tensor()->name);
+        ir_sch.Bind(loops[0], "blockIdx.x");
+        ir_sch.Bind(loops[1], "threadIdx.x");
+      }
+      std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of pool1d schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      CHECK(arg_pack.size() == 2UL || arg_pack.size() == 3UL);
+      Expr Out              = arg_pack[0];
+      poly::StageMap stages = arg_pack[arg_pack.size() - 1];
+      if (arg_pack.size() == 3UL) {
+        Expr input_pad = arg_pack[1];
+        CHECK(input_pad.as_tensor());
+        stages[input_pad.as_tensor_ref()]->ComputeInline();
+      }
+
+      if (target.arch == Target::Arch::NVGPU) {
+        CHECK(Out.as_tensor());
+        stages[Out.as_tensor_ref()]->Split(1, 2);
+        stages[Out.as_tensor_ref()]->Bind(0, "blockIdx.x");
+        stages[Out.as_tensor_ref()]->Bind(1, "threadIdx.x");
+      }
+      *ret = CINNValuePack{{CINNValue(Out), CINNValue(stages)}};
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(pool1d_compute, pool1d_schedule, "strategy.pool1d.x86", 1);
+
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForPool1d(const std::vector<std::vector<int>> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+  std::vector<int> kernel_size;   // [kernel_w]
+  std::vector<int> stride_size;   // [stride_w]
+  std::vector<int> padding_size;  // [padding_left, padding_right]
+  std::string pool_type   = "max";
+  bool ceil_mode          = false;
+  bool exclusive          = true;
+  std::string data_format = "NCW";
+  for (auto &iter : attrs) {
+    if (iter.first == "kernel_size") {
+      kernel_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "stride_size") {
+      stride_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "padding_size") {
+      padding_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "ceil_mode") {
+      ceil_mode = absl::get<bool>(iter.second);
+    } else if (iter.first == "exclusive") {
+      exclusive = absl::get<bool>(iter.second);
+    } else if (iter.first == "data_format") {
+      data_format = absl::get<std::string>(iter.second);
+    }
+  }
+  CHECK_EQ(kernel_size.size(), 1U) << "kernel size for pool1d should be 1.\n";
+  CHECK_EQ(stride_size.size(), 1U) << "stride_size size for pool1d should be 1.\n";
+  CHECK_EQ(padding_size.size(), 2U) << "padding_size size for pool1d should be 2.\n";
+  CHECK(pool_type == "max" || pool_type == "avg") << "pool_type for pool1d should be max or avg.\n";
+
+  std::vector<int> output_shape1 = inputs_shape[0];
+  CHECK_EQ(output_shape1.size(), 3U);
+  int width_axis = -1;
+  if (data_format == "NCW") {
+    width_axis = 2;
+  } else if (data_format == "NWC") {
+    width_axis = 1;
+  } else {
+    LOG(FATAL) << "unsupported data_format: " << data_format << std::endl;
+  }
+
+  if (ceil_mode) {
+    output_shape1[width_axis] =
+        (inputs_shape[0][width_axis] - kernel_size[0] + padding_size[0] + padding_size[1] + stride_size[0] - 1) /
+            stride_size[0] +
+        1;
+  } else {
+    output_shape1[width_axis] =
+        (inputs_shape[0][width_axis] - kernel_size[0] + padding_size[0] + padding_size[1]) / stride_size[0] + 1;
+  }
+
+  std::vector<std::vector<int>> res{output_shape1};
+  return res;
+}
+
+std::shared_ptr<OpStrategy> StrategyForPool2d(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<Type> &out_type,
+                                              const std::vector<std::vector<int>> &output_shapes,
+                                              const Target &target) {
+  auto attr_store = attrs.attr_store;
+  std::vector<int> kernel_size;   // [kernel_h, kernel_w]
+  std::vector<int> stride_size;   // [stride_h, stride_w]
+  std::vector<int> padding_size;  // [padding_top, padding_left, padding_bottom, padding_right]
+  std::string pool_type   = "max";
+  bool ceil_mode          = false;
+  bool exclusive          = true;
+  bool global_pooling     = false;
+  bool adaptive           = false;
+  std::string data_format = "NCHW";
+  for (auto &iter : attrs.attr_store) {
+    if (iter.first == "kernel_size") {
+      kernel_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "stride_size") {
+      stride_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "padding_size") {
+      padding_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "pool_type") {
+      pool_type = absl::get<std::string>(iter.second);
+    } else if (iter.first == "ceil_mode") {
+      ceil_mode = absl::get<bool>(iter.second);
+    } else if (iter.first == "exclusive") {
+      exclusive = absl::get<bool>(iter.second);
+    } else if (iter.first == "data_format") {
+      data_format = absl::get<std::string>(iter.second);
+    } else if (iter.first == "global_pooling") {
+      global_pooling = absl::get<bool>(iter.second);
+    } else if (iter.first == "adaptive") {
+      adaptive = absl::get<bool>(iter.second);
+    }
+  }
+  // It can be removed after fixing the global_pool2d problem
+  if (attr_store.count("origin_kernel_size")) {
+    kernel_size = absl::get<std::vector<int>>(attr_store.at("origin_kernel_size"));
+  }
+  if (attr_store.count("origin_padding_size")) {
+    padding_size = absl::get<std::vector<int>>(attr_store.at("origin_padding_size"));
+  }
+  if (attr_store.count("origin_global_pooling")) {
+    global_pooling = absl::get<bool>(attr_store.at("origin_global_pooling"));
+  }
+  if (attr_store.count("origin_adaptive")) {
+    adaptive = absl::get<bool>(attr_store.at("origin_adaptive"));
+  }
+
+  CHECK(!kernel_size.empty()) << "kernel_size for pool2d is empty. Please check.\n";
+  CHECK(!stride_size.empty()) << "stride_size for pool2d is empty. Please check.\n";
+  CHECK(!padding_size.empty()) << "padding_size for pool2d is empty. Please check.\n";
+  CHECK(pool_type == "max" || pool_type == "avg") << "pool_type for pool2d should be max or avg.\n";
+
+  CHECK(!inputs.empty()) << "The input tensor of pool2d compute is empty! Please check.\n";
+  const ir::Tensor &A_tensor = inputs[0];
+  CHECK(A_tensor->shape.size() == 4U || A_tensor->shape.size() == 5U)
+      << "pool2d requires tensor's shape_size to be 4 or 5\n";
+
+  if (global_pooling) {
+    int height_index = -1;
+    int width_index  = -1;
+    if (data_format == "NCHW") {
+      height_index = 2;
+      width_index  = 3;
+    } else if (data_format == "NHWC") {
+      height_index = 1;
+      width_index  = 2;
+    } else if (data_format == "AnyLayout") {
+      height_index = 2;
+      width_index  = 3;
+      data_format  = "NCHW";
+    } else {
+      LOG(FATAL) << "Only support 'NCHW' or 'NHWC' or 'AnyLayout' data_format.\n";
+    }
+    kernel_size  = {A_tensor->shape[height_index].as_int32(), A_tensor->shape[width_index].as_int32()};
+    padding_size = {0, 0, 0, 0};
+  }
+  if (kernel_size.size() == padding_size.size()) {
+    padding_size.insert(padding_size.end(), padding_size.begin(), padding_size.end());
+  }
+
+  framework::CINNCompute global_pool2d_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of pool2d compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    Expr A                  = pack_args[0];
+    CHECK(A.as_tensor());
+    ir::Tensor A_tensor = A.as_tensor_ref();
+
+    std::string tensor_name = UniqName("GlobalPool2d_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    auto out = pe::GlobalPool2d(A_tensor, pool_type, tensor_name);
+    CHECK(out.size() == 2U) << "The size of pe::GlobalPool2d's output should be 2.";
+    auto stages = CreateStages({A_tensor, out[0], out[1]});
+    *ret        = CINNValuePack{{CINNValue(out[0]), CINNValue(out[1]), CINNValue(stages)}};
+  });
+
+  framework::CINNSchedule global_pool2d_schedule([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of pool2d schedule is empty! Please check.\n";
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of pool1d schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      std::vector<Expr> vec_tensor;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        } else if (arg_pack[i].is_tensor()) {
+          Expr temp = arg_pack[i];
+          vec_tensor.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      if (target.arch == Target::Arch::NVGPU) {
+        pe::IRGlobalPoolScheduleGPU(ir_sch, target);
+      } else {
+        CINN_NOT_IMPLEMENTED
+      }
+      std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = CINNValuePack{res};
+    } else {
+      CINNValuePack arg_pack = args[0];
+      CHECK(arg_pack.size() == 3UL);
+      Expr out    = arg_pack[0];
+      Expr reduce = arg_pack[1];
+      CHECK(out.as_tensor() && reduce.as_tensor());
+      poly::StageMap stages = arg_pack[arg_pack.size() - 1];
+      pe::GlobalPoolScheduleGPU(stages, {out.as_tensor_ref(), reduce.as_tensor_ref()}, target);
+      *ret = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+    }
+  });
+
+  framework::CINNCompute pool2d_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of pool2d compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    Expr A                  = pack_args[0];
+    CHECK(A.as_tensor());
+    ir::Tensor A_tensor = A.as_tensor_ref();
+
+    std::string tensor_name = UniqName("Pool2d_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    auto out = pe::Pool2d(A_tensor,
+                          kernel_size,
+                          stride_size,
+                          padding_size,
+                          pool_type,
+                          ceil_mode,
+                          exclusive,
+                          data_format,
+                          adaptive,
+                          tensor_name);
+
+    auto stages = CreateStages({A_tensor});
+    CHECK(out.size() == 1U || out.size() == 2U) << "The size of pe::Pool2d's output should be 1 or 2.";
+    std::vector<CINNValue> res;
+    for (auto &t : out) {
+      stages->InsertLazily(t);
+      res.push_back(CINNValue(t));
+    }
+    CHECK(!out_type.empty()) << "Output type of Pool2d is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule pool2d_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of pool2d schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      std::vector<Expr> vec_tensor;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        } else if (arg_pack[i].is_tensor()) {
+          Expr temp = arg_pack[i];
+          vec_tensor.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      int arg_pack_size = arg_pack.size();
+      // arg_pack_size == 3 case: input, input_pad, output
+      // arg_pack_size == 4 case: input, input_pad, output, stage
+      if (arg_pack_size == 3UL || arg_pack_size == 4UL) {
+        CHECK_EQ(vec_tensor.size(), 2);
+        Expr input_pad = vec_tensor[1];
+        CHECK(input_pad.as_tensor());
+        const std::string &input_pad_name = input_pad.as_tensor()->name;
+        VLOG(6) << "ComputeInline on " << input_pad_name;
+        auto block_input_pad = ir_sch.GetBlock(input_pad_name);
+        ir_sch.ComputeInline(block_input_pad);
+      }
+      if (target.arch == Target::Arch::NVGPU) {
+        pe::IRPoolScheduleGPU(ir_sch, target, arg_pack_size);
+      }
+      std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of pool2d schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      CHECK(arg_pack.size() == 2UL || arg_pack.size() == 3UL);
+      Expr Out = arg_pack[0];
+      CHECK(Out.as_tensor());
+      poly::StageMap stages = arg_pack[arg_pack.size() - 1];
+      if (arg_pack.size() == 3UL) {
+        Expr input_pad = arg_pack[1];
+        CHECK(input_pad.as_tensor());
+        stages[input_pad.as_tensor_ref()]->ComputeInline();
+      }
+      ir::Tensor temp_out = Out.as_tensor_ref();
+      if (target.arch == Target::Arch::NVGPU) {
+        pe::PoolScheduleGPU(stages, temp_out, target);
+        arg_pack[arg_pack.size() - 2] = Expr(temp_out);
+      }
+      *ret = CINNValuePack{{CINNValue(Out), CINNValue(stages)}};
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+
+  bool use_warp_reduce = false;
+  if (global_pooling && data_format == "NCHW" && target.arch == Target::Arch::NVGPU) {
+    // TODO 32 may not be the exact number, try also 16 or 8 or other number
+    //      we choose 32 to make sure all the threads in a warp has work to do,
+    if ((A_tensor->shape[2].as_int32() * A_tensor->shape[3].as_int32()) >= 32) {
+      use_warp_reduce = true;
+    }
+  }
+  strategy->AddImpl(pool2d_compute, pool2d_schedule, "strategy.pool2d.x86", 1);
+  if (use_warp_reduce) {
+    strategy->AddImpl(global_pool2d_compute, global_pool2d_schedule, "strategy.pool2d.gpu.global", 2);
+  }
+
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForPool2d(const std::vector<std::vector<int>> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK(inputs_shape[0].size() == 4 || inputs_shape[0].size() == 5)
+      << "The input's shape size of pool2d should be 4 or 5! Please check again.";
+  std::vector<int> kernel_size;
+  std::vector<int> stride_size;
+  std::vector<int> padding_size;
+  std::string pool_type   = "max";
+  bool ceil_mode          = false;
+  bool exclusive          = true;
+  std::string data_format = "NCHW";
+  bool global_pooling     = false;
+  bool adaptive           = false;
+  for (auto &iter : attrs) {
+    if (iter.first == "kernel_size") {
+      kernel_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "stride_size") {
+      stride_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "padding_size") {
+      padding_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "ceil_mode") {
+      ceil_mode = absl::get<bool>(iter.second);
+    } else if (iter.first == "exclusive") {
+      exclusive = absl::get<bool>(iter.second);
+    } else if (iter.first == "global_pooling") {
+      global_pooling = absl::get<bool>(iter.second);
+    } else if (iter.first == "data_format") {
+      data_format = absl::get<std::string>(iter.second);
+    } else if (iter.first == "adaptive") {
+      adaptive = absl::get<bool>(iter.second);
+    } else if (iter.first == "pool_type") {
+      pool_type = absl::get<std::string>(iter.second);
+    }
+  }
+
+  int height_axis = -1;
+  int width_axis  = -1;
+  if (data_format == "NCHW") {
+    height_axis = 2;
+    width_axis  = 3;
+  } else {
+    height_axis = 1;
+    width_axis  = 2;
+  }
+
+  std::vector<int> output_shape1 = inputs_shape[0];
+  if (ceil_mode) {
+    output_shape1[height_axis] =
+        (inputs_shape[0][height_axis] - kernel_size[0] + padding_size[0] + padding_size[2] + stride_size[0] - 1) /
+            stride_size[0] +
+        1;
+    output_shape1[width_axis] =
+        (inputs_shape[0][width_axis] - kernel_size[1] + padding_size[1] + padding_size[3] + stride_size[1] - 1) /
+            stride_size[1] +
+        1;
+  } else {
+    output_shape1[height_axis] =
+        (inputs_shape[0][height_axis] - kernel_size[0] + padding_size[0] + padding_size[2]) / stride_size[0] + 1;
+    output_shape1[width_axis] =
+        (inputs_shape[0][width_axis] - kernel_size[1] + padding_size[1] + padding_size[3]) / stride_size[1] + 1;
+  }
+
+  if (adaptive) {
+    kernel_size = absl::get<std::vector<int>>(attrs.at("kernel_size"));
+    if (kernel_size.size() == 1UL) kernel_size.push_back(kernel_size[0]);
+    CHECK(kernel_size.size() >= 2UL) << "In pool2d, kernel_size's size should be >= 2, please check!";
+    output_shape1[height_axis] = kernel_size[0];
+    output_shape1[width_axis]  = kernel_size[1];
+  }
+
+  VLOG(4) << std::boolalpha << "y[" << cinn::utils::Join(output_shape1, ", ") << "] = pool2d(x["
+          << cinn::utils::Join(inputs_shape[0], ", ") << "], kernel_size=[" << cinn::utils::Join(kernel_size, ", ")
+          << "], stride_size=[" << cinn::utils::Join(stride_size, ", ") << "], padding_size=["
+          << cinn::utils::Join(padding_size, ", ") << "], pool_type=" << pool_type << ", ceil_mode=" << ceil_mode
+          << ", exclusive=" << exclusive << ", data_format=" << data_format << ", global_pooling=" << global_pooling
+          << ", adaptive=" << adaptive;
+  std::vector<std::vector<int>> res{output_shape1};
+  return res;
+}
+
+std::shared_ptr<OpStrategy> StrategyForPool3d(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<Type> &out_type,
+                                              const std::vector<std::vector<int>> &output_shapes,
+                                              const Target &target) {
+  framework::CINNCompute pool3d_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of pool3d compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "The input tensor of pool3d compute is empty! Please check.\n";
+    Expr A = pack_args[0];
+    CHECK(A.as_tensor());
+    auto attr_store = attrs.attr_store;
+    std::vector<int> kernel_size;  // [kernel_d, kernel_h, kernel_w]
+    std::vector<int> stride_size;  // [stride_d, stride_h, stride_w]
+    std::vector<int>
+        padding_size;  // [padding_front, padding_top, padding_left, padding_back, padding_bottom, padding_right]
+    std::string pool_type   = "max";
+    bool ceil_mode          = false;
+    bool exclusive          = true;
+    std::string data_format = "NCDHW";
+    for (auto &iter : attrs.attr_store) {
+      if (iter.first == "kernel_size") {
+        kernel_size = absl::get<std::vector<int>>(iter.second);
+      } else if (iter.first == "stride_size") {
+        stride_size = absl::get<std::vector<int>>(iter.second);
+      } else if (iter.first == "padding_size") {
+        padding_size = absl::get<std::vector<int>>(iter.second);
+      } else if (iter.first == "pool_type") {
+        pool_type = absl::get<std::string>(iter.second);
+      } else if (iter.first == "ceil_mode") {
+        ceil_mode = absl::get<bool>(iter.second);
+      } else if (iter.first == "exclusive") {
+        exclusive = absl::get<bool>(iter.second);
+      } else if (iter.first == "data_format") {
+        data_format = absl::get<std::string>(iter.second);
+      } else {
+        LOG(ERROR) << "Unsupported attr: " << iter.first << std::endl;
+      }
+    }
+    CHECK(!kernel_size.empty()) << "kernel_size for pool3d is empty. Please check.\n";
+    CHECK(!stride_size.empty()) << "stride_size for pool3d is empty. Please check.\n";
+    CHECK(!padding_size.empty()) << "padding_size for pool3d is empty. Please check.\n";
+    CHECK(pool_type == "max" || pool_type == "avg") << "pool_type for pool3d should be max or avg.\n";
+
+    std::string tensor_name = UniqName("Pool3d_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    auto out = pe::Pool3d(A.as_tensor_ref(),
+                          kernel_size,
+                          stride_size,
+                          padding_size,
+                          pool_type,
+                          ceil_mode,
+                          exclusive,
+                          data_format,
+                          tensor_name);
+
+    auto stages = CreateStages(out);
+    CHECK(out.size() == 1U || out.size() == 2U) << "The size of pe::Pool3d's output should be 1 or 2.";
+    CHECK(!out_type.empty()) << "Output type of Pool3d is empty! Please check.\n";
+
+    std::vector<CINNValue> res;
+    for (auto &t : out) {
+      res.push_back(CINNValue(Expr(t.get())));
+    }
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule pool3d_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of pool3d schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      std::vector<Expr> vec_tensor;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        } else if (arg_pack[i].is_tensor()) {
+          Expr temp = arg_pack[i];
+          vec_tensor.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      if (arg_pack.size() == 3UL) {
+        CHECK_EQ(vec_tensor.size(), 2);
+        Expr input_pad = vec_tensor[1];
+        CHECK(input_pad.as_tensor());
+        auto block_input_pad = ir_sch.GetBlock(input_pad.as_tensor()->name);
+        ir_sch.ComputeInline(block_input_pad);
+      }
+      if (target.arch == Target::Arch::NVGPU) {
+        CHECK(!vec_tensor.empty());
+        Expr Out = vec_tensor[0];
+        CHECK(Out.as_tensor());
+        auto loops = ir_sch.GetLoops(Out.as_tensor()->name);
+        ir_sch.Split(loops[1], {-1, 2});
+        loops = ir_sch.GetLoops(Out.as_tensor()->name);
+        ir_sch.Bind(loops[0], "blockIdx.x");
+        ir_sch.Bind(loops[1], "threadIdx.x");
+      }
+      std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of pool3d schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      CHECK(arg_pack.size() == 2UL || arg_pack.size() == 3UL);
+      Expr Out              = arg_pack[0];
+      poly::StageMap stages = arg_pack[arg_pack.size() - 1];
+      if (arg_pack.size() == 3UL) {
+        Expr input_pad = arg_pack[1];
+        CHECK(input_pad.as_tensor());
+        stages[input_pad.as_tensor_ref()]->ComputeInline();
+      }
+
+      if (target.arch == Target::Arch::NVGPU) {
+        CHECK(Out.as_tensor());
+        stages[Out.as_tensor_ref()]->Split(1, 2);
+        stages[Out.as_tensor_ref()]->Bind(0, "blockIdx.x");
+        stages[Out.as_tensor_ref()]->Bind(1, "threadIdx.x");
+      }
+      *ret = CINNValuePack{{CINNValue(Out), CINNValue(stages)}};
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(pool3d_compute, pool3d_schedule, "strategy.pool3d.x86", 1);
+
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForPool3d(const std::vector<std::vector<int>> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+  std::vector<int> kernel_size;  // [kernel_d, kernel_h, kernel_w]
+  std::vector<int> stride_size;  // [stride_d, stride_h, stride_w]
+  std::vector<int>
+      padding_size;  // [padding_front, padding_top, padding_left, padding_bottom, padding_right, padding_back]
+  std::string pool_type   = "max";
+  bool ceil_mode          = false;
+  bool exclusive          = true;
+  std::string data_format = "NCDHW";
+  for (auto &iter : attrs) {
+    if (iter.first == "kernel_size") {
+      kernel_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "stride_size") {
+      stride_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "padding_size") {
+      padding_size = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "ceil_mode") {
+      ceil_mode = absl::get<bool>(iter.second);
+    } else if (iter.first == "exclusive") {
+      exclusive = absl::get<bool>(iter.second);
+    } else if (iter.first == "data_format") {
+      data_format = absl::get<std::string>(iter.second);
+    }
+  }
+
+  CHECK_EQ(kernel_size.size(), 3U) << "kernel_size for pool3d should be 3.\n";
+  CHECK_EQ(stride_size.size(), 3U) << "stride_size for pool3d should be 3.\n";
+  CHECK(pool_type == "max" || pool_type == "avg") << "pool_type for pool3d should be max or avg.\n";
+
+  std::vector<int> output_shape1 = inputs_shape[0];
+  CHECK_EQ(inputs_shape[0].size(), 5U) << "input_shape size for pool3d should be 5.\n";
+  int depth_axis  = -1;
+  int height_axis = -1;
+  int width_axis  = -1;
+  if (data_format == "NCDHW") {
+    depth_axis  = 2;
+    height_axis = 3;
+    width_axis  = 4;
+  } else if (data_format == "NDHWC") {
+    depth_axis  = 1;
+    height_axis = 2;
+    width_axis  = 3;
+  } else {
+    LOG(ERROR) << "unsupported data_format: " << data_format << std::endl;
+  }
+
+  if (ceil_mode) {
+    output_shape1[depth_axis] =
+        (inputs_shape[0][depth_axis] - kernel_size[0] + padding_size[0] + padding_size[3] + stride_size[0] - 1) /
+            stride_size[0] +
+        1;
+    output_shape1[height_axis] =
+        (inputs_shape[0][height_axis] - kernel_size[1] + padding_size[1] + padding_size[4] + stride_size[1] - 1) /
+            stride_size[1] +
+        1;
+    output_shape1[width_axis] =
+        (inputs_shape[0][width_axis] - kernel_size[2] + padding_size[2] + padding_size[5] + stride_size[2] - 1) /
+            stride_size[2] +
+        1;
+  } else {
+    output_shape1[depth_axis] =
+        (inputs_shape[0][depth_axis] - kernel_size[0] + padding_size[0] + padding_size[3]) / stride_size[0] + 1;
+    output_shape1[height_axis] =
+        (inputs_shape[0][height_axis] - kernel_size[1] + padding_size[1] + padding_size[4]) / stride_size[1] + 1;
+    output_shape1[width_axis] =
+        (inputs_shape[0][width_axis] - kernel_size[2] + padding_size[2] + padding_size[5]) / stride_size[2] + 1;
+  }
+
+  std::vector<std::vector<int>> res{output_shape1};
+  return res;
+}
+
+std::vector<Type> InferDtypeForPool(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForPool(const std::vector<framework::shape_t> &input_shapes,
+                                                         const std::vector<std::string> &input_layouts,
+                                                         const framework::NodeAttr &attrs,
+                                                         const Target &target) {
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layout size is not 1! Please check again.";
+  return {input_layouts, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForSoftmax(const framework::NodeAttr &attrs,
+                                               const std::vector<ir::Tensor> &inputs,
+                                               const std::vector<Type> &out_type,
+                                               const std::vector<std::vector<int>> &output_shapes,
+                                               const Target &target) {
+  int axis        = -1;
+  bool use_mkldnn = false;
+  if (attrs.attr_store.count("axis")) {
+    axis = absl::get<int>(attrs.attr_store.at("axis"));
+  }
+  if (attrs.attr_store.count("use_mkldnn")) {
+    use_mkldnn = absl::get<bool>(attrs.attr_store.at("use_mkldnn"));
+  }
+  framework::CINNCompute softmax_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of softmax compute is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "The input tensors of softmax compute is empty! Please check.";
+    Expr A_expr = pack_args[0];
+    CHECK(A_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+    auto stages  = CreateStages({A});
+    int new_axis = axis;
+    if (axis == -1) {
+      new_axis = A->shape.size() - 1;
+    }
+    std::vector<ir::Tensor> out;
+
+    std::string tensor_name = UniqName("Softmax_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_GE(pack_args.size(), 2);
+      CHECK(pack_args[pack_args.size() - 1].is_string());
+      tensor_name = pack_args[pack_args.size() - 1].operator std::string();
+    }
+
+#ifdef CINN_WITH_MKLDNN
+    if (use_mkldnn) {
+      out = pe::SoftmaxMKLDNN(A, new_axis, tensor_name);
+    } else {
+      out = pe::Softmax(A, new_axis, tensor_name);
+    }
+#else
+    out = pe::Softmax(A, new_axis, tensor_name);
+#endif
+    std::vector<CINNValue> res;
+    for (auto &t : out) {
+      stages->InsertLazily(t);
+      res.push_back(CINNValue(t));
+    }
+    CHECK_EQ(out.size(), 2U) << "The size of pe::Softmax's output should be 2.";
+    CHECK(!out_type.empty()) << "Output type of Softmax is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule softmax_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input arguments of softmax schedule is empty! Please check.";
+      CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      if (target.arch == Target::Arch::NVGPU) {
+        if (output_shapes[0].size() > 1) {
+          auto all_blocks = ir_sch.GetAllBlocks();
+          CHECK_EQ(all_blocks.size(), 3);
+          auto loops = ir_sch.GetLoops(all_blocks[2]);
+          ir_sch.ComputeAt(all_blocks[1], loops.back());
+
+          if (output_shapes[0][0] != 1) {
+            ir_sch.SimpleComputeAt(all_blocks[0], loops[0]);
+          }
+
+          loops          = ir_sch.GetLoops(all_blocks[2]);
+          int loop_index = 1;
+          if (output_shapes[0][0] == 1) loop_index--;
+          CHECK_GE(loops.size(), loop_index + 1);
+          auto splited_loops = ir_sch.Split(loops[loop_index], {-1, 5});
+
+          all_blocks = ir_sch.GetAllBlocks();
+          loops      = ir_sch.GetLoops(all_blocks[2]);
+          ir_sch.Bind(loops[0], "blockIdx.x");
+          ir_sch.Bind(loops[1], "threadIdx.x");
+        }
+        std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+        *ret = CINNValuePack{res};
+      } else if (target.arch == Target::Arch::X86) {
+        pe::IRSoftmaxScheduleCPU(ir_sch, axis);
+        std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+        *ret = CINNValuePack{res};
+      }
+    } else {
+      CHECK(!args.empty()) << "The input arguments of softmax schedule is empty! Please check.";
+      CINNValuePack arg_pack = args[0];
+      CHECK_EQ(arg_pack.size(), 3UL) << "The input tensor's size of softmax schedule is " << arg_pack.size()
+                                     << "and it should be equal to 3! Please check.";
+      Expr out1             = arg_pack[0];
+      Expr out2             = arg_pack[1];
+      poly::StageMap stages = arg_pack[2];
+      CHECK(out1.as_tensor());
+      CHECK(out2.as_tensor());
+      ir::Tensor tensor_a = out1.as_tensor_ref();
+      ir::Tensor tensor_b = out2.as_tensor_ref();
+      if (target.arch == Target::Arch::NVGPU) {
+        if (tensor_a->shape.size() > 1) {
+          stages[tensor_a]->Split(1, 5);
+          stages[tensor_a]->Bind(0, "blockIdx.x");
+          stages[tensor_a]->Bind(1, "threadIdx.x");
+          int shape_size = tensor_a->shape.size();
+          stages[tensor_b]->ComputeAt(stages[tensor_a], shape_size);
+        }
+      } else if (target.arch == Target::Arch::X86) {
+        pe::SoftmaxScheduleCPU(stages, tensor_a, tensor_b, axis);
+      }
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(softmax_compute, softmax_schedule, "strategy.softmax.x86", 1);
+
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForSoftmax(const std::vector<std::vector<int>> &inputs_shape,
+                                                   const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+  std::vector<std::vector<int>> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForSoftmax(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForSoftmax(const std::vector<framework::shape_t> &input_shapes,
+                                                            const std::vector<std::string> &input_layouts,
+                                                            const framework::NodeAttr &attrs,
+                                                            const Target &target) {
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layout size is not 1! Please check again.";
+  if (input_shapes[0].size() > 4) {
+    // input tensor needs to be transformed back to NCHW for mkldnn
+    return {{"NCHW", "NCHW"}, {"NCHW"}};
+  }
+  return {{input_layouts[0], input_layouts[0]}, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForDropoutInfer(const framework::NodeAttr &attrs,
+                                                    const std::vector<ir::Tensor> &inputs,
+                                                    const std::vector<Type> &out_type,
+                                                    const std::vector<std::vector<int>> &output_shapes,
+                                                    const Target &target) {
+  float dropout_prob                 = 0;
+  std::string dropout_implementation = "downgrade_in_infer";
+  if (attrs.attr_store.find("dropout_prob") != attrs.attr_store.end()) {
+    dropout_prob = absl::get<float>(attrs.attr_store.at("dropout_prob"));
+  }
+  if (attrs.attr_store.find("dropout_implementation") != attrs.attr_store.end()) {
+    dropout_implementation = absl::get<std::string>(attrs.attr_store.at("dropout_implementation"));
+  }
+
+  framework::CINNCompute dropout_infer_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of dropout_infer compute is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "The input tensors of dropout_infer compute is empty! Please check.";
+    Expr A_expr = pack_args[0];
+    CHECK(A_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+
+    std::string tensor_name = UniqName("dropout_infer_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    auto out    = pe::DropoutInfer(A, dropout_prob, dropout_implementation, tensor_name);
+    auto stages = CreateStages({A, out});
+    *ret        = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      dropout_infer_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.dropout_infer.x86", 1);
+
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForDropoutInfer(const std::vector<std::vector<int>> &inputs_shape,
+                                                        const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty()) << "The input's shape size is 0! Please check again.";
+  float dropout_prob                 = 0;
+  std::string dropout_implementation = "downgrade_in_infer";
+  for (auto &iter : attrs) {
+    if (iter.first == "dropout_prob") {
+      dropout_prob = absl::get<float>(iter.second);
+    } else if (iter.first == "dropout_implementation") {
+      dropout_implementation = absl::get<std::string>(iter.second);
+    } else {
+      LOG(ERROR) << "Unsupported attr: " << iter.first << std::endl;
+    }
+  }
+
+  std::vector<std::vector<int>> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForDropoutInfer(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::shared_ptr<OpStrategy> StrategyForSelect(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<Type> &out_type,
+                                              const std::vector<std::vector<int>> &output_shapes,
+                                              const Target &target) {
+  framework::CINNCompute select_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of select compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 3U) << "at least three input tensor for select compute\n";
+    Expr condition   = pack_args[0];
+    Expr true_value  = pack_args[1];
+    Expr false_value = pack_args[2];
+    CHECK(condition.as_tensor());
+    CHECK(true_value.as_tensor());
+    CHECK(false_value.as_tensor());
+
+    std::string tensor_name = UniqName("Select_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 4U);
+      CHECK(pack_args[3].is_string());
+      tensor_name = pack_args[3].operator std::string();
+    }
+
+    auto out =
+        pe::Select(condition.as_tensor_ref(), true_value.as_tensor_ref(), false_value.as_tensor_ref(), tensor_name);
+    auto stages =
+        CreateStages({condition.as_tensor_ref(), true_value.as_tensor_ref(), false_value.as_tensor_ref(), out});
+    *ret = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  CHECK(out_type.size()) << "Out_type of select op is empty! Please check.";
+  strategy->AddImpl(select_compute, GetInjectiveScheduleFunc(output_shapes, target, false), "strategy.select.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForSelect(const std::vector<framework::shape_t> &inputs_shape,
+                                                    const framework::AttrMapType &attrs) {
+  CHECK_GE(inputs_shape.size(), 3) << "The input's shape size is 0! Please check again.";
+  CHECK(inputs_shape[0].size() == inputs_shape[1].size() && inputs_shape[1].size() == inputs_shape[2].size())
+      << "input tensors n_dim is not equal!";
+  CHECK(inputs_shape[0] == inputs_shape[1] && inputs_shape[1] == inputs_shape[2])
+      << "input tensor shapes is not equal!";
+  std::vector<framework::shape_t> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForSelect(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK_GE(inputs_type.size(), 3) << "The input's type size is less than three! Please check again.";
+  CHECK(inputs_type[0].is_bool()) << "The condition tensor type should be bool";
+  CHECK_EQ(inputs_type[1], inputs_type[2]) << "The true or false tensor type should be equal";
+  std::vector<Type> res{inputs_type[1]};
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForUnary(const std::vector<framework::shape_t> &input_shapes,
+                                                          const std::vector<std::string> &input_layouts,
+                                                          const framework::NodeAttr &attrs,
+                                                          const Target &target) {
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layout size is not 1! Please check again.";
+  return {input_layouts, input_layouts};
+}
+
+// batch norm train
+std::vector<framework::shape_t> InferShapeForBatchNormTrain(const std::vector<framework::shape_t> &inputs_shape,
+                                                            const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 5U) << "The input's layout size is not 5! Please check again.";
+  std::string data_layout = "";
+  if (attrs.find("data_layout") != attrs.end()) {
+    data_layout = absl::get<std::string>(attrs.at("data_layout"));
+  } else {
+    LOG(FATAL) << "data_layout is not found, please check!";
+  }
+
+  CHECK_EQ(inputs_shape[0].size(), 4) << "x dimension size is not required!";
+  CHECK_EQ(inputs_shape[1].size(), 1) << "scale dimension size is not required!";
+  CHECK_EQ(inputs_shape[2].size(), 1) << "bias dimension size is not required!";
+  CHECK_EQ(inputs_shape[3].size(), 1) << "moving_mean dimension size is not required!";
+  CHECK_EQ(inputs_shape[4].size(), 1) << "moving_variance dimension size is not required!";
+
+  if (data_layout == "NCHW") {
+    CHECK_EQ(inputs_shape[0][1], inputs_shape[1][0]) << "x and scale dimension is not equal!";
+    CHECK_EQ(inputs_shape[0][1], inputs_shape[2][0]) << "x and bias dimension size is not equal!";
+    CHECK_EQ(inputs_shape[0][1], inputs_shape[3][0]) << "x and moveing_mean dimension size is not equal!";
+    CHECK_EQ(inputs_shape[0][1], inputs_shape[4][0]) << "x and moveing_variance dimension size is not equal!";
+  } else if (data_layout == "NHWC") {
+    CHECK_EQ(inputs_shape[0][3], inputs_shape[1][0]) << "x and scale dimension is not equal!";
+    CHECK_EQ(inputs_shape[0][3], inputs_shape[2][0]) << "x and bias dimension size is not equal!";
+    CHECK_EQ(inputs_shape[0][3], inputs_shape[3][0]) << "x and moveing_mean dimension size is not equal!";
+    CHECK_EQ(inputs_shape[0][3], inputs_shape[4][0]) << "x and moveing_variance dimension size is not equal!";
+  } else {
+    LOG(FATAL) << "data_layout " << data_layout << " is not support!";
+  }
+
+  return {inputs_shape[0], inputs_shape[1], inputs_shape[1], inputs_shape[1], inputs_shape[1]};
+}
+
+std::vector<Type> InferDtypeForBatchNormTrain(const std::vector<Type> &inputs_type,
+                                              const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 5U) << "The BatchNormTrain input's type size should be 5! Please check again.";
+  CHECK_EQ(inputs_type[1], inputs_type[2]) << "The BatchNormTrain scale type should the same as bias type";
+  CHECK_EQ(inputs_type[1], inputs_type[3]) << "The BatchNormTrain scale type should the same as moving_mean type";
+  CHECK_EQ(inputs_type[1], inputs_type[4]) << "The BatchNormTrain scale type should the same as moving_variance type";
+  return {inputs_type[0], inputs_type[1], inputs_type[1], inputs_type[1], inputs_type[1]};
+}
+
+std::shared_ptr<OpStrategy> StrategyForGradOp(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<Type> &out_type,
+                                              const std::vector<std::vector<int>> &output_shapes,
+                                              const Target &target) {
+  LOG(FATAL)
+      << "Gradient operator will be decomposed into several primitive operators. Please Use Decomposer Program Pass.";
+}
+
+// batch norm grad
+std::vector<framework::shape_t> InferShapeForBatchNormGrad(const std::vector<framework::shape_t> &inputs_shape,
+                                                           const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 5U) << "The input's layout size is not 5! Please check again.";
+  std::string data_layout = "";
+  if (attrs.find("data_layout") != attrs.end()) {
+    data_layout = absl::get<std::string>(attrs.at("data_layout"));
+  } else {
+    LOG(FATAL) << "data_layout is not found, please check!";
+  }
+
+  CHECK_EQ(inputs_shape[0].size(), 4) << "dy dimension size is not required!";
+  CHECK_EQ(inputs_shape[1].size(), 4) << "x dimension size is not required!";
+  CHECK_EQ(inputs_shape[2].size(), 1) << "scale dimension size is not required!";
+  CHECK_EQ(inputs_shape[3].size(), 1) << "save_mean dimension size is not required!";
+  CHECK_EQ(inputs_shape[4].size(), 1) << "save_variance dimension size is not required!";
+
+  CHECK(inputs_shape[0] == inputs_shape[1]) << "dy and x shape is not equal!";
+  if (data_layout == "NCHW") {
+    CHECK_EQ(inputs_shape[0][1], inputs_shape[2][0]) << "dy and bias dimension size is not equal!";
+    CHECK_EQ(inputs_shape[0][1], inputs_shape[3][0]) << "dy and moveing_mean dimension size is not equal!";
+    CHECK_EQ(inputs_shape[0][1], inputs_shape[4][0]) << "dy and moveing_variance dimension size is not equal!";
+  } else if (data_layout == "NHWC") {
+    CHECK_EQ(inputs_shape[0][3], inputs_shape[2][0]) << "dy and bias dimension size is not equal!";
+    CHECK_EQ(inputs_shape[0][3], inputs_shape[3][0]) << "dy and moveing_mean dimension size is not equal!";
+    CHECK_EQ(inputs_shape[0][3], inputs_shape[4][0]) << "dy and moveing_variance dimension size is not equal!";
+  } else {
+    LOG(FATAL) << "data_layout " << data_layout << " is not support!";
+  }
+
+  return {inputs_shape[0], inputs_shape[2], inputs_shape[2]};
+}
+
+std::vector<Type> InferDtypeForBatchNormGrad(const std::vector<Type> &inputs_type,
+                                             const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 5U) << "The BatchNormGrad input's type size should be 5! Please check again.";
+
+  CHECK_EQ(inputs_type[0], inputs_type[1]) << "The BatchNormGrad y_grad type should the same as x type";
+  CHECK_EQ(inputs_type[2], inputs_type[3]) << "The BatchNormGrad scale type should the same as save_mean type";
+  CHECK_EQ(inputs_type[2], inputs_type[4]) << "The BatchNormGrad scale type should the same as save_variance type";
+  return {inputs_type[0], inputs_type[2], inputs_type[2]};
+}
+
+// pool2d grad
+std::vector<framework::shape_t> InferShapeForPool2dGrad(const std::vector<framework::shape_t> &inputs_shape,
+                                                        const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 3U) << "The operator pool2d_grad should has 3 inputs! Please check again.";
+  return {inputs_shape[0]};
+}
+
+std::vector<Type> InferDtypeForPool2dGrad(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 3U) << "The operator pool2d_grad should has 3 inputs! Please check again.";
+  return {inputs_type[0]};
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(nn_ops) {
+  CINN_REGISTER_OP(relu)
+      .describe("Output 0 for each input element < 0. Output itself for each input element >= 0.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForRelu)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForRelu))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForRelu))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForUnary))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(relu6)
+      .describe("Output 0 for each input element < 0. Output itself for each input element >= 0 and <=6.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForRelu6)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForRelu))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForRelu))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForUnary))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(conv2d)
+      .describe("Do a 2-D convolution with an NCHW/NHWC layout.")
+      .set_num_inputs(2)  // here we consider filter as another input
+      .set_num_outputs(4)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForConv2d)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForConv2d))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForConv2d))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForConv2d))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(conv2d_NCHWc)
+      .describe("Do a 2-D convolution with an NCHWc layout. Input is 5D tensor and weight is 6D tensor.")
+      .set_num_inputs(2)  // here we consider filter as another input
+      .set_num_outputs(3)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForConv2dNCHWc)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForConv2dNCHWc))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForConv2dNCHWc))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForConv2dNCHWc))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kOutFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(depthwise_conv2d)
+      .describe("Do a 2-D depthwise convolution with an NCHW/NHWC layout.")
+      .set_num_inputs(2)  // here we consider filter as another input
+      .set_num_outputs(4)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForDepthwiseConv2d)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForConv2d))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForConv2d))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForConv2d))
+#endif
+#ifdef CINN_WITH_CUDNN
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+#else
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kOutFusible)
+#endif
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(batch_norm)
+      .describe("Can be used as a normalizer function for convolution or fully_connected operations.")
+      .set_num_inputs(5)  // here we consider batchnorm's 4 attrs(mean, variance, scale, bias) as other 4 inputs
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForBatchNorm)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForBatchNorm))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForBatchNorm))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForBatchNorm))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(pool1d)
+      .describe("Do pooling on the width dimension of the input tensor.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForPool1d)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForPool1d))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForPool))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForPool))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(pool2d)
+      .describe("Do pooling on the height and width dimension of the input tensor.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForPool2d)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForPool2d))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForPool))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForPool))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(pool3d)
+      .describe("Do pooling on the depth, height and width dimension of the input tensor.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForPool3d)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForPool3d))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForPool))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForPool))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(softmax)
+      .describe("This operator implements the softmax layer")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForSoftmax)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForSoftmax))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForSoftmax))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForSoftmax))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(dropout_infer)
+      .describe("Downgrade the outcome at inference or keep the same.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForDropoutInfer)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForDropoutInfer))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForDropoutInfer))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForUnary))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(select)
+      .describe("This operator implements the meta op 'Select'.")
+      .set_num_inputs(3)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForSelect)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForSelect))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForSelect))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForUnary))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(nn_grad_ops) {
+  CINN_REGISTER_OP(relu_grad)
+      .describe("The gradient of relu.")
+      .set_num_inputs(2)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForGradOp)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForRelu))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForRelu))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise);
+
+  CINN_REGISTER_OP(batch_norm_train)
+      .describe("This operator implements the batch normalization training forward.")
+      .set_num_inputs(5)
+      .set_num_outputs(5)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForBatchNormTrain))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForBatchNormTrain))
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(batch_norm_grad)
+      .describe("This operator implements the batch normalization backward.")
+      .set_num_inputs(5)
+      .set_num_outputs(3)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForBatchNormGrad))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForBatchNormGrad))
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(pool2d_grad)
+      .describe("This operator implements the batch normalization backward.")
+      .set_num_inputs(3)
+      .set_num_outputs(1)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForPool2dGrad))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForPool2dGrad))
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/op_broadcast_test.cc b/paddle/cinn/hlir/op/op_broadcast_test.cc
new file mode 100755
index 0000000000000..67bc46e428ac5
--- /dev/null
+++ b/paddle/cinn/hlir/op/op_broadcast_test.cc
@@ -0,0 +1,318 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <functional>
+#include <string>
+
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/cinn.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using CCompute = std::function<std::shared_ptr<ir::Tensor>(const std::vector<ir::Tensor>)>;
+
+TEST(Operator, Operator_ElementWise_Add_Test0) {
+  auto add      = Operator::Get("elementwise_add");
+  Operator temp = *add;
+  auto strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  Expr M(100), N(32);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  NodeAttr attrs;
+  std::vector<ir::Tensor> inputs{A.tensor(), B.tensor()};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+  auto impl = OpStrategy::SelectImpl(strategy[add](attrs, inputs, type, {{M.as_int32(), N.as_int32()}}, target));
+  ASSERT_EQ(impl->name, "strategy.elementwise_add.x86");
+  ASSERT_EQ(add->description, "elementwise_add function");
+
+  std::string func_name = "add1";
+  Module::Builder builder("module0", target);
+
+  if (FLAGS_cinn_ir_schedule) {
+    std::string out_name = "C";
+    common::CINNValuePack cinn_input =
+        common::CINNValuePack{{common::CINNValue(A), common::CINNValue(B), common::CINNValue(out_name)}};
+    std::vector<std::string> input_output_names{"A", "B", out_name};
+
+    auto funcs = framework::GetFuncFromImpl(impl, cinn_input, inputs, input_output_names, func_name, target);
+
+    for (auto func : funcs) {
+      LOG(INFO) << "Test Operator_ElementWise_Add_Test0's Strategy, func is :\n" << func;
+      builder.AddFunction(func);
+    }
+
+  } else {
+    common::CINNValuePack cinn_input = common::CINNValuePack{{common::CINNValue(A), common::CINNValue(B)}};
+    common::CINNValuePack rets       = impl->fcompute(cinn_input);
+    ASSERT_EQ(rets.size(), 2UL);
+    rets = impl->fschedule(rets);
+    ASSERT_EQ(rets.size(), 2UL);
+    // the last element is a StageMap
+    for (int i = 0; i < rets->size() - 1; i++) {
+      Expr temp = rets[i];
+      inputs.push_back(temp.as_tensor_ref());
+    }
+    auto func = Lower("fn_" + func_name, rets.back(), inputs);
+    LOG(INFO) << "Test Strategy Codegen:\n" << func;
+    builder.AddFunction(func);
+  }
+
+  auto jit    = backends::ExecutionEngine::Create({});
+  auto module = builder.Build();
+  jit->Link(module);
+  auto fn = jit->Lookup("fn_" + func_name);
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+  cinn_buffer_t *A_buf;
+  cinn_buffer_t *B_buf;
+  int set_value = 0;
+  if (set_value != 0) {
+    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_align(512).set_val(set_value).Build();
+    B_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_align(512).set_val(set_value).Build();
+  } else {
+    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_align(512).set_random().Build();
+    B_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_align(512).set_random().Build();
+  }
+  auto *C_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_align(512).set_zero().Build();
+
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
+  fn_(args, 3);
+
+  auto *ad = reinterpret_cast<float *>(A_buf->memory);
+  auto *bd = reinterpret_cast<float *>(B_buf->memory);
+  auto *cd = reinterpret_cast<float *>(C_buf->memory);
+  for (int i = 0; i < A_buf->num_elements(); i++) {
+    ASSERT_NEAR(cd[i], ad[i] + bd[i], 1e-5);
+  }
+}
+#ifdef CINN_WITH_CUDA
+TEST(Operator, Operator_ElementWise_Add_Test1) {
+  auto add      = Operator::Get("elementwise_add");
+  Operator temp = *add;
+  auto strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  Expr M(100), N(32);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {N});
+
+  NodeAttr attrs;
+  attrs.attr_store["axis"] = 1;
+  std::vector<ir::Tensor> inputs{A.tensor(), B.tensor()};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultNVGPUTarget();
+  auto impl             = OpStrategy::SelectImpl(strategy[add](attrs, inputs, type, {{100, 32}}, target));
+  ASSERT_EQ(impl->name, "strategy.elementwise_add.x86");
+  ASSERT_EQ(add->description, "elementwise_add function");
+
+  std::string func_name = "add2";
+  Module::Builder builder("module", target);
+
+  if (FLAGS_cinn_ir_schedule) {
+    std::string out_name = "C";
+    common::CINNValuePack cinn_input =
+        common::CINNValuePack{{common::CINNValue(A), common::CINNValue(B), common::CINNValue(out_name)}};
+    std::vector<std::string> input_output_names{"A", "B", out_name};
+
+    auto funcs = framework::GetFuncFromImpl(impl, cinn_input, inputs, input_output_names, func_name, target);
+
+    for (auto func : funcs) {
+      builder.AddFunction(func);
+      LOG(INFO) << "Test Operator_ElementWise_Add_Test1's Strategy, func is :\n" << func;
+    }
+  } else {
+    common::CINNValuePack cinn_input = common::CINNValuePack{{common::CINNValue(A), common::CINNValue(B)}};
+    common::CINNValuePack rets       = impl->fcompute(cinn_input);
+    ASSERT_EQ(rets.size(), 2UL);
+    rets = impl->fschedule(rets);
+    ASSERT_EQ(rets.size(), 2UL);
+    // the last element is a StageMap
+    for (int i = 0; i < rets->size() - 1; i++) {
+      Expr temp = rets[i];
+      inputs.push_back(temp.as_tensor_ref());
+    }
+    auto func = Lower("fn_" + func_name, rets.back(), inputs);
+    LOG(INFO) << "Test Strategy Codegen:\n" << func;
+    builder.AddFunction(func);
+  }
+
+  backends::CodeGenCUDA_Dev codegen(target);
+
+  auto module      = builder.Build();
+  auto source_code = codegen.Compile(module);
+  LOG(INFO) << "Operator_ElementWise_Add_Test1 source code:\n" << source_code;
+}
+#endif
+
+TEST(Operator, Operator_BroadcastTo) {
+  auto broadcast_to = Operator::Get("broadcast_to");
+  Operator temp     = *broadcast_to;
+  auto strategy     = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  Expr N(1);
+  Placeholder<float> B("B", {N});
+
+  NodeAttr attrs;
+  std::vector<int> out_shape    = {16};
+  attrs.attr_store["out_shape"] = out_shape;
+
+  std::vector<int> broadcast_axes    = {0};
+  attrs.attr_store["broadcast_axes"] = broadcast_axes;
+
+  std::vector<ir::Tensor> inputs{B.tensor()};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+
+  auto impl = OpStrategy::SelectImpl(strategy[broadcast_to](attrs, inputs, type, {out_shape}, target));
+
+  std::string func_name = "broadcast_to";
+
+  if (FLAGS_cinn_ir_schedule) {
+    std::string out_name             = "C";
+    common::CINNValuePack cinn_input = common::CINNValuePack{{common::CINNValue(B), common::CINNValue(out_name)}};
+    std::vector<std::string> input_output_names{"B", out_name};
+
+    auto funcs = framework::GetFuncFromImpl(impl, cinn_input, inputs, input_output_names, func_name, target);
+
+    for (auto func : funcs) {
+      LOG(INFO) << "Test Operator_BroadcastTo's Strategy, func is :\n" << func;
+    }
+  } else {
+    common::CINNValuePack cinn_input = common::CINNValuePack{{common::CINNValue(B)}};
+    common::CINNValuePack rets       = impl->fcompute(cinn_input);
+
+    ASSERT_EQ(rets.size(), 2UL);
+    rets = impl->fschedule(rets);
+    ASSERT_EQ(rets.size(), 2UL);
+    // the last element is a StageMap
+    for (int i = 0; i < rets->size() - 1; i++) {
+      Expr temp = rets[i];
+      inputs.push_back(temp.as_tensor_ref());
+    }
+
+    auto func = Lower("func" + func_name, rets.back(), inputs);
+    LOG(INFO) << "Test Operator_BroadcastTo's Strategy, func is :\n" << func;
+  }
+}
+
+common::CINNValuePack GetComputeResult(const std::shared_ptr<OpImpl> &impl,
+                                       std::vector<common::CINNValue> &cinn_inputs,
+                                       const std::string &output_name = "") {
+  if (FLAGS_cinn_ir_schedule) {
+    cinn_inputs.emplace_back(output_name);
+  }
+  return impl->fcompute(common::CINNValuePack{cinn_inputs});
+}
+
+TEST(Operator, Operator_BroadcastTo_0) {
+  auto const_scalar    = Operator::Get("const_scalar");
+  auto broadcast_to    = Operator::Get("broadcast_to");
+  auto reduce_sum      = Operator::Get("reduce_sum");
+  auto elementwise_add = Operator::Get("elementwise_mul");
+
+  auto strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  Expr N(16);
+  Placeholder<float> A("A", {N, N, N, N});
+
+  NodeAttr attrs;
+  attrs.attr_store["value"] = 0.5f;
+
+  std::vector<int> out_shape    = {16};
+  attrs.attr_store["out_shape"] = out_shape;
+
+  std::vector<int> broadcast_axes    = {0};
+  attrs.attr_store["broadcast_axes"] = broadcast_axes;
+
+  std::vector<int> dim    = {0, 2, 3};
+  attrs.attr_store["dim"] = dim;
+
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+
+  auto impl_0 =
+      OpStrategy::SelectImpl(strategy[const_scalar](attrs, std::vector<ir::Tensor>{}, type, {out_shape}, target));
+  std::vector<common::CINNValue> cinn_inputs;
+  common::CINNValuePack rets_0 = GetComputeResult(impl_0, cinn_inputs, "out_0");
+  ir::Expr out_0               = rets_0[0];
+  auto tensor_0                = out_0.as_tensor_ref();
+  poly::StageMap stages_0      = rets_0.back();
+
+  auto impl_1 = OpStrategy::SelectImpl(strategy[broadcast_to](attrs, {tensor_0}, type, {out_shape}, target));
+  std::vector<common::CINNValue> cinn_inputs_1 = {{common::CINNValue(tensor_0)}};
+  common::CINNValuePack rets_1                 = GetComputeResult(impl_1, cinn_inputs_1, "out_1");
+
+  ir::Expr out_1          = rets_1[0];
+  auto tensor_1           = out_1.as_tensor_ref();
+  poly::StageMap stages_1 = rets_1.back();
+
+  auto impl_2 = OpStrategy::SelectImpl(strategy[reduce_sum](attrs, {A.tensor()}, type, {out_shape}, target));
+  std::vector<common::CINNValue> cinn_inputs_2 = {{common::CINNValue(A.tensor())}};
+  common::CINNValuePack rets_2                 = GetComputeResult(impl_2, cinn_inputs_2, "out_2");
+
+  ir::Expr out_2          = rets_2[0];
+  auto tensor_2           = out_2.as_tensor_ref();
+  poly::StageMap stages_2 = rets_2.back();
+
+  std::vector<common::CINNValue> cinn_inputs_4 = {{common::CINNValue(A.tensor())}};
+  common::CINNValuePack rets_4                 = GetComputeResult(impl_2, cinn_inputs_4, "out_4");
+  ir::Expr out_4                               = rets_4[0];
+  auto tensor_4                                = out_4.as_tensor_ref();
+  poly::StageMap stages_4                      = rets_4.back();
+
+  auto impl_3 =
+      OpStrategy::SelectImpl(strategy[elementwise_add](attrs, {tensor_1, tensor_2}, type, {out_shape}, target));
+  std::vector<common::CINNValue> cinn_inputs_3 = {{common::CINNValue(tensor_1), common::CINNValue(tensor_2)}};
+  common::CINNValuePack rets_3                 = GetComputeResult(impl_3, cinn_inputs_3, "out_3");
+
+  ir::Expr out_3          = rets_3[0];
+  auto tensor_3           = out_3.as_tensor_ref();
+  poly::StageMap stages_3 = rets_3.back();
+
+  stages_3->InsertLazily(tensor_0, stages_0[tensor_0]);
+  stages_3->InsertLazily(tensor_1, stages_1[tensor_1]);
+  stages_3->InsertLazily(tensor_2, stages_2[tensor_2]);
+  stages_3->InsertLazily(tensor_4, stages_4[tensor_4]);
+  stages_3[tensor_0]->ComputeInline();
+  stages_3[tensor_1]->ComputeInline();
+  stages_3[tensor_2]->SetBuffer("local");
+  stages_3[tensor_4]->SimpleComputeAt(stages_3[tensor_2], 3);
+  stages_3[tensor_2]->SimpleComputeAt(stages_3[tensor_3], 0);
+
+  std::vector<ir::Tensor> inputs = {A.tensor(), tensor_3, tensor_4};
+  auto func                      = Lower("broadcast_to", stages_3, inputs);
+  LOG(INFO) << "Test Strategy Codegen:\n" << func;
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/op_nn_test.cc b/paddle/cinn/hlir/op/op_nn_test.cc
new file mode 100644
index 0000000000000..55f595227eedd
--- /dev/null
+++ b/paddle/cinn/hlir/op/op_nn_test.cc
@@ -0,0 +1,513 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <functional>
+#include <iostream>
+#include <string>
+
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/cinn.h"
+#include "cinn/common/target.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using CCompute = std::function<std::shared_ptr<ir::Tensor>(const std::vector<ir::Tensor>)>;
+
+Module LowerToModule(const std::string test_name,
+                     const std::string func_name,
+                     const std::shared_ptr<OpImpl> &impl,
+                     std::vector<std::string> input_names,
+                     const std::string &output_name,
+                     std::vector<ir::Tensor> &inputs,
+                     std::vector<common::CINNValue> cinn_inputs,
+                     const Target &target) {
+  Module::Builder builder("module", target);
+
+  if (FLAGS_cinn_ir_schedule) {
+    cinn_inputs.emplace_back(output_name);
+    common::CINNValuePack cinn_input = common::CINNValuePack{cinn_inputs};
+    input_names.push_back(output_name);
+
+    auto funcs = framework::GetFuncFromImpl(impl, cinn_input, inputs, input_names, func_name, target);
+
+    for (auto func : funcs) {
+      LOG(INFO) << "Test" << test_name << "'s Strategy, func is :\n" << func;
+      builder.AddFunction(func);
+    }
+  } else {
+    common::CINNValuePack cinn_input = common::CINNValuePack{cinn_inputs};
+    common::CINNValuePack rets       = impl->fcompute(cinn_input);
+    rets                             = impl->fschedule(rets);
+    // the last element is a StageMap
+    for (int i = 0; i < rets->size() - 1; i++) {
+      Expr temp = rets[i];
+      inputs.push_back(temp.as_tensor_ref());
+    }
+    auto func = Lower("fn_" + func_name, rets.back(), inputs);
+    LOG(INFO) << "Test Strategy Codegen:\n" << func;
+
+    builder.AddFunction(func);
+  }
+
+  return builder.Build();
+}
+
+TEST(Operator, Operator_Pool2d_Test0) {
+  auto pool2d   = Operator::Get("pool2d");
+  Operator temp = *pool2d;
+  auto strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  Expr N(1), C(3), H(8), W(8);
+  Placeholder<float> A("A", {N, C, H, W});
+
+  NodeAttr attrs;
+  std::vector<int> kernel_size     = {2, 2};
+  std::vector<int> stride_size     = {2, 2};
+  std::vector<int> padding_size    = {1, 1, 1, 1};
+  std::string pool_type            = "max";
+  attrs.attr_store["kernel_size"]  = kernel_size;
+  attrs.attr_store["stride_size"]  = stride_size;
+  attrs.attr_store["padding_size"] = padding_size;
+  attrs.attr_store["pool_type"]    = pool_type;
+  std::vector<ir::Tensor> inputs{A.tensor()};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+  auto impl = OpStrategy::SelectImpl(strategy[pool2d](attrs, inputs, type, {{1, 3, 10, 10}, {1, 3, 5, 5}}, target));
+
+  std::string func_name = "pool2d";
+  auto module =
+      LowerToModule("Operator_Pool2d_Test0", func_name, impl, {"A"}, "B", inputs, {common::CINNValue(A)}, target);
+
+  auto jit = backends::ExecutionEngine::Create({});
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn_" + func_name);
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf = common::BufferBuilder(Float(32), {1, 3, 8, 8}).set_random().Build();
+  cinn_buffer_t *B_buf = common::BufferBuilder(Float(32), {1, 3, 10, 10}).set_random().Build();
+  cinn_buffer_t *C_buf = common::BufferBuilder(Float(32), {1, 3, 5, 5}).set_random().Build();
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
+  fn_(args, 3);
+
+  ASSERT_EQ(impl->name, "strategy.pool2d.x86");
+  ASSERT_EQ(pool2d->description, "Do pooling on the height and width dimension of the input tensor.");
+}
+
+TEST(Operator, Operator_Pool2d_Test1) {
+  auto pool2d   = Operator::Get("pool2d");
+  Operator temp = *pool2d;
+  auto strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  Expr N(1), C(3), H(8), W(8);
+  Placeholder<float> A("A", {N, C, H, W});
+
+  NodeAttr attrs;
+  std::vector<int> kernel_size     = {2, 2};
+  std::vector<int> stride_size     = {2, 2};
+  std::vector<int> padding_size    = {1, 1, 1, 1};
+  std::string pool_type            = "avg";
+  attrs.attr_store["kernel_size"]  = kernel_size;
+  attrs.attr_store["stride_size"]  = stride_size;
+  attrs.attr_store["padding_size"] = padding_size;
+  attrs.attr_store["pool_type"]    = pool_type;
+  attrs.attr_store["ceil_mode"]    = true;
+  attrs.attr_store["exclusive"]    = false;
+  std::vector<ir::Tensor> inputs{A.tensor()};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+  auto impl = OpStrategy::SelectImpl(strategy[pool2d](attrs, inputs, type, {{1, 3, 11, 11}, {1, 3, 5, 5}}, target));
+
+  std::string func_name = "pool2d";
+
+  auto module =
+      LowerToModule("Operator_Pool2d_Test1", func_name, impl, {"A"}, "B", inputs, {common::CINNValue(A)}, target);
+
+  auto jit = backends::ExecutionEngine::Create({});
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn_" + func_name);
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf = common::BufferBuilder(Float(32), {1, 3, 8, 8}).set_random().Build();
+  cinn_buffer_t *B_buf = common::BufferBuilder(Float(32), {1, 3, 11, 11}).set_random().Build();
+  cinn_buffer_t *C_buf = common::BufferBuilder(Float(32), {1, 3, 5, 5}).set_random().Build();
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
+  fn_(args, 3);
+
+  ASSERT_EQ(impl->name, "strategy.pool2d.x86");
+  ASSERT_EQ(pool2d->description, "Do pooling on the height and width dimension of the input tensor.");
+}
+
+TEST(Operator, Operator_Pool2d_Test2) {
+  auto pool2d   = Operator::Get("pool2d");
+  Operator temp = *pool2d;
+  auto strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  Expr N(1), H(8), W(8), C(3);
+  Placeholder<float> A("A", {N, H, W, C});
+
+  NodeAttr attrs;
+  std::vector<int> kernel_size     = {2, 2};
+  std::vector<int> stride_size     = {2, 2};
+  std::vector<int> padding_size    = {1, 1, 1, 1};
+  std::string pool_type            = "avg";
+  std::string data_format          = "NHWC";
+  attrs.attr_store["kernel_size"]  = kernel_size;
+  attrs.attr_store["stride_size"]  = stride_size;
+  attrs.attr_store["padding_size"] = padding_size;
+  attrs.attr_store["pool_type"]    = pool_type;
+  attrs.attr_store["ceil_mode"]    = true;
+  attrs.attr_store["exclusive"]    = true;
+  attrs.attr_store["data_format"]  = data_format;
+  std::vector<ir::Tensor> inputs{A.tensor()};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+  auto impl = OpStrategy::SelectImpl(strategy[pool2d](attrs, inputs, type, {{1, 11, 11, 3}, {1, 5, 5, 3}}, target));
+
+  std::string func_name = "pool2d";
+
+  auto module =
+      LowerToModule("Operator_Pool2d_Test2", func_name, impl, {"A"}, "B", inputs, {common::CINNValue(A)}, target);
+
+  auto jit = backends::ExecutionEngine::Create({});
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn_" + func_name);
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf = common::BufferBuilder(Float(32), {1, 8, 8, 3}).set_random().Build();
+  cinn_buffer_t *B_buf = common::BufferBuilder(Float(32), {1, 11, 11, 3}).set_random().Build();
+  cinn_buffer_t *C_buf = common::BufferBuilder(Float(32), {1, 5, 5, 3}).set_random().Build();
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
+  fn_(args, 3);
+
+  ASSERT_EQ(impl->name, "strategy.pool2d.x86");
+  ASSERT_EQ(pool2d->description, "Do pooling on the height and width dimension of the input tensor.");
+}
+
+TEST(Operator, Operator_Pool3d_Test0) {
+  auto pool3d   = Operator::Get("pool3d");
+  Operator temp = *pool3d;
+  auto strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  Expr N(1), D(8), H(8), W(8), C(3);
+  Placeholder<float> A("A", {N, D, H, W, C});
+
+  NodeAttr attrs;
+  std::vector<int> kernel_size     = {2, 2, 2};
+  std::vector<int> stride_size     = {2, 2, 2};
+  std::vector<int> padding_size    = {1, 1, 1, 1, 1, 1};
+  std::string pool_type            = "max";
+  std::string data_format          = "NDHWC";
+  attrs.attr_store["kernel_size"]  = kernel_size;
+  attrs.attr_store["stride_size"]  = stride_size;
+  attrs.attr_store["padding_size"] = padding_size;
+  attrs.attr_store["pool_type"]    = pool_type;
+  attrs.attr_store["ceil_mode"]    = false;
+  attrs.attr_store["exclusive"]    = true;
+  attrs.attr_store["data_format"]  = data_format;
+  std::vector<ir::Tensor> inputs{A.tensor()};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+  auto impl =
+      OpStrategy::SelectImpl(strategy[pool3d](attrs, inputs, type, {{1, 11, 11, 11, 3}, {1, 5, 5, 5, 3}}, target));
+
+  std::string func_name = "pool3d";
+  auto module =
+      LowerToModule("Operator_Pool3d_Test0", func_name, impl, {"A"}, "B", inputs, {common::CINNValue(A)}, target);
+
+  auto jit = backends::ExecutionEngine::Create({});
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn_" + func_name);
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf = common::BufferBuilder(Float(32), {1, 8, 8, 8, 3}).set_random().Build();
+  cinn_buffer_t *B_buf = common::BufferBuilder(Float(32), {1, 11, 11, 11, 3}).set_random().Build();
+  cinn_buffer_t *C_buf = common::BufferBuilder(Float(32), {1, 5, 5, 5, 3}).set_random().Build();
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
+  fn_(args, 3);
+
+  ASSERT_EQ(impl->name, "strategy.pool3d.x86");
+  ASSERT_EQ(pool3d->description, "Do pooling on the depth, height and width dimension of the input tensor.");
+}
+
+TEST(Operator, Operator_Pool1d_Test0) {
+  auto pool1d   = Operator::Get("pool1d");
+  Operator temp = *pool1d;
+  auto strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  Expr N(1), W(8), C(3);
+  Placeholder<float> A("A", {N, W, C});
+
+  NodeAttr attrs;
+  std::vector<int> kernel_size     = {2};
+  std::vector<int> stride_size     = {2};
+  std::vector<int> padding_size    = {1, 1};
+  std::string pool_type            = "max";
+  std::string data_format          = "NWC";
+  attrs.attr_store["kernel_size"]  = kernel_size;
+  attrs.attr_store["stride_size"]  = stride_size;
+  attrs.attr_store["padding_size"] = padding_size;
+  attrs.attr_store["pool_type"]    = pool_type;
+  attrs.attr_store["ceil_mode"]    = false;
+  attrs.attr_store["exclusive"]    = true;
+  attrs.attr_store["data_format"]  = data_format;
+  std::vector<ir::Tensor> inputs{A.tensor()};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+  auto impl = OpStrategy::SelectImpl(strategy[pool1d](attrs, inputs, type, {{1, 11, 3}, {1, 5, 3}}, target));
+
+  std::string func_name = "pool1d";
+  auto module =
+      LowerToModule("Operator_Pool1d_Test0", func_name, impl, {"A"}, "B", inputs, {common::CINNValue(A)}, target);
+
+  auto jit = backends::ExecutionEngine::Create({});
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn_" + func_name);
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf = common::BufferBuilder(Float(32), {1, 8, 3}).set_random().Build();
+  cinn_buffer_t *B_buf = common::BufferBuilder(Float(32), {1, 11, 3}).set_random().Build();
+  cinn_buffer_t *C_buf = common::BufferBuilder(Float(32), {1, 5, 3}).set_random().Build();
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
+  fn_(args, 3);
+
+  ASSERT_EQ(impl->name, "strategy.pool1d.x86");
+  ASSERT_EQ(pool1d->description, "Do pooling on the width dimension of the input tensor.");
+}
+
+TEST(Operator, Operator_Select_Test0) {
+  auto select           = Operator::Get("select");
+  Operator temp         = *select;
+  auto strategy         = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+  auto infer_shape_func = Operator::GetAttrs<InferShapeFunction>("infershape")[select];
+
+  Expr C(16), H(64), W(64);
+  Placeholder<bool> condition("condition", {C, H, W});
+  Placeholder<float> true_value("true_value", {C, H, W});
+  Placeholder<float> false_value("false_value", {C, H, W});
+
+  NodeAttr attrs;
+  std::vector<ir::Tensor> inputs{condition.tensor(), true_value.tensor(), false_value.tensor()};
+  std::vector<Type> type{Float(32)};
+  const common::Target target = common::DefaultHostTarget();
+
+  const std::vector<framework::shape_t> input_shapes = {{16, 64, 64}, {16, 64, 64}, {16, 64, 64}};
+  auto infer_shape                                   = infer_shape_func(input_shapes, attrs.attr_store);
+  ASSERT_EQ(infer_shape[0][0], 16);
+  ASSERT_EQ(infer_shape[0][1], 64);
+  ASSERT_EQ(infer_shape[0][2], 64);
+
+  auto impl = OpStrategy::SelectImpl(strategy[select](attrs, inputs, type, {{16, 64, 64}}, target));
+
+  std::string func_name                      = "select";
+  std::vector<std::string> input_names       = {"condition", "true_value", "false_value"};
+  std::vector<common::CINNValue> cinn_inputs = {
+      common::CINNValue(condition), common::CINNValue(true_value), common::CINNValue(false_value)};
+
+  auto module = LowerToModule(
+      "Operator_Select_Test0", func_name, impl, std::move(input_names), "output", inputs, cinn_inputs, target);
+
+  auto jit = backends::ExecutionEngine::Create({});
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn_" + func_name);
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf = common::BufferBuilder(Bool(), {16, 64, 64}).set_random().Build();
+  cinn_buffer_t *B_buf = common::BufferBuilder(Float(32), {16, 64, 64}).set_random().Build();
+  cinn_buffer_t *C_buf = common::BufferBuilder(Float(32), {16, 64, 64}).set_random().Build();
+  cinn_buffer_t *D_buf = common::BufferBuilder(Float(32), {16, 64, 64}).set_random().Build();
+
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf), d_arg(D_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg, c_arg, d_arg};
+  fn_(args, 4);
+
+  auto condition_   = reinterpret_cast<int8_t *>(A_buf->memory);
+  auto true_value_  = reinterpret_cast<float *>(B_buf->memory);
+  auto false_value_ = reinterpret_cast<float *>(C_buf->memory);
+  auto output_      = reinterpret_cast<float *>(D_buf->memory);
+
+  for (int i = 0; i < A_buf->num_elements(); i++) {
+    if (static_cast<bool>(condition_[i])) {
+      ASSERT_EQ(output_[i], true_value_[i]);
+    } else {
+      ASSERT_EQ(output_[i], false_value_[i]);
+    }
+  }
+
+  ASSERT_EQ(impl->name, "strategy.select.x86");
+  ASSERT_EQ(select->description, "This operator implements the meta op 'Select'.");
+}
+
+TEST(Operator, Operator_Reverse_Test0) {
+  auto reverse  = Operator::Get("reverse");
+  Operator temp = *reverse;
+  auto strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+
+  int c = 16, h = 64, w = 64;
+  Expr C(c), H(h), W(w);
+  Placeholder<float> A("A", {C, H, W});
+
+  NodeAttr attrs;
+  std::vector<int> axis    = {1, 2};
+  attrs.attr_store["axis"] = axis;
+  std::vector<ir::Tensor> inputs{A.tensor()};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+
+  auto impl = OpStrategy::SelectImpl(strategy[reverse](attrs, inputs, type, {{c, h, w}}, target));
+
+  std::string func_name = "reverse";
+  auto module =
+      LowerToModule("Operator_Reverse_Test0", func_name, impl, {"A"}, "B", inputs, {common::CINNValue(A)}, target);
+
+  auto jit = backends::ExecutionEngine::Create({});
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn_" + func_name);
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf = common::BufferBuilder(Float(32), {c, h, w}).set_random().Build();
+  cinn_buffer_t *B_buf = common::BufferBuilder(Float(32), {c, h, w}).set_random().Build();
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg};
+  fn_(args, 2);
+
+  auto input  = reinterpret_cast<float *>(A_buf->memory);
+  auto output = reinterpret_cast<float *>(B_buf->memory);
+
+  for (int ida = 0; ida < c; ++ida) {
+    for (int idb = 0; idb < h; ++idb) {
+      for (int idc = 0; idc < w; ++idc) {
+        int index  = ida * h * w + idb * h + idc;
+        int index_ = ida * h * w + (h - 1 - idb) * h + (w - 1 - idc);
+        ASSERT_EQ(output[index], input[index_]);
+      }
+    }
+  }
+
+  ASSERT_EQ(impl->name, "strategy.reverse.x86");
+  ASSERT_EQ(reverse->description, "This operator implements the meta op reverse.");
+}
+
+TEST(Operator, Operator_Transpose_Test0) {
+  auto transpose        = Operator::Get("transpose");
+  Operator temp         = *transpose;
+  auto strategy         = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
+  auto infer_shape_func = Operator::GetAttrs<InferShapeFunction>("infershape")[transpose];
+
+  int n = 16, c = 3, h = 32, w = 32;
+  Expr N(n), C(c), H(h), W(w);
+  Placeholder<float> A("A", {N, C, H, W});
+
+  NodeAttr attrs;
+  std::vector<int> axis    = {0, 2, 3, 1};
+  attrs.attr_store["axis"] = axis;
+  std::vector<ir::Tensor> inputs{A.tensor()};
+  std::vector<Type> type{Float(32)};
+  common::Target target = common::DefaultHostTarget();
+
+  auto infer_shape = infer_shape_func({{n, c, h, w}}, attrs.attr_store);
+  ASSERT_EQ(infer_shape[0][0], n);
+  ASSERT_EQ(infer_shape[0][1], h);
+  ASSERT_EQ(infer_shape[0][2], w);
+  ASSERT_EQ(infer_shape[0][3], c);
+
+#ifndef CINN_WITH_CUDA
+  using InferLayoutFunction =
+      std::function<std::vector<std::vector<std::string>>(const std::vector<framework::shape_t> &,
+                                                          const std::vector<std::string> &,
+                                                          const framework::NodeAttr &,
+                                                          const Target &target)>;
+  auto infer_layout_func = Operator::GetAttrs<InferLayoutFunction>("inferlayout")[transpose];
+  auto infer_layout      = infer_layout_func({{n, c, h, w}}, {"NCHW"}, attrs, target);
+  ASSERT_EQ(infer_layout[0][0], "NHWC");
+#endif
+
+  auto input_shape  = {n, c, h, w};
+  auto output_shape = {n, h, w, c};
+
+  auto impl = OpStrategy::SelectImpl(strategy[transpose](attrs, inputs, type, {output_shape}, target));
+
+  std::string func_name = "transpose";
+  auto module =
+      LowerToModule("Operator_Transpose_Test0", func_name, impl, {"A"}, "B", inputs, {common::CINNValue(A)}, target);
+
+  auto jit = backends::ExecutionEngine::Create({});
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn_" + func_name);
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf = common::BufferBuilder(Float(32), input_shape).set_random().Build();
+  cinn_buffer_t *B_buf = common::BufferBuilder(Float(32), output_shape).set_random().Build();
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg};
+  fn_(args, 2);
+
+  auto input  = reinterpret_cast<float *>(A_buf->memory);
+  auto output = reinterpret_cast<float *>(B_buf->memory);
+
+  for (int idx = 0; idx < n; ++idx) {
+    for (int idy = 0; idy < h; ++idy) {
+      for (int idz = 0; idz < w; ++idz) {
+        for (int id = 0; id < c; ++id) {
+          // (n, h, w, c) (idx, idy, idz, id)
+          int index = idx * (h * w * c) + idy * (w * c) + idz * c + id;
+          // (n, c, h, w) (idx, id, idy, idz)
+          int _index = idx * (c * h * w) + id * (h * w) + idy * h + idz;
+          CHECK_EQ(output[index], input[_index]);
+        }
+      }
+    }
+  }
+
+  ASSERT_EQ(impl->name, "strategy.transpose.x86");
+  ASSERT_EQ(transpose->description, "This operator implements the meta op transpose.");
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/op_util.cc b/paddle/cinn/hlir/op/op_util.cc
new file mode 100644
index 0000000000000..7a7160e3d6b3b
--- /dev/null
+++ b/paddle/cinn/hlir/op/op_util.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/op_util.h"
+
+#include <string>
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir_schedule.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+
+CINNSchedule GetElementwiseScheduleFunc(const std::vector<std::vector<int>>& output_shapes,
+                                        const Target& target,
+                                        bool vectorizable) {
+  return CINNSchedule([=](lang::Args args, lang::RetValue* ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of ElementwiseSchedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      pe::IRElementwiseSchedule(ir_sch, output_shapes.front(), target);
+      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = common::CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of ElementwiseSchedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      Expr out                       = arg_pack[0];
+      poly::StageMap stages          = arg_pack[1];
+      CHECK(out.as_tensor());
+      CHECK_EQ(arg_pack.size(), 2UL);
+      if (target.arch == Target::Arch::NVGPU) {
+        pe::CudaScheduleInjective(stages[out.as_tensor_ref()], output_shapes.front(), target);
+      } else if (target.arch == Target::Arch::X86) {
+        pe::ScheduleInjectiveCPU(stages[out.as_tensor_ref()], output_shapes.front(), target, vectorizable);
+      }
+      *ret = arg_pack;
+    }
+  });
+}
+
+CINNSchedule GetInjectiveScheduleFunc(const std::vector<std::vector<int>>& output_shapes,
+                                      const Target& target,
+                                      bool vectorizable) {
+  return CINNSchedule([=](lang::Args args, lang::RetValue* ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of InjectiveSchedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      pe::IRInjectiveSchedule(ir_sch, output_shapes.front(), target);
+      /*if (target.arch == Target::Arch::NVGPU) {
+        pe::IRInjectiveSchedule(ir_sch, output_shapes.front(), target);
+      } else if (target.arch == Target::Arch::X86) {
+        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, vectorizable);
+      }*/
+      std::vector<common::CINNValue> res{common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = common::CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of InjectiveSchedule is empty! Please check.\n";
+      common::CINNValuePack arg_pack = args[0];
+      Expr out                       = arg_pack[0];
+      poly::StageMap stages          = arg_pack[1];
+      CHECK(out.as_tensor());
+      CHECK_EQ(arg_pack.size(), 2UL);
+      if (target.arch == Target::Arch::NVGPU) {
+        pe::CudaScheduleInjective(stages[out.as_tensor_ref()], output_shapes.front(), target);
+      } else if (target.arch == Target::Arch::X86) {
+        pe::ScheduleInjectiveCPU(stages[out.as_tensor_ref()], output_shapes.front(), target, vectorizable);
+      }
+      *ret = arg_pack;
+    }
+  });
+}
+
+std::string GetExternFuncName(const common::Target& target,
+                              const common::Type& type,
+                              const std::string& func_name,
+                              const bool need_cinn,
+                              const bool need_target,
+                              const bool need_type) {
+  std::string func_proto_name;
+  if (need_cinn) {
+    func_proto_name.append("cinn_");
+  }
+  if (need_target) {
+    if (target.arch == common::Target::Arch::NVGPU) {
+      func_proto_name.append("nvgpu_");
+    } else if (target.arch == common::Target::Arch::X86) {
+      func_proto_name.append("host_");
+    } else {
+      LOG(FATAL) << func_name << " only supports X86 and NVGPU! Please Check.\n";
+    }
+  }
+  func_proto_name.append(func_name);
+  if (!need_type) {
+    return func_proto_name;
+  }
+  func_proto_name.append("_");
+  if (type.is_bool()) {
+    func_proto_name.append("bool");
+  } else if (type.is_float(8)) {
+    func_proto_name.append("fp8");
+  } else if (type.is_float16()) {
+    func_proto_name.append("fp16");
+  } else if (type.is_bfloat16()) {
+    func_proto_name.append("bf16");
+  } else if (type.is_float(32)) {
+    func_proto_name.append("fp32");
+  } else if (type.is_float(64)) {
+    func_proto_name.append("fp64");
+  } else if (type.is_int(8)) {
+    func_proto_name.append("int8");
+  } else if (type.is_int(16)) {
+    func_proto_name.append("int16");
+  } else if (type.is_int(32)) {
+    func_proto_name.append("int32");
+  } else if (type.is_int(64)) {
+    func_proto_name.append("int64");
+  } else if (type.is_uint(8)) {
+    func_proto_name.append("uint8");
+  } else if (type.is_uint(16)) {
+    func_proto_name.append("uint16");
+  } else if (type.is_uint(32)) {
+    func_proto_name.append("uint32");
+  } else if (type.is_uint(64)) {
+    func_proto_name.append("uint64");
+  } else {
+    LOG(FATAL) << "Can not find type: " << type << " for extern function. Please Check.\n";
+  }
+  return func_proto_name;
+}
+
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/op_util.h b/paddle/cinn/hlir/op/op_util.h
new file mode 100644
index 0000000000000..2eac5e0a65de9
--- /dev/null
+++ b/paddle/cinn/hlir/op/op_util.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/ir/ir.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace hlir {
+
+template <typename T>
+T GetAttr(const cinn::utils::AttributeMap &attr_map, const std::string &attr_name) {
+  CHECK(attr_map.count(attr_name)) << "Cannot found attribute \"" << attr_name << "\"";
+  const auto &attr = attr_map.at(attr_name);
+
+  CHECK(absl::holds_alternative<T>(attr)) << "The type of attribute \"" << attr_name << "\" isn't " << typeid(T).name();
+  return absl::get<T>(attr_map.at(attr_name));
+}
+
+template <class T>
+T SafeGetAttr(const cinn::utils::AttributeMap &attrs, const std::string &key, const T &&value) {
+  if (attrs.find(key) != attrs.end()) {
+    return GetAttr<T>(attrs, key);
+  }
+  return value;
+}
+
+template <typename T = int>
+std::vector<Expr> ToCinnExprs(const std::vector<T> &args) {
+  std::vector<Expr> exprs;
+  std::transform(args.begin(), args.end(), std::back_inserter(exprs), [](const T &arg) { return Expr(arg); });
+  return exprs;
+}
+
+template <typename T>
+std::vector<T> ToPodVector(const std::vector<Expr> &args) {
+  if (args.empty()) {
+    return {};
+  }
+
+  const auto &type = args.front().type();
+  CHECK_EQ(type, common::type_of<T>()) << "Cannot get " << common::type_of<T>() << " value from " << type << " vector!";
+
+  std::vector<T> shape_v;
+  if (type.is_bool()) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_bool()));
+    }
+  } else if (type.is_int(8)) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_int8()));
+    }
+  } else if (type.is_int(16)) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_int16()));
+    }
+  } else if (type.is_int(32)) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_int32()));
+    }
+  } else if (type.is_int(64)) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_int64()));
+    }
+  } else if (type.is_uint(8)) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_uint8()));
+    }
+  } else if (type.is_uint(16)) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_uint16()));
+    }
+  } else if (type.is_uint(32)) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_uint32()));
+    }
+  } else if (type.is_uint(64)) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_uint64()));
+    }
+  } else if (type.is_bfloat16()) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_bfloat16()));
+    }
+  } else if (type.is_float16()) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_float16()));
+    }
+  } else if (type.is_float(32)) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_float()));
+    }
+  } else if (type.is_float(64)) {
+    for (auto &e : args) {
+      shape_v.push_back(static_cast<T>(e.as_double()));
+    }
+  } else {
+    LOG(FATAL) << "Not support " << type;
+  }
+  return shape_v;
+}
+
+using CINNSchedule = lang::PackedFunc;
+
+CINNSchedule GetElementwiseScheduleFunc(const std::vector<std::vector<int>> &output_shapes,
+                                        const Target &target,
+                                        bool vectorizable = true);
+
+CINNSchedule GetInjectiveScheduleFunc(const std::vector<std::vector<int>> &output_shapes,
+                                      const Target &target,
+                                      bool vectorizable = true);
+
+std::string GetExternFuncName(const common::Target &target,
+                              const common::Type &type,
+                              const std::string &func_name,
+                              const bool need_cinn   = true,
+                              const bool need_target = true,
+                              const bool need_type   = true);
+
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/reduction.cc b/paddle/cinn/hlir/op/reduction.cc
new file mode 100644
index 0000000000000..7e36a572d8743
--- /dev/null
+++ b/paddle/cinn/hlir/op/reduction.cc
@@ -0,0 +1,505 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/reduction.h"
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/hlir/pe/transform.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/optim/ir_simplify.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+using BlockReduceFunc = std::function<std::vector<ir::Tensor>(
+    const ir::Tensor &, const std::vector<int> &, const bool, const std::string &)>;
+using ReduceFunc =
+    std::function<ir::Tensor(const ir::Tensor &, const std::vector<int> &, const bool, const std::string &)>;
+
+std::shared_ptr<OpStrategy> StrategyForReduce(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<Type> &out_type,
+                                              const std::vector<std::vector<int>> &output_shapes,
+                                              const Target &target,
+                                              const std::string &op_name,
+                                              BlockReduceFunc gpu_reduce_with_last_axis_func,
+                                              BlockReduceFunc gpu_reduce_without_last_axis_func,
+                                              ReduceFunc cpu_reduce_func) {
+  std::vector<int> reduce_axes;
+  auto ndim = inputs[0]->shape.size();
+  if (attrs.attr_store.count("dim")) {
+    reduce_axes = absl::get<std::vector<int>>(attrs.attr_store.at("dim"));
+    if (reduce_axes.empty()) {
+      for (int i = 0; i < ndim; ++i) {
+        reduce_axes.push_back(i);
+      }
+    } else {
+      std::for_each(reduce_axes.begin(), reduce_axes.end(), [&ndim](int &x) {
+        if (x < 0) x += ndim;
+      });
+    }
+    std::sort(reduce_axes.begin(), reduce_axes.end());
+    // check reduce_axes
+    CHECK_LE(reduce_axes.size(), ndim);
+    CHECK_LT(reduce_axes.back(), ndim);
+    for (int idx = 1; idx < reduce_axes.size(); ++idx) {
+      CHECK_NE(reduce_axes[idx - 1], reduce_axes[idx]);
+    }
+  } else {
+    LOG(FATAL) << "reduce dimension is not set!";
+  }
+
+  bool keep_dim = false;
+  if (attrs.attr_store.count("keep_dim")) {
+    keep_dim = absl::get<bool>(attrs.attr_store.at("keep_dim"));
+  }
+
+  auto WithoutLastDimInReduce = [](const std::vector<ir::Expr> &inshape, const std::vector<int> &axes) {
+    // if last axis is in reduce.
+    if (std::find(axes.begin(), axes.end(), inshape.size() - 1) != axes.end() ||
+        std::find(axes.begin(), axes.end(), -1) != axes.end()) {
+      return false;
+    }
+
+    int sum_last_axes = 1;
+    for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+      sum_last_axes *= inshape[idx].as_int32();
+    }
+
+    if (sum_last_axes > 1) {
+      return true;
+    } else {
+      return false;
+    }
+  };
+
+  framework::CINNCompute reduction_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of " << op_name << " compute is empty! Please check.";
+    CINNValuePack arg_packs = args[0];
+    std::string tensor_name = UniqName(op_name + "_out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(arg_packs.size(), 2U) << "There should be 2 input args for " << op_name << " compute";
+      CHECK(arg_packs[1].is_string());
+      tensor_name = arg_packs[1].operator std::string();
+    } else {
+      CHECK_EQ(arg_packs.size(), 1U) << "There should be 1 input args for " << op_name << " compute";
+    }
+    Expr x_expr = arg_packs[0];
+    CHECK(x_expr.as_tensor());
+    ir::Tensor x = x_expr.as_tensor_ref();
+
+    std::unordered_set<std::string> bool_reduce_op = {"reduce_all", "reduce_any"};
+    CHECK(!bool_reduce_op.count(op_name) || x->type().is_bool())
+        << "The type of input argument " << x->name << " of " << op_name << " should be bool, but get " << x->type()
+        << "! Please check.";
+
+    if (target == common::DefaultNVGPUTarget()) {
+      if (!WithoutLastDimInReduce(inputs[0]->shape, reduce_axes)) {
+        VLOG(3) << "Do Two Step Block Reduce Compute!";
+        auto res    = gpu_reduce_with_last_axis_func(x, reduce_axes, keep_dim, tensor_name);
+        auto stages = CreateStages(res);
+
+        std::vector<CINNValue> cinn_values;
+        for (auto &t : res) {
+          cinn_values.emplace_back(t);
+        }
+        cinn_values.emplace_back(stages);
+        *ret = CINNValuePack{cinn_values};
+      } else {
+        VLOG(3) << "Do Block Shuffle Reduce Compute!";
+        auto res    = gpu_reduce_without_last_axis_func(x, reduce_axes, keep_dim, tensor_name);
+        auto stages = CreateStages(res);
+
+        std::vector<CINNValue> cinn_values;
+        for (auto &t : res) {
+          cinn_values.emplace_back(t);
+        }
+        cinn_values.emplace_back(stages);
+        *ret = CINNValuePack{cinn_values};
+      }
+    } else {
+      VLOG(3) << "Do Reduce Compute!";
+      auto out    = cpu_reduce_func(x, reduce_axes, keep_dim, tensor_name);
+      auto stages = CreateStages({out});
+
+      std::vector<CINNValue> cinn_values{CINNValue(out), CINNValue(stages)};
+      *ret = CINNValuePack{cinn_values};
+    }
+  });
+
+  framework::CINNSchedule reduction_schedule([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of " << op_name << " schedule is empty! Please check.";
+    CINNValuePack arg_pack = args[0];
+
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_GE(arg_pack.size(), 2UL);
+      CHECK_LE(arg_pack.size(), 8UL);
+      CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      std::vector<Expr> vec_tensor;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          // TODO(zhhsplendid): old reducetion schedule assumes all length-1
+          // for loops are simplified, but it is not after we add length-1
+          // back. Reduction schedule is complex and we haven't changed it to
+          // support the length-1 for loop yet. So we simplify here. The todo
+          // is that remove SimplifyForLoops below and change reduction schedule
+          optim::SimplifyForLoops(&temp);
+          vec_ast.emplace_back(temp);
+        } else if (arg_pack[i].is_tensor()) {
+          Expr temp = arg_pack[i];
+          vec_tensor.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      if (target.arch == Target::Arch::NVGPU) {
+        if (!WithoutLastDimInReduce(inputs[0]->shape, reduce_axes)) {
+          if (arg_pack.size() == 4) {
+            CHECK_EQ(vec_tensor.size(), 2);
+            Expr out     = vec_tensor[0];
+            Expr tmp_out = vec_tensor[1];
+
+            VLOG(3) << "Do IRCudaScheduleBlockReduceInternal Schedule!";
+            pe::IRCudaScheduleBlockReduceInternal(ir_sch, tmp_out.as_tensor_ref(), out.as_tensor_ref(), target);
+
+            std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+            *ret = CINNValuePack{res};
+          } else if (arg_pack.size() == 6) {
+            CHECK_EQ(vec_tensor.size(), 3);
+            Expr out            = vec_tensor[0];
+            Expr tmp_out        = vec_tensor[1];
+            Expr reduce_tmp_out = vec_tensor[2];
+
+            VLOG(3) << "Do IRCudaScheduleBlockReduce Schedule!";
+            pe::IRCudaScheduleBlockReduce(
+                ir_sch, reduce_tmp_out.as_tensor_ref(), tmp_out.as_tensor_ref(), out.as_tensor_ref(), target);
+
+            std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+            *ret = CINNValuePack{res};
+          } else if (arg_pack.size() == 7) {
+            CHECK_EQ(vec_tensor.size(), 4);
+            Expr out            = vec_tensor[0];
+            Expr tmp_out        = vec_tensor[1];
+            Expr reduce_tmp_out = vec_tensor[2];
+            Expr reshape        = vec_tensor[3];
+
+            VLOG(3) << "Do IRCudaTwoStepReduceSchedule Schedule!";
+            pe::IRCudaTwoStepReduceSchedule(ir_sch,
+                                            reshape.as_tensor_ref(),
+                                            reduce_tmp_out.as_tensor_ref(),
+                                            tmp_out.as_tensor_ref(),
+                                            out.as_tensor_ref(),
+                                            common::DefaultNVGPUTarget());
+
+            std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+            *ret = CINNValuePack{res};
+          } else if (arg_pack.size() == 5) {
+            CHECK_EQ(vec_tensor.size(), 3);
+            Expr out            = vec_tensor[0];
+            Expr tmp_out        = vec_tensor[1];
+            Expr reduce_tmp_out = vec_tensor[2];
+
+            VLOG(3) << "Do IRCudaScheduleBlockReduce Schedule!";
+            pe::IRCudaScheduleBlockReduce(ir_sch,
+                                          reduce_tmp_out.as_tensor_ref(),
+                                          tmp_out.as_tensor_ref(),
+                                          out.as_tensor_ref(),
+                                          common::DefaultNVGPUTarget());
+
+            std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+            *ret = CINNValuePack{res};
+          } else {
+            LOG(FATAL) << "Unkown Reduce Type!";
+          }
+        } else {
+          if (arg_pack.size() == 2) {
+            CHECK_EQ(vec_tensor.size(), 1);
+            Expr reduce_out = vec_tensor[0];
+
+            VLOG(3) << "Do IRCudaScheduleReduce Schedule!";
+            pe::IRCudaScheduleReduce(
+                ir_sch, reduce_out.as_tensor_ref(), inputs[0]->shape.size() - reduce_axes.back() - 1, target);
+
+            std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+            *ret = CINNValuePack{res};
+          } else if (arg_pack.size() == 6) {
+            CHECK_EQ(vec_tensor.size(), 3);
+            Expr reduce_out      = vec_tensor[0];
+            Expr reduce_internal = vec_tensor[1];
+            Expr reduce_reshape  = vec_tensor[2];
+
+            VLOG(3) << "Do IRCudaScheduleBlockShuffleReduce Schedule!";
+            pe::IRCudaScheduleBlockShuffleReduce(ir_sch,
+                                                 reduce_reshape.as_tensor_ref(),
+                                                 reduce_internal.as_tensor_ref(),
+                                                 reduce_out.as_tensor_ref(),
+                                                 target);
+
+            std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+            *ret = CINNValuePack{res};
+          } else {
+            LOG(FATAL) << "Unkown Reduce Type!";
+          }
+        }
+      } else {
+        std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+        *ret = CINNValuePack{res};
+      }
+    } else {
+      CHECK_GE(arg_pack.size(), 2UL);
+      CHECK_LE(arg_pack.size(), 5UL);
+      if (target.arch == Target::Arch::NVGPU) {
+        if (!WithoutLastDimInReduce(inputs[0]->shape, reduce_axes)) {
+          if (arg_pack.size() == 3) {
+            Expr out              = arg_pack[0];
+            Expr tmp_out          = arg_pack[1];
+            poly::StageMap stages = arg_pack.back();
+            VLOG(3) << "Do CudaBlockReduceInternalSchedule Schedule!";
+            pe::CudaBlockReduceInternalSchedule(
+                stages, tmp_out.as_tensor_ref(), out.as_tensor_ref(), common::DefaultNVGPUTarget());
+          } else if (arg_pack.size() == 4) {
+            Expr out              = arg_pack[0];
+            Expr tmp_out          = arg_pack[1];
+            Expr reduce_tmp_out   = arg_pack[2];
+            poly::StageMap stages = arg_pack.back();
+            VLOG(3) << "Do CudaBlockReduceSchedule Schedule!";
+            pe::CudaBlockReduceSchedule(stages,
+                                        reduce_tmp_out.as_tensor_ref(),
+                                        tmp_out.as_tensor_ref(),
+                                        out.as_tensor_ref(),
+                                        common::DefaultNVGPUTarget());
+          } else {
+            Expr out              = arg_pack[0];
+            Expr tmp_out          = arg_pack[1];
+            Expr reduce_tmp_out   = arg_pack[2];
+            Expr reshape          = arg_pack[3];
+            poly::StageMap stages = arg_pack.back();
+            VLOG(3) << "Do CudaTwoStepReduceSchedule Schedule!";
+            pe::CudaTwoStepReduceSchedule(stages,
+                                          reshape.as_tensor_ref(),
+                                          reduce_tmp_out.as_tensor_ref(),
+                                          tmp_out.as_tensor_ref(),
+                                          out.as_tensor_ref(),
+                                          common::DefaultNVGPUTarget());
+          }
+        } else {
+          if (arg_pack.size() == 2) {
+            Expr reduce_out       = arg_pack[0];
+            poly::StageMap stages = arg_pack.back();
+            VLOG(3) << "Do CudaReduceSchedule Schedule!";
+            pe::CudaReduceSchedule(
+                stages, reduce_out.as_tensor_ref(), inputs[0]->shape.size() - reduce_axes.back() - 1, target);
+          } else {
+            CHECK_EQ(arg_pack.size(), 4) << "args is not equal 4!";
+            Expr reduce_reshape   = arg_pack[2];
+            Expr reduce_internal  = arg_pack[1];
+            Expr reduce_out       = arg_pack[0];
+            poly::StageMap stages = arg_pack.back();
+            VLOG(3) << "Do CudaBlockShuffleReduceSchedule Schedule!";
+            pe::CudaBlockShuffleReduceSchedule(stages,
+                                               reduce_reshape.as_tensor_ref(),
+                                               reduce_internal.as_tensor_ref(),
+                                               reduce_out.as_tensor_ref(),
+                                               target);
+          }
+        }
+      }
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(reduction_compute, reduction_schedule, "strategy." + op_name + ".x86", 1);
+
+  return strategy;
+}
+
+#define STRATEGY_FOR_REDUCE(                                                                                  \
+    op_name_, reduce_op_, gpu_reduce_with_last_axis_func, gpu_reduce_without_last_axis_func, cpu_reduce_func) \
+  std::shared_ptr<OpStrategy> StrategyFor##reduce_op_(const framework::NodeAttr &attrs,                       \
+                                                      const std::vector<ir::Tensor> &inputs,                  \
+                                                      const std::vector<Type> &out_type,                      \
+                                                      const std::vector<std::vector<int>> &output_shapes,     \
+                                                      const Target &target) {                                 \
+    return StrategyForReduce(attrs,                                                                           \
+                             inputs,                                                                          \
+                             out_type,                                                                        \
+                             output_shapes,                                                                   \
+                             target,                                                                          \
+                             #op_name_,                                                                       \
+                             gpu_reduce_with_last_axis_func,                                                  \
+                             gpu_reduce_without_last_axis_func,                                               \
+                             cpu_reduce_func);                                                                \
+  }
+
+STRATEGY_FOR_REDUCE(reduce_sum, ReduceSum, pe::TwoStepBlockReduceSum, pe::BlockShuffleReduceSum, pe::ReduceSum);
+STRATEGY_FOR_REDUCE(reduce_prod, ReduceProd, pe::TwoStepBlockReduceProd, pe::BlockShuffleReduceProd, pe::ReduceProd);
+STRATEGY_FOR_REDUCE(reduce_max, ReduceMax, pe::TwoStepBlockReduceMax, pe::BlockShuffleReduceMax, pe::ReduceMax);
+STRATEGY_FOR_REDUCE(reduce_min, ReduceMin, pe::TwoStepBlockReduceMin, pe::BlockShuffleReduceMin, pe::ReduceMin);
+STRATEGY_FOR_REDUCE(reduce_all, ReduceAll, pe::TwoStepBlockReduceAll, pe::BlockShuffleReduceAll, pe::ReduceAll);
+STRATEGY_FOR_REDUCE(reduce_any, ReduceAny, pe::TwoStepBlockReduceAny, pe::BlockShuffleReduceAny, pe::ReduceAny);
+
+#undef STRATEGY_FOR_REDUCE
+
+std::vector<shape_t> InferShapeForReduction(const std::vector<shape_t> &inputs_shape,
+                                            const framework::AttrMapType &attrs) {
+  CHECK(inputs_shape.size() == 1UL || inputs_shape.size() == 3UL);
+  std::vector<int> dim;
+  bool keep_dim = false;
+  if (attrs.find("dim") != attrs.end()) {
+    dim = absl::get<std::vector<int>>(attrs.at("dim"));
+  }
+
+  if (attrs.find("keep_dim") != attrs.end()) {
+    keep_dim = absl::get<bool>(attrs.at("keep_dim"));
+  }
+
+  auto ndim = inputs_shape[0].size();
+  CHECK_LE(dim.size(), ndim) << "reduce dim should no more than the input size";
+
+  if (dim.empty()) {
+    for (int i = 0; i < ndim; ++i) {
+      dim.emplace_back(i);
+    }
+  } else {
+    std::for_each(dim.begin(), dim.end(), [&ndim](int &x) {
+      if (x < 0) x += ndim;
+    });
+  }
+
+  std::vector<int> out_shapes;
+  for (size_t i = 0; i < ndim; ++i) {
+    if (std::find(dim.begin(), dim.end(), i) != dim.end()) {
+      if (keep_dim) {
+        out_shapes.push_back(1);
+      }
+    } else {
+      out_shapes.push_back(inputs_shape[0][i]);
+    }
+  }
+
+  if (out_shapes.empty()) {
+    out_shapes.push_back(1);
+  }
+
+  VLOG(4) << "Reduce from input shape [" << cinn::utils::Join(inputs_shape[0], ",") << "] to output shape ["
+          << cinn::utils::Join(out_shapes, ",") << "] with reduce dim [" << cinn::utils::Join(dim, ",")
+          << "] and keep_dim is " << keep_dim;
+
+  return {out_shapes};
+}
+
+std::vector<Type> InferDtypeForReduction(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForReductionBool(const std::vector<Type> &inputs_type,
+                                             const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 1UL) << "The reduce should only has one input! Please check again.";
+  CHECK(inputs_type[0].is_bool()) << "The input's type should be bool! Please check.";
+  return inputs_type;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForReduction(const std::vector<framework::shape_t> &input_shapes,
+                                                              const std::vector<std::string> &input_layouts,
+                                                              const framework::NodeAttr &attrs,
+                                                              const Target &target) {
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layouts size is not 1! Please check again.";
+  std::vector<std::string> new_input_layouts = input_layouts;
+  if (input_shapes[0].size() > 4) {
+    // alter input layout back
+    new_input_layouts[0] = "NCHW";
+    VLOG(3) << "alter input layout from " << input_layouts[0] << " to " << new_input_layouts[0];
+  }
+
+  return {{""}, new_input_layouts};
+}
+
+std::vector<shape_t> InferShapeForBnOptimize(const std::vector<shape_t> &inputs_shape,
+                                             const framework::AttrMapType &attrs) {
+  auto shapes = InferShapeForReduction(inputs_shape, attrs);
+  CHECK_GE(shapes.size(), 1) << "shapes's size less than 1, please check!";
+  return {shapes[0], shapes[0]};
+}
+
+std::vector<Type> InferDtypeForBnOptimize(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  return {inputs_type[0], inputs_type[0]};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForBnOptimize(const std::vector<framework::shape_t> &input_shapes,
+                                                               const std::vector<std::string> &input_layouts,
+                                                               const framework::NodeAttr &attrs,
+                                                               const Target &target) {
+  return {{"", ""}, {"", ""}};
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(reduce_ops) {
+#define CINN_REGISTER_REDUCTION_WITH_DTYPE(op__, op_stragegy__, dtype__)                                             \
+  CINN_REGISTER_OP(op__)                                                                                             \
+      .describe(#op__ " function")                                                                                   \
+      .set_num_inputs(1)                                                                                             \
+      .set_num_outputs(1)                                                                                            \
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyFor##op_stragegy__) \
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForReduction))                                \
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForReduction##dtype__))                       \
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForReduction))                              \
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kReduction) \
+      .set_support_level(4);
+
+#define CINN_REGISTER_REDUCTION(op__, op_stragegy__) CINN_REGISTER_REDUCTION_WITH_DTYPE(op__, op_stragegy__, )
+
+  CINN_REGISTER_REDUCTION(reduce_sum, ReduceSum);
+  CINN_REGISTER_REDUCTION(reduce_prod, ReduceProd);
+  CINN_REGISTER_REDUCTION(reduce_max, ReduceMax);
+  CINN_REGISTER_REDUCTION(reduce_min, ReduceMin);
+
+#undef CINN_REGISTER_REDUCTION
+
+  CINN_REGISTER_REDUCTION_WITH_DTYPE(reduce_all, ReduceAll, Bool);
+  CINN_REGISTER_REDUCTION_WITH_DTYPE(reduce_any, ReduceAny, Bool);
+
+#undef CINN_REGISTER_REDUCTION_WITH_DTYPE
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/reduction_test.cc b/paddle/cinn/hlir/op/reduction_test.cc
new file mode 100644
index 0000000000000..9b214f467f3e8
--- /dev/null
+++ b/paddle/cinn/hlir/op/reduction_test.cc
@@ -0,0 +1,561 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <functional>
+#include <iostream>
+#include <string>
+
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/codegen_cuda_host.h"
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/cuda_util.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "cinn/cinn.h"
+#include "cinn/common/target.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/runtime/cuda/cuda_module.h"
+DECLARE_bool(cinn_ir_schedule);
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+using runtime::cuda::CUDAModule;
+
+std::pair<ir::Module, std::string> GenReduceCode(const std::vector<int>& shape,
+                                                 const std::vector<int>& dim,
+                                                 const std::string& func_name,
+                                                 bool keep_dim              = false,
+                                                 const std::string& op_name = "reduce_sum") {
+  // code gen
+  Context::Global().ResetNameId();
+  auto reduce_sum = Operator::Get(op_name);
+  auto strategy   = Operator::GetAttrs<StrategyFunction>("CINNStrategy")[reduce_sum];
+
+  // input tensor
+  std::vector<Expr> shape_as_expr;
+  for (auto value : shape) {
+    shape_as_expr.emplace_back(value);
+  }
+  Placeholder<float> X("X", shape_as_expr);
+
+  // set attrs
+  NodeAttr attrs;
+  attrs.attr_store["dim"]      = dim;
+  attrs.attr_store["keep_dim"] = keep_dim;
+  std::vector<ir::Tensor> inputs{X.tensor()};
+  std::vector<Type> out_type{Float(32)};
+
+  std::vector<int> output_shape;
+  for (int idx = 0; idx < shape.size(); ++idx) {
+    if (std::find(dim.begin(), dim.end(), idx) != dim.end()) {
+      if (keep_dim) {
+        output_shape.push_back(1);
+      }
+    } else {
+      output_shape.push_back(shape[idx]);
+    }
+  }
+
+  auto target = common::DefaultNVGPUTarget();
+  auto impl   = OpStrategy::SelectImpl(strategy(attrs, inputs, out_type, {output_shape}, target));
+
+  std::vector<ir::LoweredFunc> func;
+  if (!FLAGS_cinn_ir_schedule) {
+    common::CINNValuePack cinn_input = common::CINNValuePack{{common::CINNValue(X)}};
+    common::CINNValuePack rets       = impl->fcompute(cinn_input);
+    rets                             = impl->fschedule(rets);
+    poly::StageMap stages            = rets.back();
+
+    // the last element is a StageMap
+    for (int i = 0; i < rets->size() - 1; i++) {
+      Expr temp = rets[i];
+      if (!temp.as_tensor_ref()->buffer.defined() && !stages[temp.as_tensor_ref()]->inlined()) {
+        inputs.push_back(temp.as_tensor_ref());
+      }
+    }
+
+    func = lang::LowerVec(func_name, rets.back(), inputs, {}, {}, nullptr, target);
+  } else {
+    std::vector<std::string> input_output_nodes{"X", op_name};
+    func = GetFuncFromImpl(impl,
+                           common::CINNValuePack{{common::CINNValue(X), common::CINNValue(op_name)}},
+                           inputs,
+                           input_output_nodes,
+                           func_name,
+                           target);
+  }
+
+  Module::Builder builder(func_name + "_builder", target);
+  for (auto& f : func) {
+    builder.AddFunction(f);
+  }
+  // compile the module
+  // Need to create a new compiler for every call of Build,
+  // because the underneath jit engine does't support addIRModule repeatedly now.
+  auto module                    = builder.Build();
+  auto host_module_device_module = backends::SplitCudaAndHostModule(module);  // NOLINT
+  auto& host_module              = std::get<0>(host_module_device_module);
+  auto& device_module            = std::get<1>(host_module_device_module);
+
+  backends::CodeGenCUDA_Dev codegen(target);
+  std::string source_code;
+  if (!FLAGS_cinn_ir_schedule) {
+    source_code = codegen.Compile(builder.Build());
+  } else {
+    source_code = codegen.Compile(device_module);
+  }
+  // LOG(INFO) << "compiled code:\n" << device_module;
+
+  return std::pair<ir::Module, std::string>(host_module, source_code);
+}
+
+// last dimension not in reduce
+TEST(Operator, Operator_Reduce_Without_Last_Channel_Case_5) {
+  std::vector<int> shape = {128, 112, 112, 128};
+  std::vector<int> dim   = {0, 1, 2};
+
+  GenReduceCode(shape, dim, "Reduce_Without_Last_Channel_Case_5");
+}
+
+// last dimension not in reduce
+TEST(Operator, Operator_Reduce_Without_Last_Channel_Case_4) {
+  std::vector<int> shape = {16, 16, 8, 8, 16, 16};
+  std::vector<int> dim   = {0, 2, 3};
+
+  GenReduceCode(shape, dim, "Reduce_Without_Last_Channel_Case_4");
+}
+// case 3
+TEST(Operator, Operator_Reduce_Without_Last_Channel_Case_3) {
+  std::vector<int> shape = {16, 16, 16, 16, 16};
+  std::vector<int> dim   = {0, 2};
+
+  GenReduceCode(shape, dim, "Reduce_Without_Last_Channel_Case_3");
+}
+// case 2
+TEST(Operator, Operator_Reduce_Without_Last_Channel_Case_2) {
+  std::vector<int> shape = {16, 16, 16, 16};
+  std::vector<int> dim   = {0, 1};
+
+  GenReduceCode(shape, dim, "Reduce_Without_Last_Channel_Case_2");
+}
+// case 1
+TEST(Operator, Operator_Reduce_Without_Last_Channel_Case_1) {
+  std::vector<int> shape = {16, 16, 16, 16};
+  std::vector<int> dim   = {1};
+
+  GenReduceCode(shape, dim, "Reduce_Without_Last_Channel_Case_1");
+}
+// case 0
+TEST(Operator, Operator_Reduce_Without_Last_Channel_Case_0) {
+  std::vector<int> shape = {16, 16, 32};
+  std::vector<int> dim   = {1};
+
+  GenReduceCode(shape, dim, "Reduce_Without_Last_Channel_Case_0");
+}
+
+TEST(Operator, Operator_Reduction_Case_Last_Dim_1) {
+  std::vector<int> shape = {10, 100, 1};
+  std::vector<int> dim   = {0, 2};
+
+  GenReduceCode(shape, dim, "reduce_cast_with_last_dim_1");
+}
+
+TEST(Operator, Operator_Reduction_Case_0) {
+  std::vector<int> shape = {16, 16, 8, 16};
+  std::vector<int> dim   = {2, 3};
+
+  GenReduceCode(shape, dim, "reduce_cast_0");
+}
+
+TEST(Operator, Operator_Reduction_Case_0_0) {
+  std::vector<int> shape = {16, 16, 8, 16};
+  std::vector<int> dim   = {2, 3};
+
+  GenReduceCode(shape, dim, "reduce_cast_0_0", true);
+}
+
+TEST(Operator, Operator_Reduction_Case_1) {
+  std::vector<int> shape = {16, 16, 32, 32};
+  std::vector<int> dim   = {2, 3};
+
+  GenReduceCode(shape, dim, "reduce_cast_1");
+}
+
+TEST(Operator, Operator_Reduction_Case_1_1) {
+  std::vector<int> shape = {16, 16, 32, 32};
+  std::vector<int> dim   = {2, 3};
+
+  GenReduceCode(shape, dim, "reduce_cast_1_1", true);
+}
+
+TEST(Operator, Operator_Reduction_Case_2) {
+  std::vector<int> shape = {16, 16, 32, 32};
+  std::vector<int> dim   = {1};
+
+  GenReduceCode(shape, dim, "reduce_cast_2", true);
+}
+
+TEST(Operator, Operator_Reduction_Case_2_1) {
+  std::vector<int> shape = {16, 16, 32, 32};
+  std::vector<int> dim   = {-1};
+
+  GenReduceCode(shape, dim, "reduce_cast_2_1", true);
+}
+
+TEST(Operator, Operator_Reduction_Case_3) {
+  std::vector<int> shape = {16, 16, 64, 64};
+  std::vector<int> dim   = {1};
+
+  GenReduceCode(shape, dim, "reduce_cast_3");
+}
+
+TEST(Operator, Operator_Reduction_Case_4) {
+  std::vector<int> shape = {16, 16, 16, 16};
+  std::vector<int> dim   = {0, 2, 3};
+
+  GenReduceCode(shape, dim, "reduce_cast_4");
+}
+
+TEST(Operator, Operator_Reduction_Case_4_4) {
+  std::vector<int> shape = {16, 16, 16, 16};
+  std::vector<int> dim   = {0, 2, 3};
+
+  GenReduceCode(shape, dim, "reduce_cast_4_4", true);
+}
+
+TEST(Operator, Operator_Reduction_Case_5) {
+  std::vector<int> shape = {16, 16, 16, 16, 16, 32};
+  std::vector<int> dim   = {1, 3, 5};
+
+  GenReduceCode(shape, dim, "reduce_cast_5");
+}
+
+TEST(Operator, Operator_Reduction_Case_5_5) {
+  std::vector<int> shape = {16, 16, 16, 16, 16, 32};
+  std::vector<int> dim   = {1, 3, 5};
+
+  GenReduceCode(shape, dim, "reduce_cast_5_5", true);
+}
+
+TEST(Operator, Operator_Reduction_Case_6_0) {
+  std::vector<int> shape = {32, 32, 32};
+  std::vector<int> dim   = {0, 1, 2};
+
+  GenReduceCode(shape, dim, "reduce_cast_6_0", false);
+}
+
+TEST(Operator, Operator_Reduction_Case_6_00) {
+  std::vector<int> shape = {32, 32, 32, 32};
+  std::vector<int> dim   = {0, 1, 2};
+
+  GenReduceCode(shape, dim, "reduce_cast_6_00", false);
+}
+
+TEST(Operator, Operator_Reduction_Case_6_10) {
+  std::vector<int> shape = {32, 32, 32};
+  std::vector<int> dim   = {-2, -1, 0};
+
+  GenReduceCode(shape, dim, "reduce_cast_6_10", true);
+}
+
+struct SumOp {
+  float operator()(const float left, const float right) { return left + right; }
+};
+struct ProdOp {
+  float operator()(const float left, const float right) { return left * right; }
+};
+struct MaxOp {
+  float operator()(const float left, const float right) { return std::max(left, right); }
+};
+struct MinOp {
+  float operator()(const float left, const float right) { return std::min(left, right); }
+};
+
+template <class Op>
+void DoCpuReduce(const float* x,
+                 std::vector<float>* sum0,
+                 std::vector<float>* sum1,
+                 const float init_val,
+                 const int n,
+                 const int c,
+                 const int h,
+                 const int w) {
+  for (auto& val : *sum0) {
+    val = init_val;
+  }
+  for (auto& val : *sum1) {
+    val = init_val;
+  }
+
+  for (int idx = 0; idx < n; ++idx) {
+    for (int idy = 0; idy < c; ++idy) {
+      for (int idz = 0; idz < h; ++idz) {
+        for (int ida = 0; ida < w; ++ida) {
+          sum0->at(idy * w + ida) += Op()(sum0->at(idy * w + ida), x[idx * c * h * w + idy * h * w + idz * w + ida]);
+          sum1->at(idy) = Op()(sum1->at(idy), x[idx * c * h * w + idy * h * w + idz * w + ida]);
+        }
+      }
+    }
+  }
+}
+
+template <class Op>
+void TestCaseForReduce(
+    const float init_val, int n, int c, int h, int w, const std::string& test_name, const std::string& op_name) {
+  std::vector<int> shape = {n, c, h, w};
+  std::vector<int> dim   = {0, 2, 3};
+
+  // get source code
+  auto source_code = GenReduceCode(shape, dim, test_name, false, op_name).second;
+
+  // nv jit compile to ptx
+  backends::nvrtc::Compiler compiler;
+  auto ptx = compiler(source_code);
+  CHECK(!ptx.empty());
+
+  // cuda_module load ptx
+  runtime::cuda::CUDAModule cuda_module(ptx, CUDAModule::Kind::PTX);
+
+  srand(time(NULL));
+  CUDA_CALL(cudaSetDevice(0));
+
+  // auto func_0   = reinterpret_cast<void (*)(cinn_pod_value_t*, int)>(fn_reduce_sum);
+  auto buffer_x = common::BufferBuilder(Float(32), {n, c, h, w}).set_random().Build();
+  auto buffer_z = common::BufferBuilder(Float(32), {c}).set_random().Build();
+
+  void *dev_x = nullptr, *dev_z = nullptr;
+  CUDA_CALL(cudaMalloc(&dev_x, buffer_x->memory_size));
+  CUDA_CALL(cudaMalloc(&dev_z, buffer_z->memory_size));
+  CUDA_CALL(cudaMemcpy(dev_x, buffer_x->memory, buffer_x->memory_size, cudaMemcpyHostToDevice));
+  dim3 grid;
+  dim3 block;
+  if (!FLAGS_cinn_ir_schedule) {
+    grid  = {n * c, 1, 1};
+    block = {h * w, 1, 1};
+  } else {
+    grid            = {c, 1, 1};
+    int block_dim_x = n * w * h > 1024 ? 1024 : n * w * h;
+    block           = {block_dim_x, 1, 1};
+  }
+
+  void* args[]              = {&dev_x, &dev_z};
+  std::string new_test_name = test_name;
+  if (FLAGS_cinn_ir_schedule) new_test_name = "fn_" + new_test_name + "_kernel";
+  cuda_module.LaunchKernel(0, new_test_name, grid, block, args);
+  CUDA_CALL(cudaMemcpy(buffer_z->memory, dev_z, buffer_z->memory_size, cudaMemcpyDeviceToHost));
+
+  std::vector<float> sum0(c * w);
+  std::vector<float> sum1(c);
+  DoCpuReduce<Op>(reinterpret_cast<float*>(buffer_x->memory), &sum0, &sum1, init_val, n, c, h, w);
+
+  std::vector<std::pair<std::vector<float>, float*>> results = {{sum1, reinterpret_cast<float*>(buffer_z->memory)}};
+  for (auto& res : results) {
+    for (int idx = 0; idx < res.first.size(); ++idx) {
+      ASSERT_LT(abs(res.first[idx] - res.second[idx]) / res.first[idx], 1e-4);
+    }
+  }
+
+  CUDA_CALL(cudaFree(dev_x));
+  CUDA_CALL(cudaFree(dev_z));
+}
+
+TEST(Operator, Operator_Reduction_Case_6_1) {
+  TestCaseForReduce<SumOp>(0.0f, 32, 32, 32, 32, "Operator_Reduction_Case_6_1", "reduce_sum");
+}
+TEST(Operator, Operator_Reduction_Case_6_2) {
+  TestCaseForReduce<ProdOp>(1.0f, 1, 1, 1, 32, "Operator_Reduction_Case_6_2", "reduce_prod");
+}
+TEST(Operator, Operator_Reduction_Case_6_3) {
+  TestCaseForReduce<MaxOp>(-1e38f, 32, 32, 32, 32, "Operator_Reduction_Case_6_3", "reduce_max");
+}
+TEST(Operator, Operator_Reduction_Case_6_4) {
+  TestCaseForReduce<MinOp>(1e38f, 32, 32, 32, 32, "Operator_Reduction_Case_6_4", "reduce_min");
+}
+TEST(Operator, Operator_Reduction_Case_7) {
+  int n = 32, c = 32, h = 16, w = 16;
+  std::vector<int> shape = {n, c, h, w};
+  std::vector<int> dim   = {0, 1};
+
+  std::string func_name = "reduce_cast_7";
+  // get source code
+  auto host_source = GenReduceCode(shape, dim, func_name);
+
+  // compile to ptx
+  backends::nvrtc::Compiler compiler;
+  auto ptx = compiler(host_source.second);
+  CHECK(!ptx.empty());
+
+  // load ptx
+  CUDA_CALL(cudaSetDevice(0));
+  runtime::cuda::CUDAModule cuda_module(ptx, runtime::cuda::CUDAModule::Kind::PTX);
+  std::string new_func_name = func_name;
+  if (FLAGS_cinn_ir_schedule) new_func_name = "fn_" + new_func_name;
+  void* reduce_sum_kernel = cuda_module.GetFunction(0, new_func_name + "_kernel");
+  CHECK(reduce_sum_kernel);
+
+  // register cufunction and stream
+  void* stream = nullptr;
+  backends::GlobalSymbolRegistry::Global().RegisterFn(new_func_name + "_kernel_ptr_",
+                                                      reinterpret_cast<void*>(&reduce_sum_kernel));
+
+  // gen host code
+  auto jit = backends::SimpleJIT::Create();
+  jit->Link<backends::CodeGenCUDA_Host>(host_source.first);
+
+  auto fn_reduce_sum = jit->Lookup(new_func_name);
+  CHECK(fn_reduce_sum);
+
+  auto func_0 = reinterpret_cast<void (*)(void*, int, void*)>(fn_reduce_sum);
+
+  srand(time(NULL));
+  auto buffer_x = common::BufferBuilder(Float(32), {n, c, h, w}).set_random().Build();
+  auto buffer_y = common::BufferBuilder(Float(32), {h, w}).set_random().Build();
+
+  void *dev_x = nullptr, *dev_y = nullptr;
+  CUDA_CALL(cudaMalloc(&dev_x, buffer_x->memory_size));
+  CUDA_CALL(cudaMalloc(&dev_y, buffer_y->memory_size));
+
+  CUDA_CALL(cudaMemcpy(dev_x, buffer_x->memory, buffer_x->memory_size, cudaMemcpyHostToDevice));
+
+  cinn_buffer_t _x;
+  cinn_buffer_t _y;
+
+  _x.memory = static_cast<uint8_t*>(dev_x);
+  _y.memory = static_cast<uint8_t*>(dev_y);
+
+  _x.memory_size = buffer_x->memory_size;
+  _y.memory_size = buffer_y->memory_size;
+
+  cinn_pod_value_t x_arg(&_x), y_arg(&_y);
+  cinn_pod_value_t args0[] = {x_arg, y_arg};
+
+  func_0(args0, 2, stream);
+  CUDA_CALL(cudaMemcpy(buffer_y->memory, dev_y, buffer_y->memory_size, cudaMemcpyDeviceToHost));
+
+  CUDA_CALL(cudaFree(dev_x));
+  CUDA_CALL(cudaFree(dev_y));
+}
+
+TEST(Operator, Operator_Reduction_Case_8) {
+  std::vector<int> shape = {128, 1};
+  std::vector<int> dim   = {0};
+
+  GenReduceCode(shape, dim, "Operator_Reduction_Case_8");
+}
+
+TEST(Operator, Operator_Reduction_Case_88) {
+  std::vector<int> shape = {128, 1};
+  std::vector<int> dim   = {0};
+
+  GenReduceCode(shape, dim, "Operator_Reduction_Case_88", true);
+}
+
+TEST(Operator, Operator_Reduction_Case_9) {
+  std::vector<int> shape = {2560, 1};
+  std::vector<int> dim   = {0};
+
+  GenReduceCode(shape, dim, "Operator_Reduction_Case_9");
+}
+
+TEST(Operator, Operator_Reduction_Case_99) {
+  std::vector<int> shape = {2560, 1};
+  std::vector<int> dim   = {0};
+
+  GenReduceCode(shape, dim, "Operator_Reduction_Case_99", true);
+}
+
+TEST(Operator, Operator_Reduction_Case_10) {
+  std::vector<int> shape = {16, 2560, 1};
+  std::vector<int> dim   = {1};
+
+  GenReduceCode(shape, dim, "Operator_Reduction_Case_10");
+}
+
+TEST(Operator, Operator_Reduction_Case_11) {
+  std::vector<int> shape = {16, 128, 128, 1};
+  std::vector<int> dim   = {1, 2};
+
+  GenReduceCode(shape, dim, "Operator_Reduction_Case_11");
+}
+
+TEST(Operator, Operator_Reduction_Case_Warp_Reduce) {
+  int sm_count              = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int max_threads_per_sm    = common::DefaultNVGPUTarget().get_max_threads_per_sm();
+  int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
+
+  std::vector<int> shape = {warp_reduce_threshold + 10, 256};
+  std::vector<int> dim   = {1};
+
+  auto res = GenReduceCode(shape, dim, "Operator_Reduction_Case_Warp_Reduce");
+  CHECK(res.second.find("threadIdx.x < 32") != std::string::npos);
+}
+
+TEST(Operator, Operator_Reduction_Case_Block_Reduce) {
+  int sm_count              = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int max_threads_per_sm    = common::DefaultNVGPUTarget().get_max_threads_per_sm();
+  int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
+
+  std::vector<int> shape = {warp_reduce_threshold - 10, 33};
+  std::vector<int> dim   = {1};
+
+  auto res = GenReduceCode(shape, dim, "Operator_Reduction_Case_Block_Reduce");
+  CHECK(res.second.find("threadIdx.x < 32") == std::string::npos);
+}
+
+TEST(Operator, Operator_Reduction_Case_Warp_Reduce_Case_1) {
+  int sm_count              = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int max_threads_per_sm    = common::DefaultNVGPUTarget().get_max_threads_per_sm();
+  int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
+
+  std::vector<int> shape = {(warp_reduce_threshold + 32) / 2, 2, 10, 256};
+  std::vector<int> dim   = {2, 3};
+
+  auto res = GenReduceCode(shape, dim, "Operator_Reduction_Case_Warp_Reduce_Case_1");
+  CHECK(res.second.find("threadIdx.x < 32") != std::string::npos);
+}
+
+TEST(Operator, Operator_Reduction_Case_Block_Reduce_Case_1) {
+  int sm_count              = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int max_threads_per_sm    = common::DefaultNVGPUTarget().get_max_threads_per_sm();
+  int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
+
+  std::vector<int> shape = {(warp_reduce_threshold - 32) / 2, 2, 10, 33};
+  std::vector<int> dim   = {2, 3};
+
+  auto res = GenReduceCode(shape, dim, "Operator_Reduction_Case_Block_Reduce_Case_2");
+  CHECK(res.second.find("threadIdx.x < 32") == std::string::npos);
+}
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/transform.cc b/paddle/cinn/hlir/op/transform.cc
new file mode 100644
index 0000000000000..24951be324eee
--- /dev/null
+++ b/paddle/cinn/hlir/op/transform.cc
@@ -0,0 +1,1797 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/transform.h"
+
+#include <algorithm>
+
+#include "cinn/common/cas.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/hlir/pe/transform.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/utils/string.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+std::shared_ptr<OpStrategy> StrategyForMatMul(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<Type> &out_type,
+                                              const std::vector<std::vector<int>> &output_shapes,
+                                              const Target &target) {
+  const auto &attr_store = attrs.attr_store;
+  bool trans_a           = SafeGetAttr(attr_store, "trans_a", false);
+  bool trans_b           = SafeGetAttr(attr_store, "trans_b", false);
+  float alpha            = SafeGetAttr(attr_store, "alpha", 1.0f);
+
+  const auto &shape_A = ToPodVector<int>(inputs[0]->shape);
+  const auto &shape_B = ToPodVector<int>(inputs[1]->shape);
+
+  const auto &new_shape = pe::utils::GetMatmulNewShapes({shape_A, shape_B}, trans_a, trans_b);
+
+  const auto &new_shape_A  = new_shape[0];
+  const auto &new_shape_B  = new_shape[1];
+  const auto &output_shape = new_shape[2];
+
+  framework::CINNCompute matmul_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of Matmul compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 2U) << "at least 2 input tensors for Matmul compute\n";
+    Expr A = pack_args[0];
+    Expr B = pack_args[1];
+    CHECK(A.as_tensor());
+    CHECK(B.as_tensor());
+
+    std::string tensor_name = UniqName("MatMul");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_GE(pack_args.size(), 3);
+      CHECK(pack_args[2].is_string());
+      tensor_name = pack_args[2].operator std::string();
+    }
+
+    auto tensor_A = A.as_tensor_ref();
+    auto tensor_B = B.as_tensor_ref();
+    auto stages   = CreateStages({tensor_A, tensor_B});
+
+    auto new_shape_A_e = ToCinnExprs(new_shape_A);
+    auto new_shape_B_e = ToCinnExprs(new_shape_B);
+
+    auto new_A = tensor_A->Reshape(new_shape_A_e, stages);
+    auto new_B = tensor_B->Reshape(new_shape_B_e, stages);
+
+    std::vector<ir::Tensor> out;
+    if (target.arch == Target::Arch::X86) {
+#ifdef CINN_WITH_MKL_CBLAS
+      out = pe::MatmulMKL(new_A, new_B, trans_a, trans_b, alpha, UniqName("MatmulMKL_output"), target);
+#else
+      out = pe::MatmulV2(new_A, new_B, trans_a, trans_b, alpha, UniqName("MatmulV2_output"), target);
+#endif
+    } else {
+      out = pe::Matmul(new_A, new_B, trans_a, trans_b, alpha, tensor_name);
+    }
+
+    std::vector<CINNValue> res;
+    for (auto &t : out) {
+      stages->InsertLazily(t);
+    }
+
+    for (auto &t : out) {
+      res.push_back(CINNValue(t));
+    }
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule matmul_schedule([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of matmul schedule is empty! Please check.\n";
+    CINNValuePack arg_pack = args[0];
+    if (FLAGS_cinn_ir_schedule) {
+      std::vector<CINNValue> results = pe::IRCudaScheduleMatMul(arg_pack, output_shape, target);
+      *ret                           = CINNValuePack({results});
+    } else {
+      CHECK(arg_pack.size() == 2UL || arg_pack.size() == 3UL);
+      poly::StageMap stages = arg_pack.back();
+      if (target.arch == Target::Arch::NVGPU) {
+        Expr out = arg_pack[0];
+        CHECK(out.as_tensor());
+        pe::MatmulScheduleCUDA(stages, out.as_tensor_ref(), target);
+      } else if (target.arch == Target::Arch::X86) {
+#ifdef CINN_WITH_MKL_CBLAS
+        CHECK_EQ(arg_pack.size(), 3UL);
+#else
+        CHECK_EQ(arg_pack.size(), 3UL);
+        Expr out     = arg_pack[0];
+        Expr packedB = arg_pack[1];
+        CHECK(packedB.as_tensor());
+        CHECK(out.as_tensor());
+        pe::MatmulScheduleCPU(stages, out.as_tensor_ref(), packedB.as_tensor_ref(), target);
+#endif
+      }
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(matmul_compute, matmul_schedule, "strategy.matmul.x86", 1);
+
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForMatMul(const std::vector<std::vector<int>> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 2UL) << "The input's shape size should be 2! Please check again.";
+  bool trans_a = SafeGetAttr(attrs, "trans_a", false);
+  bool trans_b = SafeGetAttr(attrs, "trans_b", false);
+
+  VLOG(4) << "During the matmul shape inference, origin shape_A: " << utils::Join(inputs_shape[0], ", ");
+  VLOG(4) << "During the matmul shape inference, origin shape_B: " << utils::Join(inputs_shape[1], ", ");
+
+  const auto &new_shape = pe::utils::GetMatmulNewShapes(inputs_shape, trans_a, trans_b);
+
+  const auto &new_shape_A  = new_shape[0];
+  const auto &new_shape_B  = new_shape[1];
+  const auto &output_shape = new_shape[2];
+
+  VLOG(4) << "During the matmul shape inference, new_shape_A: " << utils::Join(new_shape_A, ", ");
+  VLOG(4) << "During the matmul shape inference, new_shape_B: " << utils::Join(new_shape_B, ", ");
+  VLOG(4) << "During the matmul shape inference, output_shape: " << utils::Join(output_shape, ", ");
+
+  std::vector<std::vector<int>> res{output_shape};
+  return res;
+}
+
+std::vector<Type> InferDtypeForMatMul(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 2UL) << "The input's type size should be 2! Please check again.";
+  CHECK_EQ(inputs_type[0], inputs_type[1]) << "The input's types should be equal! Please check again.";
+
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForMatMul(const std::vector<framework::shape_t> &input_shapes,
+                                                           const std::vector<std::string> &input_layouts,
+                                                           const framework::NodeAttr &attrs,
+                                                           const Target &target) {
+  CHECK_EQ(input_layouts.size(), 2U) << "The input's layouts size is not 2! Please check again.";
+  CHECK_EQ(input_shapes.size(), 2U) << "mul should have 2 input shapes";
+  std::vector<std::string> new_input_layouts = input_layouts;
+  for (int i = 0; i < input_shapes.size(); i++) {
+    if (input_shapes[i].size() > 4) {
+      // alter input layout back
+      new_input_layouts[i] = "NCHW";
+    }
+  }
+
+  return {{"", ""}, new_input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForSplit(const framework::NodeAttr &attrs,
+                                             const std::vector<ir::Tensor> &inputs,
+                                             const std::vector<Type> &out_type,
+                                             const std::vector<std::vector<int>> &output_shapes,
+                                             const Target &target) {
+  // get attribute
+  std::vector<int> sections;
+  int axis = 0;
+  if (attrs.attr_store.find("num_or_sections") != attrs.attr_store.end()) {
+    sections = absl::get<std::vector<int>>(attrs.attr_store.at("num_or_sections"));
+  }
+  if (attrs.attr_store.find("axis") != attrs.attr_store.end()) {
+    axis = absl::get<int>(attrs.attr_store.at("axis"));
+  }
+  if (axis < 0) axis += static_cast<int>(output_shapes[0].size());
+
+  CHECK(!output_shapes.empty()) << "The Spilt Op's output shape list should not empty.";
+  CHECK_LT(axis, static_cast<int>(output_shapes[0].size()));
+  CHECK(!sections.empty())
+      << "The Split op doesn't find [num_or_sections] attrbute! It it a mandatory attribute ! Please check.";
+
+  framework::CINNCompute split_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of split compute is empty! Please check.";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "The input tensors of split compute is empty! Please check.";
+    Expr A_expr = pack_args[0];
+    CHECK(A_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+
+    std::vector<std::string> tensor_names;
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), output_shapes.size() + 1);
+      for (int idx = 1; idx < pack_args.size(); ++idx) {
+        CHECK(pack_args[idx].is_string());
+        tensor_names.push_back(pack_args[idx].operator std::string());
+      }
+    } else {
+      for (int idx = 0; idx < output_shapes.size(); ++idx) {
+        tensor_names.push_back(UniqName("T_Split_Out"));
+      }
+    }
+
+    auto out    = pe::Split(A, axis, output_shapes, tensor_names);
+    auto stages = CreateStages(out);
+
+    std::vector<CINNValue> res;
+    for (int i = 0; i < out.size(); ++i) {
+      res.emplace_back(out[i]);
+    }
+    res.emplace_back(stages);
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule split_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of split schedule is empty! Please check.";
+      CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+      pe::IRCudaSplitSchedule(ir_sch, output_shapes, axis, target);
+      std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input arguments of split schedule is empty! Please check.";
+      CINNValuePack arg_pack = args[0];
+      CHECK_GE(arg_pack.size(), 2UL) << "The input tensor's size of split schedule is " << arg_pack.size()
+                                     << "and it should be greater equal to 2! Please check.";
+      pe::CudaSplitSchedule(&arg_pack, output_shapes, axis, target);
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(split_compute, split_schedule, "strategy.split.x86", 1);
+
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForSplit(const std::vector<std::vector<int>> &inputs_shape,
+                                                 const framework::AttrMapType &attrs) {
+  std::vector<int> sections;
+  if (attrs.find("num_or_sections") != attrs.end()) {
+    sections = absl::get<std::vector<int>>(attrs.at("num_or_sections"));
+  } else {
+    LOG(FATAL) << "The Split op doesn't find [num_or_sections] attrbute! It it a mandatory attribute ! Please check.";
+  }
+
+  if (inputs_shape.empty()) {
+    std::vector<std::vector<int>> ret;
+    if (sections.size() == 1) {
+      ret.resize(sections[0]);
+    } else {
+      ret.resize(sections.size());
+    }
+    return ret;
+  }
+  CHECK_GE(inputs_shape.size(), 1U) << "The input's shape size should be no less than 1! Please check again.";
+
+  int axis = 0;
+  if (attrs.find("axis") != attrs.end()) {
+    axis = absl::get<int>(attrs.at("axis"));
+    if (axis < 0) {
+      axis += inputs_shape[0].size();
+    }
+  }
+
+  // check sections valid
+  int output_size = sections.size();
+  int pivot       = inputs_shape[0][axis];
+
+  auto real_sections = sections;
+  if (output_size == 1) {
+    // if the 'sections' is a number, the tensor will split to 'sections' sub-tensor, each sub-tensor length A[axis] /
+    // 'sections'
+    output_size = sections[0];
+    CHECK_EQ(pivot % output_size, 0) << "If the attribute 'num_or_sections' is a number, it should be divisible by the "
+                                        "axis's dimension of inputs A ! Please check.";
+    real_sections.assign(output_size, pivot / output_size);
+  } else {
+    // else the tensor will split to sections.size sub-tensor, each sub-tensor length sections[i]
+    // The sections may have at most one '-1' in sections, that means its value should be inferred by others.
+    int section_sum = 0, neg_index = -1;
+    for (int i = 0; i < output_size; ++i) {
+      if (sections[i] > 0) {
+        section_sum += sections[i];
+      } else if (sections[i] == -1 && neg_index < 0) {
+        neg_index = i;
+      } else {
+        if (sections[i] == 0) {
+          LOG(FATAL) << "The attribute 'num_or_sections' should not has 0 ! Please check.";
+        } else {
+          LOG(FATAL) << "The attribute 'num_or_sections' can only have at most one '-1' ! Please check.";
+        }
+      }
+    }
+
+    if (neg_index >= 0) {
+      // has '-1' in sections
+      real_sections[neg_index] = pivot - section_sum;
+    } else {
+      CHECK_EQ(pivot, section_sum) << "The sum of attr sections should be equal with the axis's dimension value of "
+                                      "inputs A in Split ! Please check.";
+    }
+  }
+
+  std::vector<std::vector<int>> outputs_shape(output_size, inputs_shape[0]);
+  for (int i = 0; i < output_size; ++i) {
+    outputs_shape[i][axis] = real_sections[i];
+  }
+  return outputs_shape;
+}
+
+std::vector<Type> InferDtypeForSplit(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+
+  std::vector<int> sections;
+  if (attrs.find("num_or_sections") != attrs.end()) {
+    sections = absl::get<std::vector<int>>(attrs.at("num_or_sections"));
+  } else {
+    LOG(FATAL) << "The Split op doesn't find [num_or_sections] attrbute! It it a mandatory attribute ! Please check.";
+  }
+
+  int output_size = sections.size();
+  if (output_size == 1) {
+    output_size = sections[0];
+  }
+
+  std::vector<Type> res(output_size, inputs_type[0]);
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForSplit(const std::vector<framework::shape_t> &input_shapes,
+                                                          const std::vector<std::string> &input_layouts,
+                                                          const framework::NodeAttr &attrs,
+                                                          const Target &target) {
+  CHECK(!input_layouts.empty()) << "The input's layout size is 0! Please check again.";
+  std::vector<int> sections;
+  if (attrs.attr_store.find("num_or_sections") != attrs.attr_store.end()) {
+    sections = absl::get<std::vector<int>>(attrs.attr_store.at("num_or_sections"));
+  } else {
+    LOG(FATAL) << "The Split op doesn't find [num_or_sections] attrbute! It it a mandatory attribute ! Please check.";
+  }
+
+  int output_size = sections.size();
+  if (output_size == 1) {
+    output_size = sections[0];
+  }
+
+  std::vector<std::string> output_layout(output_size, input_layouts[0]);
+  return {output_layout, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForConcat(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<Type> &out_type,
+                                              const std::vector<std::vector<int>> &output_shapes,
+                                              const Target &target) {
+  framework::CINNCompute concat_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of Concat compute is empty! Please check.\n";
+    CHECK(!out_type.empty()) << "Output type of Concat is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    int input_size          = FLAGS_cinn_ir_schedule ? pack_args.size() - 1 : pack_args.size();
+    CHECK_GE(input_size, 1UL) << "at least 2 input tensors for Concat compute\n";
+    CHECK(!output_shapes.empty());
+    int axis = 0;
+    if (attrs.attr_store.count("axis")) {
+      axis = absl::get<int>(attrs.attr_store.at("axis"));
+    }
+
+    std::vector<ir::Tensor> input_tensors;
+    for (int i = 0; i < input_size; i++) {
+      Expr tensor = pack_args[i];
+      CHECK(tensor.as_tensor());
+      input_tensors.push_back(tensor.as_tensor_ref());
+    }
+
+    std::string tensor_name = UniqName("Concat_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(pack_args[input_size].is_string());
+      tensor_name = pack_args[input_size].operator std::string();
+    }
+
+    auto stages = CreateStages(input_tensors);
+    auto out    = pe::Concat(input_tensors, axis, tensor_name);
+    stages->InsertLazily(out);
+
+    *ret = CINNValuePack({CINNValue(out), CINNValue(stages)});
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(concat_compute, GetInjectiveScheduleFunc(output_shapes, target, false), "strategy.concat.x86", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForConcat(const std::vector<std::vector<int>> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK_GE(inputs_shape.size(), 1UL) << "The input's shape size should be no less than 2! Please check again.";
+  int axis = 0;
+  for (auto &iter : attrs) {
+    if (iter.first == "axis") {
+      axis = absl::get<int>(iter.second);
+      break;
+    }
+  }
+
+  if (axis < 0) axis += inputs_shape[0].size();
+  std::vector<int> output_shape = inputs_shape[0];
+  CHECK(axis >= 0 && axis < inputs_shape[0].size())
+      << "In Concat op, the attribute `axis` should be >= 0 and < input shape's size, please check!";
+
+  int input_dim = inputs_shape[0].size();
+  for (int i = 1; i < inputs_shape.size(); i++) {
+    CHECK_EQ(inputs_shape[i].size(), input_dim)
+        << "Dimensions of inputs tensors in Concat should be equal! Please check.";
+
+    for (int j = 0; j < input_dim; j++) {
+      if (j != axis) {
+        CHECK_EQ(inputs_shape[0][j], inputs_shape[i][j])
+            << "The " << j << "-th dimension of input[0] and input[" << i
+            << "] should be the same, but here input[0].shape=[" << cinn::utils::Join(inputs_shape[0], ", ")
+            << "], input[" << i << "].shape=[" << cinn::utils::Join(inputs_shape[i], ", ") << "]! Please check.";
+      }
+    }
+
+    output_shape[axis] += inputs_shape[i][axis];
+  }
+
+  std::vector<std::vector<int>> res{output_shape};
+  return res;
+}
+
+std::vector<Type> InferDtypeForConcat(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForConcat(const std::vector<framework::shape_t> &input_shapes,
+                                                           const std::vector<std::string> &input_layouts,
+                                                           const framework::NodeAttr &attrs,
+                                                           const Target &target) {
+  CHECK_GE(input_layouts.size(), 1UL) << "The input's layout size is less than 2! Please check again.";
+  return {{input_layouts[0]}, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForMul(const framework::NodeAttr &attrs,
+                                           const std::vector<ir::Tensor> &inputs,
+                                           const std::vector<Type> &out_type,
+                                           const std::vector<std::vector<int>> &output_shapes,
+                                           const Target &target) {
+  CHECK_EQ(inputs.size(), 2UL) << "mul should have 2 input";
+  const auto &attr_store = attrs.attr_store;
+  int x_num_col_dims     = SafeGetAttr(attr_store, "x_num_col_dims", 1);
+  int y_num_col_dims     = SafeGetAttr(attr_store, "y_num_col_dims", 1);
+  bool is_infer          = SafeGetAttr(attr_store, "is_infer", false);
+
+  const auto &shape_A = ToPodVector<int>(inputs[0]->shape);
+  const auto &shape_B = ToPodVector<int>(inputs[1]->shape);
+
+  const auto &new_shape = pe::utils::GetMulNewShapes({shape_A, shape_B}, x_num_col_dims, y_num_col_dims, is_infer);
+
+  const auto &new_shape_A  = new_shape[0];
+  const auto &new_shape_B  = new_shape[1];
+  const auto &output_shape = new_shape[2];
+
+  framework::CINNCompute mul_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of Mul compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK_GE(pack_args.size(), 2U) << "at least 2 input tensors for Mul compute\n";
+    Expr A = pack_args[0];
+    Expr B = pack_args[1];
+    CHECK(A.as_tensor());
+    CHECK(B.as_tensor());
+
+    auto A_tensor = A.as_tensor_ref();
+    auto B_tensor = B.as_tensor_ref();
+    auto stages   = CreateStages({A_tensor, B_tensor});
+
+    auto new_shape_A_e = ToCinnExprs(new_shape_A);
+    auto new_shape_B_e = ToCinnExprs(new_shape_B);
+
+    auto new_A = A_tensor->Reshape(new_shape_A_e, stages);
+    auto new_B = B_tensor->Reshape(new_shape_B_e, stages);
+
+    std::vector<ir::Tensor> out;
+    std::string tensor_name = UniqName("Mul_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(pack_args.back().is_string());
+      tensor_name = pack_args.back().operator std::string();
+    }
+
+    if (target.arch == Target::Arch::X86) {
+#ifdef CINN_WITH_MKL_CBLAS
+      out = pe::MatmulMKL(new_A, new_B, false, is_infer, 1.0f, tensor_name, target);
+#else
+      out = pe::MatmulV2(new_A, new_B, false, is_infer, 1.0f, tensor_name, target);
+#endif
+    } else {
+      out = pe::Matmul(new_A, new_B, false, is_infer, 1.0f, tensor_name);
+    }
+
+    std::vector<CINNValue> res;
+    for (auto &t : out) {
+      stages->InsertLazily(t);
+    }
+
+    for (auto &t : out) {
+      res.push_back(CINNValue(t));
+    }
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule mul_schedule([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of matmul schedule is empty! Please check.\n";
+    CINNValuePack arg_pack = args[0];
+    if (FLAGS_cinn_ir_schedule) {
+      std::vector<CINNValue> results = pe::IRCudaScheduleMatMul(arg_pack, output_shape, target);
+      *ret                           = CINNValuePack({results});
+    } else {
+      CHECK(arg_pack.size() == 2UL || arg_pack.size() == 3UL);
+      poly::StageMap stages = arg_pack.back();
+      if (target.arch == Target::Arch::NVGPU) {
+        Expr out = arg_pack[0];
+        CHECK(out.as_tensor());
+        pe::MatmulScheduleCUDA(stages, out.as_tensor_ref(), target);
+      } else if (target.arch == Target::Arch::X86) {
+#ifdef CINN_WITH_MKL_CBLAS
+        CHECK_EQ(arg_pack.size(), 3UL);
+#else
+        CHECK_EQ(arg_pack.size(), 3UL);
+        Expr out     = arg_pack[0];
+        Expr packedB = arg_pack[1];
+        CHECK(packedB.as_tensor());
+        CHECK(out.as_tensor());
+        pe::MatmulScheduleCPU(stages, out.as_tensor_ref(), packedB.as_tensor_ref(), target);
+#endif
+      }
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(mul_compute, mul_schedule, "strategy.mul.x86", 1);
+
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForMul(const std::vector<std::vector<int>> &inputs_shape,
+                                               const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 2U) << "The input's shape size should be 2! Please check again.";
+  CHECK_GE(inputs_shape[0].size(), 2U) << "Input matrix X's dim should be >= 2! Please check.";
+  CHECK_GE(inputs_shape[1].size(), 2U) << "Input matrix Y's dim should be >= 2! Please check.";
+
+  VLOG(4) << "During the matmul shape inference, origin shape_A: " << utils::Join(inputs_shape[0], ", ");
+  VLOG(4) << "During the matmul shape inference, origin shape_B: " << utils::Join(inputs_shape[1], ", ");
+
+  int x_num_col_dims = SafeGetAttr(attrs, "x_num_col_dims", 1);
+  int y_num_col_dims = SafeGetAttr(attrs, "y_num_col_dims", 1);
+  bool is_infer      = SafeGetAttr(attrs, "is_infer", false);
+
+  const auto &new_shape = pe::utils::GetMulNewShapes(inputs_shape, x_num_col_dims, y_num_col_dims, is_infer);
+
+  const auto &new_shape_A  = new_shape[0];
+  const auto &new_shape_B  = new_shape[1];
+  const auto &output_shape = new_shape[2];
+
+  VLOG(4) << "During the mul shape inference, new_shape_A: " << utils::Join(new_shape_A, ", ");
+  VLOG(4) << "During the mul shape inference, new_shape_B: " << utils::Join(new_shape_B, ", ");
+  VLOG(4) << "During the mul shape inference, output_shape: " << utils::Join(output_shape, ", ");
+
+  int a_K = new_shape_A[1];
+  int b_K = is_infer ? new_shape_B[1] : new_shape_B[0];
+
+  CHECK_EQ(a_K, b_K) << "The K dimension of mul should be equal.";
+
+  return {output_shape};
+}
+
+std::vector<Type> InferDtypeForMul(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 2U) << "The input's type size should be 2! Please check again.";
+  CHECK_EQ(inputs_type[0], inputs_type[1]) << "The input's types should be equal! Please check again.";
+
+  return {inputs_type[0]};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForMul(const std::vector<framework::shape_t> &input_shapes,
+                                                        const std::vector<std::string> &input_layouts,
+                                                        const framework::NodeAttr &attrs,
+                                                        const Target &target) {
+  CHECK_EQ(input_layouts.size(), 2U) << "The input's layouts size is not 2! Please check again.";
+  CHECK_EQ(input_shapes.size(), 2U) << "mul should have 2 input shapes";
+  std::vector<std::string> new_input_layouts = input_layouts;
+  for (int i = 0; i < input_shapes.size(); i++) {
+    if (input_shapes[i].size() > 4) {
+      // alter input layout back
+      new_input_layouts[i] = "NCHW";
+    }
+  }
+
+  return {{"", ""}, new_input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForCublasGemm(const framework::NodeAttr &attrs,
+                                                  const std::vector<ir::Tensor> &inputs,
+                                                  const std::vector<Type> &out_type,
+                                                  const std::vector<std::vector<int>> &output_shapes,
+                                                  const Target &target) {
+  framework::CINNCompute gemm_compute([attrs](lang::Args args, lang::RetValue *ret) {
+    auto &attr_store = attrs.attr_store;
+    CHECK(attr_store.contains("trans_a")) << "The cublas_gemm should have an attr named `trans_a`.";
+    CHECK(attr_store.contains("trans_b")) << "The cublas_gemm should have an attr named `trans_b`.";
+    CHECK(!args.empty()) << "The input `args` of cublas_gemm is empty! Please check.";
+
+    CINNValuePack input_args = args[0];
+    CHECK_GE(input_args.size(), 3U) << "The input number of cublas_gemm should be equal to 3.";
+    Expr lhs  = input_args[0];
+    Expr rhs  = input_args[1];
+    Expr bias = input_args[2];
+    CHECK(lhs.as_tensor());
+    CHECK(rhs.as_tensor());
+    CHECK(bias.as_tensor());
+    auto bias_tensor = bias.as_tensor_ref();
+    // dummy gemm computation, which will be replaced by cinn_gpu_cublas_gemm in the GemmRewriter pass.
+
+    std::string tensor_name = UniqName("cublas_gemm_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(input_args.size(), 4);
+      CHECK(input_args[3].is_string());
+      tensor_name = input_args[3].operator std::string();
+    }
+    auto out    = pe::Identity(bias_tensor, tensor_name).front();
+    auto stages = CreateStages({lhs.as_tensor_ref(), rhs.as_tensor_ref(), bias_tensor});
+    stages->InsertLazily(out);
+    std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(gemm_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.cublas.gemm", 1);
+
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForCublasGemm(const std::vector<std::vector<int>> &input_shapes,
+                                             const framework::AttrMapType &attrs) {
+  CHECK_EQ(input_shapes.size(), 3UL) << "cublas_gemm should have 3 input shapes";
+  CHECK_EQ(input_shapes[0].size(), input_shapes[1].size());
+  CHECK_EQ(input_shapes[0].size(), input_shapes[2].size());
+  CHECK((input_shapes[0].size() == 2 || input_shapes[0].size() == 3));
+  return {input_shapes[2]};
+}
+
+std::vector<Type> InferDtypeForCublasGemm(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_type.size(), 3UL) << "The input's type size is 0! Please check again.";
+  CHECK_EQ(inputs_type[0], inputs_type[1]) << "The input A and B's types should be equal! Please check again.";
+  CHECK_EQ(inputs_type[0], inputs_type[2]) << "The input A and C's types should be equal! Please check again.";
+  return {inputs_type[0]};
+}
+
+std::shared_ptr<OpStrategy> StrategyForLayoutTransform(const framework::NodeAttr &attrs,
+                                                       const std::vector<ir::Tensor> &inputs,
+                                                       const std::vector<Type> &out_type,
+                                                       const std::vector<std::vector<int>> &output_shapes,
+                                                       const Target &target) {
+  framework::CINNCompute layout_transform_compute([=](lang::Args args, lang::RetValue *ret) {
+    std::string src_layout;
+    std::string dst_layout;
+    if (attrs.attr_store.find("src_layout") != attrs.attr_store.end()) {
+      src_layout = absl::get<std::string>(attrs.attr_store.at("src_layout"));
+    }
+    if (attrs.attr_store.find("dst_layout") != attrs.attr_store.end()) {
+      dst_layout = absl::get<std::string>(attrs.attr_store.at("dst_layout"));
+    }
+    CHECK(!args.empty()) << "The input argument of layout_transform compute is empty! Please check.\n";
+    CINNValuePack input_args = args[0];
+    CHECK(!input_args.empty()) << "at least one input tensor for layout_transform compute\n";
+    Expr A = input_args[0];
+    CHECK(A.as_tensor());
+
+    std::string tensor_name = UniqName("layout_transform_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(input_args.size(), 2);
+      CHECK(input_args[1].is_string());
+      tensor_name = input_args[1].operator std::string();
+    }
+
+    auto out    = pe::LayoutTransform(A.as_tensor_ref(), src_layout, dst_layout, tensor_name);
+    auto stages = CreateStages({A.as_tensor_ref()});
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res  = {CINNValue(out), CINNValue(stages)};
+    *ret = CINNValuePack{res};
+  });
+
+  framework::CINNSchedule layout_transform_schedule([=](lang::Args args, lang::RetValue *ret) {
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK(!args.empty()) << "The input argument of CublasGemm schedule is empty! Please check.";
+      CINNValuePack arg_pack = args[0];
+      std::vector<Expr> vec_ast;
+      for (int i = 0; i < arg_pack.size(); i++) {
+        if (arg_pack[i].is_expr()) {
+          Expr temp = arg_pack[i];
+          vec_ast.emplace_back(temp);
+        }
+      }
+      CHECK(!vec_ast.empty());
+      ir::ModuleExpr mod_expr(vec_ast);
+      ir::IRSchedule ir_sch(mod_expr);
+      ir_sch.MergeExprs();
+
+      if (target.arch == Target::Arch::X86) {
+        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target);
+      } else {
+        CINN_NOT_IMPLEMENTED
+      }
+      std::vector<CINNValue> res{CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+      *ret = CINNValuePack{res};
+    } else {
+      CHECK(!args.empty()) << "The input argument of layout_transform schedule is empty! Please check.\n";
+      CINNValuePack arg_pack = args[0];
+      CHECK_EQ(arg_pack.size(), 2UL);
+      Expr out              = arg_pack[0];
+      poly::StageMap stages = arg_pack[1];
+      CHECK(out.as_tensor());
+      auto tensor_out = out.as_tensor_ref();
+      std::vector<int> out_shape;
+      for (auto shape : tensor_out->shape) {
+        out_shape.push_back(shape.as_int32());
+      }
+      if (target.arch == Target::Arch::X86) {
+        pe::ScheduleInjectiveCPU(stages[tensor_out], out_shape, target);
+      } else {
+        CINN_NOT_IMPLEMENTED
+      }
+      *ret = arg_pack;
+    }
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  CHECK(out_type.size()) << "Out_type of layout_transform op is empty! Please check.";
+  strategy->AddImpl(layout_transform_compute, layout_transform_schedule, "strategy.layout_transform.x86", 1);
+  return strategy;
+}
+
+std::vector<shape_t> InferShapeForLayoutTransform(const std::vector<shape_t> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  std::string src_layout;
+  std::string dst_layout;
+  if (attrs.find("src_layout") != attrs.end()) {
+    src_layout = absl::get<std::string>(attrs.at("src_layout"));
+  }
+  if (attrs.find("dst_layout") != attrs.end()) {
+    dst_layout = absl::get<std::string>(attrs.at("dst_layout"));
+  }
+  CHECK_EQ(inputs_shape.size(), 1UL);
+
+  std::vector<Expr> input_shapes_expr;
+  for (int shape : inputs_shape[0]) {
+    input_shapes_expr.push_back(Expr(shape));
+  }
+  absl::flat_hash_map<int, std::vector<int>> split_index_map;
+  std::vector<Expr> out_shapes = pe::InferShapeLayoutTransform(
+      input_shapes_expr, ir::Layout(src_layout), ir::Layout(dst_layout), &split_index_map);
+  VLOG(4) << "out_shapes: " << out_shapes;
+  std::vector<int> output_shapes;
+  for (auto &shape : out_shapes) {
+    output_shapes.push_back(shape.as_int32());
+  }
+  return {output_shapes};
+}
+
+std::vector<Type> InferDtypeForLayoutTransform(const std::vector<Type> &inputs_type,
+                                               const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::shared_ptr<OpStrategy> StrategyForReverse(const framework::NodeAttr &attrs,
+                                               const std::vector<ir::Tensor> &inputs,
+                                               const std::vector<Type> &out_type,
+                                               const std::vector<std::vector<int>> &output_shapes,
+                                               const Target &target) {
+  // check output shape
+  CHECK(!output_shapes.empty() && !output_shapes[0].empty()) << "Output shape is empty! Please check.\n";
+  // get axis[0, n_dim)
+  std::vector<int> axis;
+  if (attrs.attr_store.find("axis") != attrs.attr_store.end()) {
+    axis = absl::get<std::vector<int>>(attrs.attr_store.at("axis"));
+    for (auto &e : axis) {
+      if (e >= static_cast<int>(output_shapes[0].size()) || e < -1 * static_cast<int>(output_shapes[0].size())) {
+        LOG(FATAL) << "axis is not in [0, n_dim), Please check.";
+      }
+      if (e < 0) {
+        e += output_shapes[0].size();
+      }
+    }
+  }
+
+  framework::CINNCompute reverse_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of reverse compute is empty! Please check.\n";
+    CINNValuePack input_args = args[0];
+    CHECK(!input_args.empty()) << "at least one input tensor for reverse compute\n";
+    Expr A = input_args[0];
+    CHECK(A.as_tensor());
+
+    std::string tensor_name = UniqName("Reverse_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(input_args.size(), 2);
+      CHECK(input_args[1].is_string());
+      tensor_name = input_args[1].operator std::string();
+    }
+
+    auto out    = pe::Reverse(A.as_tensor_ref(), axis, tensor_name);
+    auto stages = CreateStages({A.as_tensor_ref(), out});
+    *ret        = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  CHECK(out_type.size()) << "Out_type of reverse op is empty! Please check.";
+  strategy->AddImpl(reverse_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.reverse.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForReverse(const std::vector<framework::shape_t> &inputs_shape,
+                                                     const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+  std::vector<framework::shape_t> res{inputs_shape[0]};
+  if (attrs.find("axis") != attrs.end()) {
+    auto axis = absl::get<std::vector<int>>(attrs.at("axis"));
+    for (auto &e : axis) {
+      if (e >= static_cast<int>(inputs_shape[0].size()) || e < -1 * static_cast<int>(inputs_shape[0].size())) {
+        LOG(FATAL) << "axis is not in [-n_dim, n_dim), Please check.";
+      }
+      if (e < 0) {
+        e += inputs_shape[0].size();
+      }
+    }
+  }
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForReverse(const std::vector<framework::shape_t> &input_shapes,
+                                                            const std::vector<std::string> &input_layouts,
+                                                            const framework::NodeAttr &attrs,
+                                                            const Target &target) {
+  if (attrs.attr_store.find("axis") != attrs.attr_store.end()) {
+    auto axis = absl::get<std::vector<int>>(attrs.attr_store.at("axis"));
+    for (auto &e : axis) {
+      if (e >= static_cast<int>(input_shapes[0].size()) || e < -1 * static_cast<int>(input_shapes[0].size())) {
+        LOG(FATAL) << "axis is not in [-n_dim, n_dim), Please check.";
+      }
+    }
+  }
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layout size is not 1! Please check again.";
+  return {input_layouts, input_layouts};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForLayoutTransform(const std::vector<framework::shape_t> &input_shapes,
+                                                                    const std::vector<std::string> &input_layouts,
+                                                                    const framework::NodeAttr &attrs,
+                                                                    const Target &target) {
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layouts size is not 1! Please check again.";
+  std::string dst_layout;
+  std::string src_layout;
+  if (attrs.attr_store.find("dst_layout") != attrs.attr_store.end()) {
+    dst_layout = absl::get<std::string>(attrs.attr_store.at("dst_layout"));
+  }
+  if (attrs.attr_store.find("src_layout") != attrs.attr_store.end()) {
+    src_layout = absl::get<std::string>(attrs.attr_store.at("src_layout"));
+  }
+  return {{dst_layout}, {src_layout}};
+}
+
+std::shared_ptr<OpStrategy> StrategyForTranspose(const framework::NodeAttr &attrs,
+                                                 const std::vector<ir::Tensor> &inputs,
+                                                 const std::vector<Type> &out_type,
+                                                 const std::vector<std::vector<int>> &output_shapes,
+                                                 const Target &target) {
+  // check output shape
+  CHECK(!output_shapes.empty() && !output_shapes[0].empty()) << "Output shape is empty! Please check.\n";
+
+  std::vector<int> axis;
+  auto input_shape = inputs[0]->shape;
+  if (attrs.attr_store.find("axis") != attrs.attr_store.end()) {
+    axis = absl::get<std::vector<int>>(attrs.attr_store.at("axis"));
+    CHECK_EQ(axis.size(), output_shapes[0].size())
+        << "axis size is not equal output_shapes size! Please check setting.\n";
+    // check axis and shape
+    for (int idx = 0; idx < axis.size(); ++idx) {
+      CHECK(axis[idx] >= 0 && axis[idx] < axis.size());
+      for (int idy = idx + 1; idy < axis.size(); ++idy) {
+        CHECK_NE(axis[idx], axis[idy]) << "axis can't repeat!";
+      }
+      CHECK_EQ(output_shapes[0][idx], input_shape[axis[idx]].as_int32())
+          << "output shape is not equal! Please check!\n";
+    }
+  } else {
+    LOG(FATAL) << "axis is not be set! Please check.";
+  }
+
+  framework::CINNCompute transpose_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of transpose compute is empty! Please check.\n";
+    CINNValuePack input_args = args[0];
+    CHECK(!input_args.empty()) << "at least one input tensor for transpose compute\n";
+    Expr A = input_args[0];
+    CHECK(A.as_tensor());
+    std::string tensor_name = UniqName("Transpose_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(input_args.size(), 2);
+      CHECK(input_args[1].is_string());
+      tensor_name = input_args[1].operator std::string();
+    }
+
+    auto out    = pe::Transpose(A.as_tensor_ref(), axis, tensor_name);
+    auto stages = CreateStages({out});
+    *ret        = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(transpose_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.transpose.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForTranspose(const std::vector<framework::shape_t> &inputs_shape,
+                                                       const framework::AttrMapType &attrs) {
+  std::vector<framework::shape_t> result;
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+  if (attrs.find("axis") != attrs.end()) {
+    auto axis = absl::get<std::vector<int>>(attrs.at("axis"));
+    CHECK_EQ(axis.size(), inputs_shape[0].size()) << "input size and axis size is not equal!";
+    std::vector<int> output_shape;
+    for (int idx = 0; idx < axis.size(); ++idx) {
+      CHECK(axis[idx] >= 0 && axis[idx] < axis.size());
+      for (int idy = idx + 1; idy < axis.size(); ++idy) {
+        CHECK_NE(axis[idx], axis[idy]) << "axis can't repeat!";
+      }
+      output_shape.push_back(inputs_shape[0][axis[idx]]);
+    }
+    result.push_back(output_shape);
+  } else {
+    LOG(FATAL) << "axis is not be set! Please check.";
+  }
+  return result;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForTranspose(const std::vector<framework::shape_t> &input_shapes,
+                                                              const std::vector<std::string> &input_layouts,
+                                                              const framework::NodeAttr &attrs,
+                                                              const Target &target) {
+  CHECK_EQ(input_shapes.size(), 1U) << "The input's shape size is not 1! Please check again.";
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layout size is not 1! Please check again.";
+
+  std::vector<int> axis;
+  if (attrs.attr_store.find("axis") != attrs.attr_store.end()) {
+    axis = absl::get<std::vector<int>>(attrs.attr_store.at("axis"));
+    for (int idx = 0; idx < axis.size(); ++idx) {
+      CHECK(axis[idx] >= 0 && axis[idx] < axis.size());
+      for (int idy = idx + 1; idy < axis.size(); ++idy) {
+        CHECK_NE(axis[idx], axis[idy]) << "axis can't repeat!";
+      }
+    }
+  } else {
+    LOG(FATAL) << "axis is not be set! Please check.";
+  }
+
+  std::vector<std::string> new_input_layouts = input_layouts;
+  for (int i = 0; i < input_shapes.size(); i++) {
+    if (input_shapes[i].size() > 4) {
+      // alter input layout back
+      new_input_layouts[i] = input_layouts[0].substr(0, 4);
+    }
+  }
+
+  std::string output_layout = new_input_layouts[0];
+  for (int idx = 0; idx < axis.size(); ++idx) {
+    output_layout[idx] = new_input_layouts[0][axis[idx]];
+  }
+
+  return {{output_layout}, new_input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForGather(const framework::NodeAttr &attrs,
+                                              const std::vector<ir::Tensor> &inputs,
+                                              const std::vector<Type> &out_type,
+                                              const std::vector<std::vector<int>> &output_shapes,
+                                              const Target &target) {
+  CHECK(!output_shapes.empty() && !output_shapes[0].empty()) << "The shape of output is empty! Please check again.";
+  VLOG(4) << "The output passed in StrategyForGather: " << utils::Join(output_shapes[0], ", ");
+  CHECK(!out_type.empty()) << "The output type of Gather is empty! Please check again.\n";
+
+  int axis = 0;
+  if (attrs.attr_store.contains("axis")) {
+    axis = absl::get<int>(attrs.attr_store.at("axis"));
+  }
+  axis = axis < 0 ? axis + static_cast<int>(inputs[0]->shape.size()) : axis;
+
+  std::vector<Expr> output_shape;
+  output_shape.reserve(output_shapes[0].size());
+  for (int i : output_shapes[0]) {
+    output_shape.emplace_back(i);
+  }
+
+  framework::CINNCompute gather_compute{
+      [axis, output_shape = std::move(output_shape)](lang::Args args, lang::RetValue *ret) {
+        VLOG(4) << "The axis value used in gather_compute: " << axis;
+        CHECK(!args.empty()) << "The input args are empty! Please check again.";
+        CINNValuePack input_args = args[0];
+        int input_size           = input_args.size();
+        CHECK_GE(input_size, 2U) << "Require 2 input tensors for Gather compute.";
+        Expr x = input_args[0];
+        CHECK(x.as_tensor());
+        Expr index = input_args[1];
+        CHECK(index.as_tensor());
+
+        std::string tensor_name = UniqName("gather_output");
+        if (FLAGS_cinn_ir_schedule) {
+          CHECK_EQ(input_args.size(), 3U);
+          CHECK(input_args[2].is_string());
+          tensor_name = input_args[2].operator std::string();
+        }
+
+        auto out    = pe::Gather(x.as_tensor_ref(), index.as_tensor_ref(), output_shape, axis, tensor_name);
+        auto stages = CreateStages({x.as_tensor_ref(), index.as_tensor_ref()});
+        stages->InsertLazily(out);
+        std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
+        *ret = CINNValuePack{res};
+      }};
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(gather_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.gather.x86", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForGather(const std::vector<std::vector<int>> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 2U) << "The inputs' shape size should be equal to 2! Please check again.";
+  std::vector<int> x_shape     = inputs_shape[0];
+  std::vector<int> index_shape = inputs_shape[1];
+  int axis                     = absl::get<int>(attrs.at("axis"));
+  VLOG(4) << "The axis value used in Gather: " << axis;
+
+  CHECK(axis >= 0 && axis < static_cast<int>(x_shape.size()))
+      << "The attribute `axis` in Gather should be >= 0 and < the size of the first input shape! Please check again.";
+
+  std::vector<int> output_shape = x_shape;
+  output_shape[axis]            = index_shape[axis];
+  VLOG(4) << "The output shape of gather: " << utils::Join(output_shape, ", ");
+
+  return {std::move(output_shape)};
+}
+
+std::vector<Type> InferDtypeForGather(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  return {inputs_type[0]};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForGather(const std::vector<framework::shape_t> &input_shapes,
+                                                           const std::vector<std::string> &input_layouts,
+                                                           const framework::NodeAttr &attrs,
+                                                           const Target &target) {
+  CHECK_EQ(input_layouts.size(), 2U) << "The input's layout size is not equal to 2! Please check again.";
+  return {{input_layouts[0]}, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForScatterAssign(const framework::NodeAttr &attrs,
+                                                     const std::vector<ir::Tensor> &inputs,
+                                                     const std::vector<Type> &out_type,
+                                                     const std::vector<std::vector<int>> &output_shapes,
+                                                     const Target &target) {
+  int axis = 0;
+  if (attrs.attr_store.find("axis") != attrs.attr_store.end()) {
+    axis = absl::get<int>(attrs.attr_store.at("axis"));
+  }
+
+  framework::CINNCompute scatter_assign_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of ScatterAssign compute is empty! Please check.\n";
+    CINNValuePack arg_pack = args[0];
+    int input_size         = arg_pack.size();
+    CHECK_GE(input_size, 3U) << "at least 3 input tensors for ScatterAssign compute\n";
+    CHECK(!output_shapes.empty());
+
+    Expr expr_input = arg_pack[0];
+    CHECK(expr_input.as_tensor());
+    auto tensor_input = expr_input.as_tensor_ref();
+
+    Expr expr_updates = arg_pack[1];
+    CHECK(expr_updates.as_tensor());
+    auto tensor_updates = expr_updates.as_tensor_ref();
+
+    Expr expr_index = arg_pack[2];
+    CHECK(expr_index.as_tensor());
+    auto tensor_index = expr_index.as_tensor_ref();
+
+    auto stages = CreateStages({tensor_input, tensor_updates, tensor_index});
+
+    std::string tensor_name = UniqName("scatter_assign_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(arg_pack.size(), 4U);
+      CHECK(arg_pack[3].is_string());
+      tensor_name = arg_pack[3].operator std::string();
+    }
+
+    auto out = pe::ScatterAssign(tensor_input, tensor_updates, tensor_index, target, axis, tensor_name);
+
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res.push_back(CINNValue(out));
+    CHECK(!out_type.empty()) << "Output type of ScatterAssign is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      scatter_assign_compute, GetInjectiveScheduleFunc(output_shapes, target, false), "strategy.scatter_assign.x86", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForScatterAssign(const std::vector<std::vector<int>> &inputs_shape,
+                                                         const framework::AttrMapType &attrs) {
+  CHECK_GE(inputs_shape.size(), 3U) << "The input's shape size should be no less than 3! Please check again.";
+
+  const auto &input_shape  = inputs_shape[0];
+  const auto &assign_shape = inputs_shape[1];
+  const auto &index_shape  = inputs_shape[2];
+
+  int axis = 0;
+  if (attrs.find("axis") != attrs.end()) {
+    axis = absl::get<int>(attrs.at("axis"));
+  }
+
+  if (axis < 0) axis += input_shape.size();
+
+  CHECK(axis >= 0 && axis < input_shape.size())
+      << "In ScatterAssign op, the attribute `axis` should be >= 0 and < input shape's size! Please check.";
+  CHECK_EQ(index_shape.size(), 1U) << "Dimensions of index tensor in ScatterAssign should be 1! Please check.";
+  CHECK_EQ(input_shape.size(), assign_shape.size())
+      << "Dimensions of inputs A and B in ScatterAssign should be equal! Please check.";
+  CHECK_EQ(assign_shape[axis], index_shape[0])
+      << "The first dimension of input B and index tensor in ScatterAssign should be equal! Please check.";
+  for (int i = 0; i < input_shape.size(); ++i) {
+    if (i != axis) {
+      CHECK_EQ(input_shape[i], assign_shape[i])
+          << "The " << i << "-th dimension of input A and B in ScatterAssign should be equal! Please check.";
+    }
+  }
+
+  VLOG(4) << "Each input tensor's shape of ScatterAssign: A(" << cinn::utils::Join(input_shape, ",") << "), B("
+          << cinn::utils::Join(assign_shape, ",") << "), index(" << cinn::utils::Join(index_shape, ",") << ")"
+          << " at axis (" << axis << ")";
+
+  return {input_shape};
+}
+
+std::vector<Type> InferDtypeForScatterAssign(const std::vector<Type> &inputs_type,
+                                             const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForScatterAssign(const std::vector<framework::shape_t> &input_shapes,
+                                                                  const std::vector<std::string> &input_layouts,
+                                                                  const framework::NodeAttr &attrs,
+                                                                  const Target &target) {
+  CHECK_GE(input_layouts.size(), 3U) << "The input's layout size is less than 3! Please check again.";
+  return {{input_layouts[0]}, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForScatterAdd(const framework::NodeAttr &attrs,
+                                                  const std::vector<ir::Tensor> &inputs,
+                                                  const std::vector<Type> &out_type,
+                                                  const std::vector<std::vector<int>> &output_shapes,
+                                                  const Target &target) {
+  int axis = 0;
+  if (attrs.attr_store.find("axis") != attrs.attr_store.end()) {
+    axis = absl::get<int>(attrs.attr_store.at("axis"));
+  }
+
+  framework::CINNCompute scatter_add_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of ScatterAdd compute is empty! Please check.\n";
+    CINNValuePack arg_pack = args[0];
+    int input_size         = arg_pack.size();
+    CHECK_GE(input_size, 3U) << "at least 3 input tensors for ScatterAdd compute\n";
+    CHECK(!output_shapes.empty());
+
+    Expr expr_input = arg_pack[0];
+    CHECK(expr_input.as_tensor());
+    auto tensor_input = expr_input.as_tensor_ref();
+
+    Expr expr_updates = arg_pack[1];
+    CHECK(expr_updates.as_tensor());
+    auto tensor_updates = expr_updates.as_tensor_ref();
+
+    Expr expr_index = arg_pack[2];
+    CHECK(expr_index.as_tensor());
+    auto tensor_index = expr_index.as_tensor_ref();
+
+    auto stages = CreateStages({tensor_input, tensor_updates, tensor_index});
+
+    std::string tensor_name = UniqName("scatter_add_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(arg_pack.size(), 4U);
+      CHECK(arg_pack[3].is_string());
+      tensor_name = arg_pack[3].operator std::string();
+    }
+
+    auto out = pe::ScatterAdd(tensor_input, tensor_updates, tensor_index, target, axis, tensor_name);
+
+    std::vector<CINNValue> res;
+    stages->InsertLazily(out);
+    res.push_back(CINNValue(out));
+    CHECK(!out_type.empty()) << "Output type of ScatterAdd is empty! Please check.\n";
+    res.push_back(CINNValue(stages));
+    *ret = CINNValuePack{res};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      scatter_add_compute, GetInjectiveScheduleFunc(output_shapes, target, false), "strategy.scatter_add.x86", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForScatterAdd(const std::vector<std::vector<int>> &inputs_shape,
+                                                      const framework::AttrMapType &attrs) {
+  CHECK_GE(inputs_shape.size(), 3U) << "The input's shape size should be no less than 3! Please check again.";
+
+  const auto &input_shape   = inputs_shape[0];
+  const auto &updates_shape = inputs_shape[1];
+  const auto &index_shape   = inputs_shape[2];
+
+  int axis = 0;
+  if (attrs.find("axis") != attrs.end()) {
+    axis = absl::get<int>(attrs.at("axis"));
+  }
+
+  if (axis < 0) axis += input_shape.size();
+
+  CHECK(axis >= 0 && axis < input_shape.size())
+      << "In ScatterAdd op, the attribute `axis` should be >= 0 and < input shape's size! Please check.";
+  CHECK_EQ(index_shape.size(), 1U) << "Dimensions of index tensor in ScatterAdd should be 1! Please check.";
+  CHECK_EQ(input_shape.size(), updates_shape.size())
+      << "Dimensions of inputs A and B in ScatterAdd should be equal! Please check.";
+  CHECK_EQ(updates_shape[axis], index_shape[0])
+      << "The first dimension of input B and index tensor in ScatterAdd should be equal! Please check.";
+  for (int i = 0; i < input_shape.size(); ++i) {
+    if (i != axis) {
+      CHECK_EQ(input_shape[i], updates_shape[i])
+          << "The " << i << "-th dimension of input A and B in ScatterAdd should be equal! Please check.";
+    }
+  }
+
+  VLOG(4) << "Each input tensor's shape of ScatterAdd: A(" << cinn::utils::Join(input_shape, ",") << "), B("
+          << cinn::utils::Join(updates_shape, ",") << "), index(" << cinn::utils::Join(index_shape, ",") << ")"
+          << " at axis (" << axis << ")";
+
+  return {input_shape};
+}
+
+std::vector<Type> InferDtypeForScatterAdd(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForScatterAdd(const std::vector<framework::shape_t> &input_shapes,
+                                                               const std::vector<std::string> &input_layouts,
+                                                               const framework::NodeAttr &attrs,
+                                                               const Target &target) {
+  CHECK_GE(input_layouts.size(), 3U) << "The input's layout size is less than 3! Please check again.";
+  return {{input_layouts[0]}, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForSlice(const framework::NodeAttr &attrs,
+                                             const std::vector<ir::Tensor> &inputs,
+                                             const std::vector<Type> &out_type,
+                                             const std::vector<std::vector<int>> &output_shapes,
+                                             const Target &target) {
+  std::vector<int> starts, ends, axes, strides, decrease_axis;
+  if (attrs.attr_store.find("starts") != attrs.attr_store.end()) {
+    starts = absl::get<std::vector<int>>(attrs.attr_store.at("starts"));
+  }
+  if (attrs.attr_store.find("ends") != attrs.attr_store.end()) {
+    ends = absl::get<std::vector<int>>(attrs.attr_store.at("ends"));
+  }
+  if (attrs.attr_store.find("axes") != attrs.attr_store.end()) {
+    axes = absl::get<std::vector<int>>(attrs.attr_store.at("axes"));
+  }
+  if (attrs.attr_store.find("strides") != attrs.attr_store.end()) {
+    strides = absl::get<std::vector<int>>(attrs.attr_store.at("strides"));
+  }
+  if (attrs.attr_store.find("decrease_axis") != attrs.attr_store.end()) {
+    decrease_axis = absl::get<std::vector<int>>(attrs.attr_store.at("decrease_axis"));
+  }
+
+  CHECK(!starts.empty()) << "The Slice op doesn't find [starts] attrbute! It it a mandatory attribute, please check.";
+  CHECK(!ends.empty()) << "The Slice op doesn't find [ends] attrbute! It it a mandatory attribute, please check.";
+  CHECK_EQ(starts.size(), ends.size()) << "The size of [starts] and [ends] must be identical! Please check.";
+  if (!axes.empty()) {
+    CHECK_EQ(starts.size(), axes.size()) << "The size of [starts] and [axes] must be identical! Please check.";
+  } else {
+    for (int i = 0; i < starts.size(); i++) {
+      axes.push_back(i);
+    }
+  }
+  if (!strides.empty()) {
+    CHECK_EQ(starts.size(), strides.size()) << "The size of [starts] and [strides] must be identical! Please check.";
+  } else {
+    for (int i = 0; i < starts.size(); i++) {
+      strides.push_back(1);
+    }
+  }
+
+  std::vector<Expr> output_shape;
+  for (auto &i : output_shapes[0]) {
+    output_shape.push_back(Expr(i));
+  }
+
+  framework::CINNCompute slice_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input arguments of slice compute is empty! Please check.";
+    CINNValuePack arg_pack = args[0];
+    CHECK(!arg_pack.empty()) << "The input tensors of slice compute is empty! Please check.";
+    Expr A_expr = arg_pack[0];
+    CHECK(A_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+
+    std::string tensor_name = UniqName("Slice_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(arg_pack.size(), 2U);
+      CHECK(arg_pack[1].is_string());
+      tensor_name = arg_pack[1].operator std::string();
+    }
+
+    auto out    = pe::Slice(A, starts, axes, strides, decrease_axis, output_shape, tensor_name);
+    auto stages = CreateStages({out});
+    *ret        = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(slice_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.slice.x86", 1);
+
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForSlice(const std::vector<std::vector<int>> &inputs_shape,
+                                                 const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+  std::vector<int> starts, ends, axes, strides, decrease_axis, infer_flags;
+  for (auto &iter : attrs) {
+    if (iter.first == "starts") {
+      starts = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "ends") {
+      ends = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "axes") {
+      axes = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "strides") {
+      strides = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "decrease_axis") {
+      decrease_axis = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "infer_flags") {
+      infer_flags = absl::get<std::vector<int>>(iter.second);
+    } else {
+      LOG(ERROR) << "Unsupported attr: " << iter.first << std::endl;
+    }
+  }
+  CHECK(!starts.empty()) << "The Slice op doesn't find [starts] attrbute! It it a mandatory attribute, please check.";
+  CHECK(!ends.empty()) << "The Slice op doesn't find [ends] attrbute! It it a mandatory attribute, please check.";
+  CHECK_EQ(starts.size(), ends.size()) << "The size of [starts] and [ends] must be identical! Please check.";
+  if (!axes.empty()) {
+    CHECK_EQ(starts.size(), axes.size()) << "The size of [starts] and [axes] must be identical! Please check.";
+  } else {
+    for (int i = 0; i < starts.size(); i++) {
+      axes.push_back(i);
+    }
+  }
+  if (!strides.empty()) {
+    CHECK_EQ(starts.size(), strides.size()) << "The size of [starts] and [strides] must be identical! Please check.";
+  } else {
+    for (int i = 0; i < starts.size(); i++) {
+      strides.push_back(1);
+    }
+  }
+
+  std::vector<int> output_shape = inputs_shape[0];
+  for (int i = 0; i < axes.size(); i++) {
+    if (ends[i] < 0) {
+      ends[i] = output_shape[axes[i]] + ends[i];
+    } else if (ends[i] > output_shape[axes[i]]) {
+      ends[i] = output_shape[axes[i]];
+    }
+    if (starts[i] < -output_shape[axes[i]]) {
+      starts[i] = 0;
+    } else if (starts[i] < 0) {
+      starts[i] = output_shape[axes[i]] + starts[i];
+    } else if (starts[i] > output_shape[axes[i]]) {
+      starts[i] = output_shape[axes[i]] - 1;
+    }
+
+    CHECK_NE(strides[i], 0) << "The value of [strides] of slice should not be 0 ! Please Check.";
+    if (strides[i] > 0) {
+      CHECK(ends[i] > starts[i]) << "[ends] should greater than [starts] when strides > 0 ! But here " << ends[i]
+                                 << " < " << starts[i] << ", Please Check.";
+      output_shape[axes[i]] = (ends[i] - starts[i] + strides[i] - 1) / strides[i];
+    } else {
+      CHECK(ends[i] < starts[i]) << "[ends] should less than [starts] when strides < 0 ! But here " << ends[i] << " > "
+                                 << starts[i] << ",  Please Check.";
+      output_shape[axes[i]] = (starts[i] - ends[i] + (-strides[i]) - 1) / (-strides[i]);
+    }
+  }
+
+  if (decrease_axis.size() > 0) {
+    std::vector<int> new_shape;
+    for (int i = 0; i < output_shape.size(); ++i) {
+      if (std::find(decrease_axis.cbegin(), decrease_axis.cend(), i) != decrease_axis.cend()) {
+        CHECK_EQ(output_shape[i], 1) << "Decrease dim should be 1, but now received " << output_shape[i];
+      } else {
+        new_shape.emplace_back(output_shape[i]);
+      }
+    }
+    if (new_shape.empty()) {
+      // output shape should not empty
+      new_shape = {1};
+    }
+
+    output_shape = new_shape;
+  }
+
+  VLOG(4) << "Output shape of Slice is: " << cinn::utils::Join(output_shape, ",");
+  std::vector<std::vector<int>> res{output_shape};
+  return res;
+}
+
+std::vector<Type> InferDtypeForSlice(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+std::vector<std::vector<std::string>> InferLayoutForSlice(const std::vector<framework::shape_t> &input_shapes,
+                                                          const std::vector<std::string> &input_layouts,
+                                                          const framework::NodeAttr &attrs,
+                                                          const Target &target) {
+  CHECK_EQ(input_layouts.size(), 1U) << "The input's layout size is not 1! Please check again.";
+  CHECK_EQ(input_shapes.size(), 1U) << "The input's shape size is not 1! Please check again.";
+  std::vector<int> starts;
+  std::vector<int> ends;
+  std::vector<int> axes;
+  for (auto &iter : attrs.attr_store) {
+    if (iter.first == "starts") {
+      starts = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "ends") {
+      ends = absl::get<std::vector<int>>(iter.second);
+    } else if (iter.first == "axes") {
+      axes = absl::get<std::vector<int>>(iter.second);
+    }
+  }
+  std::string new_input_layouts = input_layouts[0];
+  bool trans_back               = false;
+  if (input_shapes[0].size() > 4) {
+    for (int i = 0; i < axes.size(); i++) {
+      if (axes[i] == 1) {
+        trans_back = true;
+        break;
+      }
+    }
+  }
+  if (trans_back) {
+    return {{"NCHW"}, {"NCHW"}};
+  }
+  return {input_layouts, input_layouts};
+}
+
+std::shared_ptr<OpStrategy> StrategyForSliceAssign(const framework::NodeAttr &attrs,
+                                                   const std::vector<ir::Tensor> &inputs,
+                                                   const std::vector<Type> &out_type,
+                                                   const std::vector<std::vector<int>> &output_shapes,
+                                                   const Target &target) {
+  CHECK_EQ(inputs.size(), 2) << "the number of input tensors must be equal to 2";
+  CHECK(!output_shapes.empty() && !output_shapes[0].empty()) << "The shape of output is empty! Please check again.";
+  VLOG(4) << "The output passed in StrategyForSliceAssign: " << utils::Join(output_shapes[0], ", ");
+  CHECK(!out_type.empty()) << "The output type of SliceAssign is empty! Please check again.\n";
+
+  std::vector<int> starts, ends, axes, strides;
+  if (attrs.attr_store.find("starts") != attrs.attr_store.end()) {
+    starts = absl::get<std::vector<int>>(attrs.attr_store.at("starts"));
+  }
+  if (attrs.attr_store.find("ends") != attrs.attr_store.end()) {
+    ends = absl::get<std::vector<int>>(attrs.attr_store.at("ends"));
+  }
+  if (attrs.attr_store.find("axes") != attrs.attr_store.end()) {
+    axes = absl::get<std::vector<int>>(attrs.attr_store.at("axes"));
+  }
+  if (attrs.attr_store.find("strides") != attrs.attr_store.end()) {
+    strides = absl::get<std::vector<int>>(attrs.attr_store.at("strides"));
+  }
+
+  CHECK(!starts.empty())
+      << "The SliceAssign op doesn't find [starts] attrbute! It it a mandatory attribute, please check.";
+  CHECK(!ends.empty()) << "The SliceAssign op doesn't find [ends] attrbute! It it a mandatory attribute, please check.";
+  CHECK_EQ(starts.size(), ends.size()) << "The size of [starts] and [ends] must be identical! Please check.";
+  if (!axes.empty()) {
+    CHECK_EQ(starts.size(), axes.size()) << "The size of [starts] and [axes] must be identical! Please check.";
+  } else {
+    for (int i = 0; i < starts.size(); i++) {
+      axes.push_back(i);
+    }
+  }
+  if (!strides.empty()) {
+    CHECK_EQ(starts.size(), strides.size()) << "The size of [starts] and [strides] must be identical! Please check.";
+  } else {
+    for (int i = 0; i < starts.size(); i++) {
+      strides.push_back(1);
+    }
+  }
+
+  framework::CINNCompute slice_assign_compute{[=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input args are empty! Please check again.";
+    CINNValuePack arg_pack = args[0];
+    int input_size         = arg_pack.size();
+    CHECK_GE(input_size, 2U) << "Require 2 input tensors for SliceAssign compute.";
+    Expr input = arg_pack[0];
+    CHECK(input.as_tensor());
+    Expr assign = arg_pack[1];
+    CHECK(assign.as_tensor());
+
+    std::string tensor_name = UniqName("slice_assign_output");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(arg_pack.size(), 3U);
+      CHECK(arg_pack[2].is_string());
+      tensor_name = arg_pack[2].operator std::string();
+    }
+
+    auto out = pe::SliceAssign(input.as_tensor_ref(), assign.as_tensor_ref(), axes, starts, ends, strides, tensor_name);
+    auto stages = CreateStages({out});
+    std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
+    *ret = CINNValuePack{res};
+  }};
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(
+      slice_assign_compute, GetInjectiveScheduleFunc(output_shapes, target), "strategy.slice_assign.x86", 1);
+  return strategy;
+}
+
+std::vector<std::vector<int>> InferShapeForSliceAssign(const std::vector<std::vector<int>> &inputs_shape,
+                                                       const framework::AttrMapType &attrs) {
+  CHECK_EQ(inputs_shape.size(), 2U) << "The inputs' shape size should be equal to 2! Please check again.";
+  return {inputs_shape[0]};
+}
+
+std::vector<Type> InferDtypeForSliceAssign(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  return {inputs_type[0]};
+}
+
+std::vector<std::vector<std::string>> InferLayoutForSliceAssign(const std::vector<framework::shape_t> &input_shapes,
+                                                                const std::vector<std::string> &input_layouts,
+                                                                const framework::NodeAttr &attrs,
+                                                                const Target &target) {
+  CHECK_EQ(input_layouts.size(), 2U) << "The input's layout size is not equal to 2! Please check again.";
+  return {{input_layouts[0]}, {""}};
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(transform_ops) {
+  CINN_REGISTER_OP(matmul)
+      .describe(
+          "This operator is used to perform (batched) matrix multiplication over the last two dimensions of the input "
+          "tensors X and Y.")
+      .set_num_inputs(2)
+#ifdef CINN_WITH_CUDA
+      .set_num_outputs(1)
+#else
+      .set_num_outputs(2)
+#endif
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForMatMul)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForMatMul))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForMatMul))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForMatMul))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(split)
+      .describe("This operator is used to split tensors X to 'sections' sub-tensor on specified axis.")
+      .set_num_inputs(1)
+      .set_num_outputs(0)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForSplit)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForSplit))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForSplit))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForSplit))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(concat)
+      .describe("This operator is used to concat two input tensors X and Y on specified axis.")
+      .set_num_inputs(2)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForConcat)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForConcat))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForConcat))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForConcat))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(reverse)
+      .describe("This operator implements the meta op reverse.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForReverse)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForReverse))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForLayoutTransform))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForReverse))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(transpose)
+      .describe("This operator implements the meta op transpose.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForTranspose)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForTranspose))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForLayoutTransform))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForTranspose))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(mul)
+      .describe("This operator is used to perform matrix multiplication for input X and Y.")
+      .set_num_inputs(2)
+      .set_num_outputs(2)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForMul)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForMul))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForMul))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForMul))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+#ifdef CINN_WITH_CUDA
+  CINN_REGISTER_OP(cublas_gemm)
+      .describe("This operator uses cublas to compute the gemm.")
+      .set_num_inputs(3)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForCublasGemm)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForCublasGemm))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForCublasGemm))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(cublas_matmul)
+      .describe("This operator uses cublas to compute the matmul.")
+      .set_num_inputs(2)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForMatMul)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForMatMul))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForMatMul))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+#endif
+
+  CINN_REGISTER_OP(layout_transform)
+      .describe("This operator is used to transform op's layouts")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForLayoutTransform)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForLayoutTransform))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForLayoutTransform))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForLayoutTransform))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(slice)
+      .describe("This operator implements the slice layer")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForSlice)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForSlice))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForSlice))
+#ifndef CINN_WITH_CUDA
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForSlice))
+#endif
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(slice_assign)
+      .describe("This operator is used to perform slice assign for tensor input and tensor assign.")
+      .set_num_inputs(2)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForSliceAssign)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForSliceAssign))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForSliceAssign))
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForSliceAssign))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(gather)
+      .describe(
+          "This operator is used to create a new tensor which indexes the `input` tensor along dimension `axis` using "
+          "the entries in `index`.")
+      .set_num_inputs(2)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForGather)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForGather))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForGather))
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForGather))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(scatter_assign)
+      .describe("This operator is used to assign tensor B to tensor A by index.")
+      .set_num_inputs(3)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForScatterAssign)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForScatterAssign))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForScatterAssign))
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForScatterAssign))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kInjective)
+      .set_support_level(4);
+
+  CINN_REGISTER_OP(scatter_add)
+      .describe("This operator is used to add update tensor B into tensor A by index.")
+      .set_num_inputs(3)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForScatterAdd)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForScatterAdd))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForScatterAdd))
+      .set_attr("inferlayout", MakeOpFunction(cinn::hlir::op::InferLayoutForScatterAdd))
+      // Because the scatter_add operator calls the external function by passing pointers,
+      // the code generated by operator fusion will have out-of-bounds access.
+      // It should not fuse with any other injective operators, though scatter_add is injective.
+      // turn KNonFusible to kInjective will fail /Paddle/python/paddle/fluid/tests/unittests/test_index_select_op.py
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/op/transform_test.cc b/paddle/cinn/hlir/op/transform_test.cc
new file mode 100644
index 0000000000000..3578d986fccf8
--- /dev/null
+++ b/paddle/cinn/hlir/op/transform_test.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <functional>
+#include <iostream>
+#include <string>
+
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/codegen_cuda_host.h"
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/cuda_util.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "cinn/cinn.h"
+#include "cinn/common/target.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/runtime/cuda/cuda_module.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace framework {
+
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+TEST(SliceAssign, SliceAssign_Op) {
+  // code gen
+  auto slice_assign = Operator::Get("slice_assign");
+  auto strategy     = Operator::GetAttrs<StrategyFunction>("CINNStrategy")[slice_assign];
+
+  int m = 64;
+  int n = 32;
+
+  Placeholder<float> input("input", {ir::Expr(m), ir::Expr(m)});
+  Placeholder<float> assign("assign", {ir::Expr(n), ir::Expr(n)});
+
+  // set attrs
+  NodeAttr attrs;
+  attrs.attr_store["axis"]    = std::vector<int>{0, 1};
+  attrs.attr_store["starts"]  = std::vector<int>{16, 16};
+  attrs.attr_store["ends"]    = std::vector<int>{32, 32};
+  attrs.attr_store["strides"] = std::vector<int>{1, 1};
+
+  std::vector<Type> out_type{Float(32)};
+  std::vector<int> output_shape = {64, 64};
+  std::vector<ir::Tensor> inputs{input.tensor(), assign.tensor()};
+
+#ifdef CINN_WITH_CUDA
+  auto target = common::DefaultNVGPUTarget();
+#else
+  auto target = common::DefaultHostTarget();
+#endif
+  auto impl = OpStrategy::SelectImpl(strategy(attrs, inputs, out_type, {output_shape}, target));
+
+  std::string func_name = "slice_assign";
+
+  if (FLAGS_cinn_ir_schedule) {
+    std::string out_name             = "output";
+    common::CINNValuePack cinn_input = common::CINNValuePack{
+        {common::CINNValue(input.tensor()), common::CINNValue(assign.tensor()), common::CINNValue(out_name)}};
+    std::vector<std::string> input_output_names{"input", "assign", out_name};
+
+    auto funcs = framework::GetFuncFromImpl(impl, cinn_input, inputs, input_output_names, func_name, target);
+
+    for (auto func : funcs) {
+      LOG(INFO) << "Test Operator_BroadcastTo's Strategy, func is :\n" << func;
+    }
+  } else {
+    common::CINNValuePack cinn_input =
+        common::CINNValuePack{{common::CINNValue(input.tensor()), common::CINNValue(assign.tensor())}};
+    common::CINNValuePack rets = impl->fcompute(cinn_input);
+    rets                       = impl->fschedule(rets);
+
+    // the last element is a StageMap
+    for (int i = 0; i < rets->size() - 1; i++) {
+      Expr temp = rets[i];
+      if (!temp.as_tensor_ref()->buffer.defined()) {
+        inputs.push_back(temp.as_tensor_ref());
+      }
+    }
+
+    auto func = lang::LowerVec("slice_assign", rets.back(), inputs, {}, {}, nullptr, target);
+    for (auto& f : func) {
+      LOG(INFO) << "Test Strategy Codegen:\n" << f;
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/op/use_ops.h b/paddle/cinn/hlir/op/use_ops.h
new file mode 100644
index 0000000000000..9589bb96b0ed5
--- /dev/null
+++ b/paddle/cinn/hlir/op/use_ops.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/common/macros.h"
+
+CINN_USE_REGISTER(nn_ops)
+CINN_USE_REGISTER(nn_grad_ops)
+CINN_USE_REGISTER(broadcast_ops)
+CINN_USE_REGISTER(broadcast_grad_ops)
+CINN_USE_REGISTER(elementwise_ops)
+CINN_USE_REGISTER(transform_ops)
+CINN_USE_REGISTER(gather_nd_ops)
+CINN_USE_REGISTER(sort_ops)
+CINN_USE_REGISTER(argmin_ops)
+CINN_USE_REGISTER(argmax_ops)
+CINN_USE_REGISTER(reduce_ops)
+CINN_USE_REGISTER(custom_call_op)
+CINN_USE_REGISTER(repeat_ops)
+CINN_USE_REGISTER(one_hot_ops)
+CINN_USE_REGISTER(lookup_table_ops)
+CINN_USE_REGISTER(reciprocal_ops)
+CINN_USE_REGISTER(gaussian_random_ops)
+CINN_USE_REGISTER(uniform_random_ops)
+CINN_USE_REGISTER(randint_ops)
+CINN_USE_REGISTER(cholesky_ops)
+CINN_USE_REGISTER(triangular_solve_ops)
+CINN_USE_REGISTER(bitcast_convert_ops)
+CINN_USE_REGISTER(op_external_api)
+CINN_USE_REGISTER(resize_ops)
+CINN_USE_REGISTER(assert_true_ops)
diff --git a/paddle/cinn/hlir/pass/CMakeLists.txt b/paddle/cinn/hlir/pass/CMakeLists.txt
new file mode 100644
index 0000000000000..ac48b9a153cf6
--- /dev/null
+++ b/paddle/cinn/hlir/pass/CMakeLists.txt
@@ -0,0 +1,42 @@
+core_gather_headers()
+
+
+gather_srcs(cinnapi_src SRCS
+    infershape.cc
+    opfusion.cc
+    alterlayout.cc
+    const_propagate.cc
+    op_fusion_pass.cc
+    fusion_merge_pass.cc
+    dot_merger.cc
+    check_fusion_accuracy_pass.cc
+    custom_call_pass.cc
+    common_subexpression_elimination.cc
+    constant_folding_pass.cc
+    dce_pass.cc
+    dense_merge_pass.cc
+    reduce_split_pass.cc
+    single_group_optimize_pass.cc
+    constant_folding_pass_util.cc
+    )
+
+#cc_test(test_opfusion SRCS opfusion_test.cc DEPS cinncore)
+if (WITH_CUDA)
+cc_test(test_primitive_ops SRCS test_primitive_ops.cc DEPS cinncore)
+cc_test(test_const_propagate SRCS const_propagate_test.cc DEPS cinncore)
+cc_test(test_dot_merger_pass SRCS dot_merger_test.cc DEPS cinncore)
+
+# TODO(thisjiang): move when test bug in x86 is fixed
+cc_test(test_check_fusion_accuracy_pass SRCS check_fusion_accuracy_pass_test.cc DEPS cinncore decomposer_test_helper)
+cc_test(test_dense_merge_pass SRCS dense_merge_pass_test.cc DEPS cinncore)
+cc_test(test_reduce_split_pass SRCS reduce_split_pass_test.cc DEPS cinncore)
+endif()
+cc_test(test_op_fusion_pass SRCS op_fusion_pass_test.cc DEPS cinncore decomposer_test_helper)
+cc_test(test_fusion_merge_pass SRCS fusion_merge_pass_test.cc DEPS cinncore decomposer_test_helper)
+if (NOT WITH_CUDA)
+#cc_test(test_alterlayout SRCS alterlayout_test.cc DEPS cinncore)
+endif()
+cc_test(test_dot_merger SRCS test_dot_merger.cc DEPS cinncore)
+cc_test(test_dce_pass SRCS dce_pass_test.cc DEPS cinncore)
+cc_test(test_common_subexpression_elimination SRCS common_subexpression_elimination_test.cc DEPS cinncore)
+cc_test(test_constant_folding_pass SRCS constant_folding_pass_test.cc DEPS cinncore)
diff --git a/paddle/cinn/hlir/pass/alterlayout.cc b/paddle/cinn/hlir/pass/alterlayout.cc
new file mode 100644
index 0000000000000..0ebec27a56ac9
--- /dev/null
+++ b/paddle/cinn/hlir/pass/alterlayout.cc
@@ -0,0 +1,649 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/layout.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using common::GraphNode;
+using common::Type;
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+using framework::Operator;
+using framework::OpValueType;
+
+using InferShapeFunc  = std::function<std::vector<framework::shape_t>(const std::vector<framework::shape_t>&,
+                                                                     const framework::AttrMapType&)>;
+using InferTypeFunc   = std::function<std::vector<Type>(const std::vector<Type>&, const framework::AttrMapType&)>;
+using InferLayoutFunc = std::function<std::vector<std::vector<std::string>>(const std::vector<framework::shape_t>&,
+                                                                            const std::vector<std::string>&,
+                                                                            const framework::NodeAttr&,
+                                                                            const Target&)>;
+// insert layout_transform after the input var
+std::tuple<Node*, NodeData*> InsertLayoutTransformNodeAfter(Graph* graph,
+                                                            NodeData* input_data,
+                                                            Node* dst_node,
+                                                            int pos,
+                                                            const std::string& src_layout,
+                                                            const std::string& dst_layout,
+                                                            const std::string& name) {
+  CHECK(graph);
+  CHECK(input_data);
+  std::string op_type                           = "layout_transform";
+  auto trans_node                               = new Node(Operator::Get(op_type), op_type, name);
+  trans_node->attrs.attr_store["src_layout"]    = src_layout;
+  trans_node->attrs.attr_store["dst_layout"]    = dst_layout;
+  auto output_data                              = InsertGraphOpNodeAfter(graph, trans_node, input_data, dst_node, pos);
+  trans_node->attrs.attr_store["input_layouts"] = {src_layout};
+  trans_node->attrs.attr_store["out_layouts"]   = {dst_layout};
+  return std::make_tuple(trans_node, output_data);
+}
+
+// insert layout_transform before the output var
+std::tuple<Node*, NodeData*> InsertLayoutTransformNodeBefore(Graph* graph,
+                                                             Node* input_node,
+                                                             NodeData* dst_data,
+                                                             int pos,
+                                                             const std::string& src_layout,
+                                                             const std::string& dst_layout,
+                                                             const std::string& name) {
+  CHECK(graph);
+  CHECK(input_node);
+  CHECK(dst_data);
+  std::string op_type                           = "layout_transform";
+  auto trans_node                               = new Node(Operator::Get(op_type), op_type, name);
+  trans_node->attrs.attr_store["src_layout"]    = src_layout;
+  trans_node->attrs.attr_store["dst_layout"]    = dst_layout;
+  auto temp_outdata                             = InsertGraphOpNodeBefore(graph, trans_node, input_node, dst_data, pos);
+  trans_node->attrs.attr_store["input_layouts"] = {src_layout};
+  trans_node->attrs.attr_store["out_layouts"]   = {dst_layout};
+  return std::make_tuple(trans_node, temp_outdata);
+}
+
+std::vector<framework::shape_t> UpdateInferInfos(Node* node,
+                                                 const std::vector<framework::shape_t>& input_shapes,
+                                                 const std::vector<Type>& input_types,
+                                                 const std::vector<std::string>& input_layouts,
+                                                 const common::Target& target,
+                                                 const OpValueType<InferShapeFunc>& op_infershape,
+                                                 const OpValueType<InferTypeFunc>& op_infertype,
+                                                 const OpValueType<InferLayoutFunc>& op_inferlayout,
+                                                 absl::flat_hash_map<std::string, framework::shape_t>* shape_dict,
+                                                 absl::flat_hash_map<std::string, Type>* type_dict,
+                                                 absl::flat_hash_map<std::string, std::string>* layout_dict) {
+  CHECK(shape_dict);
+  CHECK(type_dict);
+  CHECK(layout_dict);
+  CHECK(op_infershape[node->op()]) << "find no InferShape function for op " << node->op()->name;
+  CHECK(op_infertype[node->op()]) << "find no InferDtype function for op " << node->op()->name;
+  CHECK(op_inferlayout[node->op()]) << "find no InferLayout function for op " << node->op()->name;
+  auto infershapes  = op_infershape[node->op()](input_shapes, node->attrs.attr_store);
+  auto infertypes   = op_infertype[node->op()](input_types, node->attrs.attr_store);
+  auto inferlayouts = op_inferlayout[node->op()](input_shapes, input_layouts, node->attrs, target);
+
+  CHECK(!infershapes.empty()) << node->op()->name << " finds no infershape";
+  CHECK(!infertypes.empty()) << node->op()->name << " finds no infertype";
+  CHECK(!inferlayouts.empty()) << node->op()->name << " finds no inferlayout";
+  auto outlinks = node->outlinks_in_order();
+  CHECK_EQ(infershapes.size(), infertypes.size());
+  CHECK_EQ(inferlayouts.size(), 2U);
+  CHECK_EQ(infertypes.size(), inferlayouts[0].size());
+  CHECK_EQ(outlinks.size(), infershapes.size());
+
+  for (int i = 0; i < outlinks.size(); i++) {
+    auto* sink                 = outlinks[i]->sink();
+    (*shape_dict)[sink->id()]  = infershapes[i];
+    (*type_dict)[sink->id()]   = infertypes[i];
+    (*layout_dict)[sink->id()] = inferlayouts[0][i];
+    VLOG(3) << "Infershape: " << node->op()->name << "'s " << i << "-th outlink " << sink->id() << ": "
+            << utils::Join(infershapes[i], ", ");
+  }
+  node->attrs.attr_store["out_layouts"]   = inferlayouts[0];
+  node->attrs.attr_store["input_layouts"] = inferlayouts[1];
+  return infershapes;
+}
+
+void AlterLayoutPass(Graph* graph) {
+  // alterlayout only in X86 for it's specific layout requirements
+  if (graph->target_.arch == Target::Arch::X86) {
+    auto store_nodes     = std::get<0>(graph->topological_order());
+    auto& shape_dict     = graph->GetMutableAttrs<absl::flat_hash_map<std::string, framework::shape_t>>("infershape");
+    auto& type_dict      = graph->GetMutableAttrs<absl::flat_hash_map<std::string, Type>>("inferdtype");
+    auto& op_infershape  = Operator::GetAttrs<InferShapeFunc>("infershape");
+    auto& op_inferdtype  = Operator::GetAttrs<InferTypeFunc>("inferdtype");
+    auto& op_inferlayout = Operator::GetAttrs<InferLayoutFunc>("inferlayout");
+    absl::flat_hash_map<std::string, std::string> layout_dict;
+    std::string model_name = "";
+    if (graph->HasAttr("model_name")) {
+      model_name = graph->GetMutableAttrs<std::string>("model_name");
+      VLOG(3) << "model_name: " << model_name;
+    }
+    // collect all convs' original input config before altering layout for loading tune params afterwards
+    int index = 0;
+    for (int i = 0; i < store_nodes.size(); i++) {
+      auto node = store_nodes[i]->safe_as<Node>();
+      if (node && node->op()->name == "conv2d") {
+        std::vector<int> padding({0, 0});
+        std::vector<int> stride({1, 1});
+        std::vector<int> dilation({1, 1});
+        if (node->attrs.attr_store.find("padding") != node->attrs.attr_store.end()) {
+          padding = absl::get<std::vector<int>>(node->attrs.attr_store.at("padding"));
+        }
+        if (node->attrs.attr_store.find("stride") != node->attrs.attr_store.end()) {
+          stride = absl::get<std::vector<int>>(node->attrs.attr_store.at("stride"));
+        }
+        if (node->attrs.attr_store.find("dilation") != node->attrs.attr_store.end()) {
+          dilation = absl::get<std::vector<int>>(node->attrs.attr_store.at("dilation"));
+        }
+        const auto& conv_inlinks = node->inlinks_in_order();
+        CHECK_EQ(conv_inlinks.size(), 2U) << "conv2d should have 2 inputs";
+        std::vector<std::vector<int>> inputs_shape;
+        for (auto& link : conv_inlinks) {
+          auto* source = link->source();
+          CHECK(shape_dict.count(source->id())) << source->id() << " finds no infershape";
+          inputs_shape.push_back(shape_dict.at(source->id()));
+        }
+        std::string key =
+            pe::GenerateX86ConvKey(inputs_shape[0], inputs_shape[1], stride, padding, dilation, index++, model_name);
+        VLOG(3) << "key: " << key;
+        node->attrs.attr_store["key"] = key;
+      }
+    }
+
+    bool has_altered = false;
+    for (int i = 0; i < store_nodes.size(); i++) {
+      auto node = store_nodes[i]->safe_as<Node>();
+      if (node) {
+        if (node->op()->name == "conv2d") {
+          CHECK(node->attrs.attr_store.count("data_format")) << node->op()->name << " op has no data_format attr";
+          std::string data_format = absl::get<std::string>(node->attrs.attr_store.at("data_format"));
+          if (data_format != "NCHW") {
+            // not NCHW such as NHWC or has already been altered layout
+            continue;
+          }
+          has_altered             = true;
+          std::string new_op_type = node->op()->name + "_NCHWc";
+          // alter conv2d op to conv2d_NCHWc
+          Node* new_node             = new Node(Operator::Get(new_op_type), new_op_type, common::UniqName(new_op_type));
+          new_node->attrs.attr_store = node->attrs.attr_store;
+          std::string new_data_format               = "NCHWc";
+          new_node->attrs.attr_store["data_format"] = new_data_format;
+
+          const auto& conv_inlinks = node->inlinks_in_order();
+          std::vector<common::GraphNode*> input_nodes;
+          for (auto& link : conv_inlinks) {
+            auto* source = link->source();
+            input_nodes.push_back(source);
+          }
+          // get new layout: ic_bn, oc_bn
+          CHECK_EQ(input_nodes.size(), 2U) << "conv2d should have 2 input nodes";
+          auto* input_node  = input_nodes[0];
+          auto* weight_node = input_nodes[1];
+          CHECK(shape_dict.count(input_node->id())) << input_node->id() << " has no infershape";
+          CHECK(shape_dict.count(weight_node->id())) << weight_node->id() << " has no infershape";
+          CHECK(type_dict.count(input_node->id())) << input_node->id() << " has no infertype";
+          CHECK(type_dict.count(weight_node->id())) << weight_node->id() << " has no infertype";
+          auto input_shape  = shape_dict.at(input_node->id());
+          auto weight_shape = shape_dict.at(weight_node->id());
+          auto input_type   = type_dict.at(input_node->id());
+          auto weight_type  = type_dict.at(weight_node->id());
+          Node* weight_trans_node;
+          Node* input_trans_node;
+          std::vector<framework::shape_t> conv2d_NCHWc_inputshapes;
+          std::vector<Type> conv2d_NCHWc_inputtypes;
+          std::vector<std::string> conv2d_NCHWc_inputlayouts;
+          CHECK(weight_shape.size() == 4) << "old conv2d's weight shape should be 4";
+          absl::flat_hash_map<std::string, int> conv2d_factors;
+          int oc, fc, ic = 1;
+          if (input_shape.size() == 4) {
+            ic = input_shape[1];
+          } else if (input_shape.size() == 5) {
+            ic = input_shape[1] * input_shape[4];
+          } else {
+            LOG(FATAL) << "conv2d's input shape should be 4D/5D. Wrong input shape: " << utils::Join(input_shape, ", ");
+          }
+
+          if (weight_shape.size() == 4) {
+            oc = weight_shape[0];
+            fc = weight_shape[1];
+          } else if (weight_shape.size() == 6) {
+            oc = weight_shape[0] * weight_shape[5];
+            fc = weight_shape[1] * weight_shape[4];
+          } else {
+            LOG(FATAL) << "conv2d's weight shape should be 4D/6D. Wrong weight shape: "
+                       << utils::Join(weight_shape, ", ");
+          }
+          VLOG(3) << "oc: " << oc;
+          VLOG(3) << "ic: " << ic;
+          VLOG(3) << "fc: " << fc;
+
+          // get the original conv config stored in the key attr
+          CHECK(new_node->attrs.attr_store.count("key")) << "conv2d finds no key attr";
+          std::string key = absl::get<std::string>(new_node->attrs.attr_store.at("key"));
+          VLOG(3) << "key: " << key;
+          pe::GetConv2dFactors(&conv2d_factors, oc, ic, fc, -1, -1, input_type, graph->target_, key);
+          CHECK(conv2d_factors.count("oc_bn"));
+          CHECK(conv2d_factors.count("ic_bn"));
+          CHECK(conv2d_factors.count("fc_bn"));
+          int oc_bn = conv2d_factors["oc_bn"];
+          int ic_bn = conv2d_factors["ic_bn"];
+          int fc_bn = conv2d_factors["fc_bn"];
+          VLOG(3) << "oc_bn: " << oc_bn;
+          VLOG(3) << "ic_bn: " << ic_bn;
+          VLOG(3) << "fc_bn: " << fc_bn;
+
+          if (input_shape.size() == 4) {
+            std::string src_input_layout = "NCHW";
+            std::string dst_input_layout = "NCHW" + std::to_string(ic_bn) + "c";
+            VLOG(3) << "dst_input_layout: " << dst_input_layout;
+            // insert input layout_transform
+            auto input_data = input_node->safe_as<NodeData>();
+            CHECK(input_data);
+            NodeData* output_data;
+            std::tie(input_trans_node, output_data) =
+                InsertLayoutTransformNodeAfter(graph,
+                                               input_data,
+                                               node,
+                                               0,
+                                               src_input_layout,
+                                               dst_input_layout,
+                                               common::UniqName(node->op()->name + "_input_layout_tranform"));
+            UpdateInferInfos(input_trans_node,
+                             {input_shape},
+                             {input_type},
+                             {src_input_layout},
+                             graph->target_,
+                             op_infershape,
+                             op_inferdtype,
+                             op_inferlayout,
+                             &shape_dict,
+                             &type_dict,
+                             &layout_dict);
+            CHECK(shape_dict.count(output_data->id())) << output_data->id() << " finds no infershape in shape_dict.";
+            CHECK(type_dict.count(output_data->id())) << output_data->id() << " finds no infertype in shape_dict.";
+            auto trans_out_shapes = shape_dict[output_data->id()];
+            auto trans_out_dtypes = type_dict[output_data->id()];
+            conv2d_NCHWc_inputshapes.push_back(trans_out_shapes);
+            conv2d_NCHWc_inputtypes.push_back(trans_out_dtypes);
+            conv2d_NCHWc_inputlayouts.push_back(dst_input_layout);
+          } else {
+            CHECK_EQ(input_shape.size(), 5U) << "conv2d_NCHWc op's input shape dim should be 5";
+            conv2d_NCHWc_inputshapes.push_back(input_shape);
+            conv2d_NCHWc_inputtypes.push_back(input_type);
+            CHECK(layout_dict.count(input_node->id())) << input_node->id() << " should have out_layout attr";
+            conv2d_NCHWc_inputlayouts.push_back(layout_dict[input_node->id()]);
+          }
+          if (weight_shape.size() == 4) {
+            std::string src_kernel_layout = "OIHW";
+            std::string dst_kernel_layout = "OIHW" + std::to_string(fc_bn) + "i" + std::to_string(oc_bn) + "o";
+            VLOG(3) << "dst_kernel_layout: " << dst_kernel_layout;
+            // insert weight layout_transform
+            auto weight_data = weight_node->safe_as<NodeData>();
+            CHECK(weight_data);
+            NodeData* output_data;
+            std::tie(weight_trans_node, output_data) =
+                InsertLayoutTransformNodeAfter(graph,
+                                               weight_data,
+                                               node,
+                                               1,
+                                               src_kernel_layout,
+                                               dst_kernel_layout,
+                                               common::UniqName(node->op()->name + "_weight_layout_tranform"));
+            UpdateInferInfos(weight_trans_node,
+                             {weight_shape},
+                             {weight_type},
+                             {src_kernel_layout},
+                             graph->target_,
+                             op_infershape,
+                             op_inferdtype,
+                             op_inferlayout,
+                             &shape_dict,
+                             &type_dict,
+                             &layout_dict);
+            CHECK(shape_dict.count(output_data->id())) << output_data->id() << " finds no infershape in shape_dict.";
+            CHECK(type_dict.count(output_data->id())) << output_data->id() << " finds no infertype in shape_dict.";
+            auto trans_out_shapes = shape_dict[output_data->id()];
+            auto trans_out_dtypes = type_dict[output_data->id()];
+            conv2d_NCHWc_inputshapes.push_back(trans_out_shapes);
+            conv2d_NCHWc_inputtypes.push_back(trans_out_dtypes);
+            conv2d_NCHWc_inputlayouts.push_back(dst_kernel_layout);
+          } else {
+            CHECK_EQ(weight_shape.size(), 6U) << weight_node->id() << " shape dim should be 6";
+            conv2d_NCHWc_inputshapes.push_back(weight_shape);
+            conv2d_NCHWc_inputtypes.push_back(weight_type);
+            CHECK(layout_dict.count(weight_node->id())) << weight_node->id() << " should have out_layout attr";
+            conv2d_NCHWc_inputlayouts.push_back(layout_dict[weight_node->id()]);
+          }
+          // replace conv2d to conv2d_NCHWc
+          auto infershapes        = op_infershape[new_node->op()](conv2d_NCHWc_inputshapes, new_node->attrs.attr_store);
+          const auto& old_inlinks = node->inlinks_in_order();
+          const auto& old_outlinks = node->outlinks_in_order();
+          for (auto& link : old_inlinks) {
+            auto source = link->source();
+            source->UnLinkSingleTo(node);
+            source->LinkTo(new_node);
+          }
+          std::vector<Node*> next_ops;
+          int count = 0;
+          Shared<Node> node_ptr(new_node);
+          for (auto& link : old_outlinks) {
+            auto sink = link->sink();
+            node->UnLinkSingleTo(sink);
+            if (!count) {
+              // keep the first out var and its outlinks
+              auto out_var = sink->safe_as<NodeData>();
+              CHECK(out_var);
+              out_var->source_node = node_ptr;
+              new_node->LinkTo(out_var);
+            }
+            count++;
+          }
+          for (int i = 1; i < infershapes.size(); i++) {
+            auto* new_out =
+                new NodeData(node_ptr, i, 0, common::UniqName(new_node->id() + "_out_" + std::to_string(i)));
+            graph->RegisterNode(new_out->id(), new_out);
+            new_node->as<common::GraphNode>()->LinkTo(new_out);
+          }
+          graph->RegisterNode(new_node->id(), new_node);
+          // update conv2d_NCHWc's infershape, infertype, inferlayout and set attrs
+          UpdateInferInfos(new_node,
+                           conv2d_NCHWc_inputshapes,
+                           conv2d_NCHWc_inputtypes,
+                           conv2d_NCHWc_inputlayouts,
+                           graph->target_,
+                           op_infershape,
+                           op_inferdtype,
+                           op_inferlayout,
+                           &shape_dict,
+                           &type_dict,
+                           &layout_dict);
+        } else if (has_altered) {
+          // not alterlayout like conv2d, just inferlayout
+          std::vector<framework::shape_t> input_shapes;
+          std::vector<Type> input_types;
+          std::vector<std::string> input_layouts;
+          for (auto& link : node->inlinks_in_order()) {
+            auto* source = link->source();
+            CHECK(shape_dict.count(source->id())) << source->id() << " finds no infershape";
+            CHECK(type_dict.count(source->id())) << source->id() << " finds no infertype";
+            input_shapes.push_back(shape_dict[source->id()]);
+            input_types.push_back(type_dict[source->id()]);
+            if (layout_dict.count(source->id())) {
+              input_layouts.push_back(layout_dict[source->id()]);
+            } else {
+              input_layouts.push_back("");
+            }
+          }
+          CHECK(op_inferlayout[node->op()]) << "find no InferLayout function for op " << node->op()->name;
+          auto inferlayouts = op_inferlayout[node->op()](input_shapes, input_layouts, node->attrs, graph->target_);
+          // if input inferred layouts is different from original's, expand dims or do transformation.
+          CHECK_EQ(inferlayouts.size(), 2U);
+          auto new_input_layouts = inferlayouts[1];
+          auto inlinks           = node->inlinks_in_order();
+          CHECK_EQ(input_layouts.size(), inlinks.size());
+          CHECK_EQ(input_layouts.size(), new_input_layouts.size());
+          CHECK_EQ(input_layouts.size(), input_shapes.size());
+          bool reset_axis = false;
+          for (int i = 0; i < inlinks.size(); i++) {
+            if (input_layouts[i] != new_input_layouts[i]) {
+              // expand dims or do transformation
+              int input_shape_size = input_shapes[i].size();
+              if (input_shape_size == 1 && new_input_layouts[i].size() > 4) {
+                // C -> NCHWxc: 1. C -> NCHW 2. layout transform from NCHW to NCHWxc
+                int axis = -1;
+                CHECK(node->attrs.attr_store.count("axis")) << node->id() << " find no axis attr";
+                axis = absl::get<int>(node->attrs.attr_store["axis"]);
+                CHECK(new_input_layouts[i].substr(0, 4) == "NCHW") << "only support NCHWxc";
+                if (axis == -1) {
+                  axis += 4;
+                }
+                std::vector<int> new_shapes;
+                for (int j = 0; j < 4; j++) {
+                  if (axis == j) {
+                    new_shapes.push_back(input_shapes[i][0]);
+                  } else {
+                    new_shapes.push_back(1);
+                  }
+                }
+                // C -> NCHW, insert layout tranfrom
+                auto source               = inlinks[i]->source();
+                std::string src_layout    = "C";
+                layout_dict[source->id()] = src_layout;
+                auto input_data           = source->safe_as<NodeData>();
+                CHECK(input_data);
+                VLOG(3) << source->id() << " do layout_tranform from C to NCHW";
+                std::string op_type = "broadcast_to";
+                auto trans_node =
+                    new Node(Operator::Get(op_type), op_type, common::UniqName(source->id() + "_broadcastto"));
+                trans_node->attrs.attr_store["out_shape"]      = new_shapes;
+                std::vector<int> broadcast_axes                = {1};
+                trans_node->attrs.attr_store["broadcast_axes"] = broadcast_axes;
+                auto output_data = InsertGraphOpNodeAfter(graph, trans_node, input_data, node, i);
+                UpdateInferInfos(trans_node,
+                                 {input_shapes[i]},
+                                 {input_types[i]},
+                                 {src_layout},
+                                 graph->target_,
+                                 op_infershape,
+                                 op_inferdtype,
+                                 op_inferlayout,
+                                 &shape_dict,
+                                 &type_dict,
+                                 &layout_dict);
+
+                std::string new_src_layout = "NCHW";
+                reset_axis                 = true;
+                // insert layout tranfrom
+                auto new_input_data = output_data->safe_as<NodeData>();
+                CHECK(new_input_data);
+                NodeData* new_output_data;
+                Node* new_trans_node;
+                VLOG(3) << new_input_data->id() << " do layout_tranform from NCHW to NCHWxc";
+                std::tie(new_trans_node, new_output_data) =
+                    InsertLayoutTransformNodeAfter(graph,
+                                                   new_input_data,
+                                                   node,
+                                                   i,
+                                                   new_src_layout,
+                                                   new_input_layouts[i],
+                                                   common::UniqName(new_input_data->id() + "_layout_tranform"));
+                UpdateInferInfos(new_trans_node,
+                                 {shape_dict[new_input_data->id()]},
+                                 {input_types[i]},
+                                 {new_src_layout},
+                                 graph->target_,
+                                 op_infershape,
+                                 op_inferdtype,
+                                 op_inferlayout,
+                                 &shape_dict,
+                                 &type_dict,
+                                 &layout_dict);
+              } else if (input_shape_size == 4 && new_input_layouts[i].size() > 4) {
+                // NCHW -> NCHWxc
+                // insert layout tranfrom
+                auto source               = inlinks[i]->source();
+                auto src_layout           = "NCHW";
+                layout_dict[source->id()] = src_layout;
+                auto input_data           = source->safe_as<NodeData>();
+                CHECK(input_data);
+                NodeData* output_data;
+                Node* trans_node;
+                VLOG(3) << source->id() << " do layout_tranform from NCHW to NCHWxc";
+                std::tie(trans_node, output_data) =
+                    InsertLayoutTransformNodeAfter(graph,
+                                                   input_data,
+                                                   node,
+                                                   i,
+                                                   src_layout,
+                                                   new_input_layouts[i],
+                                                   common::UniqName(source->id() + "_layout_tranform"));
+                UpdateInferInfos(trans_node,
+                                 {input_shapes[i]},
+                                 {input_types[i]},
+                                 {src_layout},
+                                 graph->target_,
+                                 op_infershape,
+                                 op_inferdtype,
+                                 op_inferlayout,
+                                 &shape_dict,
+                                 &type_dict,
+                                 &layout_dict);
+              } else if (input_shape_size == 5 && new_input_layouts[i].size() == 4) {
+                // NCHWxc -> NCHW
+                // insert layout tranfrom
+                auto source               = inlinks[i]->source();
+                auto src_layout           = input_layouts[i];
+                layout_dict[source->id()] = src_layout;
+                auto input_data           = source->safe_as<NodeData>();
+                CHECK(input_data);
+                NodeData* output_data;
+                Node* trans_node;
+                VLOG(3) << source->id() << " do layout_tranform from NCHWxc to NCHW";
+                std::tie(trans_node, output_data) =
+                    InsertLayoutTransformNodeAfter(graph,
+                                                   input_data,
+                                                   node,
+                                                   i,
+                                                   src_layout,
+                                                   new_input_layouts[i],
+                                                   common::UniqName(source->id() + "_layout_tranform"));
+                UpdateInferInfos(trans_node,
+                                 {input_shapes[i]},
+                                 {input_types[i]},
+                                 {src_layout},
+                                 graph->target_,
+                                 op_infershape,
+                                 op_inferdtype,
+                                 op_inferlayout,
+                                 &shape_dict,
+                                 &type_dict,
+                                 &layout_dict);
+              }
+            }
+          }
+          if (reset_axis) {
+            node->attrs.attr_store["axis"] = -1;
+          }
+          input_shapes.clear();
+          input_types.clear();
+          input_layouts.clear();
+          for (auto& link : node->inlinks_in_order()) {
+            auto* source = link->source();
+            CHECK(shape_dict.count(source->id())) << source->id() << " finds no infershape";
+            CHECK(type_dict.count(source->id())) << source->id() << " finds no infertype";
+            input_shapes.push_back(shape_dict[source->id()]);
+            input_types.push_back(type_dict[source->id()]);
+            if (layout_dict.count(source->id())) {
+              input_layouts.push_back(layout_dict[source->id()]);
+            } else {
+              input_layouts.push_back("");
+            }
+          }
+          UpdateInferInfos(node,
+                           input_shapes,
+                           input_types,
+                           input_layouts,
+                           graph->target_,
+                           op_infershape,
+                           op_inferdtype,
+                           op_inferlayout,
+                           &shape_dict,
+                           &type_dict,
+                           &layout_dict);
+        }
+      }
+    }
+    if (has_altered) {
+      // final layout transform
+      store_nodes = std::get<0>(graph->topological_order());
+      for (int i = store_nodes.size() - 1; i >= 0; i--) {
+        auto* node = store_nodes[i]->safe_as<Node>();
+        if (node) {
+          CHECK(node->attrs.attr_store.count("out_layouts")) << node->id() << " finds no out_layouts attr";
+          auto out_layouts = absl::get<std::vector<std::string>>(node->attrs.attr_store.at("out_layouts"));
+          CHECK(!out_layouts.empty());
+          if (out_layouts[0].size() > 4) {
+            // recover the layout finally, NCHWxc->NCHW, only first output
+            auto outlinks = node->outlinks_in_order();
+            CHECK(!outlinks.empty());
+            auto* out_node         = outlinks[0]->sink();
+            std::string dst_layout = "NCHW";
+            CHECK(layout_dict.count(out_node->id())) << out_node->id() << " finds no out_layout";
+            std::string src_layout = layout_dict[out_node->id()];
+            // insert layout_transform
+            NodeData* temp_out;
+            Node* trans_node;
+            CHECK(shape_dict.count(out_node->id())) << out_node->id() << " finds no infershape";
+            CHECK(type_dict.count(out_node->id())) << out_node->id() << " finds no infertype";
+            auto shape = shape_dict[out_node->id()];
+            auto type  = type_dict[out_node->id()];
+            // insert layout transform before the output var to keep the final original output var
+            std::tie(trans_node, temp_out) =
+                InsertLayoutTransformNodeBefore(graph,
+                                                node,
+                                                out_node->safe_as<NodeData>(),
+                                                0,
+                                                src_layout,
+                                                dst_layout,
+                                                common::UniqName(node->op()->name + "_final_layout_tranform"));
+            shape_dict[temp_out->id()]  = shape;
+            type_dict[temp_out->id()]   = type;
+            layout_dict[temp_out->id()] = src_layout;
+            UpdateInferInfos(trans_node,
+                             {shape},
+                             {type},
+                             {src_layout},
+                             graph->target_,
+                             op_infershape,
+                             op_inferdtype,
+                             op_inferlayout,
+                             &shape_dict,
+                             &type_dict,
+                             &layout_dict);
+          }
+          break;
+        }
+      }
+      graph->ClearUnlinkedNodes(&shape_dict, &type_dict, &layout_dict);
+      graph->attrs["infershape"]  = std::make_shared<absl::any>(shape_dict);
+      graph->attrs["inferdtype"]  = std::make_shared<absl::any>(type_dict);
+      graph->attrs["inferlayout"] = std::make_shared<absl::any>(layout_dict);
+    }
+  }
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+CINN_REGISTER_HELPER(AlterLayout) {
+  CINN_REGISTER_PASS(AlterLayout)
+      .describe(
+          "This pass alters ops' data layouts in the graph(e.g. NCHW -> NCHWxc, OIHW -> OIHWxoxi) and saves to "
+          "g.attrs[\"inferlayout\"]")
+      .set_change_structure(true)
+      .provide_graph_attr("infershape")
+      .provide_graph_attr("inferdtype")
+      .set_body(cinn::hlir::pass::AlterLayoutPass);
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/alterlayout_test.cc b/paddle/cinn/hlir/pass/alterlayout_test.cc
new file mode 100755
index 0000000000000..5f1a955d204b0
--- /dev/null
+++ b/paddle/cinn/hlir/pass/alterlayout_test.cc
@@ -0,0 +1,458 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace cinn {
+namespace frontend {
+
+using hlir::framework::Scope;
+using utils::Join;
+
+std::unique_ptr<Program> CreateAddProgram() {
+  const int M = 32;
+  const int N = 24;
+
+  Placeholder a(Float(32), {M, N});
+  Placeholder b(Float(32), {M, N});
+  std::unique_ptr<Program> program(new Program);
+
+  auto c = program->add(a, b);
+  auto d = program->add(a, c);
+
+  program->SetInputs({a, b});
+  program->Validate();
+
+  return program;
+}
+
+TEST(conv, conv) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder C(Float(32), {1, 64, 112, 112}, "C");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  auto c = program.conv2d(A, B, attrs);
+
+  Target target = common::DefaultHostTarget();
+  program.SetInputs({A, B});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+
+  runtime_program->Execute();
+}
+
+TEST(conv_relu_conv, conv_relu_conv) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder C(Float(32), {1, 64, 112, 112}, "C");
+  Placeholder D(Float(32), {64, 64, 7, 7}, "D");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.relu(c);
+  auto e = program.conv2d(d, D, attrs);
+
+  Target target = common::DefaultHostTarget();
+  program.SetInputs({A, B, D});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+
+  runtime_program->Execute();
+}
+
+TEST(conv_add_conv, conv_add_conv) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder C(Float(32), {64}, "C");
+  Placeholder D(Float(32), {64, 64, 7, 7}, "D");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.elementwise_add(c, C, 1);
+  auto e = program.conv2d(d, D, attrs);
+
+  Target target = common::DefaultHostTarget();
+  program.SetInputs({A, B, D});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+
+  runtime_program->Execute();
+}
+
+TEST(conv_bn_conv, conv_bn_conv) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder D(Float(32), {64, 64, 7, 7}, "D");
+
+  Placeholder Scale(Float(32), {64}, "Scale");
+  Placeholder Bias(Float(32), {64}, "Bias");
+  Placeholder Mean(Float(32), {64}, "Mean");
+  Placeholder Variance(Float(32), {64}, "Variance");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  absl::flat_hash_map<std::string, Program::attr_t> attrs1;
+  attrs1["epsilon"] = (float)0.001;
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.batchnorm(c, Scale, Bias, Mean, Variance, attrs1);
+  auto e = program.conv2d(d, D, attrs);
+
+  Target target = common::DefaultHostTarget();
+  program.SetInputs({A, B, D});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+
+  runtime_program->Execute();
+}
+
+TEST(conv_pool2d_conv, conv_pool2d_conv) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder C(Float(32), {1, 64, 112, 112}, "C");
+  Placeholder D(Float(32), {64, 64, 7, 7}, "D");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  absl::flat_hash_map<std::string, Program::attr_t> attrs2;
+  attrs2["stride_size"]  = std::vector<int>({2, 2});
+  attrs2["padding_size"] = std::vector<int>({1, 1, 1, 1});
+  attrs2["kernel_size"]  = std::vector<int>({3, 3});
+  std::string pool_type  = "max";
+  attrs2["pool_type"]    = pool_type;
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.pool2d(c, attrs2);
+  auto e = program.conv2d(d, D, attrs);
+
+  Target target = common::DefaultHostTarget();
+  program.SetInputs({A, B, D});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+
+  runtime_program->Execute();
+}
+
+TEST(conv_softmax_conv, conv_softmax_conv) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder D(Float(32), {64, 64, 7, 7}, "D");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  absl::flat_hash_map<std::string, Program::attr_t> attrs1;
+  attrs1["axis"] = (int)-1;
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.softmax(c, attrs1);
+  auto e = program.conv2d(d, D, attrs);
+
+  Target target = common::DefaultHostTarget();
+  program.SetInputs({A, B, D});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+
+  runtime_program->Execute();
+}
+
+TEST(conv_sigmoid_conv, conv_sigmoid_conv) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder D(Float(32), {64, 64, 7, 7}, "D");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.sigmoid(c);
+  auto e = program.conv2d(d, D, attrs);
+
+  Target target = common::DefaultHostTarget();
+  program.SetInputs({A, B, D});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+
+  runtime_program->Execute();
+}
+
+TEST(conv_mul_conv, conv_mul_conv) {
+  Placeholder A(Float(32), {3, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder C(Float(32), {1, 64, 112, 112}, "C");
+  Placeholder D(Float(32), {64, 64, 7, 7}, "D");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  absl::flat_hash_map<std::string, Program::attr_t> attrs1;
+  attrs1["axis"] = (int)-1;
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.mul(c, C, 1, 1);
+  auto e = program.softmax(d, attrs1);
+
+  Target target = common::DefaultHostTarget();
+  program.SetInputs({A, B, D});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+
+  runtime_program->Execute();
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
new file mode 100644
index 0000000000000..38b10322cf3c7
--- /dev/null
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
@@ -0,0 +1,546 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/container/flat_hash_map.h>
+
+#include <deque>
+#include <sstream>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/visualize_helper.h"
+#include "cinn/hlir/pass/fusion_helper_base.h"
+#include "cinn/runtime/custom_function.h"
+
+namespace cinn::hlir::pass {
+
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+using framework::NodePtr;
+using framework::Operator;
+using framework::OpPatternKind;
+
+using cinn::hlir::framework::GenerateAccCheckNodeId;
+using common::GraphEdge;
+using common::GraphNode;
+
+using GroupPtr  = std::shared_ptr<Graph::Group>;
+using GroupList = std::vector<GroupPtr>;
+
+using ShapeDict = absl::flat_hash_map<std::string, framework::shape_t>;
+using DtypeDict = absl::flat_hash_map<std::string, common::Type>;
+
+namespace utils {
+class AssertMsg {
+ public:
+  AssertMsg(int group_id) : group_id_(group_id) {}
+
+  void SetMsg(const std::string& title, const std::string& msg) { msg_info_[title] = msg; }
+
+  const std::string& GetMsg(const std::string& title) const {
+    CHECK(msg_info_.count(title)) << "Msg of group " << group_id_ << " not has title: " << title;
+    return msg_info_.at(title);
+  }
+
+  void CleasMsg(const std::string& title) { msg_info_.erase(title); }
+
+  std::string str() const {
+    std::stringstream ss;
+    for (const auto& msg_pair : msg_info_) {
+      ss << "  -- " << msg_pair.first << ": " << msg_pair.second << "\n";
+    }
+    return ss.str();
+  }
+
+ private:
+  std::unordered_map<std::string, std::string> msg_info_;
+  int group_id_;
+};
+}  // namespace utils
+
+class CheckFusionAccuracyPass {
+ public:
+  CheckFusionAccuracyPass(Graph* graph_)
+      : graph_(graph_),
+        shape_dict_(graph_->GetMutableAttrs<ShapeDict>("infershape")),
+        dtype_dict_(graph_->GetMutableAttrs<DtypeDict>("inferdtype")) {}
+
+  GroupList Apply();
+
+ protected:
+  // a helper find to get node data's debug information
+  std::string DebugNodeData(NodeData* node);
+
+  // create node's output node, whose name is output_id
+  NodeData* CreateOutputNode(NodePtr node, const std::string& output_id = "");
+
+  // get group's op kind
+  OpPatternKind GetOpKind(const framework::Node* node);
+
+  // create a group, the group only has one node
+  GroupPtr CreateSingleNodeGroup(NodePtr node_ptr);
+
+  // topological order nodes list
+  std::vector<Node*> TopologicalOrder(const std::vector<Node*>& nodes);
+
+  // copy and create new output from old_node, and link to new_node
+  void CreateCheckNodeOutputs(Node* old_node, NodePtr new_node);
+
+  // relink new_node's input node
+  void RelinkNodeInputs(Node* old_node, NodePtr new_node);
+
+  // create check fusion accuracy pass node
+  NodePtr CreateCheckNode(Node* node);
+
+  std::pair<NodePtr, NodeData*> CreateIsCloseNode(const std::string& node_id);
+
+  std::pair<NodePtr, NodeData*> CreateAllNode(const std::string& node_id);
+
+  std::pair<NodePtr, NodeData*> CreateAssertNode(const std::string& node_id, utils::AssertMsg* assert_msg);
+
+  // the AssertAllClose operator are composed of isclose+all+assert
+  std::vector<NodePtr> CreateAssertAllClose(const std::string& node_id,
+                                            utils::AssertMsg* assert_msg,
+                                            const std::vector<NodeData*>& inputs);
+
+  // link origin group's output and pass group's output to the AssertAllClose nodes
+  GroupList LinkToAssertAllClose(const std::unordered_set<NodeData*>& group_outputs, utils::AssertMsg* msg);
+
+  // skip check some op and var, now only support check float dtype
+  bool IsSkipVar(const NodeData* var);
+
+ private:
+  Graph* graph_;
+  std::unordered_map<NodeData*, NodeData*> old2new_nodedata_map_;
+  std::unordered_map<NodeData*, NodeData*> constnode_old2new_map_;
+
+  ShapeDict& shape_dict_;
+  DtypeDict& dtype_dict_;
+
+  static std::atomic_int key_count_;
+};
+
+std::atomic_int CheckFusionAccuracyPass::key_count_{0};
+
+bool CheckFusionAccuracyPass::IsSkipVar(const NodeData* var) { return !dtype_dict_.at(var->id()).is_float(); }
+
+std::string CheckFusionAccuracyPass::DebugNodeData(NodeData* node) {
+  std::stringstream ss;
+  ss << node->id() << "{shape=[" << cinn::utils::Join(shape_dict_.at(node->id()), ", ")
+     << "], dtype=" << dtype_dict_.at(node->id()) << "}";
+  return ss.str();
+}
+
+NodeData* CheckFusionAccuracyPass::CreateOutputNode(NodePtr node, const std::string& output_id) {
+  // create node's output data node
+  auto node_id = output_id;
+  if (node_id.empty()) {
+    node_id = cinn::common::UniqName("var_" + node->id());
+  }
+
+  CHECK(graph_->RetrieveNode(node_id) == nullptr)
+      << "The node " << node->op()->name << "'s output " << node_id << " had been registered in graph! Please check.";
+
+  auto* output_data = new NodeData(node, 0, 0, node_id);
+  node->LinkTo(output_data);
+  graph_->RegisterNode(node_id, output_data);
+
+  return output_data;
+}
+
+void CheckFusionAccuracyPass::CreateCheckNodeOutputs(Node* old_node, NodePtr new_node) {
+  const auto& outlinks = old_node->outlinks_in_order();
+  for (const auto& out_edge : outlinks) {
+    auto out_node = out_edge->sink()->safe_as<NodeData>();
+    CHECK(out_node) << "Node " << old_node->id() << "'s output node is nullptr! Please check.";
+
+    const auto& out_node_id = out_node->id();
+    // If the check node's output variable node not created
+    if (!FusionHelperBase::IsConstOp(old_node)) {
+      // note the const op will recompute in group, so that the op may appear in many group
+      // CHECK_EQ(old2new_nodedata_map_.count(out_node), 0)
+      //    << "Var " << out_node_id << " repeated! The graph is not a SSA graph! Please check.";
+    }
+
+    const auto& check_out_node_id = GenerateAccCheckNodeId(out_node_id);
+
+    auto check_out_node          = CreateOutputNode(new_node, check_out_node_id);
+    check_out_node->output_index = out_node->output_index;
+
+    auto check_out_shape = shape_dict_.at(out_node_id);
+    shape_dict_.emplace(check_out_node_id, std::move(check_out_shape));
+
+    auto check_out_dtype = dtype_dict_.at(out_node_id);
+    dtype_dict_.emplace(check_out_node_id, std::move(check_out_dtype));
+
+    VLOG(4) << "Create the check fusion accuracy node of node " << old_node->id() << "'s output node "
+            << DebugNodeData(out_node) << " success, which is " << DebugNodeData(check_out_node);
+
+    old2new_nodedata_map_[out_node] = check_out_node;
+  }
+}
+
+void CheckFusionAccuracyPass::RelinkNodeInputs(Node* old_node, NodePtr new_node) {
+  const auto& inlinks = old_node->inlinks_in_order();
+  for (const auto& in_edge : inlinks) {
+    auto in_node = in_edge->source()->safe_as<NodeData>();
+    CHECK(in_node) << "Node " << old_node->id() << "'s input node is nullptr! Please check.";
+
+    if (old2new_nodedata_map_.count(in_node)) {
+      old2new_nodedata_map_[in_node]->LinkTo(new_node.get());
+    } else {
+      in_node->LinkTo(new_node.get());
+    }
+  }
+}
+
+NodePtr CheckFusionAccuracyPass::CreateCheckNode(Node* node) {
+  CHECK(node->op()) << "Node " << node->id() << " is not operator! Please check.";
+
+  const auto& check_node_id = GenerateAccCheckNodeId(node->id());
+
+  CHECK(graph_->RetrieveNode(check_node_id) == nullptr)
+      << "The node " << node->id() << "'s check fusion accuracy node" << check_node_id
+      << " had been registered in graph! Please check.";
+
+  auto check_node              = Node::Create(node->op(), GenerateAccCheckNodeId(node->attrs.node_name), check_node_id);
+  check_node->attrs.attr_store = node->attrs.attr_store;
+
+  graph_->RegisterNode(check_node_id, check_node.get());
+
+  CreateCheckNodeOutputs(node, check_node);
+  RelinkNodeInputs(node, check_node);
+
+  VLOG(4) << "Create node " << framework::DebugString(node) << "'s check fusion accuracy node success, which is "
+          << framework::DebugString(check_node.get());
+
+  return check_node;
+}
+
+OpPatternKind CheckFusionAccuracyPass::GetOpKind(const framework::Node* node) {
+  auto op_pattern_dict_ = &framework::Operator::GetAttrs<OpPatternKind>("OpPattern");
+  CHECK(op_pattern_dict_->Find(node->op())) << "Don't find the pattern of op : " << node->id();
+  auto kind = op_pattern_dict_[0][node->op()];
+
+  if (kind == framework::kBroadcast) {
+    // As binary op was defined as broadcast, actually it should be element-wise.
+    if (node->op()->name != "broadcast_to") {
+      return framework::kElementWise;
+    }
+  }
+
+  return kind;
+}
+
+GroupPtr CheckFusionAccuracyPass::CreateSingleNodeGroup(NodePtr node_ptr) {
+  auto node  = node_ptr.get();
+  auto group = std::make_shared<Graph::Group>();
+  // init group
+  group->nodes.push_back(node);
+  group->nodes_set.insert(node);
+  group->output_nodes.insert(node);
+  // input node
+  for (auto& edge : node->inlinks()) {
+    auto input_graph_node = edge->source();
+    auto input_node_data  = input_graph_node->safe_as<NodeData>();
+    CHECK(input_node_data);
+    // input data has no source node
+    if (input_node_data->source_node.get()) {
+      group->input_nodes[input_node_data->source_node.get()] = 1;
+    }
+  }
+
+  // group type
+  group->op_pattern_kind = GetOpKind(node);
+  // use current node as master node for schedule
+  group->master_nodes.insert(node);
+  group->group_id = node->id();
+
+  return group;
+}
+
+std::pair<NodePtr, NodeData*> CheckFusionAccuracyPass::CreateIsCloseNode(const std::string& node_id) {
+  const auto& is_close_node_id = "isclose_" + node_id;
+
+  auto is_close_node = Node::Create(Operator::Get("isclose"), GenerateAccCheckNodeId("isclose"), is_close_node_id);
+  is_close_node->attrs.attr_store["rtol"] =
+      cinn::runtime::utils::AssertTrueMsgTool::GetInstance()->GetFlagValue<float>("rtol");
+  is_close_node->attrs.attr_store["atol"] =
+      cinn::runtime::utils::AssertTrueMsgTool::GetInstance()->GetFlagValue<float>("atol");
+  is_close_node->attrs.attr_store["equal_nan"] =
+      cinn::runtime::utils::AssertTrueMsgTool::GetInstance()->GetFlagValue<bool>("equal_nan");
+
+  graph_->RegisterNode(is_close_node_id, is_close_node.get());
+
+  // create node's output data node
+  auto output_data = CreateOutputNode(is_close_node);
+
+  auto check_out_shape = shape_dict_.at(node_id);
+  shape_dict_.emplace(output_data->id(), std::move(check_out_shape));
+  dtype_dict_.emplace(output_data->id(), common::Bool());
+
+  VLOG(4) << "Create node " << node_id << "'s isclose node success, whose id is " << is_close_node_id
+          << ", whose output is " << DebugNodeData(output_data);
+
+  return {is_close_node, output_data};
+}
+
+std::pair<NodePtr, NodeData*> CheckFusionAccuracyPass::CreateAllNode(const std::string& node_id) {
+  const auto& all_node_id = "all_" + node_id;
+
+  auto all_node = Node::Create(Operator::Get("reduce_all"), GenerateAccCheckNodeId("reduce_all"), all_node_id);
+
+  int shape_size = shape_dict_[node_id].size();
+  std::vector<int> axes(shape_size);
+  for (int i = 0; i < shape_size; ++i) {
+    axes[i] = i;
+  }
+  all_node->attrs.attr_store["dim"]      = axes;
+  all_node->attrs.attr_store["keep_dim"] = false;
+
+  graph_->RegisterNode(all_node_id, all_node.get());
+
+  // create node's output data node
+  auto output_data = CreateOutputNode(all_node);
+
+  shape_dict_.emplace(output_data->id(), framework::shape_t{1});
+  dtype_dict_.emplace(output_data->id(), common::Bool());
+
+  VLOG(4) << "Create node " << node_id << "'s all node success, whose id is " << all_node_id << ", whose output is "
+          << DebugNodeData(output_data);
+
+  return {all_node, output_data};
+}
+
+std::pair<NodePtr, NodeData*> CheckFusionAccuracyPass::CreateAssertNode(const std::string& node_id,
+                                                                        utils::AssertMsg* assert_msg) {
+  const auto& assert_node_id = "assert_" + node_id;
+
+  auto assert_node = Node::Create(Operator::Get("assert_true"), GenerateAccCheckNodeId("assert_true"), assert_node_id);
+  // TODO(thisjiang): change type from 'int' to 'std::string' when custom call support 'std::string' type
+  int msg_key                          = key_count_.fetch_add(1);
+  assert_node->attrs.attr_store["msg"] = msg_key;
+  cinn::runtime::utils::AssertTrueMsgTool::GetInstance()->SetMsg(msg_key, assert_msg->str());
+  assert_node->attrs.attr_store["only_warning"] =
+      cinn::runtime::utils::AssertTrueMsgTool::GetInstance()->GetFlagValue<bool>("only_warning");
+
+  graph_->RegisterNode(assert_node_id, assert_node.get());
+
+  // create node's output data node
+  auto output_data = CreateOutputNode(assert_node);
+
+  shape_dict_.emplace(output_data->id(), framework::shape_t{1});
+  dtype_dict_.emplace(output_data->id(), common::Bool());
+
+  VLOG(4) << "Create node " << node_id << "'s assert node success, whose id is " << assert_node_id
+          << ", whose output is " << DebugNodeData(output_data);
+
+  return {assert_node, output_data};
+}
+
+std::vector<NodePtr> CheckFusionAccuracyPass::CreateAssertAllClose(const std::string& node_id,
+                                                                   utils::AssertMsg* assert_msg,
+                                                                   const std::vector<NodeData*>& inputs) {
+  std::vector<NodePtr> group_nodes;
+  // create isclose + all + assert nodes
+  // create isclose node and link inputs to the node
+  const auto& is_close_node = CreateIsCloseNode(node_id);
+  for (auto in_data : inputs) {
+    in_data->LinkTo(is_close_node.first.get());
+  }
+  group_nodes.emplace_back(is_close_node.first);
+
+  // create assert node
+  const auto& assert_node = CreateAssertNode(node_id, assert_msg);
+
+  // check and create all node
+  auto in_shape = shape_dict_[node_id];
+  int prod_size = std::accumulate(in_shape.begin(), in_shape.end(), 1, std::multiplies<int>());
+  if (prod_size > 1) {
+    // need reduce
+    const auto& all_node = CreateAllNode(node_id);
+
+    is_close_node.second->LinkTo(all_node.first.get());
+    all_node.second->LinkTo(assert_node.first.get());
+
+    group_nodes.emplace_back(all_node.first);
+  } else {
+    // do not need reduce
+    is_close_node.second->LinkTo(assert_node.first.get());
+  }
+  group_nodes.emplace_back(assert_node.first);
+
+  return group_nodes;
+}
+
+GroupList CheckFusionAccuracyPass::LinkToAssertAllClose(const std::unordered_set<NodeData*>& group_outputs,
+                                                        utils::AssertMsg* msg) {
+  GroupList assert_groups;
+  for (auto* group_out : group_outputs) {
+    const auto& out_node_id = group_out->id();
+    if (IsSkipVar(group_out)) {
+      LOG(WARNING) << "The CheckFusionAccuracyPass only support check float point dtype data now, skip check node \""
+                   << out_node_id << "\", who's dtype=" << dtype_dict_.at(out_node_id);
+      continue;
+    }
+    CHECK(old2new_nodedata_map_.count(group_out)) << "The check fusion accuracy's node corresponding to " << out_node_id
+                                                  << " had not been created! Please check.";
+    auto pass_out                = old2new_nodedata_map_.at(group_out);
+    const auto& acc_check_out_id = pass_out->id();
+
+    msg->SetMsg("Var Name", out_node_id);
+    msg->SetMsg("Suggestion",
+                cinn::utils::StringFormat("You can check the value by set FLAGS_cinn_self_check_accuracy and compare "
+                                          "the result between \"%s\" and \"%s\"",
+                                          out_node_id.c_str(),
+                                          acc_check_out_id.c_str()));
+
+    const auto& nodes = CreateAssertAllClose(acc_check_out_id, msg, {group_out, pass_out});
+
+    for (const auto& node : nodes) {
+      assert_groups.emplace_back(CreateSingleNodeGroup(node));
+    }
+  }
+  return assert_groups;
+}
+
+std::vector<Node*> CheckFusionAccuracyPass::TopologicalOrder(const std::vector<Node*>& nodes) {
+  struct NodeCompare {
+    bool operator()(Node* lhs, Node* rhs) const { return lhs->id() < rhs->id(); }
+  };
+
+  std::set<Node*, NodeCompare> node_set(nodes.begin(), nodes.end());
+
+  // count all node's output to find the group's start node
+  std::unordered_set<NodeData*> all_outputs;
+  for (auto node : node_set) {
+    for (auto& out_edge : node->outlinks_in_order()) {
+      all_outputs.insert(out_edge->sink()->safe_as<NodeData>());
+    }
+  }
+
+  // if the node's input is not any group node's output, it's start node
+  std::deque<Node*> queue;
+  std::unordered_map<Node*, int> indegree;
+  for (auto node : node_set) {
+    bool is_start = true;
+    for (auto& in_edge : node->inlinks_in_order()) {
+      if (all_outputs.count(in_edge->source()->safe_as<NodeData>())) {
+        // if the node's input is some group node's output, it's not start node
+        is_start = false;
+        indegree[node]++;
+      }
+    }
+    if (is_start) {
+      queue.emplace_back(node);
+    }
+  }
+
+  std::vector<Node*> ordered_nodes;
+  // start to visit
+  while (!queue.empty()) {
+    auto top_node = queue.front();
+    ordered_nodes.push_back(top_node);
+
+    queue.pop_front();
+
+    for (auto& out_edge : top_node->outlinks_in_order()) {
+      // the output of node is a variable node, not op node
+      auto out_data = out_edge->sink()->safe_as<NodeData>();
+
+      for (auto out_data_edge : out_data->outlinks()) {
+        // the variable node's output are the required output nodes
+        auto out_node = out_data_edge->sink()->safe_as<Node>();
+        if (indegree.count(out_node) && (--indegree[out_node]) == 0) {
+          // if the output node in group and its input nodes are all visited, push
+          queue.push_back(out_node);
+        }
+      }
+    }
+  }
+
+  CHECK_EQ(ordered_nodes.size(), nodes.size()) << "There has circle in group! Please check.";
+
+  return ordered_nodes;
+}
+
+GroupList CheckFusionAccuracyPass::Apply() {
+  GroupList check_fusion_groups;
+
+  std::unordered_set<std::string> fetch_ids;
+  for (auto* node : graph_->outputs) {
+    fetch_ids.emplace(node->id());
+  }
+
+  int i = 0;
+  for (auto& group : graph_->fusion_groups) {
+    check_fusion_groups.emplace_back(group);
+
+    const auto& group_nodes = group->CollectNodes();
+
+    // fusion group only has one node, do not need check, skip
+    if (group_nodes.size() <= 1) {
+      VLOG(4) << "The Group " << group->GetFuncName() << " just has one node, skip.";
+      continue;
+    }
+
+    // split orign group and create group for each node
+    const auto& ordered_nodes = TopologicalOrder(group_nodes);
+    VLOG(4) << "Check the accuracy of group " << graph_->DebugGroupedGraph(ordered_nodes);
+
+    for (auto* node : ordered_nodes) {
+      if (node->is_variable()) {
+        VLOG(4) << "The node " << node->id() << " is variable, skip check fusion accuracy.";
+        continue;
+      }
+
+      auto check_node = CreateCheckNode(node);
+      check_fusion_groups.push_back(CreateSingleNodeGroup(check_node));
+    }
+
+    // set assert debug info
+    utils::AssertMsg msg(i);
+    msg.SetMsg("Kernel name", group->GetFuncName());
+    msg.SetMsg("Group id", std::to_string(i));
+    msg.SetMsg(
+        "Group structure",
+        cinn::utils::StringFormat("\nGroup %d {\n%s}", i, graph_->DebugGroupedGraph(ordered_nodes, fetch_ids).c_str()));
+
+    // link the group's output data node to assert all close node
+    const auto& assert_group = LinkToAssertAllClose(group->GetOutputNodeDatas(), &msg);
+    check_fusion_groups.insert(check_fusion_groups.end(), assert_group.begin(), assert_group.end());
+
+    i++;
+  }
+  return check_fusion_groups;
+}
+
+void CheckFusionAccuracyPassImpl(Graph* graph) { graph->fusion_groups = CheckFusionAccuracyPass(graph).Apply(); }
+
+}  // namespace cinn::hlir::pass
+
+CINN_REGISTER_HELPER(CheckFusionAccuracyPass) {
+  CINN_REGISTER_PASS(CheckFusionAccuracyPass)
+      .describe("Check Fusion Accuracy Pass.")
+      .set_change_structure(true)
+      .set_body(cinn::hlir::pass::CheckFusionAccuracyPassImpl);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
new file mode 100644
index 0000000000000..3db0a1ff21432
--- /dev/null
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
@@ -0,0 +1,589 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <unordered_set>
+
+#include "cinn/common/target.h"
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn::frontend {
+
+using hlir::framework::Graph;
+
+int CountAfterPassNodeSize(Graph* graph) {
+  int node_size = 0, output_size = 0;
+  for (auto group : graph->fusion_groups) {
+    int group_size = group->CollectNodes().size();
+    if (group_size == 1) {
+      // CheckFusionAccuracyPass will skip if the group only has one node
+      continue;
+    }
+
+    node_size += group_size;
+    output_size += group->GetOutputNodeDatas().size();
+  }
+
+  // CheckFusionAccuracyPass will split each group, and add isclose+all+assert node for each output
+  return node_size + output_size * 3;
+}
+
+void RunTest(const Target& target, const std::shared_ptr<Graph>& graph, const std::vector<std::string>& input_names) {
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    scope->Var<hlir::framework::Tensor>(input_names[i]);
+    auto tensor = scope->GetTensor(input_names[i]);
+
+    std::vector<float> vec;
+    InitRandomVector<float>(&vec, tensor->shape().numel(), 0.0f, 1.0f);
+    CopyFromVector<float>(vec, tensor, target);
+  }
+
+  auto runtime_program = gc.Build();
+  runtime_program->Execute();
+}
+
+TEST(CheckFusionAccuracyPass, ElementWise_Fusion) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_0");
+  std::unordered_set<std::string> fetch_ids;
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(E, C);
+    auto G = net_builder.Add(E, D);
+
+    fetch_ids = {F->id, G->id};
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D"});
+}
+
+TEST(CheckFusionAccuracyPass, ElementWise_Fusion_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, D);
+    auto G = net_builder.Add(E, F);
+    auto I = net_builder.Add(E, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D"});
+}
+
+TEST(CheckFusionAccuracyPass, ElementWise_Fusion_2) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.CreateInput(Float(32), {h, w}, "E");
+    auto F = net_builder.CreateInput(Float(32), {h, w}, "F");
+    auto G = net_builder.Add(A, B);
+    auto H = net_builder.Add(C, D);
+    auto I = net_builder.Add(E, G);
+    auto J = net_builder.Add(G, H);
+    auto K = net_builder.Add(H, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D", "E", "F"});
+}
+
+TEST(CheckFusionAccuracyPass, ElementWise_Fusion_3) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_3");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.CreateInput(Float(32), {h, w}, "E");
+    auto F = net_builder.CreateInput(Float(32), {h, w}, "F");
+    auto G = net_builder.Add(A, B);
+    auto H = net_builder.Add(G, C);
+    auto I = net_builder.Add(G, D);
+    auto J = net_builder.Add(G, E);
+    auto K = net_builder.Add(G, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D", "E", "F"});
+}
+
+TEST(CheckFusionAccuracyPass, ElementWise_Fusion_4) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_4");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.CreateInput(Float(32), {h, w}, "E");
+    auto F = net_builder.CreateInput(Float(32), {h, w}, "F");
+    auto G = net_builder.Add(A, B);
+    auto H = net_builder.Add(G, C);
+    auto I = net_builder.Add(G, D);
+    auto J = net_builder.Add(I, E);
+    auto K = net_builder.Add(I, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D", "E", "F"});
+}
+
+TEST(CheckFusionAccuracyPass, ElementWise_Fusion_5) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_5");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+    auto D = net_builder.Add(A, B);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B"});
+}
+
+TEST(CheckFusionAccuracyPass, Broadcast_Test_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Broadcast_Test_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, D);
+    auto G = net_builder.Add(F, E);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D"});
+}
+
+TEST(CheckFusionAccuracyPass, Broadcast_Test_2) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Broadcast_Test_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, E);
+    auto G = net_builder.Add(D, E);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D"});
+}
+
+TEST(CheckFusionAccuracyPass, Broadcast_Test_4) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Broadcast_Test_4");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.CreateInput(Float(32), {h, w}, "E");
+    auto F = net_builder.Add(A, B);
+    auto G = net_builder.Add(C, F);
+    auto H = net_builder.Add(D, F);
+    auto I = net_builder.Add(E, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D", "E"});
+}
+
+TEST(CheckFusionAccuracyPass, Broadcast_Test_5) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Broadcast_Test_5");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.CreateInput(Float(32), {h * w, w}, "E");
+    auto F = net_builder.Add(A, B);
+    auto G = net_builder.Add(C, F);
+    auto H = net_builder.Add(D, F);
+    auto I = net_builder.Add(E, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D", "E"});
+}
+
+TEST(CheckFusionAccuracyPass, Reduce_Test_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+    auto D = net_builder.ReduceSum(C, {0});
+    auto E = net_builder.ReduceSum(C, {0});
+    auto F = net_builder.ReduceSum(C, {0});
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B"});
+}
+
+TEST(CheckFusionAccuracyPass, Reduce_Test_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+    auto D = net_builder.ReduceSum(C, {0});
+    auto E = net_builder.ReduceSum(C, {1});
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B"});
+}
+
+TEST(CheckFusionAccuracyPass, Reduce_Test_2) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0});
+    auto F = net_builder.ReduceSum(D, {1});
+    auto G = net_builder.Add(C, E);
+    auto H = net_builder.Add(C, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C"});
+}
+
+TEST(CheckFusionAccuracyPass, Reduce_Test_3) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_3");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.ReduceSum(E, {0});
+    auto G = net_builder.Add(C, F);
+    auto H = net_builder.Add(D, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D"});
+}
+
+TEST(CheckFusionAccuracyPass, Reduce_Test_4) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_4");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.ReduceSum(E, {0});
+    auto G = net_builder.Add(C, F);
+    auto H = net_builder.Add(D, F);
+    auto I = net_builder.Add(D, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B", "C", "D"});
+}
+
+TEST(CheckFusionAccuracyPass, Reduce_Test_5) {
+  int h = 128, w = 128;
+  NetBuilder net_builder("Reduce_Test_5");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+    auto D = net_builder.ReduceSum(A, {1});
+    auto E = net_builder.ReduceSum(B, {1});
+    auto F = net_builder.ReduceSum(C, {1});
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<Graph>(program, target);
+
+  hlir::framework::ApplyPasses(graph.get(), {"OpFusionPass", "FusionMergePass"});
+
+  int group_size_after = graph->fusion_groups.size() + CountAfterPassNodeSize(graph.get());
+
+  VLOG(1) << "Before CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+  hlir::framework::ApplyPasses(graph.get(), {"CheckFusionAccuracyPass", "TransToCustomCallPass"});
+  VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
+
+  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+
+  RunTest(target, graph, {"A", "B"});
+}
+
+}  // namespace cinn::frontend
diff --git a/paddle/cinn/hlir/pass/common_subexpression_elimination.cc b/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
new file mode 100644
index 0000000000000..75aa2c92c3361
--- /dev/null
+++ b/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
@@ -0,0 +1,307 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <unordered_set>
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_lowering_util.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+
+using common::GraphEdge;
+using common::GraphNode;
+
+using InputToNodeMap = std::unordered_map<std::string, std::unordered_set<Node*>>;
+using shape_dict_t   = absl::flat_hash_map<std::string, framework::shape_t>;
+
+std::unordered_set<std::string> unordered_ops = {
+    "elementwise_add",
+    "elementwise_mul",
+    "max",
+    "min",
+    "logical_and",
+    "logical_or",
+    "logical_xor",
+    "equal",
+    "not_equal",
+    "bitwise_or",
+    "bitwise_xor",
+    "bitwise_and",
+};
+
+// When all the inputs are the same, those ops just ensure that all the outputs shape is the same.
+std::unordered_set<std::string> reshape_ops = {
+    "reshape",
+    "concat",
+};
+
+// Those special attrs maybe different but equivalent.
+std::unordered_map<std::string, int> special_attrs = {
+    //    {"axis", 1}, // due to the issue in some ops
+    //    {"dim", 1}, // due to the issue in some ops
+    {"axes", 2},
+    {"perm", 2}};
+
+bool IsSameSubexpression(Node* op1, Node* op2, shape_dict_t& shape_dict) {
+  // Get the input edges for op1 and op2 in order.
+  auto op1_in_edges = op1->inlinks_in_order();
+  auto op2_in_edges = op2->inlinks_in_order();
+  // Get the number of input edges for op1 and op2
+  auto op1_inputs_size = op1_in_edges.size();
+  auto op2_inputs_size = op2_in_edges.size();
+  // If the number of input edges is not the same, the subexpression is not the same.
+  if (op1_inputs_size != op2_inputs_size) {
+    return false;
+  }
+  // Get the number of attributes for op1 and op2.
+  auto op1_attrs_size = op1->attrs.attr_store.size();
+  auto op2_attrs_size = op2->attrs.attr_store.size();
+  // If the number of attributes is not the same, the subexpression is not the same.
+  if (op1_attrs_size != op2_attrs_size) {
+    return false;
+  }
+  // Check if the input nodes match.
+  if (unordered_ops.count(op1->op()->name)) {
+    // For unordered ops, check if any input node of op2 matches any input node of op1.
+    for (auto& op1_edge : op1_in_edges) {
+      auto* op1_source_node = op1_edge->source()->safe_as<NodeData>();
+      CHECK(op1_source_node);
+      bool op1_equal_op2 = std::any_of(op2_in_edges.begin(), op2_in_edges.end(), [&](common::Shared<GraphEdge>& edge) {
+        auto* op2_source_node = edge->source()->safe_as<NodeData>();
+        CHECK(op2_source_node);
+        if (op1_source_node->id() == op2_source_node->id()) {
+          return true;
+        }
+        return false;
+      });
+      if (!op1_equal_op2) {
+        return false;
+      }
+    }
+  } else {
+    // For ordered ops, check if the input nodes match one-to-one.
+    for (int i = 0; i < op1_inputs_size; ++i) {
+      auto* op1_source_node = op1_in_edges[i]->source()->safe_as<NodeData>();
+      auto* op2_source_node = op2_in_edges[i]->source()->safe_as<NodeData>();
+      CHECK(op1_source_node);
+      CHECK(op2_source_node);
+      if (op1_source_node->id() != op2_source_node->id()) {
+        return false;
+      }
+    }
+  }
+
+  // Check if the number of dimensions is the same.
+  auto* op1_sink_node = GetNodeData(op1);
+  auto* op2_sink_node = GetNodeData(op2);
+  if (shape_dict[op1_sink_node->id()].size() != shape_dict[op2_sink_node->id()].size()) {
+    return false;
+  }
+  if (reshape_ops.count(op1->op()->name)) {
+    // For reshape ops, check if the reshaped shape is the same.
+    return shape_dict[op1_sink_node->id()] == shape_dict[op2_sink_node->id()];
+  } else {
+    // For non-reshape ops, check if the attributes is the same.
+    return std::all_of(op1->attrs.attr_store.begin(), op1->attrs.attr_store.end(), [&](auto attr) {
+      if (!op2->attrs.attr_store.count(attr.first)) {
+        return false;
+      }
+      auto& attr1 = attr.second;
+      auto& attr2 = op2->attrs.attr_store[attr.first];
+      auto ndim   = static_cast<int>(shape_dict[op1_sink_node->id()].size());
+      if (special_attrs.count(attr.first)) {
+        switch (special_attrs[attr.first]) {
+          case 1: {
+            auto op1_axis = absl::get<int>(attr1);
+            auto op2_axis = absl::get<int>(attr2);
+            if (op1_axis < 0) {
+              op1_axis += ndim;
+            }
+            if (op2_axis < 0) {
+              op2_axis += ndim;
+            }
+            return op2_axis == op1_axis;
+          }
+          case 2: {
+            auto& op1_axes = absl::get<std::vector<int>>(attr1);
+            auto& op2_axes = absl::get<std::vector<int>>(attr2);
+            auto op1_size  = op1_axes.size();
+            auto op2_size  = op2_axes.size();
+            if (op1_size != op2_size) {
+              return false;
+            }
+            for (int i = 0; i < op1_axes.size(); ++i) {
+              int op1_axis = op1_axes[i];
+              int op2_axis = op2_axes[i];
+              if (op1_axis < 0) {
+                op1_axis += ndim;
+              }
+              if (op2_axis < 0) {
+                op2_axis += ndim;
+              }
+              if (op2_axis != op1_axis) {
+                return false;
+              }
+            }
+            return true;
+          }
+        }
+      }
+      return attr1 == attr2;
+    });
+  }
+}
+
+void RemoveNodes(framework::Graph* graph, GraphNode* node) {
+  auto in_edges = node->inlinks();
+  for (auto& edge : in_edges) {
+    auto* in_node = edge->source();
+    in_node->UnLinkSingleTo(node);
+  }
+  auto out_edges = node->outlinks();
+  for (auto& edge : out_edges) {
+    auto* out_node = edge->sink();
+    node->UnLinkSingleTo(out_node);
+  }
+  graph->DropNode(node);
+}
+
+void RemoveNodes(framework::Graph* graph, std::vector<Node*>& nodes) {
+  for (auto* node : nodes) {
+    RemoveNodes(graph, node);
+  }
+}
+
+void RemoveNodes(framework::Graph* graph, std::vector<NodeData*>& nodes_data) {
+  for (auto* data : nodes_data) {
+    if (std::find(graph->outputs.begin(), graph->outputs.end(), data) != graph->outputs.end()) {
+      return;
+    }
+    RemoveNodes(graph, data);
+  }
+}
+
+void ReplaceNode(NodeData* src_new, NodeData* src_old, Node* trt) {
+  std::vector<NodeData*> in_nodes;
+  for (auto& in_edge : trt->inlinks_in_order()) {
+    auto* in_node = in_edge->source()->safe_as<NodeData>();
+    if (in_node->id() == src_old->id()) {
+      in_node->UnLinkSingleTo(trt);
+      src_new->LinkTo(trt);
+    }
+  }
+}
+
+void CommonSubexpressionElimination(Graph* graph, std::vector<GraphNode*> store_nodes, InputToNodeMap in2node) {
+  std::unordered_map<std::string, std::vector<Node*>> candidates_map;
+  auto shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, framework::shape_t>>("infershape");
+  std::vector<Node*> remove_nodes;
+  std::vector<NodeData*> remove_nodes_data;
+
+  while (!store_nodes.empty()) {
+    auto* graph_node = store_nodes[0];
+    store_nodes.erase(store_nodes.begin());
+    VLOG(4) << "size of store_nodes is " << store_nodes.size();
+    auto node = graph_node->safe_as<Node>();
+    if (node) {
+      auto& node_type  = node->op()->name;
+      auto& candidates = candidates_map[node_type];
+      bool found       = false;
+      for (auto* candidate_node : candidates) {
+        // If node is same with candidate_node, continue the next.
+        if (node->id() == candidate_node->id()) continue;
+        // If node is different from candidate_node, continue the next.
+        if (!IsSameSubexpression(node, candidate_node, shape_dict)) continue;
+        found = true;
+        for (int k = 0; k < node->outlinks_in_order().size(); ++k) {
+          const auto& out_links           = node->outlinks_in_order();
+          const auto& candidate_out_links = candidate_node->outlinks_in_order();
+          CHECK(out_links.size() == candidate_out_links.size());
+          auto* sink_node           = out_links[k]->sink()->safe_as<NodeData>();
+          auto* candidate_sink_node = candidate_out_links[k]->sink()->safe_as<NodeData>();
+          CHECK(sink_node);
+          CHECK(candidate_sink_node);
+          auto iter_sink_node = std::find(graph->outputs.begin(), graph->outputs.end(), sink_node);
+          if (iter_sink_node != graph->outputs.end()) {
+            // If sink node in outputs, the node cannot be removed.
+            continue;
+          }
+          remove_nodes_data.push_back(sink_node);
+          // Replace sink_node with candidate_sink_node in nodes linked by sink_node.
+          auto out_nodes = in2node[sink_node->id()];
+          for (auto out_node : out_nodes) {
+            ReplaceNode(candidate_sink_node, sink_node, out_node);
+            // The changed out node will be detected again.
+            if (std::find(store_nodes.begin(), store_nodes.end(), out_node) == store_nodes.end()) {
+              store_nodes.insert(store_nodes.begin(), out_node);
+            }
+          }
+        }
+        remove_nodes.push_back(node);
+        VLOG(4) << "remove " << node->id() << " node.";
+        break;
+      }
+      if (!found) {
+        candidates_map[node_type].push_back(node);
+      }
+    }
+  }
+  // Node should be deleted before node data.
+  RemoveNodes(graph, remove_nodes);
+  RemoveNodes(graph, remove_nodes_data);
+}
+
+void CommonSubexpressionEliminationPass(Graph* graph) {
+  VLOG(3) << "CommonSubexpressionEliminationPass...!";
+  std::unordered_map<std::string, std::vector<Node*>> candidates_map;
+  InputToNodeMap in2node;
+  auto store_nodes = std::get<0>(graph->topological_order());
+
+  for (auto& graph_node : store_nodes) {
+    auto node = graph_node->safe_as<Node>();
+    if (node) {
+      for (auto& in_edge : node->inlinks_in_order()) {
+        auto* source_node = in_edge->source()->safe_as<NodeData>();
+        in2node[source_node->id()].insert(node);
+      }
+    }
+  }
+
+  CommonSubexpressionElimination(graph, store_nodes, in2node);
+  VLOG(3) << "CommonSubexpressionEliminationPass Finish...!";
+}
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(CommonSubexpressionEliminationPass) {
+  CINN_REGISTER_PASS(CommonSubexpressionEliminationPass)
+      .describe("This pass  will remove these same sub-expression.")
+      .set_change_structure(true)
+      .set_body(cinn::hlir::pass::CommonSubexpressionEliminationPass);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc b/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
new file mode 100644
index 0000000000000..eef62e68fa5b7
--- /dev/null
+++ b/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (c) 202 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/utils/data_util.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace cinn {
+namespace frontend {
+
+using hlir::framework::Scope;
+using utils::Join;
+
+TEST(common_subexpression_elimination, common_subexpression_elimination_case1) {
+  Placeholder A(Float(32), {32, 16, 1}, "A");
+  Placeholder B(Float(32), {32, 1, 1}, "B", true);
+
+  Program program;
+  auto add_1  = program.add(A, B);
+  auto add_2  = program.add(B, A);
+  auto add    = program.add(add_1, add_2);
+  auto t_1    = program.transpose(add, {2, 1, 0});  // {1, 16, 32}
+  auto t_2    = program.transpose(add, {2, 1, 0});  // {1, 16, 32}
+  auto t_3    = program.transpose(add, {2, 1, 0});  // {1, 16, 32}
+  auto concat = program.concat({t_1, t_2, t_3});
+  auto max    = program.reduce_max(concat, {0}, true);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  LOG(INFO) << "graph:\n" << graph->DebugGroupedGraph();
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "CommonSubexpressionEliminationPass");
+  hlir::framework::ApplyPass(graph.get(), "BuildNonFusedGroupsPass");
+  auto scope = BuildScope(target, graph);
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  auto& prerun_instrs  = runtime_program->GetPreRunInstructions();
+  auto& run_instrs     = runtime_program->GetRunInstructions();
+  ASSERT_EQ(prerun_instrs.size(), 0);
+  ASSERT_EQ(run_instrs.size(), 5);
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+
+  LOG(INFO) << "graph:\n" << graph->DebugGroupedGraph();
+  runtime_program->Execute();
+}
+
+TEST(common_subexpression_elimination, common_subexpression_elimination_case2) {
+  Placeholder A(Float(32), {32, 16}, "A");
+  Placeholder B(Float(32), {32, 1}, "B", true);
+
+  Program program;
+  auto add_1     = program.add(A, A);
+  auto add_2     = program.add(A, A);
+  auto reshape_1 = program.reshape(B, {4, -1});
+  auto reshape_2 = program.reshape(B, {4, 8});
+  auto concat_1  = program.concat({reshape_1, reshape_2});
+  auto concat_2  = program.concat({reshape_1, reshape_2});
+  auto concat_3  = program.concat({reshape_1, reshape_2}, 1);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  LOG(INFO) << "graph:\n" << graph->DebugGroupedGraph();
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "CommonSubexpressionEliminationPass");
+  hlir::framework::ApplyPass(graph.get(), "BuildNonFusedGroupsPass");
+  auto scope = BuildScope(target, graph);
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  auto& prerun_instrs  = runtime_program->GetPreRunInstructions();
+  auto& run_instrs     = runtime_program->GetRunInstructions();
+  ASSERT_EQ(prerun_instrs.size(), 0);
+  ASSERT_EQ(run_instrs.size(), 4);
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+
+  LOG(INFO) << "graph:\n" << graph->DebugGroupedGraph();
+  runtime_program->Execute();
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(common_subexpression_elimination, common_subexpression_elimination_case3) {
+  auto strides     = std::vector<int>({2, 2});
+  auto dilations   = std::vector<int>({1, 1});
+  auto paddings    = std::vector<int>({3, 3});
+  auto data_format = "NCHW";
+
+  NetBuilder builder("CSE");
+  auto A        = builder.CreateInput(Float(32), {1, 3, 224, 224}, "A");
+  auto B        = builder.CreateInput(Float(32), {1, 1, 224, 224}, "B");
+  auto add_1    = builder.Add(A, B);
+  auto weight_1 = builder.FillConstant<float>({64, 3, 7, 7}, 1.0f, "w1", false);
+  auto weight_2 = builder.FillConstant<float>({64, 3, 7, 7}, 1.0f, "w2", false);
+  auto bias     = builder.FillConstant<float>({1, 64, 112, 112}, 2.0f, "b1", false);
+  auto conv_1   = builder.Conv2d(add_1, weight_1, strides, paddings, dilations, 1, data_format);
+  auto add_2    = builder.Add(conv_1, bias);
+  auto relu_1   = builder.Relu(add_2);
+  auto conv_2   = builder.Conv2d(add_1, weight_2, strides, paddings, dilations, 1, data_format);
+  auto add_3    = builder.Add(conv_2, bias);
+  auto relu_2   = builder.Relu(add_3);
+  auto out1     = builder.Add(relu_1, add_2);
+  auto out2     = builder.Add(add_2, relu_2);
+
+  auto program = builder.Build();
+  LOG(INFO) << "Program:\n" << program;
+
+  std::unordered_set<std::string> fetch_list;
+  fetch_list.insert(out1->id);
+  fetch_list.insert(out2->id);
+
+  Target target = common::DefaultNVGPUTarget();
+  auto graph    = std::make_shared<hlir::framework::Graph>(program, fetch_list, target);
+  LOG(INFO) << "graph:\n" << graph->DebugGroupedGraph(fetch_list);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "CommonSubexpressionEliminationPass");
+  hlir::framework::ApplyPass(graph.get(), "TransToCustomCallPass");
+  hlir::framework::ApplyPass(graph.get(), "BuildNonFusedGroupsPass");
+
+  LOG(INFO) << "graph:\n" << graph->DebugGroupedGraph(fetch_list);
+
+  auto scope = BuildScope(target, graph);
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  auto& prerun_instrs  = runtime_program->GetPreRunInstructions();
+  auto& run_instrs     = runtime_program->GetRunInstructions();
+  ASSERT_EQ(prerun_instrs.size(), 0);
+  ASSERT_EQ(run_instrs.size(), 7);
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  runtime_program->Execute();
+}
+#endif
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/const_propagate.cc b/paddle/cinn/hlir/pass/const_propagate.cc
new file mode 100644
index 0000000000000..d603f99fff0d3
--- /dev/null
+++ b/paddle/cinn/hlir/pass/const_propagate.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using common::Type;
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+using framework::Operator;
+
+void ConstPropagatePass(Graph* graph) {
+  auto store_nodes = std::get<0>(graph->topological_order());
+  for (auto& n : store_nodes) {
+    auto node = n->safe_as<Node>();
+    if (node) {
+      bool is_all_const = true;
+      for (auto& in_edge : node->inlinks_in_order()) {
+        auto* source_node = in_edge->source()->safe_as<NodeData>();
+        CHECK(source_node);
+        if (!source_node->is_const()) {
+          is_all_const = false;
+          break;
+        }
+      }
+      if (is_all_const) {
+        node->attrs.attr_store["pre_run"] = true;
+        VLOG(4) << node->id() << " do pre_run";
+        for (auto& out_edge : node->outlinks_in_order()) {
+          // mark all out nodedatas as const
+          auto* sink_node = out_edge->sink()->safe_as<NodeData>();
+          CHECK(sink_node);
+          sink_node->set_const(true);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(ConstPropagate) {
+  CINN_REGISTER_PASS(ConstPropagate)
+      .describe(
+          "This pass will propagate const node_datas and mark the op_node with the attr[\"pre_run\"] if inputs are all "
+          "constants;")
+      .set_change_structure(false)
+      .provide_graph_attr("pre_run")
+      .set_body(cinn::hlir::pass::ConstPropagatePass);
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/const_propagate_test.cc b/paddle/cinn/hlir/pass/const_propagate_test.cc
new file mode 100644
index 0000000000000..1c520528188a9
--- /dev/null
+++ b/paddle/cinn/hlir/pass/const_propagate_test.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace cinn {
+namespace frontend {
+
+using hlir::framework::Scope;
+using utils::Join;
+
+TEST(const_conv, const_conv) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B", true);
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  auto c        = program.conv2d(A, B, attrs);
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "ConstPropagate");
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  auto scope = BuildScope(target, graph);
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  auto& prerun_instrs  = runtime_program->GetPreRunInstructions();
+  auto& run_instrs     = runtime_program->GetRunInstructions();
+  ASSERT_EQ(prerun_instrs.size(), 0);
+  ASSERT_EQ(run_instrs.size(), 1);
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+
+  runtime_program->Execute();
+}
+
+// fused_batch_norm
+TEST(const_bn, const_bn) {
+  Placeholder A(Float(32), {1, 64, 112, 112}, "A");
+
+  Placeholder Scale(Float(32), {64}, "Scale", true);
+  Placeholder Bias(Float(32), {64}, "Bias", true);
+  Placeholder Mean(Float(32), {64}, "Mean", true);
+  Placeholder Variance(Float(32), {64}, "Variance", true);
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["epsilon"] = static_cast<float>(0.001);
+  auto a           = program.fused_batchnorm_inference(A, Scale, Bias, Mean, Variance, attrs);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, Scale, Bias, Mean, Variance});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  auto scope = BuildScope(target, graph);
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+  auto& prerun_instrs  = runtime_program->GetPreRunInstructions();
+  auto& run_instrs     = runtime_program->GetRunInstructions();
+  // Revert changes in PR #990 to pass the model unittests
+  ASSERT_EQ(run_instrs.size(), 1);
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("Scale");
+  scope->Var<hlir::framework::Tensor>("Bias");
+  scope->Var<hlir::framework::Tensor>("Mean");
+  scope->Var<hlir::framework::Tensor>("Variance");
+
+  auto A1        = scope->GetTensor("A");
+  auto Scale1    = scope->GetTensor("Scale");
+  auto Bias1     = scope->GetTensor("Bias");
+  auto Mean1     = scope->GetTensor("Mean");
+  auto Variance1 = scope->GetTensor("Variance");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(Scale1, target);
+  SetRandData<float>(Bias1, target);
+  SetRandData<float>(Mean1, target);
+  SetRandData<float>(Variance1, target);
+
+  runtime_program->Execute();
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass.cc b/paddle/cinn/hlir/pass/constant_folding_pass.cc
new file mode 100644
index 0000000000000..a4aa07c6030cd
--- /dev/null
+++ b/paddle/cinn/hlir/pass/constant_folding_pass.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/type.h"
+#include "cinn/hlir/pass/constant_folding_pass_util.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+using framework::OpPatternKind;
+using framework::shape_t;
+
+using common::GraphEdge;
+using common::GraphNode;
+
+using AlterFunction = std::function<void(const FusionHelperBase*, Graph*, Node*)>;
+
+// Constant Fold Pass
+//
+class ConstantFoldingPassHelper : public FusionHelperBase {
+ public:
+  ConstantFoldingPassHelper(Graph* graph) : FusionHelperBase(graph), graph_(graph) { RegisterAlterFunction(); }
+
+  void operator()() {
+    bool update = false;
+    do {
+      update             = false;
+      auto nodes_inorder = std::get<0>(graph_->topological_order());
+      for (auto node : nodes_inorder) {
+        if (!node->safe_as<Node>()) {
+          continue;
+        }
+        // check producer's type
+        auto producers = GetProducerNode(node->safe_as<Node>());
+        if (producers.empty()) {
+          continue;
+        }
+
+        bool can_fold = true;
+        for (auto producer : producers) {
+          if (!IsConstOp(producer)) {
+            can_fold = false;
+            break;
+          }
+          // if producer's output in graph_->outputs, then will not fold
+          for (auto& edge : producer->outlinks()) {
+            auto graph_node = edge->sink();
+            auto data       = graph_node->safe_as<NodeData>();
+            CHECK(data);
+            if (std::find(graph_->outputs.begin(), graph_->outputs.end(), data) != graph_->outputs.end()) {
+              can_fold = false;
+              break;
+            }
+          }
+        }
+
+        if (!can_fold) continue;
+        // key = "${cur_node_id}_${producer_node_id}"", for example:
+        // "broadcast_to_fill_constant" means fill_constant->broadcast_to
+        auto key = GetTypeName(node->safe_as<Node>());
+        if (alter_function_.count(key)) {
+          alter_function_[key](this, graph_, node->safe_as<Node>());
+          update = true;
+        }
+      }
+    } while (update);
+  }
+
+ private:
+  void RegisterAlterFunction() {
+    alter_function_ = {{"broadcast_to_const_scalar", fold_broadcast_to_constant},
+                       {"broadcast_to_fill_constant", fold_broadcast_to_constant},
+                       {"reshape_fill_constant", fold_reshape_fill_constant},
+                       {"squeeze_fill_constant", fold_squeeze_fill_constant},
+                       {"expand_dims_fill_constant", fold_expand_dims_fill_constant}};
+  }
+
+  std::string GetTypeName(Node* node) {
+    auto producers  = GetProducerNode(node->safe_as<Node>());
+    std::string key = node->op()->name;
+    for (auto producer : producers) {
+      key += std::string("_") + producer->op()->name;
+    }
+    return key;
+  }
+
+  std::unordered_map<std::string, AlterFunction> alter_function_;
+  Graph* graph_;
+};
+
+void ConstantFoldingPassInternal(Graph* graph) {
+  ConstantFoldingPassHelper constant_folding_pass_helper(graph);
+  constant_folding_pass_helper();
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(ConstantFolding) {
+  CINN_REGISTER_PASS(ConstantFolding)
+      .describe("Constant Fold Pass which performs \"Constant Folding\"")
+      .set_change_structure(true)
+      .set_body(cinn::hlir::pass::ConstantFoldingPassInternal);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_test.cc b/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
new file mode 100644
index 0000000000000..b761a890761f1
--- /dev/null
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
@@ -0,0 +1,333 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn {
+namespace frontend {
+
+int GetSize(std::vector<int>& shape) { return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()); }
+
+std::unordered_map<std::string, std::vector<float>> GetInputRandom(const std::vector<Variable>&& inputs) {
+  std::unordered_map<std::string, std::vector<float>> input_data;
+  for (auto input : inputs) {
+    input_data[input->id] = std::vector<float>(GetSize(input->shape));
+    InitRandomVector<float>(&input_data[input->id], input_data[input->id].size(), 0.0f, 1.0f, 1e-3);
+  }
+
+  return input_data;
+}
+
+std::unordered_map<std::string, std::vector<float>> RunModelTest(
+    Program& program,
+    const std::vector<std::string>&& passes,
+    const std::unordered_map<std::string, std::vector<float>>& input_data,
+    const std::unordered_set<std::string>& fetch_ids) {
+  auto target = common::DefaultTarget();
+  auto graph  = std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
+  hlir::framework::ApplyPasses(graph.get(), passes);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto run_program = gc.Build();
+
+  for (auto& data : input_data) {
+    scope->Var<hlir::framework::Tensor>(data.first);
+    auto tensor = scope->GetTensor(data.first);
+    CopyFromVector(data.second, tensor, target);
+  }
+  run_program->Execute();
+
+  std::unordered_map<std::string, std::vector<float>> outputs;
+  for (auto id : fetch_ids) {
+    auto tensor = scope->GetTensor(id);
+    std::vector<float> data(tensor->shape().numel());
+    CopyToVector(tensor, &data);
+    outputs[id] = data;
+  }
+
+  return outputs;
+}
+
+TEST(Constant_Folding, fold_broadcast_to_const_scalar_1) {
+  NetBuilder net_builder("fold_broadcast_to_const_scalar_1");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.Constant<float>(1.0f, "A");
+  auto B = net_builder.BroadcastTo(A, {h, w}, {1});
+  auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+  auto D = net_builder.Add(B, C);
+
+  auto fetch_ids  = {D->id};
+  auto input_data = GetInputRandom({C});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_broadcast_to_const_scalar_2) {
+  NetBuilder net_builder("fold_broadcast_to_const_scalar_2");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.Constant<float>(1.0f, "A");
+  auto B = net_builder.BroadcastTo(A, {h, w}, {1});
+  auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+  auto D = net_builder.CreateInput(Float(32), {1}, "D");
+  auto E = net_builder.Add(B, C);
+  auto F = net_builder.Add(A, D);
+
+  auto fetch_ids  = {E->id, F->id};
+  auto input_data = GetInputRandom({C, D});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_broadcast_to_const_scalar_3) {
+  NetBuilder net_builder("fold_broadcast_to_const_scalar_3");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.Constant<float>(1.0f, "A");
+  auto B = net_builder.BroadcastTo(A, {h, w}, {1});
+  auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+  auto D = net_builder.BroadcastTo(A, {h, w, w}, {2});
+  auto E = net_builder.CreateInput(Float(32), {h, w, w}, "E");
+  auto F = net_builder.Add(B, C);
+  auto G = net_builder.Add(D, E);
+
+  auto fetch_ids  = {G->id, F->id};
+  auto input_data = GetInputRandom({C, E});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_broadcast_to_fill_constant_1) {
+  NetBuilder net_builder("fold_broadcast_to_fill_constant_1");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.FillConstant<float>({w}, 1.0f, "A");
+  auto B = net_builder.BroadcastTo(A, {h, w}, {1});
+  auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+  auto D = net_builder.Add(B, C);
+
+  auto fetch_ids  = {D->id};
+  auto input_data = GetInputRandom({C});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_broadcast_to_fill_constant_2) {
+  NetBuilder net_builder("fold_broadcast_to_fill_constant_2");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.FillConstant<float>({w}, 1.0f, "A");
+  auto B = net_builder.BroadcastTo(A, {h, w}, {1});
+  auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+  auto D = net_builder.CreateInput(Float(32), {w}, "D");
+  auto E = net_builder.Add(B, C);
+  auto F = net_builder.Add(A, D);
+
+  auto fetch_ids  = {E->id, F->id};
+  auto input_data = GetInputRandom({C, D});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_reshape_fill_constant_1) {
+  NetBuilder net_builder("fold_reshape_fill_constant_1");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.FillConstant<float>({h * w}, 1.0f, "A");
+  auto B = net_builder.Reshape(A, {h, w});
+  auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+  auto D = net_builder.Add(B, C);
+
+  auto fetch_ids  = {D->id};
+  auto input_data = GetInputRandom({C});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_reshape_fill_constant_2) {
+  NetBuilder net_builder("fold_reshape_fill_constant_2");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.FillConstant<float>({h * w}, 1.0f, "A");
+  auto B = net_builder.Reshape(A, {h, w});
+  auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+  auto D = net_builder.CreateInput(Float(32), {h * w}, "D");
+  auto E = net_builder.Add(B, C);
+  auto F = net_builder.Add(A, D);
+
+  auto fetch_ids  = {E->id, F->id};
+  auto input_data = GetInputRandom({C, D});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_squeeze_fill_constant_1) {
+  NetBuilder net_builder("fold_squeeze_fill_constant_1");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.FillConstant<float>({h, 1, w, 1}, 1.0f, "A");
+  auto B = net_builder.Squeeze(A, {1, 3});
+  auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+  auto D = net_builder.Add(B, C);
+
+  auto fetch_ids  = {D->id};
+  auto input_data = GetInputRandom({C});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_squeeze_fill_constant_2) {
+  NetBuilder net_builder("fold_squeeze_fill_constant_2");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.FillConstant<float>({h, 1, w, 1}, 1.0f, "A");
+  auto B = net_builder.Squeeze(A, {1, 3});
+  auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+  auto D = net_builder.CreateInput(Float(32), {h, 1, w, 1}, "D");
+  auto E = net_builder.Add(B, C);
+  auto F = net_builder.Add(A, D);
+
+  auto fetch_ids  = {E->id, F->id};
+  auto input_data = GetInputRandom({C, D});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_expand_dims_to_fill_constant_1) {
+  NetBuilder net_builder("fold_expand_dims_to_fill_constant_1");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.FillConstant<float>({h, w}, 1.0f, "A");
+  auto B = net_builder.ExpandDims(A, {1, 3});
+  auto C = net_builder.CreateInput(Float(32), {h, 1, w, 1}, "C");
+  auto D = net_builder.Add(B, C);
+
+  auto fetch_ids  = {D->id};
+  auto input_data = GetInputRandom({C});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_expand_dims_to_fill_constant_2) {
+  NetBuilder net_builder("fold_expand_dims_to_fill_constant_2");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.FillConstant<float>({h, w}, 1.0f, "A");
+  auto B = net_builder.ExpandDims(A, {1, 3});
+  auto C = net_builder.CreateInput(Float(32), {h, 1, w, 1}, "C");
+  auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+  auto E = net_builder.Add(B, C);
+  auto F = net_builder.Add(A, D);
+
+  auto fetch_ids  = {E->id, F->id};
+  auto input_data = GetInputRandom({C, D});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(Constant_Folding, fold_expand_dims_to_fill_constant_3) {
+  NetBuilder net_builder("fold_expand_dims_to_fill_constant_3");
+  // create model, ExpandDims axes have nagetive value
+  int h = 32, w = 32;
+  auto A = net_builder.FillConstant<float>({h, w}, 1.0f, "A");
+  auto B = net_builder.ExpandDims(A, {1, -1});
+  auto C = net_builder.CreateInput(Float(32), {h, 1, w, 1}, "C");
+  auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+  auto E = net_builder.Add(B, C);
+  auto F = net_builder.Add(A, D);
+
+  auto fetch_ids  = {E->id, F->id};
+  auto input_data = GetInputRandom({C, D});
+  auto program    = net_builder.Build();
+  auto output0    = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+  auto output1 = RunModelTest(program, {"ConstantFolding", "OpFusionPass", "FusionMergePass"}, input_data, fetch_ids);
+
+  for (auto& output : output0) {
+    CHECK(output1.count(output.first));
+    CheckOutput<float>(output.second, output1[output.first], 1e-8, 1e-4);
+  }
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
new file mode 100644
index 0000000000000..343a80bc89e59
--- /dev/null
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
@@ -0,0 +1,237 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "cinn/hlir/pass/constant_folding_pass_util.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <queue>
+
+#include "absl/types/variant.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/utils/functional.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using cinn::utils::Attribute;
+using cinn::utils::AttributeMap;
+using cinn::utils::ShapeType;
+
+namespace utils {
+class ConstantFoldingHelper {
+ public:
+  ConstantFoldingHelper(const FusionHelperBase* helper, Graph* graph, Node* node)
+      : helper_(helper), graph_(graph), consumer_(node), producer_(helper->GetProducerNode(node)[0]) {}
+
+  const AttributeMap& GetProducerAttrs() const { return producer_->attrs.attr_store; }
+  const AttributeMap& GetConsumerAttrs() const { return consumer_->attrs.attr_store; }
+
+  // fold consumer node and producer node into a new op node
+  void operator()(const AttributeMap& attrs_map, const std::string& new_op_name) {
+    auto* new_fold_node = CreateNewNode(new_op_name, attrs_map);
+
+    // create new link.
+    RelinkEdge(new_fold_node);
+  }
+
+  // fold consumer node into producer node
+  void operator()(const AttributeMap& attrs_map) { this->operator()(attrs_map, producer_->op()->name); }
+
+ private:
+  Node* CreateNewNode(const std::string& op_name, const AttributeMap& attrs_map) {
+    auto* node             = new Node(Operator::Get(op_name), op_name, common::UniqName(op_name));
+    node->attrs.attr_store = attrs_map;
+    graph_->RegisterNode(node->id(), node);
+    return node;
+  }
+
+  void RelinkEdge(Node* new_fold_node) {
+    // first relink consumer node.
+    RelinkAndRemoveConsumer(new_fold_node);
+    // then relink producer node.
+    RelinkProducer(new_fold_node);
+  }
+
+  void RelinkAndRemoveConsumer(Node* new_fold_node) {
+    // relink outputs
+    {
+      const auto& consumer_outputs = helper_->GetNodeDatas(consumer_);
+      for (auto* output : consumer_outputs) {
+        // now the output linked to new fold node
+        output->source_node.Reset(new_fold_node);
+        new_fold_node->LinkTo(output);
+
+        consumer_->UnLinkSingleTo(output);
+      }
+    }
+
+    // consumer are replaced by new_fold_node now, drop useless consumer node
+    {
+      const auto& consumer_inputs = helper_->GetProducerNodeData(consumer_);
+      for (auto* input : consumer_inputs) {
+        input->UnLinkSingleTo(consumer_);
+      }
+      graph_->DropNode(consumer_);
+    }
+  }
+
+  void RelinkProducer(Node* new_fold_node) {
+    // if the producer's output are fetched, cannot remove the producer node
+    bool can_producer_remove = !helper_->output_nodes_set_.count(producer_);
+    // check whether producer node can be removed
+    if (can_producer_remove) {
+      const auto& producer_outputs = helper_->GetNodeDatas(producer_);
+      for (auto* output : producer_outputs) {
+        if (!output->outlinks().empty()) {
+          // if the producer's output linked to other node, cannot remove
+          can_producer_remove = false;
+          break;
+        }
+      }
+    }
+
+    // relink inputs
+    {
+      const auto& producer_inputs = helper_->GetProducerNodeData(producer_);
+      for (auto* input : producer_inputs) {
+        input->LinkTo(new_fold_node);
+
+        if (can_producer_remove) {
+          input->UnLinkSingleTo(producer_);
+        }
+      }
+    }
+
+    // drop producer node if needed
+    if (can_producer_remove) {
+      // the producer's output are no need now, remove
+      const auto& producer_outputs = helper_->GetNodeDatas(producer_);
+      for (auto* output : producer_outputs) {
+        producer_->UnLinkSingleTo(output);
+        graph_->DropNode(output);
+      }
+
+      graph_->DropNode(producer_);
+    }
+  }
+
+  const FusionHelperBase* helper_;
+  Graph* graph_{nullptr};
+  Node* producer_{nullptr};
+  Node* consumer_{nullptr};
+};
+
+}  // namespace utils
+
+// fold fill_constant/const_scalar->broadcast_to ==> fill_constant
+void fold_broadcast_to_constant(const FusionHelperBase* helper, Graph* graph, Node* node) {
+  utils::ConstantFoldingHelper fold_helper(helper, graph, node);
+
+  const auto& broadcast_to_attrs = fold_helper.GetConsumerAttrs();
+  const auto& constant_attrs     = fold_helper.GetProducerAttrs();
+
+  auto shape = GetAttr<ShapeType>(broadcast_to_attrs, "out_shape");
+
+  AttributeMap new_attrs;
+  new_attrs["dtype"]     = constant_attrs.at("dtype");
+  new_attrs["shape"]     = GetAttr<ShapeType>(broadcast_to_attrs, "out_shape");
+  new_attrs["value"]     = constant_attrs.at("value");
+  new_attrs["force_cpu"] = false;
+
+  fold_helper(new_attrs, "fill_constant");
+}
+
+// fold fill_constant->reshape ==> fill_constant
+void fold_reshape_fill_constant(const FusionHelperBase* helper, Graph* graph, Node* node) {
+  utils::ConstantFoldingHelper fold_helper(helper, graph, node);
+
+  const auto& reshape_attrs = fold_helper.GetConsumerAttrs();
+
+  AttributeMap new_attrs = fold_helper.GetProducerAttrs();
+  new_attrs["shape"]     = GetAttr<ShapeType>(reshape_attrs, "shape");
+
+  fold_helper(new_attrs);
+}
+
+// fold fill_constant->squeeze ==> fill_constant
+void fold_squeeze_fill_constant(const FusionHelperBase* helper, Graph* graph, Node* node) {
+  utils::ConstantFoldingHelper fold_helper(helper, graph, node);
+
+  const auto& squeeze_attrs  = fold_helper.GetConsumerAttrs();
+  const auto& constant_attrs = fold_helper.GetProducerAttrs();
+
+  const auto& shape = GetAttr<ShapeType>(constant_attrs, "shape");
+  const auto& axes  = GetAttr<ShapeType>(squeeze_attrs, "axes");
+  // set node attr
+  std::vector<int> n_shape;
+  if (axes.size() == 0) {
+    for (auto s : shape) {
+      if (s > 1) {
+        n_shape.push_back(s);
+      }
+    }
+  } else {
+    for (int idx = 0; idx < shape.size(); ++idx) {
+      if (std::find(axes.begin(), axes.end(), idx) == axes.end()) {
+        n_shape.push_back(shape[idx]);
+      }
+    }
+  }
+
+  AttributeMap new_attrs = constant_attrs;
+  new_attrs["shape"]     = n_shape;
+
+  fold_helper(new_attrs);
+}
+
+// fold fill_constant->expand_dims ==> fill_constant
+void fold_expand_dims_fill_constant(const FusionHelperBase* helper, Graph* graph, Node* node) {
+  utils::ConstantFoldingHelper fold_helper(helper, graph, node);
+
+  const auto& expand_dims_attrs = fold_helper.GetConsumerAttrs();
+  const auto& constant_attrs    = fold_helper.GetProducerAttrs();
+
+  const auto& shape = GetAttr<ShapeType>(constant_attrs, "shape");
+  auto axes         = GetAttr<ShapeType>(expand_dims_attrs, "axes");
+
+  int shape_size = shape.size();
+  int axes_size  = axes.size();
+  int total_size = shape_size + axes_size;
+  axes           = cinn::utils::GetPositiveAxes(axes, total_size);
+
+  // check axes whether in range [-total_size, total_size-1] and convert all to [0, total_size-1].
+  // check axes can't repeat.
+  std::sort(axes.begin(), axes.end(), std::less<int>());
+  for (int idx = 0; idx < axes_size - 1; ++idx) {
+    CHECK_NE(axes[idx], axes[idx + 1]);
+  }
+  // insert 1 to new shape.
+  std::vector<int> n_shape(total_size, 1);
+  for (int idx = 0, index = 0; idx < n_shape.size(); ++idx) {
+    if (std::find(axes.begin(), axes.end(), idx) == axes.end()) {
+      n_shape[idx] = shape[index++];
+    }
+  }
+
+  AttributeMap new_attrs = constant_attrs;
+  new_attrs["shape"]     = n_shape;
+
+  fold_helper(new_attrs);
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_util.h b/paddle/cinn/hlir/pass/constant_folding_pass_util.h
new file mode 100644
index 0000000000000..517ecee70cca9
--- /dev/null
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_util.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <queue>
+
+#include "cinn/hlir/pass/fusion_helper_base.h"
+#include "cinn/utils/functional.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+// fold fill_constant/const_scalar->broadcast_to ==> fill_constant
+void fold_broadcast_to_constant(const FusionHelperBase* helper, Graph* graph, Node* node);
+
+// fold fill_constant->reshape ==> fill_constant
+void fold_reshape_fill_constant(const FusionHelperBase* helper, Graph* graph, Node* node);
+
+// fold fill_constant->squeeze ==> fill_constant
+void fold_squeeze_fill_constant(const FusionHelperBase* helper, Graph* graph, Node* node);
+
+// fold fill_constant->expand_dims ==> fill_constant
+void fold_expand_dims_fill_constant(const FusionHelperBase* helper, Graph* graph, Node* node);
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/custom_call_pass.cc b/paddle/cinn/hlir/pass/custom_call_pass.cc
new file mode 100644
index 0000000000000..392ed57d53986
--- /dev/null
+++ b/paddle/cinn/hlir/pass/custom_call_pass.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/type.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/external_api_registry.h"
+#include "cinn/utils/string.h"
+
+DECLARE_string(cinn_custom_call_deny_ops);
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using cinn::hlir::op::ExternalApiRegistry;
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+
+class GraphAlterHelper {
+ public:
+  GraphAlterHelper(Graph* graph) : graph_(graph) {
+    if (!FLAGS_cinn_custom_call_deny_ops.empty()) {
+      auto splited_names = cinn::utils::Split(FLAGS_cinn_custom_call_deny_ops, ";");
+      deny_ops_          = {splited_names.begin(), splited_names.end()};
+    }
+  }
+  void TransToCustomCall(const common::Target& target) {
+    // collect candidate nodes
+    auto mark_nodes = graph_->CollectNodes([this, &target](const common::GraphNode* graph_node) -> bool {
+      if (graph_node->safe_as<Node>()) {
+        auto node      = graph_node->safe_as<Node>();
+        auto&& op_name = node->op()->name;
+        // a op with external_api registered and not excluded explicitly will be selected
+        if (!IsExcluded(op_name) && ExternalApiRegistry::Global()->Has(op_name, target)) {
+          VLOG(4) << "Op:" << op_name << " will use custom_call";
+          return true;
+        }
+      }
+
+      return false;
+    });
+
+    for (auto* graph_node : mark_nodes) {
+      auto* node = graph_node->safe_as<Node>();
+      // revise the output edges for conv2d because the compute implement of
+      // codegen-registered is not consistent with cudnn
+      if ((node->op()->name == "conv2d" || node->op()->name == "depthwise_conv2d") &&
+          target == common::DefaultNVGPUTarget()) {
+        auto out_links = node->outlinks_in_order();
+        for (int idx = 1; idx < out_links.size(); ++idx) {
+          auto link = out_links[idx];
+          CHECK(link->sink()->safe_as<NodeData>());
+          node->UnLinkSingleTo(link->sink());
+          graph_->DropNode(link->sink());
+        }
+      }
+
+      node->attrs.attr_store["original_op"] = node->op()->name;
+      node->attrs.op                        = framework::Operator::Get("custom_call");
+    }
+  }
+
+ private:
+  Graph* graph_;
+  std::unordered_set<std::string> deny_ops_;
+
+  bool IsExcluded(const std::string& op_name) { return deny_ops_.count(op_name); }
+};
+
+void TransToCustomCallInternal(Graph* graph) {
+  VLOG(3) << "TransToCustomCallPass...!";
+  GraphAlterHelper(graph).TransToCustomCall(graph->target_);
+  VLOG(3) << "TransToCustomCallPass Finish...!";
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(TransToCustomCallPass) {
+  CINN_REGISTER_PASS(TransToCustomCallPass)
+      .describe(
+          "This pass replaces every op with external_api registered on the specified target to be custom_call op, "
+          "except the blacklist specified by FLAGS_cinn_custom_call_deny_ops")
+      .set_change_structure(false)
+      .set_body(cinn::hlir::pass::TransToCustomCallInternal);
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/dce_pass.cc b/paddle/cinn/hlir/pass/dce_pass.cc
new file mode 100644
index 0000000000000..c02ec31d8c34a
--- /dev/null
+++ b/paddle/cinn/hlir/pass/dce_pass.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <queue>
+
+#include "cinn/common/type.h"
+#include "cinn/hlir/pass/op_fusion_pass_util.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+using framework::OpPatternKind;
+using framework::shape_t;
+
+using common::GraphEdge;
+using common::GraphNode;
+
+using GroupPtr  = std::shared_ptr<Graph::Group>;
+using GroupList = std::vector<GroupPtr>;
+
+using ConditionFunction = std::function<bool(const FusionHelperBase*, const Node*, const GroupPtr&)>;
+
+class DceHelper : public FusionHelperBase {
+ public:
+  DceHelper(Graph* graph) : FusionHelperBase(graph), graph_(graph) {}
+
+  void operator()() {
+    if (output_nodes_set_.empty()) {
+      return;
+    }
+    for (auto node : output_nodes_set_) {
+      WFS(node);
+    }
+
+    RemoveDeadNode();
+  }
+
+ private:
+  void WFS(const Node* node) {
+    std::queue<const Node*> candidates;
+    candidates.push(node);
+    nodes_set_.insert(node);
+
+    while (!candidates.empty()) {
+      auto candidate = candidates.front();
+      candidates.pop();
+      auto producers = GetProducerNode(candidate);
+
+      for (auto producer : producers) {
+        if (nodes_set_.count(producer)) {
+          continue;
+        }
+        candidates.push(producer);
+        nodes_set_.insert(producer);
+      }
+    }
+  }
+
+  void RemoveDeadNode() {
+    auto nodes_inorder = std::get<0>(graph_->topological_order());
+    std::vector<Node*> all_nodes_list;
+    for (auto node : nodes_inorder) {
+      if (!node->safe_as<Node>()) {
+        continue;
+      }
+      all_nodes_list.push_back(node->safe_as<Node>());
+    }
+
+    for (auto node : all_nodes_list) {
+      if (nodes_set_.count(node)) {
+        continue;
+      }
+      auto& inlinks  = node->inlinks();
+      auto& outlinks = node->outlinks();
+
+      // remove others link to node.
+      for (auto link : inlinks) {
+        auto src = link->source();
+        src->UnLinkAllTo(node);
+      }
+
+      // remove node data link to others.
+      for (auto link : outlinks) {
+        // node data
+        auto ndata  = link->sink();
+        auto& links = ndata->outlinks();
+        for (auto link_ : links) {
+          auto dest = link_->sink();
+          ndata->UnLinkAllTo(dest);
+        }
+        VLOG(1) << "Drop : " << ndata->id();
+        graph_->DropNode(ndata);
+      }
+
+      VLOG(1) << "Drop : " << node->id();
+      graph_->DropNode(node);
+    }
+  }
+
+  framework::Graph* graph_;
+  std::unordered_set<const Node*> nodes_set_;
+};
+
+void DCEPassInternal(Graph* graph) {
+  CHECK_GT(graph->outputs.size(), 0);
+  DceHelper dce_helper(graph);
+  dce_helper();
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(DCE) {
+  CINN_REGISTER_PASS(DCE)
+      .describe("Dce Pass which performs \"Dead code elimination\"")
+      .set_change_structure(true)
+      .set_body(cinn::hlir::pass::DCEPassInternal);
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/dce_pass_test.cc b/paddle/cinn/hlir/pass/dce_pass_test.cc
new file mode 100644
index 0000000000000..2fd08f9ca985b
--- /dev/null
+++ b/paddle/cinn/hlir/pass/dce_pass_test.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn {
+namespace frontend {
+
+TEST(DCE, Test_0) {
+  NetBuilder net_builder("Test_0");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+  auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+  auto C = net_builder.Add(A, B);
+  auto D = net_builder.Multiply(A, B);
+
+  auto fetch_ids = {D->id};
+  auto program   = net_builder.Build();
+  auto target    = common::DefaultTarget();
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
+  hlir::framework::ApplyPass(graph.get(), "DCE");
+
+  CHECK_EQ(graph->nodes().size(), 4);
+}
+
+TEST(DCE, Test_1) {
+  NetBuilder net_builder("Test_1");
+  // create model
+  int h = 32, w = 32;
+  auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+  auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+  auto C = net_builder.Add(A, B);
+  auto D = net_builder.Multiply(A, B);
+  auto E = net_builder.Divide(A, B);
+  auto F = net_builder.Add(C, D);
+  auto G = net_builder.Add(D, E);
+  auto H = net_builder.Add(E, G);
+
+  auto fetch_ids = {F->id};
+  auto program   = net_builder.Build();
+  auto target    = common::DefaultTarget();
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
+  hlir::framework::ApplyPass(graph.get(), "DCE");
+  CHECK_EQ(graph->nodes().size(), 8);
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/dense_merge_pass.cc b/paddle/cinn/hlir/pass/dense_merge_pass.cc
new file mode 100644
index 0000000000000..b994788080c75
--- /dev/null
+++ b/paddle/cinn/hlir/pass/dense_merge_pass.cc
@@ -0,0 +1,187 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/common/type.h"
+#include "cinn/hlir/pass/fusion_helper_base.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using common::GraphNode;
+using framework::Graph;
+using framework::Node;
+using framework::NodeAttr;
+
+// Dense Merge Pass: merge those gemm which has same var as input into a batched cubals call op.
+// A * B, A * C, A * D,...
+// after
+// A * [B, C, D,...]
+// Using cublas batched gemm can avoid do concat and slice.
+
+class DenseMergePassHelper : public FusionHelperBase {
+ public:
+  DenseMergePassHelper(Graph* graph) : FusionHelperBase(graph), graph_(graph) {}
+
+  void operator()() {
+    auto nodes_inorder = std::get<0>(graph_->topological_order());
+    for (auto node : nodes_inorder) {
+      if (removed_node_set_.count(node)) {
+        continue;
+      }
+      if (node->safe_as<NodeData>()) {
+        MergeDense(node->safe_as<NodeData>());
+      }
+    }
+  }
+
+ private:
+  void MergeDense(NodeData* node) {
+    auto dense_ops = GetDenseOp(node);
+    if (dense_ops.size() <= 1) {
+      return;
+    }
+
+    std::vector<Node*> lhs_ops, rhs_ops;
+    for (auto op : dense_ops) {
+      const auto& in_links = op->inlinks_in_order();
+      CHECK(!in_links.empty());
+      if (in_links[0]->source() == node) {
+        lhs_ops.push_back(op);
+      } else {
+        rhs_ops.push_back(op);
+      }
+    }
+
+    if (lhs_ops.size() > 1) LeftMerge(node, lhs_ops);
+    if (rhs_ops.size() > 1) RightMerge(node, rhs_ops);
+  }
+
+  std::vector<Node*> GetDenseOp(NodeData* node) {
+    std::vector<Node*> dense_ops;
+    for (auto link : node->outlinks()) {
+      auto sink = link->sink()->safe_as<Node>();
+      if (sink->op()->name == "matmul" || sink->op()->name == "mul" || sink->op()->name == "cublas_gemm" ||
+          sink->op()->name == "cublas_matmul") {
+        if (std::find(dense_ops.begin(), dense_ops.end(), sink) == dense_ops.end()) {
+          dense_ops.push_back(sink);
+        }
+      }
+    }
+    return dense_ops;
+  }
+
+  void LeftMerge(NodeData* node, std::vector<Node*> dense_ops) { DoMerge(node, dense_ops, 1, "left"); }
+
+  void RightMerge(NodeData* node, std::vector<Node*> dense_ops) { DoMerge(node, dense_ops, 0, "right"); }
+
+  void DoMerge(NodeData* node, std::vector<Node*> dense_ops, int pos, std::string side) {
+    // split dense op by it's attr
+    std::unordered_map<std::string, std::vector<Node*>> dense_op_map;
+    for (auto dense_op : dense_ops) {
+      const auto& in_links = dense_op->inlinks_in_order();
+      CHECK_GT(in_links.size(), pos);
+      auto sign = GenOpSign(in_links[pos]->source()->safe_as<NodeData>(), dense_op->attrs);
+      if (dense_op_map.count(sign)) {
+        dense_op_map[sign].push_back(dense_op);
+      } else {
+        dense_op_map[sign] = {dense_op};
+      }
+    }
+
+    for (auto dense_op : dense_op_map) {
+      if (dense_op.second.size() <= 1) {
+        continue;
+      }
+
+      // create custom call node
+      Node* node_tmp = new Node(Operator::Get("custom_call"), "custom_call", common::UniqName("custom_call"));
+      graph_->RegisterNode(node_tmp->id(), node_tmp);
+      node_tmp->attrs.attr_store                = dense_op.second[0]->attrs.attr_store;
+      node_tmp->attrs.attr_store["side"]        = side;
+      node_tmp->attrs.attr_store["custom_call"] = std::string("cinn_call_batched_cublas");
+
+      // update inlink.
+      node->LinkTo(node_tmp);
+      for (auto op : dense_op.second) {
+        const auto& in_links = op->inlinks_in_order();
+        node->UnLinkSingleTo(op);
+        // link to new node
+        CHECK_GT(in_links.size(), pos);
+        in_links[pos]->source()->LinkTo(node_tmp);
+        // unlink old dense node
+        in_links[pos]->source()->UnLinkSingleTo(op);
+        // dense_node_data link to node_tmp
+        auto op_node_data = GetNodeData(op);
+        op->UnLinkSingleTo(op_node_data);
+        node_tmp->LinkTo(op_node_data);
+        // update node tmp.
+        op_node_data->source_node.Reset(node_tmp);
+
+        removed_node_set_.insert(op);
+        graph_->DropNode(op);
+      }
+    }
+  }
+
+  std::string GenOpSign(const NodeData* node, const NodeAttr& attrs) {
+    auto attr_store    = attrs.attr_store;
+    bool trans_a       = attr_store.count("trans_a") ? absl::get<bool>(attr_store.at("trans_a")) : false;
+    bool trans_b       = attr_store.count("trans_b") ? absl::get<bool>(attr_store.at("trans_b")) : false;
+    bool trans_out     = attr_store.count("trans_out") ? absl::get<bool>(attr_store.at("trans_out")) : false;
+    float alpha        = attr_store.count("alpha") ? absl::get<float>(attr_store.at("alpha")) : 1.0f;
+    float beta         = attr_store.count("beta") ? absl::get<float>(attr_store.at("beta")) : 0.0f;
+    int x_num_col_dims = attr_store.count("x_num_col_dims") ? absl::get<int>(attr_store.at("x_num_col_dims")) : 0;
+    int y_num_col_dims = attr_store.count("y_num_col_dims") ? absl::get<int>(attr_store.at("y_num_col_dims")) : 0;
+
+    std::string sign = "";
+    sign += std::to_string(trans_a);
+    sign += "_" + std::to_string(trans_b);
+    sign += "_" + std::to_string(trans_out);
+    sign += "_" + std::to_string(alpha);
+    sign += "_" + std::to_string(beta);
+    sign += "_" + std::to_string(x_num_col_dims);
+    sign += "_" + std::to_string(y_num_col_dims);
+    auto shape = shape_dict_.at(node->id());
+    for (auto s : shape) {
+      sign += "_" + std::to_string(s);
+    }
+
+    return sign;
+  }
+
+ private:
+  std::unordered_set<GraphNode*> removed_node_set_;
+  Graph* graph_;
+};
+
+void DenseMergePassInternal(Graph* graph) {
+  DenseMergePassHelper dense_merge_pass_helper(graph);
+  dense_merge_pass_helper();
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(DenseMergePass) {
+  CINN_REGISTER_PASS(DenseMergePass)
+      .describe("")
+      .set_change_structure(true)
+      .provide_graph_attr("infershape")
+      .provide_graph_attr("inferdtype")
+      .set_body(cinn::hlir::pass::DenseMergePassInternal);
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/dense_merge_pass_test.cc b/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
new file mode 100644
index 0000000000000..2b3954958c23c
--- /dev/null
+++ b/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn {
+namespace frontend {
+
+int GetSize(std::vector<int>& shape) { return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()); }
+
+void RunModelTest(Program& program,
+                  const std::vector<Variable>&& inputs,
+                  const std::unordered_set<std::string>& fetch_ids) {
+  // init input data.
+  std::vector<std::vector<float>> inputs_data;
+  for (auto input : inputs) {
+    inputs_data.emplace_back(GetSize(input->shape));
+    InitRandomVector<float>(&inputs_data.back(), inputs_data.back().size(), 0.0f, 1.0f, 1e-3);
+  }
+
+  auto target = common::DefaultTarget();
+  std::unordered_map<std::string, std::pair<std::vector<float>, std::vector<float>>> outputs;
+  {
+    auto graph = std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
+    hlir::framework::ApplyPass(graph.get(), "TransToCustomCallPass");
+    hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+    hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+    auto scope = BuildScope(target, graph);
+    hlir::framework::GraphCompiler gc(target, scope, graph);
+    auto run_program = gc.Build();
+
+    for (int idx = 0; idx < inputs.size(); ++idx) {
+      scope->Var<hlir::framework::Tensor>(inputs[idx]->id);
+      auto tensor = scope->GetTensor(inputs[idx]->id);
+      auto* data  = tensor->mutable_data<float>(target);
+      CopyFromVector(inputs_data[idx], tensor, target);
+    }
+    run_program->Execute();
+    for (auto id : fetch_ids) {
+      auto tensor = scope->GetTensor(id);
+      std::vector<float> data(tensor->shape().numel());
+      CopyToVector(tensor, &data);
+      outputs[id] = std::pair<std::vector<float>, std::vector<float>>(data, std::vector<float>());
+    }
+  }
+  {
+    auto graph = std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
+    hlir::framework::ApplyPass(graph.get(), "DenseMergePass");
+    hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+    hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+    auto scope = BuildScope(target, graph);
+    hlir::framework::GraphCompiler gc(target, scope, graph);
+    auto run_program = gc.Build();
+
+    for (int idx = 0; idx < inputs.size(); ++idx) {
+      scope->Var<hlir::framework::Tensor>(inputs[idx]->id);
+      auto tensor = scope->GetTensor(inputs[idx]->id);
+      auto* data  = tensor->mutable_data<float>(target);
+      CopyFromVector(inputs_data[idx], tensor, target);
+    }
+    run_program->Execute();
+    for (auto id : fetch_ids) {
+      auto tensor = scope->GetTensor(id);
+      std::vector<float> data(tensor->shape().numel());
+      CopyToVector(tensor, &data);
+      outputs[id].second = data;
+    }
+  }
+
+  for (auto& output : outputs) {
+    CheckOutput<float>(output.second.first, output.second.second, 1e-8, 1e-4);
+  }
+}
+
+TEST(DenseMergePass, Test_Matmul_0) {
+  int m = 128, k = 64, n = 128;
+  NetBuilder net_builder("Test_Matmul_0");
+  auto A = net_builder.CreateInput(Float(32), {m, k}, "A");
+  auto B = net_builder.CreateInput(Float(32), {k, n}, "B");
+  auto C = net_builder.CreateInput(Float(32), {k, n}, "C");
+  auto D = net_builder.Matmul(A, B);
+  auto E = net_builder.Matmul(A, C);
+
+  auto fetch_ids = {D->id, E->id};
+  auto program   = net_builder.Build();
+  RunModelTest(program, {A, B, C}, fetch_ids);
+}
+
+TEST(DenseMergePass, Test_Matmul_1) {
+  NetBuilder net_builder("Test_Matmul_1");
+  auto A = net_builder.CreateInput(Float(32), {128, 64}, "A");
+  auto B = net_builder.CreateInput(Float(32), {128, 64}, "B");
+  auto C = net_builder.CreateInput(Float(32), {64, 128}, "C");
+  auto D = net_builder.Matmul(A, C);
+  auto E = net_builder.Matmul(B, C);
+
+  auto fetch_ids = {D->id, E->id};
+  auto program   = net_builder.Build();
+  RunModelTest(program, {A, B, C}, fetch_ids);
+}
+
+TEST(DenseMergePass, Test_Matmul_2) {
+  NetBuilder net_builder("Test_Matmul_2");
+  auto A = net_builder.CreateInput(Float(32), {128, 64}, "A");
+  auto B = net_builder.CreateInput(Float(32), {128, 64}, "B");
+  auto C = net_builder.CreateInput(Float(32), {128, 64}, "C");
+  auto D = net_builder.CreateInput(Float(32), {128, 64}, "D");
+  auto E = net_builder.CreateInput(Float(32), {64, 128}, "E");
+  auto F = net_builder.Matmul(A, E);
+  auto G = net_builder.Matmul(B, E);
+  auto H = net_builder.Matmul(C, E);
+  auto I = net_builder.Matmul(D, E);
+
+  auto fetch_ids = {F->id, G->id, H->id, I->id};
+  auto program   = net_builder.Build();
+  RunModelTest(program, {A, B, C, D, E}, fetch_ids);
+}
+
+TEST(DenseMergePass, Test_Matmul_3) {
+  NetBuilder net_builder("Test_Matmul_3");
+  auto A = net_builder.CreateInput(Float(32), {128, 64}, "A");
+  auto B = net_builder.CreateInput(Float(32), {128, 64}, "B");
+  auto C = net_builder.CreateInput(Float(32), {64, 128}, "C");
+  auto D = net_builder.CreateInput(Float(32), {128, 64}, "D");
+  auto E = net_builder.CreateInput(Float(32), {128, 64}, "E");
+  auto F = net_builder.Matmul(A, C);
+  auto G = net_builder.Matmul(B, C);
+  auto H = net_builder.Matmul(C, D);
+  auto I = net_builder.Matmul(C, E);
+
+  auto fetch_ids = {F->id, G->id, H->id, I->id};
+  auto program   = net_builder.Build();
+  RunModelTest(program, {A, B, C, D, E}, fetch_ids);
+}
+
+TEST(DenseMergePass, Test_Matmul_4) {
+  NetBuilder net_builder("Test_Matmul_4");
+  auto A = net_builder.CreateInput(Float(32), {128, 64}, "A");
+  auto B = net_builder.CreateInput(Float(32), {128, 64}, "B");
+  auto C = net_builder.CreateInput(Float(32), {64, 128}, "C");
+  auto D = net_builder.CreateInput(Float(32), {64, 128}, "D");
+  auto F = net_builder.Matmul(A, C);
+  auto G = net_builder.Matmul(B, C);
+  auto H = net_builder.Matmul(A, D);
+  auto I = net_builder.Matmul(B, D);
+
+  auto fetch_ids = {F->id, G->id, H->id, I->id};
+  auto program   = net_builder.Build();
+  RunModelTest(program, {A, B, C, D}, fetch_ids);
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/dot_merger.cc b/paddle/cinn/hlir/pass/dot_merger.cc
new file mode 100644
index 0000000000000..8739d9dd17e96
--- /dev/null
+++ b/paddle/cinn/hlir/pass/dot_merger.cc
@@ -0,0 +1,437 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/pass/infershape.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+namespace {
+
+using common::GraphNode;
+using framework::Node;
+using framework::NodeData;
+using framework::Operator;
+
+template <typename T>
+using OpValueType  = cinn::hlir::framework::OpValueType<T>;
+using infershape_t = std::function<std::vector<framework::shape_t>(const std::vector<framework::shape_t>&,
+                                                                   const framework::AttrMapType&)>;
+using inferdtype_t = std::function<std::vector<Type>(const std::vector<Type>&, const framework::AttrMapType&)>;
+using dtype_dict_t = absl::flat_hash_map<std::string, common::Type>;
+using shape_dict_t = absl::flat_hash_map<std::string, framework::shape_t>;
+
+bool accessible(GraphNode* start, GraphNode* end) {
+  std::set<GraphNode const*> marked;
+  std::function<void(GraphNode const*)> dfs = [&](GraphNode const* node) {
+    marked.emplace(node);
+    for (const auto& edge : node->outlinks()) {
+      if (!marked.count(edge->sink())) {
+        dfs(edge->sink());
+      }
+    }
+  };
+  dfs(start);
+  return marked.count(end);
+}
+
+template <typename T>
+T get_attr(Node* instr, const std::string& attr, T def) {
+  if (!instr->attrs.attr_store.count(attr)) {
+    return def;
+  }
+  return absl::get<T>(instr->attrs.attr_store.at(attr));
+}
+
+NodeData* input_operand(Node* instr, int idx) { return instr->inlinks_in_order()[idx]->source()->safe_as<NodeData>(); }
+NodeData* output_operand(Node* instr, int idx) { return instr->outlinks_in_order()[idx]->sink()->safe_as<NodeData>(); }
+
+void remove_node(framework::Graph* graph, GraphNode* node) {
+  auto inlinks = node->inlinks();
+  for (auto& link : inlinks) {
+    link->source()->UnLinkSingleTo(link->sink());
+  }
+  auto outlinks = node->outlinks();
+  for (auto& link : outlinks) {
+    link->source()->UnLinkSingleTo(link->sink());
+  }
+  graph->DropNode(node);
+}
+
+template <typename T>
+bool all_equal(const T& arg) {
+  return arg[0] == arg[1];
+}
+
+template <typename T, typename... Args>
+bool all_equal(const T& arg, const Args&... args) {
+  return all_equal(arg) && all_equal(args...);
+}
+
+void PrintAllMatmulOps(framework::Graph* graph, const std::string& dot_type) {
+  auto& dtype_dict{graph->GetMutableAttrs<dtype_dict_t>("inferdtype")};
+  auto& shape_dict{graph->GetMutableAttrs<shape_dict_t>("infershape")};
+  auto nodes       = std::get<0>(graph->topological_order());
+  auto print_shape = [](const std::vector<int32_t>& shape) -> std::string {
+    std::stringstream ss;
+    for (auto i : shape) {
+      ss << i << ",";
+    }
+    return ss.str();
+  };
+  for (auto* n : nodes) {
+    auto* op_node = n->safe_as<Node>();
+    if (op_node && op_node->op()->name == dot_type) {
+      auto a_id    = input_operand(op_node, 0)->id();
+      auto b_id    = input_operand(op_node, 1)->id();
+      auto a_shape = shape_dict.at(a_id);
+      auto b_shape = shape_dict.at(b_id);
+      LOG(INFO) << "Find op: " << dot_type;
+      LOG(INFO) << "Attrs: "
+                << "trans_a = " << get_attr<bool>(op_node, "trans_a", false) << ", "
+                << "trans_b = " << get_attr<bool>(op_node, "trans_b", false) << ", "
+                << "a: " << a_id << ", " << print_shape(a_shape) << " "
+                << "b: " << b_id << ", " << print_shape(b_shape);
+    }
+  }
+}
+
+class DotBuilder {
+ public:
+  explicit DotBuilder(framework::Graph* graph, std::string dot_type)
+      : graph_{graph},
+        dot_type_{std::move(dot_type)},
+        dtype_dict_{graph_->GetMutableAttrs<dtype_dict_t>("inferdtype")},
+        shape_dict_{graph_->GetMutableAttrs<shape_dict_t>("infershape")} {}
+
+  framework::Graph* graph() const { return graph_; }
+  const dtype_dict_t& dtype_dict() const { return dtype_dict_; };
+  const shape_dict_t& shape_dict() const { return shape_dict_; };
+
+  // Currently the constructor of `NodeData` needs to pass in `Shared<Node>`.
+  NodeData* Var(common::Shared<Node>& producer) {
+    auto* res = new NodeData(producer, 0, 0, node_name("var"), false);
+    graph_->RegisterNode(producer->id(), res);
+    graph_->RegisterNode(res->id(), producer.get());
+    producer->LinkTo(res);
+    InferShape(producer.get(), dtype_dict_, shape_dict_);
+    return res;
+  }
+
+  NodeData* Concat(int axis, std::vector<NodeData*> inputs) {
+    const std::string type{"concat"};
+    auto instr = common::Shared<Node>(new Node(framework::Operator::Get(type), type, node_name(type)));
+    instr->attrs.attr_store["axis"] = axis;
+    for (auto* in : inputs) {
+      in->LinkTo(instr.get());
+    }
+    auto* output = Var(instr);
+    return output;
+  }
+
+  NodeData* Matmul(bool trans_a, bool trans_b, bool trans_out, float alpha, NodeData* lhs, NodeData* rhs) {
+    const std::string type{dot_type_};
+    auto instr = common::Shared<Node>(new Node(framework::Operator::Get(type), type, node_name(type)));
+    matmul_    = instr.get();
+    instr->attrs.attr_store["trans_a"]   = trans_a;
+    instr->attrs.attr_store["trans_b"]   = trans_b;
+    instr->attrs.attr_store["trans_out"] = trans_out;
+    instr->attrs.attr_store["alpha"]     = alpha;
+    lhs->LinkTo(instr.get());
+    rhs->LinkTo(instr.get());
+    auto* output = Var(instr);
+    return output;
+  }
+
+  NodeData* Slice(
+      std::vector<int> axes, std::vector<int> starts, std::vector<int> ends, NodeData* input, NodeData* output) {
+    const std::string type{"slice"};
+    auto instr = common::Shared<Node>(new Node(framework::Operator::Get(type), type, node_name(type)));
+    instr->attrs.attr_store["axes"]          = std::move(axes);
+    instr->attrs.attr_store["starts"]        = std::move(starts);
+    instr->attrs.attr_store["ends"]          = std::move(ends);
+    instr->attrs.attr_store["infer_flags"]   = std::vector<int>{};
+    instr->attrs.attr_store["strides"]       = std::vector<int>{};
+    instr->attrs.attr_store["decrease_axis"] = std::vector<int>{};
+    input->LinkTo(instr.get());
+    instr->LinkTo(output);
+    graph_->RegisterNode(instr->id(), instr.get());
+    InferShape(instr.get(), dtype_dict_, shape_dict_);
+    output->source_node = instr;
+    return output;
+  }
+
+  std::string node_name(std::string prefix) const {
+    return std::move(prefix.append("__dot_merger_").append(std::to_string(idx_++)));
+  }
+
+  Node* matmul_op() const { return matmul_; }
+
+ private:
+  static int idx_;
+  framework::Graph* graph_{};
+  const std::string dot_type_;
+  dtype_dict_t& dtype_dict_;
+  shape_dict_t& shape_dict_;
+  Node* matmul_{};
+};
+
+int DotBuilder::idx_ = 0;
+
+class DotMergerPass {
+ public:
+  // Find the same input for matrix multiplication and recursively fuse.
+  static int Apply(framework::Graph* graph, const std::string& dot_type) {
+    int cnt{};
+    // In the return map, the key is a shared variable, and the values
+    // are the dot operators to be fused.
+    auto clusters = GetClusters(graph, dot_type);
+    std::set<Node*> nodes_to_remove;
+    DotBuilder builder(graph, dot_type);
+    for (auto& c : clusters) {
+      auto& dots = c.second;
+      for (size_t i = 0; i < dots.size(); ++i) {
+        auto*& a = dots[i];
+        if (!a) {
+          VLOG(5) << "The node has been fused and removed, skipped.";
+          continue;
+        }
+        std::vector<Node*> merge_nodes;
+        merge_nodes.clear();
+        merge_nodes.push_back(a);
+        for (size_t j = i + 1; j < dots.size(); ++j) {
+          auto* b = dots[j];
+          if (!b || nodes_to_remove.count(a) || nodes_to_remove.count(b) || accessible(a, b) || accessible(b, a)) {
+            VLOG(5) << "Because nodes `" << a->id() << "` and `" << b->id()
+                    << " have data dependencies or have been deleted, they cannot be merged.";
+            continue;
+          }
+          if (!is_merge(&builder, a, b)) {
+            continue;
+          }
+          merge_nodes.push_back(dots[j]);
+        }
+        if (merge_nodes.size() < 2) {
+          continue;
+        }
+        auto* merged = NewMergeDots(&builder, merge_nodes);
+        cnt += 1;
+        for (size_t j = 0; j < merge_nodes.size(); ++j) {
+          nodes_to_remove.insert(dots[j]);
+          if (j != 0) {
+            dots[j] = nullptr;
+          }
+        }
+        dots[i] = merged;
+      }
+    }
+
+    for (auto* n : nodes_to_remove) {
+      remove_node(graph, n);
+    }
+    return cnt;
+  }
+
+ private:
+  static std::map<NodeData*, std::vector<Node*>> GetClusters(framework::Graph* graph, const std::string& op_type) {
+    std::map<NodeData*, std::vector<Node*>> clusters;
+    auto nodes = std::get<0>(graph->topological_order());
+    for (auto* n : nodes) {
+      auto* op_node = n->safe_as<Node>();
+      if (op_node && op_node->op()->name == op_type) {
+        for (auto& edge : n->inlinks()) {
+          auto* var_node = edge->source()->safe_as<NodeData>();
+          CHECK(var_node) << "The variable node can not be null.";
+          clusters[var_node].push_back(op_node);
+        }
+      }
+    }
+    std::vector<std::map<NodeData*, std::vector<Node*>>::iterator> del;
+    for (auto it = clusters.begin(); it != clusters.end(); ++it) {
+      // At least 2 operators are required to fuse.
+      if (it->second.size() < 2) {
+        del.push_back(it);
+      }
+    }
+    for (auto& it : del) {
+      clusters.erase(it);
+    }
+    VLOG(3) << "clusters size = " << clusters.size();
+    return clusters;
+  }
+
+  static bool is_merge(DotBuilder* builder, Node* a, Node* b) {
+    CHECK(a && b) << "The pointer of node is illegal.";
+    const std::array<bool, 2> trans_a{get_attr<bool>(a, "trans_a", false), get_attr<bool>(b, "trans_a", false)};
+    const std::array<bool, 2> trans_b{get_attr<bool>(a, "trans_b", false), get_attr<bool>(b, "trans_b", false)};
+    const std::array<bool, 2> trans_out{get_attr<bool>(a, "trans_out", false), get_attr<bool>(b, "trans_out", false)};
+    const std::array<float, 2> alpha{get_attr<float>(a, "alpha", 1.f), get_attr<float>(b, "alpha", 1.f)};
+    if (!all_equal(trans_a, trans_b, trans_out, alpha)) {
+      return false;
+    }
+    NodeData *shared_input{}, *input_a{}, *input_b{};
+    if (input_operand(a, 1) == input_operand(b, 1)) {
+      shared_input = input_operand(a, 1);
+      input_a      = input_operand(a, 0);
+      input_b      = input_operand(b, 0);
+    } else if (input_operand(a, 0) == input_operand(b, 0)) {
+      shared_input = input_operand(a, 0);
+      input_a      = input_operand(a, 1);
+      input_b      = input_operand(b, 1);
+    } else {
+      return false;
+    }
+    auto* output_a   = output_operand(a, 0);
+    auto* output_b   = output_operand(b, 0);
+    auto& graph_outs = builder->graph()->outputs;
+    for (auto* n : {shared_input, input_a, input_b}) {
+      if (std::find(graph_outs.begin(), graph_outs.end(), n) != graph_outs.end()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static Node* NewMergeDots(DotBuilder* builder, std::vector<Node*> merge_nodes) {
+    const std::array<bool, 2> trans_a{get_attr<bool>(merge_nodes[0], "trans_a", false),
+                                      get_attr<bool>(merge_nodes[1], "trans_a", false)};
+    const std::array<bool, 2> trans_b{get_attr<bool>(merge_nodes[0], "trans_b", false),
+                                      get_attr<bool>(merge_nodes[1], "trans_b", false)};
+    const std::array<float, 2> alpha{get_attr<float>(merge_nodes[0], "alpha", 1.f),
+                                     get_attr<float>(merge_nodes[1], "alpha", 1.f)};
+
+    bool lhs{true};
+    int axis{1};
+    NodeData* shared_input = input_operand(merge_nodes[0], 0);
+
+    if (input_operand(merge_nodes[0], 1) == input_operand(merge_nodes[1], 1)) {
+      shared_input = input_operand(merge_nodes[0], 1);
+      lhs          = false;
+      if (!trans_a[0]) {
+        axis = 0;
+      } else if (trans_b[0]) {
+        axis = 0;
+      }
+    }
+    CHECK(shared_input) << "The input node type must be variable.";
+    std::vector<NodeData*> concat_nodes;
+    concat_nodes.clear();
+    auto shape_shared = builder->shape_dict().at(shared_input->id());
+    concat_nodes.push_back(input_operand(merge_nodes[0], axis));
+    for (size_t i = 1; i < merge_nodes.size(); ++i) {
+      auto shape_a = builder->shape_dict().at(input_operand(merge_nodes[i - 1], axis)->id());
+      auto shape_b = builder->shape_dict().at(input_operand(merge_nodes[i], axis)->id());
+      CHECK_EQ(shape_a[1 - axis], shape_b[1 - axis])
+          << "The shape of matmul is error. " << shape_a.size() << ", " << shape_b.size();
+      concat_nodes.push_back(input_operand(merge_nodes[i], axis));
+    }
+    auto* concat_out = builder->Concat(axis, concat_nodes);
+    NodeData* matmul_out{};
+    if (!lhs) {
+      matmul_out = builder->Matmul(trans_a[0], trans_b[0], false, alpha[0], concat_out, shared_input);
+    } else {
+      matmul_out = builder->Matmul(trans_a[0], trans_b[0], false, alpha[0], shared_input, concat_out);
+    }
+    auto start_shape = 0;
+    for (size_t i = 0; i < concat_nodes.size(); ++i) {
+      auto shape   = builder->shape_dict().at(input_operand(merge_nodes[i], axis)->id());
+      auto* output = output_operand(merge_nodes[i], 0);
+      builder->Slice({axis}, {start_shape}, {start_shape + shape[axis]}, matmul_out, output);
+      start_shape += shape[axis];
+    }
+    return builder->matmul_op();
+  }
+
+  static Node* MergeDots(DotBuilder* builder, Node* a, Node* b) {
+    CHECK(a && b) << "The pointer of node is illegal.";
+    const std::array<bool, 2> trans_a{get_attr<bool>(a, "trans_a", false), get_attr<bool>(b, "trans_a", false)};
+    const std::array<bool, 2> trans_b{get_attr<bool>(a, "trans_b", false), get_attr<bool>(b, "trans_b", false)};
+    const std::array<bool, 2> trans_out{get_attr<bool>(a, "trans_out", false), get_attr<bool>(b, "trans_out", false)};
+    const std::array<float, 2> alpha{get_attr<float>(a, "alpha", 1.f), get_attr<float>(b, "alpha", 1.f)};
+    if (!all_equal(trans_a, trans_b, trans_out, alpha)) {
+      return nullptr;
+    }
+    bool lhs{true};
+    int axis{1};
+    NodeData *shared_input{}, *input_a{}, *input_b{};
+    if (input_operand(a, 1) == input_operand(b, 1)) {
+      shared_input = input_operand(a, 1);
+      input_a      = input_operand(a, 0);
+      input_b      = input_operand(b, 0);
+      lhs          = false;
+      if (!trans_a[0]) {
+        axis = 0;
+      } else if (trans_b[0]) {
+        axis = 0;
+      }
+    } else if (input_operand(a, 0) == input_operand(b, 0)) {
+      shared_input = input_operand(a, 0);
+      input_a      = input_operand(a, 1);
+      input_b      = input_operand(b, 1);
+    } else {
+      return nullptr;
+    }
+    auto* output_a   = output_operand(a, 0);
+    auto* output_b   = output_operand(b, 0);
+    auto& graph_outs = builder->graph()->outputs;
+    for (auto* n : {shared_input, input_a, input_b}) {
+      if (std::find(graph_outs.begin(), graph_outs.end(), n) != graph_outs.end()) {
+        return nullptr;
+      }
+    }
+    CHECK(shared_input && input_a && input_b) << "The input node type must be variable.";
+    auto shape_shared = builder->shape_dict().at(shared_input->id());
+    auto shape_a      = builder->shape_dict().at(input_a->id());
+    auto shape_b      = builder->shape_dict().at(input_b->id());
+    CHECK_EQ(shape_a[1 - axis], shape_b[1 - axis])
+        << "The shape of matmul is error. " << shape_a.size() << ", " << shape_b.size();
+    auto* concat_out = builder->Concat(axis, {input_a, input_b});
+    NodeData* matmul_out{};
+    if (!lhs) {
+      matmul_out = builder->Matmul(trans_a[0], trans_b[0], false, alpha[0], concat_out, shared_input);
+    } else {
+      matmul_out = builder->Matmul(trans_a[0], trans_b[0], false, alpha[0], shared_input, concat_out);
+    }
+    builder->Slice({axis}, {0}, {shape_a[axis]}, matmul_out, output_a);
+    builder->Slice({axis}, {shape_a[axis]}, {shape_a[axis] + shape_b[axis]}, matmul_out, output_b);
+    return builder->matmul_op();
+  }
+};
+
+}  // namespace
+
+void DotMergerPassFunc(framework::Graph* graph) {
+  // The cublas gemm is not yet supported.
+  for (auto& dot_type : {"matmul", "cublas_matmul"}) {
+    int n = DotMergerPass::Apply(graph, dot_type);
+    VLOG(3) << "The fusion of `" << dot_type << "` was performed " << n << " times.";
+  }
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(DotMerger) {
+  CINN_REGISTER_PASS(DotMerger)
+      .describe("")
+      .set_change_structure(false)
+      .provide_graph_attr("infershape")
+      .provide_graph_attr("inferdtype")
+      .set_body(cinn::hlir::pass::DotMergerPassFunc);
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/dot_merger_test.cc b/paddle/cinn/hlir/pass/dot_merger_test.cc
new file mode 100644
index 0000000000000..c65cbad984aa9
--- /dev/null
+++ b/paddle/cinn/hlir/pass/dot_merger_test.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn {
+namespace frontend {
+
+int GetSize(std::vector<int>& shape) { return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()); }
+
+void RunModelTest(Program& program,
+                  const std::vector<Variable>&& inputs,
+                  const std::unordered_set<std::string>& fetch_ids) {
+  // init input data.
+  std::vector<std::vector<float>> inputs_data;
+  for (auto input : inputs) {
+    inputs_data.emplace_back(GetSize(input->shape));
+    InitRandomVector<float>(&inputs_data.back(), inputs_data.back().size(), 0.0f, 1.0f, 1e-3);
+  }
+
+  auto target = common::DefaultTarget();
+  std::unordered_map<std::string, std::pair<std::vector<float>, std::vector<float>>> outputs;
+  {
+    auto graph = std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
+    hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+    hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+    auto scope = BuildScope(target, graph);
+    hlir::framework::GraphCompiler gc(target, scope, graph);
+    auto run_program = gc.Build();
+
+    for (int idx = 0; idx < inputs.size(); ++idx) {
+      scope->Var<hlir::framework::Tensor>(inputs[idx]->id);
+      auto tensor = scope->GetTensor(inputs[idx]->id);
+      auto* data  = tensor->mutable_data<float>(target);
+      CopyFromVector(inputs_data[idx], tensor, target);
+    }
+    run_program->Execute();
+    for (auto id : fetch_ids) {
+      auto tensor = scope->GetTensor(id);
+      std::vector<float> data(tensor->shape().numel());
+      CopyToVector(tensor, &data);
+      outputs[id] = std::pair<std::vector<float>, std::vector<float>>(data, std::vector<float>());
+    }
+  }
+  {
+    auto graph = std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
+    hlir::framework::ApplyPass(graph.get(), "DotMerger");
+    hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+    hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+    auto scope = BuildScope(target, graph);
+    hlir::framework::GraphCompiler gc(target, scope, graph);
+    auto run_program = gc.Build();
+
+    for (int idx = 0; idx < inputs.size(); ++idx) {
+      scope->Var<hlir::framework::Tensor>(inputs[idx]->id);
+      auto tensor = scope->GetTensor(inputs[idx]->id);
+      auto* data  = tensor->mutable_data<float>(target);
+      CopyFromVector(inputs_data[idx], tensor, target);
+    }
+    run_program->Execute();
+    for (auto id : fetch_ids) {
+      auto tensor = scope->GetTensor(id);
+      std::vector<float> data(tensor->shape().numel());
+      CopyToVector(tensor, &data);
+      outputs[id].second = data;
+    }
+  }
+
+  for (auto& output : outputs) {
+    CheckOutput<float>(output.second.first, output.second.second, 1e-8, 1e-4);
+  }
+}
+
+TEST(DotMerger, Test_dot_merger0) {
+  int m = 2, k = 1024, n = 100, n1 = 100, n2 = 100, axis = 1;
+  NetBuilder net_builder("Test_dot_merger0");
+  auto A         = net_builder.CreateInput(Float(32), {m, k}, "A");
+  auto B         = net_builder.CreateInput(Float(32), {k, n1}, "B");
+  auto C         = net_builder.CreateInput(Float(32), {k, n2}, "C");
+  auto D         = net_builder.CreateInput(Float(32), {n1, k}, "D");
+  auto E         = net_builder.CreateInput(Float(32), {n2, k}, "E");
+  auto F         = net_builder.CreateInput(Float(32), {k, n}, "F");
+  auto G         = net_builder.Matmul(A, B);
+  auto H         = net_builder.Matmul(A, C);
+  auto G1        = net_builder.Matmul(D, F);
+  auto H1        = net_builder.Matmul(E, F);
+  auto G2        = net_builder.Concat({G, H}, axis);
+  auto H2        = net_builder.Concat({G1, H1}, (1 - axis));
+  auto F1        = net_builder.Matmul(G2, H2);
+  auto fetch_ids = {F1->id};
+  auto program   = net_builder.Build();
+  std::cout << "RunModelTest" << std::endl;
+  RunModelTest(program, {A, B, C, D, E, F}, fetch_ids);
+}
+
+}  // namespace frontend
+}  // namespace cinn
\ No newline at end of file
diff --git a/paddle/cinn/hlir/pass/fusion_helper_base.h b/paddle/cinn/hlir/pass/fusion_helper_base.h
new file mode 100644
index 0000000000000..60ec6b99c9ff3
--- /dev/null
+++ b/paddle/cinn/hlir/pass/fusion_helper_base.h
@@ -0,0 +1,208 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using namespace framework;
+
+class FusionHelperBase {
+ public:
+  FusionHelperBase(const framework::Graph* graph)
+      : shape_dict_(graph->GetAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape")), target_(graph->target_) {
+    // get op pattern dict
+    op_pattern_dict_ = &framework::Operator::GetAttrs<OpPatternKind>("OpPattern");
+    // output node set
+    for (auto node_data : graph->outputs) {
+      CHECK(node_data->source_node.get());
+      output_nodes_set_.insert(node_data->source_node.get());
+    }
+  }
+
+ public:
+  OpPatternKind GetOpKind(const framework::Node* node) const {
+    CHECK(op_pattern_dict_->Find(node->op())) << "Don't find the pattern of op : " << node->id();
+    auto kind = op_pattern_dict_[0][node->op()];
+
+    if (kind == framework::kBroadcast) {
+      // As binary op was defined as broadcast, actually it should be element-wise.
+      if (node->op()->name != "broadcast_to") {
+        return framework::kElementWise;
+      }
+    }
+
+    return kind;
+  }
+
+  static bool IsConstOp(const framework::Node* node) {
+    static std::unordered_set<std::string> const_op_type = {"const_scalar", "fill_constant", "arange"};
+    if (const_op_type.count(node->op()->name)) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  static std::vector<NodeData*> GetNodeDatas(const Node* node) {
+    std::vector<NodeData*> consumer_node_data;
+    for (auto& edge : node->outlinks_in_order()) {
+      auto output = edge->sink()->safe_as<NodeData>();
+      CHECK(output) << "The op \"" << node->id() << "\" output should not be empty!";
+      consumer_node_data.push_back(output);
+    }
+    return consumer_node_data;
+  }
+
+  NodeData* GetNodeData(const Node* node) const {
+    auto node_data = (*node->outlinks().begin())->sink()->safe_as<NodeData>();
+    CHECK(node_data);
+    return node_data;
+  }
+
+  shape_t GetNodeDataShape(const Node* node) const {
+    auto* node_data = GetNodeData(node);
+    CHECK(shape_dict_.count(node_data->id())) << "Can't find " << node_data->id() << " 's shape!";
+    return shape_dict_.at(node_data->id());
+  }
+
+  shape_t GetNodeInputShape(const Node* node) const {
+    auto node_datas = GetProducerNodeData(node);
+    CHECK_GT(node_datas.size(), 0);
+    CHECK(shape_dict_.count(node_datas[0]->id())) << "Can't find " << node_datas[0]->id() << " 's shape!";
+    return shape_dict_.at(node_datas[0]->id());
+  }
+
+  static std::vector<NodeData*> GetProducerNodeData(const Node* node) {
+    std::vector<NodeData*> producer_node_data;
+    for (auto& edge : node->inlinks_in_order()) {
+      auto graph_node    = edge->source();
+      auto producer_data = graph_node->safe_as<NodeData>();
+      CHECK(producer_data);
+      producer_node_data.push_back(producer_data);
+    }
+    return producer_node_data;
+  }
+
+  std::vector<Node*> GetProducerNode(const Node* node) const {
+    std::vector<Node*> producer_node;
+    for (auto& edge : node->inlinks_in_order()) {
+      auto graph_node    = edge->source();
+      auto producer_data = graph_node->safe_as<NodeData>();
+      CHECK(producer_data);
+      auto producer = producer_data->source_node.get();
+      if (producer) {
+        producer_node.push_back(producer);
+      }
+    }
+    return producer_node;
+  }
+
+  std::vector<Node*> GetConsumerNode(const Node* node) const {
+    std::vector<Node*> consumer_nodes;
+    auto node_data = GetNodeData(node);
+    for (auto& link : node_data->outlinks()) {
+      auto consumer = link->sink()->safe_as<Node>();
+      CHECK(consumer);
+      consumer_nodes.push_back(consumer);
+    }
+    return consumer_nodes;
+  }
+
+  bool WithoutLastDimInReduce(const std::vector<int>& inshape, const std::vector<int>& axes) const {
+    // if last axis is in reduce.
+    if (std::find(axes.begin(), axes.end(), inshape.size() - 1) != axes.end() ||
+        std::find(axes.begin(), axes.end(), -1) != axes.end()) {
+      return false;
+    }
+
+    int sum_last_axes = 1;
+    for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+      sum_last_axes *= inshape[idx];
+    }
+
+    if (sum_last_axes > 1) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  int GetSharedSize(const Node* node) const {
+    auto producers = GetProducerNodeData(node);
+    CHECK_GT(producers.size(), 0);
+    auto inshape = shape_dict_.at(producers[0]->id());
+    auto axes    = absl::get<std::vector<int>>(node->attrs.attr_store.at("dim"));
+    if (WithoutLastDimInReduce(inshape, axes)) {
+      int lane = 1;
+      for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+        lane = inshape[idx];
+      }
+      int max_num_threads = common::DefaultNVGPUTarget().max_num_threads();
+      if (lane > max_num_threads / 2) {
+        return 0;
+      }
+      int index = axes.size() - 1;
+      for (; index >= 0; --index) {
+        if (index + 1 < axes.size() && axes[index] != axes[index + 1] - 1) {
+          break;
+        }
+        lane *= inshape[axes[index]];
+        if (lane > max_num_threads / 2) {
+          break;
+        }
+      }
+      // if lane > (max_num_threads / 2),the loop break from lane > max_num_threads / 2.
+      int axis = lane > (max_num_threads / 2) ? axes[index] : axes[index + 1];
+      if (lane <= max_num_threads) {
+        return lane * sizeof(float);
+      } else {
+        int prefix = inshape[axis];
+        int tail   = lane / prefix;
+        for (int idx = max_num_threads / tail; idx > ((max_num_threads / 2) / tail); --idx) {
+          if (prefix % idx == 0) {
+            return idx * tail * sizeof(float);
+          }
+        }
+        int num = max_num_threads / tail;
+        return num * tail * sizeof(float);
+      }
+    }
+    return 0;
+  }
+  // target
+  const common::Target& target_;
+  // output node set
+  std::unordered_set<const Node*> output_nodes_set_;
+  // shape dict
+  const absl::flat_hash_map<std::string, shape_t>& shape_dict_;
+  // op pattern dict
+  const framework::OpValueType<OpPatternKind>* op_pattern_dict_;
+};
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass.cc b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
new file mode 100644
index 0000000000000..7a63e12c4b357
--- /dev/null
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
@@ -0,0 +1,1032 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pass/fusion_merge_pass_util.h"
+
+DECLARE_bool(enhance_vertical_fusion_with_recompute);
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+using framework::OpPatternKind;
+using framework::shape_t;
+
+using common::GraphEdge;
+using common::GraphNode;
+
+using Comparator = Graph::Group::SharedGroupComparator;
+using Hasher     = Graph::Group::SharedGroupHasher;
+
+using GroupPtr  = std::shared_ptr<Graph::Group>;
+using GroupList = std::vector<GroupPtr>;
+
+using ConditionFunction = std::function<bool(const FusionHelperBase*, const GroupPtr&, const GroupPtr&)>;
+
+// Op Fusion Pass which performs Ops fusion, Ops are fused
+// "vertically", meaning producing Ops are fused into their consumers
+// with the intent that the loops which compute their values will be fused in
+// code generation.
+class FusionMergePassHelper : public FusionHelperBase {
+ public:
+  FusionMergePassHelper(const Graph* graph) : FusionHelperBase(graph) {
+    fusion_groups_ = graph->fusion_groups;
+    // init fusion relation.
+    InitFusionRelation();
+    // init input to consumers.
+    InitInputToConsumers();
+    // init fusion group index.
+    InitFusionGroupsAndIndex();
+  }
+
+  GroupList operator()() {
+    // run fusion merge untill no update.
+    DoFusionMerge();
+    for (auto& group : fusion_groups_) {
+      VLOG(3) << "Fusion Group -> " << group->group_id;
+      for (auto& sub_group : group->fused_sub_groups) {
+        VLOG(3) << "  Fused Sub-Group -> " << sub_group->group_id;
+      }
+      for (auto& producer : group->producer_groups) {
+        VLOG(3) << "  Producer -> " << producer->group_id;
+      }
+      for (auto& consumer : group->consumer_groups) {
+        VLOG(3) << "  Consumer -> " << consumer->group_id;
+      }
+    }
+    return fusion_groups_;
+  }
+
+ private:
+  void DoFusionMerge() {
+    VLOG(3) << "DoFusionMerge...!";
+    while (DoHorizontalFusion()) {
+    }
+    while (DoVerticalFusion(/* recompute=*/false)) {
+    }
+    while (DoVerticalFusion(/* recompute=*/true)) {
+    }
+  }
+
+  bool DoHorizontalFusion() {
+    VLOG(3) << "DoHorizontalFusion...!";
+    bool updated = false;
+    for (int idx = 0; idx < fusion_groups_.size(); ++idx) {
+      auto producer = fusion_groups_[idx];
+      VLOG(3) << "Fusion Producer Group -> " << producer->group_id;
+      // if producer is sub group.
+      if (producer->belong_groups.size()) {
+        continue;
+      }
+      // do horizontal fusion.
+      updated |= HorizontalFusion(producer, producer->consumer_groups);
+    }
+
+    if (updated) {
+      UpdateFusionGroup();
+    }
+    return updated;
+  }
+
+  bool DoVerticalFusion(bool recompute) {
+    VLOG(3) << "DoVerticalFusion...!";
+    bool updated = false;
+    for (int idx = 0; idx < fusion_groups_.size(); ++idx) {
+      auto producer = fusion_groups_[idx];
+      VLOG(3) << "Fusion Producer Group -> " << producer->group_id;
+      // if producer is sub group.
+      if (producer->belong_groups.size()) {
+        continue;
+      }
+      // do horizontal fusion.
+      if (!recompute) {
+        updated |= HorizontalFusion(producer, producer->consumer_groups);
+      }
+      updated |= VerticalFusion(producer, producer->consumer_groups, recompute);
+    }
+    // fuse input consumers
+    updated |= FuseInputToConsumers();
+
+    if (updated) {
+      UpdateFusionGroup();
+    }
+    return updated;
+  }
+
+  void UpdateFusionGroup() {
+    VLOG(3) << "UpdateFusionGroup...";
+    GroupList fusion_groups;
+    std::unordered_set<GroupPtr, Hasher, Comparator> fusion_groups_set;
+    // update fusion_groups_
+    for (auto& group : fusion_groups_) {
+      if (!group->belong_groups.size()) {
+        fusion_groups.push_back(group);
+        fusion_groups_set.insert(group);
+      }
+    }
+    // keep group in order
+    fusion_groups_.clear();
+    fusion_groups_index_.clear();
+    while (!fusion_groups_set.empty()) {
+      bool is_ring = true;
+      for (int idx = 0; idx < fusion_groups.size(); ++idx) {
+        auto& group = fusion_groups[idx];
+        if (!group.get()) {
+          continue;
+        }
+
+        bool exist = false;
+        for (auto& producer : group->producer_groups) {
+          if (fusion_groups_set.count(producer)) {
+            VLOG(4) << group->group_id << " " << producer->group_id;
+            exist = true;
+            break;
+          }
+        }
+
+        if (!exist) {
+          fusion_groups_index_[group] = fusion_groups_.size();
+          fusion_groups_.push_back(group);
+          fusion_groups_set.erase(group);
+          group.reset();
+          is_ring = false;
+          continue;
+        }
+      }
+      if (is_ring) {
+        LOG(FATAL) << "Exists Ring, Please Check!";
+      }
+    }
+  }
+
+  bool HorizontalFusion(GroupPtr producer, std::unordered_set<GroupPtr, Hasher, Comparator>& consumers) {
+    VLOG(3) << "HorizontalFusion...!";
+    if (consumers.size() <= 1) {
+      return false;
+    }
+
+    std::unordered_set<GroupPtr, Hasher, Comparator> candidates;
+    for (auto& consumer : consumers) {
+      // relation
+      auto& relation = fusion_relation_map_[consumer->op_pattern_kind];
+      // check horizontal relation exist
+      if (!relation.horizontal_relation.size()) {
+        continue;
+      }
+      candidates.insert(consumer);
+    }
+
+    std::vector<GroupList> fusionable_consumers;
+    for (auto& candidate : candidates) {
+      // check dependency
+      if (IsDependencySimplify(producer, candidate, candidates)) {
+        VLOG(4) << "IsDependencySimplify, Can't fuse " << candidate->group_id << ", As it depency others!";
+        continue;
+      }
+
+      if (IsDependency(producer, candidate, candidates)) {
+        VLOG(4) << "IsDependency, Can't fuse " << candidate->group_id << ", As it depency others!";
+        continue;
+      }
+
+      if (!fusionable_consumers.size()) {
+        fusionable_consumers.push_back({candidate});
+        continue;
+      }
+
+      // check each fusionable groups
+      bool fusionable = false;
+      auto& relation  = fusion_relation_map_[candidate->op_pattern_kind];
+      for (auto& groups : fusionable_consumers) {
+        auto& last = groups.back();
+        if (!relation.horizontal_relation.count(last->op_pattern_kind)) {
+          continue;
+        }
+
+        if (!relation.horizontal_relation[last->op_pattern_kind](this, candidate, last)) {
+          continue;
+        }
+
+        groups.push_back(candidate);
+        fusionable = true;
+        break;
+      }
+
+      // if can't fuse to othors Groups, new Groups.
+      if (!fusionable) {
+        fusionable_consumers.push_back({candidate});
+      }
+    }
+
+    bool updated = false;
+    for (auto& groups : fusionable_consumers) {
+      if (groups.size() > 1) {
+        updated = true;
+        HorizontalFuse(groups);
+      }
+    }
+
+    return updated;
+  }
+
+  void HorizontalFuse(GroupList& consumers) {
+    VLOG(3) << "HorizontalFuse Groups...";
+    // create fusion group
+    auto fused_group = std::make_shared<Graph::Group>();
+    // As recompute exist which may case sub-group used by more than one time.
+    std::vector<GroupPtr> repeat_sub_groups;
+    std::unordered_set<GroupPtr, Hasher, Comparator> sub_group_set;
+    // find the first consumer.
+    GroupPtr first_consumer(nullptr);
+    // fuse all group into fusion group.
+    for (auto& consumer : consumers) {
+      VLOG(3) << "fuse consumer " << consumer->group_id << " into fused_group!";
+      // update depth
+      fused_group->max_depth = std::max(fused_group->max_depth, consumer->max_depth);
+      fused_group->min_depth = std::min(fused_group->min_depth, consumer->min_depth);
+      // update group id
+      if (fused_group->group_id.size()) {
+        fused_group->group_id += "_" + consumer->group_id;
+      } else {
+        fused_group->group_id = consumer->group_id;
+      }
+      // set op pattern kind
+      fused_group->op_pattern_kind =
+          static_cast<int>(fused_group->op_pattern_kind) >= static_cast<int>(consumer->op_pattern_kind)
+              ? fused_group->op_pattern_kind
+              : consumer->op_pattern_kind;
+      // input nodes
+      for (auto& node : consumer->input_nodes) {
+        if (fused_group->input_nodes.count(node.first)) {
+          fused_group->input_nodes[node.first] += node.second;
+        } else {
+          fused_group->input_nodes.insert(node);
+        }
+      }
+      // output node
+      for (auto& node : consumer->output_nodes) {
+        fused_group->output_nodes.insert(node);
+      }
+      // internal node
+      if (consumer->fused_sub_groups.size()) {
+        for (auto& node : consumer->internal_nodes) {
+          fused_group->internal_nodes.insert(node);
+        }
+      }
+      // master node
+      for (auto& node : consumer->master_nodes) {
+        if (GetOpKind(node) == framework::kReduction) {
+          fused_group->master_nodes.insert(node);
+        }
+      }
+      // insert sub group
+      if (consumer->fused_sub_groups.size()) {
+        for (auto& sub_group : consumer->fused_sub_groups) {
+          // check sub group is repeat.
+          if (sub_group_set.count(sub_group)) {
+            VLOG(3) << sub_group->group_id << " is repeated!";
+            repeat_sub_groups.push_back(sub_group);
+            continue;
+          }
+          // record sub group
+          sub_group_set.insert(sub_group);
+
+          // insert to fused sub group.
+          fused_group->fused_sub_groups.push_back(sub_group);
+          // update belongs group
+          sub_group->belong_groups.erase(consumer);
+          sub_group->belong_groups.insert(fused_group);
+        }
+      } else {
+        fused_group->fused_sub_groups.push_back(consumer);
+      }
+      // producer group
+      for (auto& producer : consumer->producer_groups) {
+        fused_group->producer_groups.insert(producer);
+        // update producer's consumer
+        producer->consumer_groups.erase(consumer);
+        producer->consumer_groups.insert(fused_group);
+      }
+      // consumer group
+      for (auto& gconsumer : consumer->consumer_groups) {
+        fused_group->consumer_groups.insert(gconsumer);
+        // update consumer's producer
+        gconsumer->producer_groups.erase(consumer);
+        gconsumer->producer_groups.insert(fused_group);
+      }
+      // belongs group
+      consumer->belong_groups.insert(fused_group);
+
+      // find the first consumer.
+      CHECK(fusion_groups_index_.count(consumer))
+          << "Can't find consumer " << consumer->group_id << " index in fusion_groups_index_!";
+      if (first_consumer.get()) {
+        if (fusion_groups_index_[consumer] < fusion_groups_index_[first_consumer]) {
+          first_consumer = consumer;
+        }
+      } else {
+        first_consumer = consumer;
+      }
+    }
+
+    // if node is output nodes of sub_group, check it can't be internal node.
+    for (auto& sub_group : repeat_sub_groups) {
+      // check each output node in sub_group.
+      for (auto& node : sub_group->output_nodes) {
+        // if node is not output node of fused_group.
+        if (!fused_group->output_nodes.count(node)) {
+          fused_group->internal_nodes.insert(node);
+        }
+      }
+    }
+
+    if (static_cast<int>(framework::kReduction) > static_cast<int>((consumers.back())->op_pattern_kind)) {
+      auto consumer = consumers.back();
+
+      for (auto& node : consumer->master_nodes) {
+        fused_group->master_nodes.insert(node);
+      }
+    } else {
+      for (auto consumer = consumers.rbegin(); consumer != consumers.rend(); ++consumer) {
+        Node* master_node = nullptr;
+        for (auto& node : (*consumer)->master_nodes) {
+          if (GetOpKind(node) != framework::kReduction) {
+            master_node = node;
+            break;
+          }
+        }
+        if (master_node) {
+          VLOG(3) << "Insert Master node : " << master_node->id() << " into group : " << fused_group->group_id;
+          fused_group->master_nodes.insert(master_node);
+          break;
+        }
+      }
+    }
+
+    auto postion                      = fusion_groups_index_[first_consumer];
+    fusion_groups_[postion]           = fused_group;
+    fusion_groups_index_[fused_group] = postion;
+
+    CHECK(fused_group->output_nodes.size()) << "No output node is found, " << fused_group->group_id;
+  }
+
+  bool VerticalFusion(GroupPtr& producer, std::unordered_set<GroupPtr, Hasher, Comparator>& consumers, bool recompute) {
+    VLOG(3) << "VerticalFusion, Number of Consumers : " << consumers.size();
+    auto& relation = fusion_relation_map_[producer->op_pattern_kind];
+    // if producer can't fuse others
+    if (!relation.vertical_relation.size()) {
+      return false;
+    }
+
+    std::unordered_set<GroupPtr, Hasher, Comparator> fuse_consumers_unsafe;
+    std::unordered_set<GroupPtr, Hasher, Comparator> fuse_consumers;
+    for (auto& consumer : consumers) {
+      VLOG(4) << "Check consuemr " << consumer->group_id << " can fuse to producer " << producer->group_id;
+      // if can't fuse
+      if (!relation.vertical_relation.count(consumer->op_pattern_kind)) {
+        VLOG(4) << "Can't fuse producer " << producer->group_id << " consumer " << consumer->group_id;
+        continue;
+      }
+
+      // if condition function is false
+      if (!relation.vertical_relation[consumer->op_pattern_kind](this, producer, consumer)) {
+        VLOG(4) << "Can't fuse producer " << producer->group_id << " consumer " << consumer->group_id;
+        continue;
+      }
+
+      fuse_consumers_unsafe.insert(consumer);
+
+      if (IsDependencySimplify(producer, consumer, consumers)) {
+        VLOG(4) << "IsDependencySimplify, Consumer " << consumer->group_id << " can't be master fused group!";
+        continue;
+      }
+
+      if (IsDependency(producer, consumer, consumers)) {
+        VLOG(4) << "IsDependency, Consumer " << consumer->group_id << " can't be master fused group!";
+        continue;
+      }
+
+      fuse_consumers.insert(consumer);
+    }
+
+    VLOG(3) << "VerticalFusion, Number of fuse Consumers : " << fuse_consumers.size();
+    VLOG(3) << "VerticalFusion, Number of unsafe fuse Consumers : " << fuse_consumers.size();
+
+    if (fuse_consumers.size() == 0) {
+      return false;
+    }
+    // if can_fuse_consumers == consumers
+    // if producer op kind == kElementwise
+    // if use recompute
+    if (fuse_consumers_unsafe.size() == producer->consumer_groups.size() &&
+        producer->op_pattern_kind == framework::kElementWise) {
+      if (!recompute) {
+        return false;
+      } else {
+        RecomputeEleGraph(producer, fuse_consumers_unsafe);
+        VerticalFuse(producer, fuse_consumers_unsafe);
+        return true;
+      }
+    }
+
+    if (fuse_consumers.size()) {
+      SelectConsumerToFuse(producer, fuse_consumers);
+    }
+
+    // if fusionable consumers exist
+    if (fuse_consumers.size()) {
+      VerticalFuse(producer, fuse_consumers);
+      return true;
+    }
+
+    return false;
+  }
+
+  void VerticalFuse(GroupPtr& producer, std::unordered_set<GroupPtr, Hasher, Comparator>& fusionable_consumers) {
+    VLOG(3) << "VerticalFuse...!";
+    GroupList fused_groups;
+    GroupPtr master_fuesd_group(nullptr);
+    for (auto& consumer : fusionable_consumers) {
+      auto fused_group = std::make_shared<Graph::Group>();
+      // update depth using consumer depth.
+      fused_group->max_depth = std::max(producer->max_depth, consumer->max_depth);
+      fused_group->min_depth = std::min(producer->min_depth, consumer->min_depth);
+      // update group id
+      fused_group->group_id = producer->group_id + "_" + consumer->group_id;
+      VLOG(3) << "fuse producer " << producer->group_id << " into consumer " << consumer->group_id;
+      // fuse producer into fusion group
+      fused_group->op_pattern_kind =
+          static_cast<int>(producer->op_pattern_kind) >= static_cast<int>(consumer->op_pattern_kind)
+              ? producer->op_pattern_kind
+              : consumer->op_pattern_kind;
+      // input nodes
+      fused_group->input_nodes = producer->input_nodes;
+
+      // internal nodes
+      if (producer->fused_sub_groups.size()) {
+        for (auto& node : producer->internal_nodes) {
+          fused_group->internal_nodes.insert(node);
+        }
+      }
+      // convert producer's output node to internal.
+      for (auto node : producer->output_nodes) {
+        // if node is used more than 1 time.
+        if (consumer->input_nodes.count(node)) {
+          if (consumer->input_nodes[node] > 1 && node->inlinks().size() > 0) {
+            fused_group->internal_nodes.insert(node);
+          }
+        }
+      }
+      // master nodes
+      for (auto& node : producer->master_nodes) {
+        if (GetOpKind(node) == framework::kReduction) {
+          fused_group->master_nodes.insert(node);
+        }
+      }
+
+      // producer groups
+      for (auto& group : producer->producer_groups) {
+        fused_group->producer_groups.insert(group);
+        // update producer's producer's consumer
+        group->consumer_groups.erase(producer);
+        group->consumer_groups.insert(fused_group);
+      }
+
+      // sub groups
+      if (producer->fused_sub_groups.size()) {
+        for (auto& group : producer->fused_sub_groups) {
+          fused_group->fused_sub_groups.push_back(group);
+          // update belong group
+          group->belong_groups.erase(producer);
+          group->belong_groups.insert(fused_group);
+        }
+      } else {
+        fused_group->fused_sub_groups.push_back(producer);
+      }
+      producer->belong_groups.insert(fused_group);
+
+      // input nodes
+      for (auto& input_node : consumer->input_nodes) {
+        // if input node not in producer output.
+        if (!producer->output_nodes.count(input_node.first)) {
+          if (fused_group->input_nodes.count(input_node.first)) {
+            fused_group->input_nodes[input_node.first] += input_node.second;
+          } else {
+            fused_group->input_nodes.insert(input_node);
+          }
+        }
+      }
+
+      // output nodes
+      for (auto& node : consumer->output_nodes) {
+        fused_group->output_nodes.insert(node);
+      }
+
+      // internal nodes
+      if (consumer->fused_sub_groups.size()) {
+        for (auto& node : consumer->internal_nodes) {
+          fused_group->internal_nodes.insert(node);
+        }
+      }
+
+      // master nodes
+      for (auto& node : consumer->master_nodes) {
+        fused_group->master_nodes.insert(node);
+      }
+
+      // producer nodes
+      for (auto& group : consumer->producer_groups) {
+        if (group.get() != producer.get()) {
+          fused_group->producer_groups.insert(group);
+          // update consumer's producer's consumer
+          group->consumer_groups.erase(consumer);
+          group->consumer_groups.insert(fused_group);
+        }
+      }
+      // consumer nodes
+      for (auto& group : consumer->consumer_groups) {
+        fused_group->consumer_groups.insert(group);
+        // update consumer's consumer's producer
+        group->producer_groups.erase(consumer);
+        group->producer_groups.insert(fused_group);
+      }
+
+      // sub group
+      if (consumer->fused_sub_groups.size()) {
+        for (auto& sub_group : consumer->fused_sub_groups) {
+          if (std::find(fused_group->fused_sub_groups.begin(), fused_group->fused_sub_groups.end(), sub_group) ==
+              fused_group->fused_sub_groups.end()) {
+            fused_group->fused_sub_groups.push_back(sub_group);
+          }
+          // update belong group
+          sub_group->belong_groups.erase(consumer);
+          sub_group->belong_groups.insert(fused_group);
+        }
+      } else {
+        fused_group->fused_sub_groups.push_back(consumer);
+      }
+      consumer->belong_groups.insert(fused_group);
+
+      fused_groups.push_back(fused_group);
+      CHECK(fusion_groups_index_.count(consumer))
+          << "Can't find consumer " << consumer->group_id << " index in fusion_groups_index_!";
+      auto postion                      = fusion_groups_index_[consumer];
+      fusion_groups_[postion]           = fused_group;
+      fusion_groups_index_[fused_group] = postion;
+
+      if (!master_fuesd_group.get()) {
+        master_fuesd_group = fused_group;
+      }
+      CHECK(fused_group->output_nodes.size()) << "No output node is found, " << fused_group->group_id;
+    }
+
+    for (auto& node : producer->output_nodes) {
+      bool be_output = true;
+      for (auto& consumer : producer->consumer_groups) {
+        // if consumer is in fusionable.
+        if (fusionable_consumers.count(consumer)) {
+          if (consumer->input_nodes.count(node)) {
+            be_output = false;
+          }
+          continue;
+        }
+        // if consumer is not in fusionable.
+        if (consumer->input_nodes.count(node)) {
+          be_output = true;
+          break;
+        }
+        // others node is as graph output.
+      }
+
+      if (output_nodes_set_.count(node)) {
+        be_output = true;
+      }
+
+      if (be_output) {
+        VLOG(4) << "Insert Id " << node->id() << " Into Group " << master_fuesd_group->group_id;
+        master_fuesd_group->output_nodes.insert(node);
+      }
+    }
+    // insert unfusionable consumer groups
+    for (auto& consumer : producer->consumer_groups) {
+      if (fusionable_consumers.count(consumer)) {
+        continue;
+      }
+      master_fuesd_group->consumer_groups.insert(consumer);
+      // update consumer's producer
+      consumer->producer_groups.erase(producer);
+      consumer->producer_groups.insert(master_fuesd_group);
+    }
+  }
+
+  void RecomputeEleGraph(const GroupPtr& producer,
+                         std::unordered_set<GroupPtr, Hasher, Comparator>& fusionable_consumers) {
+    if (producer->op_pattern_kind != framework::kElementWise) {
+      SelectConsumerToFuse(producer, fusionable_consumers);
+    }
+  }
+
+  void SelectConsumerToFuse(const GroupPtr& producer,
+                            std::unordered_set<GroupPtr, Hasher, Comparator>& fusionable_consumers) {
+    // if is const op
+    if (is_const_group(this, producer)) {
+      std::unordered_set<GroupPtr, Hasher, Comparator> candidates;
+      for (auto& consumer : fusionable_consumers) {
+        // if can be output node.
+        if (is_same_shape(this, producer, consumer)) {
+          candidates.insert(consumer);
+        } else {
+          VLOG(4) << "Fuse Producer : " << producer->group_id << " into Consumer : " << consumer->group_id;
+          consumer->group_id = producer->group_id + "_" + consumer->group_id;
+          // just merge the node into group.
+          auto& sub_group     = consumer->fused_sub_groups.front();
+          sub_group->group_id = producer->group_id + "_" + sub_group->group_id;
+          sub_group->nodes.insert(sub_group->nodes.begin(), producer->CollectNodes()[0]);
+          sub_group->nodes_set.insert(producer->CollectNodes()[0]);
+          // remove depency.
+          consumer->input_nodes.erase(producer->CollectNodes()[0]);
+          consumer->producer_groups.erase(producer);
+          producer->consumer_groups.erase(consumer);
+        }
+      }
+
+      CHECK_GE(producer->consumer_groups.size(), candidates.size());
+      if (producer->consumer_groups.size() == 0 && candidates.size() == 0 &&
+          output_nodes_set_.count(producer->CollectNodes()[0]) == 0) {
+        producer->belong_groups.insert(*fusionable_consumers.begin());
+      }
+
+      fusionable_consumers = candidates;
+      return;
+    }
+    // 1 to 1 fusion.
+    if (producer->consumer_groups.size() == 1) {
+      return;
+    }
+
+    if (FLAGS_enhance_vertical_fusion_with_recompute) {
+      std::vector<GroupPtr> candidates;
+      for (auto& consumer : fusionable_consumers) {
+        if (consumer->op_pattern_kind == framework::kElementWise) {
+          candidates.push_back(consumer);
+          continue;
+        }
+
+        auto producer_output_shape       = this->GetNodeDataShape(*producer->output_nodes.begin());
+        auto consumer_output_shape       = this->GetNodeDataShape(*consumer->output_nodes.begin());
+        auto consumer_master_input_shape = this->GetNodeInputShape(*(consumer->master_nodes.begin()));
+        int producer_output_numel =
+            std::accumulate(producer_output_shape.begin(), producer_output_shape.end(), 1, std::multiplies<int>());
+        int consumer_output_numel =
+            std::accumulate(consumer_output_shape.begin(), consumer_output_shape.end(), 1, std::multiplies<int>());
+        int consumer_master_input_numel = std::accumulate(
+            consumer_master_input_shape.begin(), consumer_master_input_shape.end(), 1, std::multiplies<int>());
+        if (producer_output_numel == consumer_output_numel) {
+          candidates.push_back(consumer);
+          continue;
+        }
+
+        if (producer->op_pattern_kind != framework::kInjective && consumer->op_pattern_kind == framework::kReduction &&
+            producer_output_numel == consumer_master_input_numel) {
+          candidates.push_back(consumer);
+        }
+      }
+      sort(candidates.begin(), candidates.end(), [](const auto& lhs, const auto& rhs) {
+        return lhs->op_pattern_kind < rhs->op_pattern_kind;
+      });
+
+      fusionable_consumers.clear();
+      if (candidates.size()) {
+        fusionable_consumers.insert(*candidates.begin());
+      }
+    } else {
+      std::unordered_set<GroupPtr, Hasher, Comparator> candidates;
+      for (auto& consumer : fusionable_consumers) {
+        if (consumer->op_pattern_kind == framework::kElementWise) {
+          candidates.insert(consumer);
+          continue;
+        }
+
+        auto shape0 = this->GetNodeDataShape(*producer->output_nodes.begin());
+        auto shape1 = this->GetNodeDataShape(*consumer->output_nodes.begin());
+
+        if (std::accumulate(shape0.begin(), shape0.end(), 1, std::multiplies<int>()) ==
+            std::accumulate(shape1.begin(), shape1.end(), 1, std::multiplies<int>())) {
+          candidates.insert(consumer);
+        }
+      }
+
+      fusionable_consumers.clear();
+      if (candidates.size()) {
+        fusionable_consumers.insert(*candidates.begin());
+      }
+    }
+  }
+
+  bool IsDependency(const GroupPtr& producer_g,
+                    const GroupPtr& consumer,
+                    const std::unordered_set<GroupPtr, Hasher, Comparator>& consumers) {
+    std::queue<GroupPtr> candidates;
+    candidates.push(consumer);
+
+    std::unordered_set<GroupPtr, Hasher, Comparator> visited_set;
+    while (!candidates.empty()) {
+      auto& candidate = candidates.front();
+      candidates.pop();
+      for (auto& producer : candidate->producer_groups) {
+        if (producer.get() == producer_g.get()) {
+          continue;
+        }
+        if (consumers.count(producer)) {
+          return true;
+        }
+        if (!visited_set.count(producer)) {
+          visited_set.insert(producer);
+          candidates.push(producer);
+        }
+      }
+    }
+    return false;
+  }
+
+  bool IsDependencySimplify(const GroupPtr& producer_g,
+                            const GroupPtr& consumer,
+                            const std::unordered_set<GroupPtr, Hasher, Comparator>& consumers) {
+    std::queue<GroupPtr> candidates;
+    candidates.push(consumer);
+    // check upper.
+    int check_upper_depth = producer_g.get() ? producer_g->max_depth : INT_MAX;
+    std::unordered_set<GroupPtr, Hasher, Comparator> visited_set;
+    while (!candidates.empty()) {
+      auto& candidate = candidates.front();
+      candidates.pop();
+      for (auto& producer : candidate->producer_groups) {
+        if (producer.get() == producer_g.get()) {
+          continue;
+        }
+        if (producer->min_depth > check_upper_depth) {
+          continue;
+        }
+        if (consumers.count(producer)) {
+          return true;
+        }
+        if (!visited_set.count(producer)) {
+          visited_set.insert(producer);
+          candidates.push(producer);
+        }
+      }
+    }
+    return false;
+  }
+
+  bool FuseInputToConsumers() {
+    VLOG(3) << "FuseInputToConsumers...!";
+    auto updated = false;
+    UpdateInputToConsumers();
+    GroupPtr producer(nullptr);
+    for (auto& input_consumers : input_to_consumers_) {
+      // if group set size == 1.
+      if (input_consumers.second.size() == 1) {
+        continue;
+      }
+      // do horizontal fusion.
+      auto st = HorizontalFusion(producer, input_consumers.second);
+      if (st) {
+        // fused consumers, update
+        UpdateInputToConsumers();
+      }
+      updated |= st;
+    }
+
+    return updated;
+  }
+
+  void UpdateInputToConsumers() {
+    for (auto& input_consumers : input_to_consumers_) {
+      auto& consumers = input_consumers.second;
+      std::unordered_set<GroupPtr, Hasher, Comparator> updated_consumers;
+      for (auto& consumer : consumers) {
+        std::queue<GroupPtr> fused_groups;
+        fused_groups.push(consumer);
+        while (!fused_groups.empty()) {
+          auto& cur = fused_groups.front();
+          fused_groups.pop();
+          // if group is sub group
+          if (cur->belong_groups.empty()) {
+            updated_consumers.insert(cur);
+          } else {
+            for (auto& belong_group : cur->belong_groups) {
+              if (belong_group->group_id == cur->group_id) {
+                updated_consumers.insert(belong_group);
+              } else {
+                fused_groups.push(belong_group);
+              }
+            }
+          }
+        }
+      }
+      consumers = updated_consumers;
+    }
+  }
+
+  void InitInputToConsumers() {
+    VLOG(3) << "InitInputToConsumers...!";
+    // init input data node -> fusion group map.
+    for (auto& group : fusion_groups_) {
+      for (auto& node : group->nodes_set) {
+        // collect producer node data.
+        auto producer_node_datas = GetProducerNodeData(node);
+        for (auto& node_data : producer_node_datas) {
+          // node data's source node is null.
+          if (!node_data->source_node.get()) {
+            // insert group to set.
+            input_to_consumers_[node_data].insert(group);
+          }
+        }
+      }
+    }
+  }
+
+  void InitFusionGroupsAndIndex() {
+    VLOG(3) << "InitFusionGroupsAndIndex...!";
+    // init the postion of groups in fusion groups.
+    for (int idx = 0; idx < fusion_groups_.size(); ++idx) {
+      auto group        = fusion_groups_[idx];
+      auto belong_group = std::make_shared<Graph::Group>();
+      // copy from group.
+      belong_group->max_depth       = group->depth;
+      belong_group->min_depth       = group->depth;
+      belong_group->group_id        = group->group_id;
+      belong_group->input_nodes     = group->input_nodes;
+      belong_group->output_nodes    = group->output_nodes;
+      belong_group->op_pattern_kind = group->op_pattern_kind;
+      belong_group->master_nodes    = group->master_nodes;
+      belong_group->producer_groups = group->producer_groups;
+      belong_group->consumer_groups = group->consumer_groups;
+      belong_group->fused_sub_groups.push_back(group);
+      group->belong_groups.insert(belong_group);
+      // replace group to fused_group
+      fusion_groups_[idx] = belong_group;
+      // record idx
+      fusion_groups_index_[belong_group] = idx;
+    }
+
+    // update producer and consumer.
+    for (auto& group : fusion_groups_) {
+      std::unordered_set<GroupPtr, Hasher, Comparator> producers;
+      std::unordered_set<GroupPtr, Hasher, Comparator> consumers;
+
+      for (auto& producer : group->producer_groups) {
+        CHECK(producer->belong_groups.size());
+        producers.insert(*producer->belong_groups.begin());
+      }
+      for (auto& consumer : group->consumer_groups) {
+        CHECK(consumer->belong_groups.size());
+        consumers.insert(*consumer->belong_groups.begin());
+      }
+      CHECK_EQ(group->producer_groups.size(), producers.size());
+      CHECK_EQ(group->consumer_groups.size(), consumers.size());
+      group->producer_groups = producers;
+      group->consumer_groups = consumers;
+    }
+  }
+
+  void InitFusionRelation() {
+    VLOG(3) << "InitFusionRelation...!";
+    // kElementWise
+    {
+      auto& relation = fusion_relation_map_[OpPatternKind::kElementWise];
+      // horizontal
+      relation.horizontal_relation = {{framework::kElementWise, is_same_size},
+                                      // element-wise and broadcast op must be horizontal relation.
+                                      {OpPatternKind::kBroadcast, is_same_size},
+                                      // element-wise and injective op must be horizontal relation.
+                                      {OpPatternKind::kInjective, is_same_size},
+                                      // element-wise and reduce op must be horizontal relation.
+                                      {OpPatternKind::kReduction, honrizontal_elementwise_fuse_reduce}};
+      // vertical
+      relation.vertical_relation = {{OpPatternKind::kElementWise, is_same_size},
+                                    // element-wise and broadcast can be vertical/horizontal relation.
+                                    {OpPatternKind::kBroadcast, elementwise_fuse_broadcast},
+                                    // element-wise and injective op must be horizontal relation.
+                                    {OpPatternKind::kInjective, horizontal_with_injective},
+                                    // element-wise and reduce can be vertical/horizontal relation.
+                                    {OpPatternKind::kReduction, elementwise_fuse_reduce}};
+    }
+    // kBroadcast
+    {
+      auto& relation = fusion_relation_map_[OpPatternKind::kBroadcast];
+      // horizontal
+      relation.horizontal_relation = {// broadcast and element-wise op must be horizontal relation.
+                                      {framework::kElementWise, is_same_size},
+                                      // broadcast and broadcast op must be horizontal relation.
+                                      {framework::kBroadcast, is_same_size},
+                                      // broadcast and injective op must be horizontal relation.
+                                      {OpPatternKind::kInjective, is_same_size},
+                                      // broadcast and reduce op must be horizontal relation.
+                                      {OpPatternKind::kReduction, is_same_size}};
+      // vertical
+      relation.vertical_relation = {// broadcast and element-wise op must be vertical relation.
+                                    {OpPatternKind::kElementWise, is_same_size},
+                                    // broadcast and broadcast op must be horizontal relation.
+                                    {OpPatternKind::kBroadcast, is_same_size},
+                                    // broadcast and injective op must be horizontal relation.
+                                    {OpPatternKind::kInjective, horizontal_with_injective},
+                                    // broadcast and reduce must be vertical relation.
+                                    {OpPatternKind::kReduction, broadcast_fuse_reduce}};
+    }
+    // kInjective
+    {
+      auto& relation = fusion_relation_map_[OpPatternKind::kInjective];
+      // horizontal
+      relation.horizontal_relation = {// injective and element-wise op must be horizontal relation.
+                                      {OpPatternKind::kElementWise, is_same_size},
+                                      // injective and broadcast op must be horizontal relation.
+                                      {OpPatternKind::kBroadcast, is_same_size},
+                                      // injective and injective op must be horizontal relation.
+                                      {OpPatternKind::kInjective, is_same_size},
+                                      // injective and reduce must be horizontal relation.
+                                      {OpPatternKind::kReduction, is_same_size}};
+      // vertical
+      relation.vertical_relation = {// injective and element-wise op must be horizontal relation.
+                                    {OpPatternKind::kElementWise, is_same_size},
+                                    // injective and broadcast op must be horizontal relation.
+                                    {OpPatternKind::kBroadcast, is_same_size},
+                                    // injective and injective op must be horizontal relation.
+                                    {OpPatternKind::kInjective, horizontal_with_injective},
+                                    // injective and reduce can be horizontal/vertical relation.
+                                    {OpPatternKind::kReduction, injective_horizontal_with_reduce}};
+    }
+    // kReduction
+    {
+      auto& relation = fusion_relation_map_[OpPatternKind::kReduction];
+      // horizontal
+      relation.horizontal_relation = {// reduce and element-wise op must be horizontal relation.
+                                      {OpPatternKind::kElementWise, honrizontal_elementwise_fuse_reduce},
+                                      // reduce and broadcast op must be horizontal relation.
+                                      {OpPatternKind::kBroadcast, is_same_size},
+                                      // reduce and injective op must be horizontal relation.
+                                      {OpPatternKind::kInjective, is_same_size},
+                                      // reduce and reduce must be horizontal relation.
+                                      {OpPatternKind::kReduction, reduce_fuse_reduce}};
+      // vertical
+      relation.vertical_relation = {// reduce and elementwise can be horizontal/vertical relation.
+                                    {OpPatternKind::kElementWise, reduce_fuse_elementwise},
+                                    // reduce and broadcast op must be horizontal relation.
+                                    {OpPatternKind::kBroadcast, reduce_fuse_broadcast},
+                                    // reduce and injective op must be horizontal relation.
+                                    {OpPatternKind::kInjective, horizontal_with_injective},
+                                    // reduce and reduce must be horizontal relation.
+                                    {OpPatternKind::kReduction, reduce_fuse_reduce}};
+    }
+  }
+
+  GroupList fusion_groups_;
+  std::unordered_map<GroupPtr, int, Hasher, Comparator> fusion_groups_index_;
+  std::unordered_map<NodeData*, std::unordered_set<GroupPtr, Hasher, Comparator>> input_to_consumers_;
+
+  struct Relation {
+    std::unordered_map<framework::OpPatternKind, ConditionFunction> vertical_relation;
+    std::unordered_map<framework::OpPatternKind, ConditionFunction> horizontal_relation;
+  };
+  std::unordered_map<framework::OpPatternKind, Relation> fusion_relation_map_;
+};
+
+void FusionMergePassInternal(Graph* graph) {
+  if (graph->fusion_groups.size() <= 1) {
+    VLOG(3) << "Don't do Fusoin Merge Pass...!";
+    return;
+  }
+
+  FusionMergePassHelper fusion_merge_pass_helper(graph);
+  graph->fusion_groups = fusion_merge_pass_helper();
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(FusionMergePass) {
+  CINN_REGISTER_PASS(FusionMergePass)
+      .describe(
+          "Fusion Merge Pass which performs Fusion-Ops fusion, Producer Fusion-Ops are fused into Consumer Fusion-Ops "
+          "with certain conditions.")
+      .set_change_structure(false)
+      .set_body(cinn::hlir::pass::FusionMergePassInternal);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
new file mode 100755
index 0000000000000..d8407e025196b
--- /dev/null
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
@@ -0,0 +1,487 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn {
+namespace frontend {
+
+TEST(FusionMergePass, ElementWise_Fusion_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(E, C);
+    auto G = net_builder.Add(E, D);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 3);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(FusionMergePass, ElementWise_Fusion_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, D);
+    auto G = net_builder.Add(E, F);
+    auto I = net_builder.Add(E, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 4);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(FusionMergePass, ElementWise_Fusion_2) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.CreateInput(Float(32), {h, w}, "E");
+    auto F = net_builder.CreateInput(Float(32), {h, w}, "F");
+    auto G = net_builder.Add(A, B);
+    auto H = net_builder.Add(C, D);
+    auto I = net_builder.Add(E, G);
+    auto J = net_builder.Add(G, H);
+    auto K = net_builder.Add(H, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 5);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(FusionMergePass, ElementWise_Fusion_3) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_3");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.CreateInput(Float(32), {h, w}, "E");
+    auto F = net_builder.CreateInput(Float(32), {h, w}, "F");
+    auto G = net_builder.Add(A, B);
+    auto H = net_builder.Add(G, C);
+    auto I = net_builder.Add(G, D);
+    auto J = net_builder.Add(G, E);
+    auto K = net_builder.Add(G, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 5);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(FusionMergePass, ElementWise_Fusion_4) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_4");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.CreateInput(Float(32), {h, w}, "E");
+    auto F = net_builder.CreateInput(Float(32), {h, w}, "F");
+    auto G = net_builder.Add(A, B);
+    auto H = net_builder.Add(G, C);
+    auto I = net_builder.Add(G, D);
+    auto J = net_builder.Add(I, E);
+    auto K = net_builder.Add(I, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 5);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(FusionMergePass, ElementWise_Fusion_5) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_5");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+    auto D = net_builder.Add(A, B);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 2);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(FusionMergePass, Broadcast_Test_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Broadcast_Test_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, D);
+    auto G = net_builder.Add(F, E);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(FusionMergePass, Broadcast_Test_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Broadcast_Test_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, E);
+    auto G = net_builder.Add(D, E);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 3);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(FusionMergePass, Broadcast_Test_2) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Broadcast_Test_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, E);
+    auto G = net_builder.Add(D, E);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 3);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 2);
+}
+
+TEST(FusionMergePass, Broadcast_Test_3) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Broadcast_Test_3");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h * w, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, E);
+    auto G = net_builder.Add(D, E);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 3);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 2);
+}
+
+TEST(FusionMergePass, Broadcast_Test_4) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Broadcast_Test_4");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.CreateInput(Float(32), {h, w}, "E");
+    auto F = net_builder.Add(A, B);
+    auto G = net_builder.Add(C, F);
+    auto H = net_builder.Add(D, F);
+    auto I = net_builder.Add(E, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 4);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 2);
+}
+
+TEST(FusionMergePass, Broadcast_Test_5) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Broadcast_Test_5");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.CreateInput(Float(32), {h * w, w}, "E");
+    auto F = net_builder.Add(A, B);
+    auto G = net_builder.Add(C, F);
+    auto H = net_builder.Add(D, F);
+    auto I = net_builder.Add(E, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 4);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 3);
+}
+
+TEST(FusionMergePass, Reduce_Test_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+    auto D = net_builder.ReduceSum(C, {0});
+    auto E = net_builder.ReduceSum(C, {0});
+    auto F = net_builder.ReduceSum(C, {0});
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 4);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  // CHECK_EQ(graph->fusion_groups.size(), 2);
+}
+
+TEST(FusionMergePass, Reduce_Test_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+    auto D = net_builder.ReduceSum(C, {0});
+    auto E = net_builder.ReduceSum(C, {1});
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 3);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 2);
+}
+
+TEST(FusionMergePass, Reduce_Test_2) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.Add(A, B);
+    auto E = net_builder.ReduceSum(D, {0});
+    auto F = net_builder.ReduceSum(D, {1});
+    auto G = net_builder.Add(C, E);
+    auto H = net_builder.Add(C, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 3);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 2);
+}
+
+TEST(FusionMergePass, Reduce_Test_3) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_3");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.ReduceSum(E, {0});
+    auto G = net_builder.Add(C, F);
+    auto H = net_builder.Add(D, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 4);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  // CHECK_EQ(graph->fusion_groups.size(), 3);
+}
+
+TEST(FusionMergePass, Reduce_Test_4) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_4");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.ReduceSum(E, {0});
+    auto G = net_builder.Add(C, F);
+    auto H = net_builder.Add(D, F);
+    auto I = net_builder.Add(D, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 5);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  // CHECK_EQ(graph->fusion_groups.size(), 3);
+}
+
+TEST(FusionMergePass, Reduce_Test_5) {
+  int h = 128, w = 128;
+  NetBuilder net_builder("Reduce_Test_5");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.Add(A, B);
+    auto D = net_builder.ReduceSum(A, {1});
+    auto E = net_builder.ReduceSum(B, {1});
+    auto F = net_builder.ReduceSum(C, {1});
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 3);
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_util.h b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
new file mode 100644
index 0000000000000..3857db05643a5
--- /dev/null
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
@@ -0,0 +1,561 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <queue>
+
+#include "cinn/hlir/pass/fusion_helper_base.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+#define CONDITION_FUNC(func)                                   \
+  inline bool func(const FusionHelperBase* helper,             \
+                   const std::shared_ptr<Graph::Group>& first, \
+                   const std::shared_ptr<Graph::Group>& second)
+
+// limit the group args number to less equal 512, as args stack size is 4K.
+CONDITION_FUNC(limit_args) {
+  std::unordered_set<Node*> args;
+  for (auto& group : {first, second}) {
+    for (auto node : group->input_nodes) {
+      args.insert(node.first);
+    }
+    for (auto node : group->output_nodes) {
+      args.insert(node);
+    }
+  }
+
+  if (args.size() > 512) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+CONDITION_FUNC(always_fuse) { return true; }
+
+CONDITION_FUNC(is_same_shape) {
+  if (!limit_args(helper, first, second)) {
+    return false;
+  }
+  auto output_var_0 = helper->GetNodeDataShape(*first->master_nodes.begin());
+  auto output_var_1 = helper->GetNodeDataShape(*second->master_nodes.begin());
+  return output_var_0 == output_var_1;
+}
+
+CONDITION_FUNC(is_same_size) {
+  if (!limit_args(helper, first, second)) {
+    return false;
+  }
+  auto output_var_0 = helper->GetNodeDataShape(*first->master_nodes.begin());
+  auto output_var_1 = helper->GetNodeDataShape(*second->master_nodes.begin());
+  if (output_var_0 == output_var_1) {
+    return true;
+  }
+
+  auto size_0 = std::accumulate(output_var_0.begin(), output_var_0.end(), 1, std::multiplies<int>());
+  auto size_1 = std::accumulate(output_var_1.begin(), output_var_1.end(), 1, std::multiplies<int>());
+  return size_0 == size_1;
+}
+
+bool is_const_group(const FusionHelperBase* helper, const std::shared_ptr<Graph::Group>& group) {
+  return group->CollectNodes().size() == 1 && helper->IsConstOp(group->CollectNodes()[0]);
+};
+
+CONDITION_FUNC(elementwise_fuse_broadcast) {
+  // if producer just include const op.
+  if (is_const_group(helper, first)) {
+    return true;
+  }
+  // if same shape with horizontal relation
+  if (is_same_size(helper, first, second)) {
+    return true;
+  }
+  // if first's output is not all in second's input
+  for (auto output : first->output_nodes) {
+    if (!second->input_nodes.count(output)) {
+      return false;
+    }
+    if (helper->output_nodes_set_.count(output)) {
+      return false;
+    }
+  }
+  // 1.compute io-size
+  // 2.compute computation-size
+  // 3.compute recompute-times
+  // 4.compute cost
+  // TODO(sunli) : cost-model.
+  return true;
+}
+
+CONDITION_FUNC(honrizontal_elementwise_fuse_reduce) {
+  std::shared_ptr<Graph::Group> ele_group, reduce_group;
+  if (first->op_pattern_kind == framework::kReduction) {
+    ele_group    = second;
+    reduce_group = first;
+  } else {
+    ele_group    = first;
+    reduce_group = second;
+  }
+  // if same shape with horizontal relation
+  if (is_same_size(helper, first, second)) {
+    return true;
+  }
+
+  shape_t ele_node_shape = helper->GetNodeDataShape(*ele_group->master_nodes.begin());
+  int32_t size_ele       = std::accumulate(ele_node_shape.begin(), ele_node_shape.end(), 1, std::multiplies<int>());
+  for (Node* master : reduce_group->master_nodes) {
+    shape_t master_node_shape = helper->GetNodeDataShape(master);
+    int32_t size_master =
+        std::accumulate(master_node_shape.begin(), master_node_shape.end(), 1, std::multiplies<int>());
+    if (size_ele == size_master) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+CONDITION_FUNC(elementwise_fuse_reduce) {
+  if (helper->target_ == common::DefaultHostTarget()) {
+    return true;
+  }
+  // if same shape with horizontal relation
+  if (is_same_size(helper, first, second)) {
+    return true;
+  }
+
+  // if reduce nodes not in consumers of first group
+  std::queue<Node*> candidates;
+  std::unordered_set<Node*> first_node_set  = first->NodeSet();
+  std::unordered_set<Node*> second_node_set = second->NodeSet();
+  for (const auto& pair : second->input_nodes) {
+    if (first_node_set.find(pair.first) != first_node_set.end()) {
+      candidates.push(pair.first);
+    }
+  }
+  std::unordered_set<Node*> visited;
+  std::unordered_set<Node*> masters_in_consumers;
+
+  while (!candidates.empty()) {
+    Node* candidate = candidates.front();
+    candidates.pop();
+
+    std::vector<Node*> consumers = helper->GetConsumerNode(candidate);
+    for (auto consumer : consumers) {
+      if (visited.count(consumer)) {
+        continue;
+      }
+      if (second_node_set.find(consumer) != second_node_set.end()) {
+        visited.insert(consumer);
+        candidates.push(consumer);
+      }
+      if (second->master_nodes.count(consumer)) {
+        masters_in_consumers.insert(consumer);
+      }
+    }
+  }
+  if (!masters_in_consumers.empty()) {
+    bool flag                = true;
+    shape_t first_node_shape = helper->GetNodeDataShape(*first->master_nodes.begin());
+    int32_t size_first = std::accumulate(first_node_shape.begin(), first_node_shape.end(), 1, std::multiplies<int>());
+    for (Node* master : masters_in_consumers) {
+      shape_t second_node_shape = helper->GetNodeDataShape(master);
+      int32_t size_second =
+          std::accumulate(second_node_shape.begin(), second_node_shape.end(), 1, std::multiplies<int>());
+      if (size_first != size_second) {
+        flag = false;
+        break;
+      }
+    }
+    if (flag) {
+      return true;
+    }
+  }
+
+  // if reduce using block_reduce, can't fuse producer.
+  Node* reducer = nullptr;
+  for (auto& node : second->master_nodes) {
+    if (helper->GetOpKind(node) == framework::kReduction) {
+      reducer = node;
+      break;
+    }
+  }
+  CHECK(reducer) << "Can't find reduce op in group " << second->group_id;
+
+  // If the elementwise's output should be fetched, the output var cannot be computed inline
+  // into reduce's loop, in other words, the elementwise's cannot fused into reduce's loop
+  // Like: group1 = {cast_0}, group2={broadcast_0 -> elementwise_0 -> cast_1 -> reduce_max_0}
+  if (helper->output_nodes_set_.count(*first->master_nodes.begin())) {
+    return false;
+  }
+
+  auto input_shape = helper->shape_dict_.at(reducer->inlinks_in_order()[0]->source()->id());
+  auto reduce_axes = absl::get<std::vector<int>>(reducer->attrs.attr_store.at("dim"));
+
+  int max_num_threads = helper->target_.max_num_threads();
+  // if without last dimension in reduce.
+  int lane = 1;
+  if (helper->WithoutLastDimInReduce(input_shape, reduce_axes)) {
+    for (int idx = reduce_axes.back() + 1; idx < input_shape.size(); ++idx) {
+      lane *= input_shape[idx];
+    }
+    if (lane > max_num_threads / 2) {
+      return true;
+    }
+  }
+
+  int index = reduce_axes.size() - 1;
+  for (; index >= 0; --index) {
+    if (index + 1 < reduce_axes.size() && reduce_axes[index] + 1 != reduce_axes[index + 1]) {
+      break;
+    }
+    lane *= input_shape[reduce_axes[index]];
+    if (lane > max_num_threads / 2) {
+      break;
+    }
+  }
+
+  if (lane <= max_num_threads) {
+    return true;
+  } else {
+    int prefix = input_shape[reduce_axes[index]];
+    int tail   = lane / prefix;
+    for (int idx = max_num_threads / tail; idx > (max_num_threads / 2) / tail; --idx) {
+      if (prefix % idx == 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+CONDITION_FUNC(broadcast_fuse_reduce) {
+  // if same shape with horizontal relation
+  if (is_same_size(helper, first, second)) {
+    return true;
+  }
+  Node* reducer = nullptr;
+  for (auto& node : second->master_nodes) {
+    if (helper->GetOpKind(node) == OpPatternKind::kReduction) {
+      reducer = node;
+      break;
+    }
+  }
+  CHECK(reducer) << "Can't find reduce op in group " << second->group_id;
+
+  auto input_shape = helper->shape_dict_.at(reducer->inlinks_in_order()[0]->source()->id());
+  auto input_size  = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
+
+  auto output_shape = helper->GetNodeDataShape(*first->master_nodes.begin());
+  auto output_size  = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+
+  if (input_size == output_size) {
+    return elementwise_fuse_reduce(helper, first, second);
+  }
+  return false;
+}
+
+CONDITION_FUNC(reduce_fuse_elementwise) {
+  if (!is_same_size(helper, first, second)) {
+    return false;
+  }
+  // if with last axis in reduce, fuse will waste computation resource.
+  // so use a simple model evaluate the cost.
+  // TODO(sunli) : cost-model.
+  return true;
+}
+
+inline bool horizontal_relation(const FusionHelperBase* helper,
+                                const std::shared_ptr<Graph::Group>& first,
+                                const std::shared_ptr<Graph::Group>& second,
+                                const framework::OpPatternKind op_pattern_kind) {
+  // merge injective
+  auto merge_nodes_set = [](const std::shared_ptr<Graph::Group>& group) {
+    std::unordered_set<Node*> nodes_set = group->nodes_set;
+    for (auto& sub_group : group->fused_sub_groups) {
+      nodes_set.insert(sub_group->nodes_set.begin(), sub_group->nodes_set.end());
+    }
+    return nodes_set;
+  };
+  auto first_set  = merge_nodes_set(first);
+  auto second_set = merge_nodes_set(second);
+
+  auto select_node_set = [helper](const std::unordered_set<Node*>& nodes, framework::OpPatternKind kind) {
+    std::unordered_set<Node*> selected;
+    for (auto node : nodes) {
+      if (helper->GetOpKind(node) == kind) {
+        selected.insert(node);
+      }
+    }
+    return selected;
+  };
+  auto selected_nodes = select_node_set(second_set, op_pattern_kind);
+
+  auto check_depency = [&](const Node* node) {
+    std::queue<const Node*> candidates;
+    std::unordered_set<const Node*> visited_set;
+    candidates.push(node);
+
+    while (!candidates.empty()) {
+      auto& candidate = candidates.front();
+      candidates.pop();
+      // visit all producer node
+      for (auto producer : helper->GetProducerNode(candidate)) {
+        // check dependency.
+        if (first_set.count(producer)) {
+          return true;
+        }
+        // check node is in region.
+        if (!second_set.count(producer)) {
+          continue;
+        }
+        // recorded visited node.
+        if (!visited_set.count(producer)) {
+          visited_set.insert(producer);
+          candidates.push(producer);
+        }
+      }
+    }
+
+    return false;
+  };
+
+  for (auto node : selected_nodes) {
+    if (check_depency(node)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+CONDITION_FUNC(horizontal_with_injective) {
+  if (is_const_group(helper, first)) {
+    return true;
+  }
+
+  if (!is_same_size(helper, first, second)) {
+    return false;
+  }
+  return horizontal_relation(helper, first, second, framework::OpPatternKind::kInjective);
+}
+
+CONDITION_FUNC(injective_horizontal_with_reduce) {
+  // check injective with injective.
+  if (!horizontal_relation(helper, first, second, framework::OpPatternKind::kInjective)) {
+    return false;
+  }
+  return elementwise_fuse_reduce(helper, first, second);
+}
+
+CONDITION_FUNC(reduce_fuse_broadcast) {
+  // if same shape with horizontal relation
+  if (is_same_size(helper, first, second)) {
+    return true;
+  }
+
+  // Traversing all reducers in all producers requires two types of conditions to be met.
+  // The first type is the condition that the reducer itself needs to meet,
+  // and the second type is the condition that the relationship between each reducer and its consumers with type of
+  // Broadcast needs to meet. It is required that each consumer of type Broadcast meet the same shape after broadcast as
+  // before reduce.
+  for (auto& node_in_master : first->master_nodes) {
+    if (helper->GetOpKind(node_in_master) != OpPatternKind::kReduction) {
+      continue;
+    }
+    Node* reducer = node_in_master;
+    // First type conditions
+    // Get some reduce information
+    auto reducer_input_shape  = helper->GetNodeInputShape(reducer);
+    auto reducer_output_shape = helper->GetNodeDataShape(reducer);
+    auto reduce_axes          = absl::get<std::vector<int>>(reducer->attrs.attr_store.at("dim"));
+    auto keep_dim             = absl::get<bool>(reducer->attrs.attr_store.at("keep_dim"));
+    for (auto& axis : reduce_axes) {
+      if (axis == -1) {
+        axis = reducer_input_shape.size() - 1;
+      }
+    }
+    // Check if the reduce axes are continuous
+    int reduce_size = reducer_input_shape.back();
+    for (auto idx = reduce_axes.size() - 1; idx >= 1; --idx) {
+      if (reduce_axes[idx] != reduce_axes[idx - 1] + 1) {
+        return false;
+      }
+      reduce_size *= reducer_input_shape[idx - 1];
+    }
+    // Check if the reduce size exceeds the hardware limit
+    if (helper->target_ == common::DefaultNVGPUTarget() && reduce_size > helper->target_.max_num_threads()) {
+      return false;
+    }
+
+    // Second type conditions
+    // Find directly or indirectly consumers with type of Broadcast in the second group
+    auto find_broadcasters_in_descendants = [&](const Node* producer) -> std::unordered_set<const Node*> {
+      std::queue<const Node*> candidates;
+      std::unordered_set<const Node*> visited_set;
+      std::unordered_set<const Node*> broadcasters;
+      candidates.push(producer);
+
+      while (!candidates.empty()) {
+        auto candidate = candidates.front();
+        candidates.pop();
+
+        for (auto consumer : helper->GetConsumerNode(candidate)) {
+          if (!visited_set.count(consumer)) {
+            visited_set.insert(consumer);
+            candidates.push(consumer);
+          }
+          if (helper->GetOpKind(consumer) == OpPatternKind::kBroadcast &&
+              second->NodeSet().find(consumer) != second->NodeSet().end()) {
+            broadcasters.insert(consumer);
+          }
+        }
+      }
+
+      return broadcasters;
+    };
+
+    // Check if each broadcast node meets the conditions
+    std::unordered_set<const Node*> broadcasters_in_consumers = find_broadcasters_in_descendants(reducer);
+    for (auto broadcaster : broadcasters_in_consumers) {
+      auto broadcaster_output_shape = absl::get<std::vector<int>>(broadcaster->attrs.attr_store.at("out_shape"));
+      auto broadcast_axes           = absl::get<std::vector<int>>(broadcaster->attrs.attr_store.at("broadcast_axes"));
+      for (auto& axis : broadcast_axes) {
+        if (axis == -1) {
+          axis = broadcaster_output_shape.size() - 1;
+        }
+      }
+
+      if (reducer_input_shape != broadcaster_output_shape) {
+        return false;
+      }
+
+      if (keep_dim) {
+        continue;
+      } else {
+        // if reducer_output_shape = [1]
+        if (reducer_output_shape.size() == 1 && reducer_output_shape[0] == 1) {
+          continue;
+        }
+        // check union [reduce_axes, broadcast_axes] = reducer_input_shape
+        for (int idx = 0; idx < reducer_input_shape.size(); ++idx) {
+          if (!(std::find(broadcast_axes.begin(), broadcast_axes.end(), idx) == broadcast_axes.end()) ^
+              std::find(reduce_axes.begin(), reduce_axes.end(), idx) == reduce_axes.end()) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+CONDITION_FUNC(reduce_fuse_reduce) {
+  if (!limit_args(helper, first, second)) {
+    return false;
+  }
+  Node* reducer_0 = nullptr;
+  for (auto& reducer : first->master_nodes) {
+    if (helper->GetOpKind(reducer) == OpPatternKind::kReduction) {
+      reducer_0 = reducer;
+      break;
+    }
+  }
+  CHECK(reducer_0) << "Can't find reduce op in group " << first->group_id;
+
+  Node* reducer_1 = nullptr;
+  for (auto& reducer : second->master_nodes) {
+    if (helper->GetOpKind(reducer) == OpPatternKind::kReduction) {
+      reducer_1 = reducer;
+      break;
+    }
+  }
+  CHECK(reducer_1) << "Can't find reduce op in group " << second->group_id;
+
+  // check reduce has same input shape and output shape
+  auto reducer_0_input_shape  = helper->shape_dict_.at(reducer_0->inlinks_in_order()[0]->source()->id());
+  auto reducer_0_output_shape = helper->shape_dict_.at(reducer_0->outlinks_in_order()[0]->sink()->id());
+
+  auto reducer_1_input_shape  = helper->shape_dict_.at(reducer_1->inlinks_in_order()[0]->source()->id());
+  auto reducer_1_output_shape = helper->shape_dict_.at(reducer_1->outlinks_in_order()[0]->sink()->id());
+
+  auto reducer_0_reduce_dim = absl::get<std::vector<int>>(reducer_0->attrs.attr_store.at("dim"));
+  auto reducer_1_reduce_dim = absl::get<std::vector<int>>(reducer_1->attrs.attr_store.at("dim"));
+
+  for (auto& dim : reducer_0_reduce_dim) {
+    // if dim = -1, set as shape.size() - 1
+    if (dim == -1) {
+      dim = reducer_0_reduce_dim.size() - 1;
+    }
+  }
+
+  for (auto& dim : reducer_1_reduce_dim) {
+    // if dim = -1,  set as shape.size() - 1
+    if (dim == -1) {
+      dim = reducer_1_reduce_dim.size() - 1;
+    }
+  }
+
+  // check shape is same
+  if (reducer_0_input_shape == reducer_1_input_shape && reducer_0_output_shape == reducer_1_output_shape &&
+      reducer_0_reduce_dim == reducer_1_reduce_dim) {
+    auto shared_size = 0;
+    for (auto& fusion_group : {first, second}) {
+      for (auto* master : fusion_group->master_nodes) {
+        if (helper->GetOpKind(master) == framework::kReduction) {
+          shared_size += helper->GetSharedSize(master);
+        }
+      }
+    }
+
+#define MAX_AVAILABLE_SHREAD 32 * 1024
+    if (shared_size > MAX_AVAILABLE_SHREAD) {
+      return false;
+    }
+#undef MAX_AVAILABLE_SHREAD
+    return true;
+  }
+
+  if (helper->WithoutLastDimInReduce(reducer_0_input_shape, reducer_0_reduce_dim) &&
+      helper->WithoutLastDimInReduce(reducer_1_input_shape, reducer_1_reduce_dim) &&
+      reducer_0_output_shape == reducer_1_output_shape && reducer_0_reduce_dim == reducer_1_reduce_dim) {
+    auto shared_size = 0;
+    for (auto& fusion_group : {first, second}) {
+      for (auto* master : fusion_group->master_nodes) {
+        if (helper->GetOpKind(master) == framework::kReduction) {
+          shared_size += helper->GetSharedSize(master);
+        }
+      }
+    }
+
+#define MAX_AVAILABLE_SHREAD 32 * 1024
+    if (shared_size > MAX_AVAILABLE_SHREAD) {
+      return false;
+    }
+#undef MAX_AVAILABLE_SHREAD
+    return true;
+  }
+
+  return false;
+}
+
+#undef CONDITION_FUNC
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/infershape.cc b/paddle/cinn/hlir/pass/infershape.cc
new file mode 100755
index 0000000000000..d563dbcf3d18d
--- /dev/null
+++ b/paddle/cinn/hlir/pass/infershape.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pass/infershape.h"
+
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using common::Type;
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+using framework::Operator;
+
+using infershape_t = std::function<std::vector<framework::shape_t>(const std::vector<framework::shape_t>&,
+                                                                   const framework::AttrMapType&)>;
+using inferdtype_t = std::function<std::vector<Type>(const std::vector<Type>&, const framework::AttrMapType&)>;
+using dtype_dict_t = absl::flat_hash_map<std::string, common::Type>;
+using shape_dict_t = absl::flat_hash_map<std::string, framework::shape_t>;
+
+void InferShape(Node* node, dtype_dict_t& dtype_dict, shape_dict_t& shape_dict) {
+  VLOG(3) << "Begin InferShape of node " << node->id();
+  auto op_infershape = Operator::GetAttrs<infershape_t>("infershape");
+  auto op_inferdtype = Operator::GetAttrs<inferdtype_t>("inferdtype");
+  CHECK(node) << "The node can not be nullptr.";
+
+  auto product = [](const framework::shape_t& shape) {
+    framework::dim_t numel = 1;
+    std::for_each(shape.begin(), shape.end(), [&numel](framework::dim_t dim) { numel *= dim; });
+    return numel;
+  };
+
+  std::vector<framework::shape_t> inputs_shape;
+  std::vector<Type> inputs_dtype;
+  for (auto& in_edge : node->inlinks_in_order()) {
+    auto* source_node = in_edge->source()->safe_as<NodeData>();
+    CHECK(source_node);
+    CHECK(shape_dict.count(source_node->id())) << "No shape for " << source_node->id();
+    CHECK(dtype_dict.count(source_node->id())) << "No dtype for " << source_node->id();
+    inputs_shape.push_back(shape_dict[source_node->id()]);
+    inputs_dtype.push_back(dtype_dict[source_node->id()]);
+
+    CHECK(product(inputs_shape.back())) << node->id() << " 's Input Node " << source_node->id() << "["
+                                        << utils::Join(inputs_shape.back(), ",")
+                                        << "]'s size should not zero ! Please check.";
+  }
+
+  auto out_shape = op_infershape[node->op()](inputs_shape, node->attrs.attr_store);
+  auto out_dtype = op_inferdtype[node->op()](inputs_dtype, node->attrs.attr_store);
+
+  CHECK_GE(node->outlinks_in_order().size(), out_shape.size())
+      << "The output number of node " << node->id() << " is " << node->outlinks_in_order().size()
+      << " , which is smaller than the output shape size " << out_shape.size() << " . And the op type is "
+      << node->op()->name;
+  CHECK_GE(node->outlinks_in_order().size(), out_dtype.size())
+      << "The output number of node " << node->id() << " is " << node->outlinks_in_order().size()
+      << " , which is smaller than the output dtype size " << out_dtype.size() << " . And the op type is "
+      << node->op()->name;
+
+  int counter = 0;
+  for (auto& out_edge : node->outlinks_in_order()) {
+    auto* sink_node = out_edge->sink()->safe_as<NodeData>();
+    CHECK(sink_node);
+
+    VLOG(3) << "Infershape: " << sink_node->id() << " " << utils::Join(out_shape[counter], ",");
+    shape_dict[sink_node->id()] = out_shape[counter];
+    dtype_dict[sink_node->id()] = out_dtype[counter];
+
+    CHECK(product(out_shape[counter])) << node->id() << " 's Output Node " << sink_node->id() << "["
+                                       << utils::Join(out_shape[counter], ",")
+                                       << "]'s size should not zero ! Please check.";
+
+    counter++;
+  }
+}
+
+void InferShapePass(Graph* graph) {
+  VLOG(3) << "Begin InferShapePass";
+  auto& shape_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, framework::shape_t>>("infershape");
+  auto& dtype_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, Type>>("inferdtype");
+  auto store_nodes = std::get<0>(graph->topological_order());
+
+  auto product = [](const framework::shape_t& shape) {
+    framework::dim_t numel = 1;
+    std::for_each(shape.begin(), shape.end(), [&numel](framework::dim_t dim) { numel *= dim; });
+    return numel;
+  };
+
+  for (auto& n : store_nodes) {
+    auto node = n->safe_as<Node>();
+    if (node) {
+      InferShape(node, dtype_dict, shape_dict);
+    }
+  }
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+CINN_REGISTER_HELPER(InferShape) {
+  CINN_REGISTER_PASS(InferShape)
+      .describe(
+          "This pass infer the shape and data type of tensor and save to g.attrs[\"infershape\"] and "
+          "g.attrs[\"inferdtype\"].")
+      .set_change_structure(false)
+      .provide_graph_attr("infershape")
+      .provide_graph_attr("inferdtype")
+      .set_body(cinn::hlir::pass::InferShapePass);
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/infershape.h b/paddle/cinn/hlir/pass/infershape.h
new file mode 100644
index 0000000000000..0c653fa754dd2
--- /dev/null
+++ b/paddle/cinn/hlir/pass/infershape.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/pass.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+void InferShape(framework::Node* node,
+                absl::flat_hash_map<std::string, common::Type>& dtype_dict,
+                absl::flat_hash_map<std::string, framework::shape_t>& shape_dict);
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass.cc b/paddle/cinn/hlir/pass/op_fusion_pass.cc
new file mode 100644
index 0000000000000..23fa6bcb1491d
--- /dev/null
+++ b/paddle/cinn/hlir/pass/op_fusion_pass.cc
@@ -0,0 +1,384 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/type.h"
+#include "cinn/hlir/pass/op_fusion_pass_util.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+using framework::OpPatternKind;
+using framework::shape_t;
+
+using common::GraphEdge;
+using common::GraphNode;
+
+using GroupPtr  = std::shared_ptr<Graph::Group>;
+using GroupList = std::vector<GroupPtr>;
+
+using ConditionFunction = std::function<bool(const FusionHelperBase*, const Node*, const GroupPtr&)>;
+
+// Op Fusion Pass which performs Ops fusion, Ops are fused
+// "vertically", meaning producing Ops are fused into their consumers
+// with the intent that the loops which compute their values will be fused in
+// code generation.
+class OpFusionPassHelper : public FusionHelperBase {
+ public:
+  OpFusionPassHelper(const Graph* graph) : FusionHelperBase(graph) {
+    // init fusion relation
+    InitFusionRelation();
+    // filter node data, create group for each node
+    auto nodes_inorder = std::get<0>(graph->topological_order());
+    for (auto graph_node : nodes_inorder) {
+      auto node = graph_node->safe_as<Node>();
+      if (node) {
+        nodes_.push_back(node);
+        auto group = std::make_shared<Graph::Group>();
+        // init group
+        group->nodes.push_back(node);
+        group->nodes_set.insert(node);
+        group->output_nodes.insert(node);
+        // input node
+        for (auto& edge : node->inlinks()) {
+          auto input_graph_node = edge->source();
+          auto input_node_data  = input_graph_node->safe_as<NodeData>();
+          CHECK(input_node_data);
+          // input data has no source node
+          if (input_node_data->source_node.get()) {
+            group->input_nodes[input_node_data->source_node.get()] = 1;
+          }
+        }
+
+        // group type
+        group->op_pattern_kind = GetOpKind(node);
+        // use current node as master node for schedule
+        group->master_nodes.insert(node);
+        group->group_id      = node->id();
+        fusion_groups_[node] = group;
+      }
+    }
+    // reverse node for output to input
+    std::reverse(nodes_.begin(), nodes_.end());
+  }
+
+  // return a vector of groups in topological order.
+  GroupList operator()(bool do_fusion = true) {
+    // do op fusion.
+    if (do_fusion) {
+      DoOpFusion();
+    }
+
+    // find all fusion group.
+    GroupList fusion_groups;
+    std::unordered_set<Graph::Group*> groups_set;
+    for (auto node : nodes_) {
+      auto& group = fusion_groups_[node];
+      if (!groups_set.count(group.get())) {
+        groups_set.insert(group.get());
+        fusion_groups.push_back(group);
+        // reverse nodes order to producer->consumer.
+        std::reverse(group->nodes.begin(), group->nodes.end());
+      }
+    }
+
+    // producer consumer
+    for (auto& consumer : fusion_groups) {
+      for (auto& input_node : consumer->input_nodes) {
+        auto& producer = fusion_groups_[input_node.first];
+        consumer->producer_groups.insert(producer);
+        producer->consumer_groups.insert(consumer);
+      }
+    }
+
+    // init group depth.
+    for (auto& group : fusion_groups) {
+      for (auto& consumer : group->consumer_groups) {
+        // update depth.
+        group->depth = std::max(group->depth, consumer->depth + 1);
+      }
+    }
+
+    // reverse to keep fusion group in order.
+    std::reverse(fusion_groups.begin(), fusion_groups.end());
+
+    return fusion_groups;
+  }
+
+ private:
+  void DoOpFusion() {
+    for (auto consumer : nodes_) {
+      auto consumer_kind = GetOpKind(consumer);
+      // kNonFusible op can't fuse any other op.
+      if (consumer_kind == framework::kNonFusible) {
+        continue;
+      }
+
+      // fusion op for consumer
+      auto consumer_fusion = fusion_groups_[consumer];  //
+      // check all linkin node
+      for (auto& edge : consumer->inlinks()) {
+        auto graph_node    = edge->source();
+        auto producer_data = graph_node->safe_as<NodeData>();
+        CHECK(producer_data);
+
+        auto producer = producer_data->source_node.get();
+        // if producer is fused.
+        if (consumer_fusion->nodes_set.count(producer)) {
+          VLOG(3) << "Op " << producer->id() << " is fused.";
+          continue;
+        }
+        // if producer data is placeholder
+        if (!producer) {
+          continue;
+        }
+
+        // kNonFusible op can't fuse any other op.
+        auto producer_kind = GetOpKind(producer);
+        if (producer_kind == framework::kNonFusible) {
+          continue;
+        }
+        VLOG(3) << "Producer Op: " << producer->id() << ", Op Pattern: " << producer_kind
+                << " -> Consumer Op: " << consumer->id() << ", Op Pattern: " << consumer_kind;
+        bool can_fuse = true;
+        // checkout producer node outputs are all in fusion op
+        for (auto& link : producer_data->outlinks()) {
+          auto consumer_node = link->sink()->safe_as<Node>();
+          CHECK(consumer_node);
+          // if fusion group can't find node, can't merge
+          if (consumer_fusion->nodes_set.find(consumer_node) == consumer_fusion->nodes_set.end()) {
+            can_fuse = false;
+            break;
+          }
+        }
+
+        if (!can_fuse || !CanFuse(producer, consumer)) continue;
+        VLOG(3) << "Fuse Op " << producer->id() << " into Op " << consumer->id();
+
+        // fuse producer to fusion group
+        consumer_fusion->group_id = producer->id() + "_" + consumer_fusion->group_id;
+        consumer_fusion->nodes.push_back(producer);
+        consumer_fusion->nodes_set.insert(producer);
+        consumer_fusion->input_nodes.erase(producer);
+        consumer_fusion->op_pattern_kind =
+            static_cast<int>(consumer_fusion->op_pattern_kind) > static_cast<int>(producer_kind)
+                ? consumer_fusion->op_pattern_kind
+                : producer_kind;
+
+        if (producer_kind == framework::kReduction) {
+          consumer_fusion->master_nodes.insert(producer);
+        }
+
+        if (this->output_nodes_set_.count(producer)) {
+          VLOG(3) << "Insert Global Output Node : " << producer->id();
+          consumer_fusion->output_nodes.insert(producer);
+        } else if (producer_data->outlinks().size() > 1 && producer->inlinks().size() > 0 &&
+                   is_same_size(this, producer, consumer_fusion)) {
+          // producer is not a const value node.
+          consumer_fusion->internal_nodes.insert(producer);
+        }
+
+        // fuse input node
+        auto& producer_fusion = fusion_groups_[producer];
+        for (auto input_node : producer_fusion->input_nodes) {
+          if (consumer_fusion->input_nodes.count(input_node.first)) {
+            consumer_fusion->input_nodes[input_node.first] += input_node.second;
+          } else {
+            consumer_fusion->input_nodes.insert(input_node);
+          }
+        }
+
+        // update node group
+        fusion_groups_[producer] = consumer_fusion;
+      }
+    }
+  }
+
+  void InitFusionRelation() {
+    // fusion relation.
+    // 1.kElementwise as producer
+    {
+      FusionRelation relation;
+      // producer -> consumer
+      relation.op_kind = {framework::kElementWise, framework::kBroadcast, framework::kReduction, framework::kInjective};
+      // producer -> fusion
+      relation.fusion_op_kind = {
+          // horizontal or vertical relation(Elementwise + *Elementwise*). As has same output shape, can always fuse.
+          {framework::kElementWise, always_fuse},
+          // must be horizontal, as Elementwise + Broadcast is left to fusion merge pass.
+          {framework::kBroadcast,
+           [](const FusionHelperBase* helper, const Node* producer, const GroupPtr& consumer) -> bool {
+             if (is_same_size(helper, producer, consumer)) {
+               return true;
+             }
+             return !helper->output_nodes_set_.count(producer);
+           }},
+          // horizontal or vertical relation, check with same output shape with horizontal relation or with last
+          // successive dimension less than 1024 for gpu.
+          {framework::kReduction, horizontal_or_vertical_reduce_relation},
+          // can be horizontal or can compute inline, check with same output shape or can compute inline.
+          {framework::kInjective, horizontal_or_can_inline},
+          // must be horizontal, check with same output shape.
+          {framework::kOutFusible, is_same_shape}};
+      fusion_relation_map_[framework::kElementWise] = std::move(relation);
+    }
+    // 2.kBroadcast as producer
+    {
+      FusionRelation relation;
+      // producer -> consumer
+      relation.op_kind = {framework::kElementWise, framework::kReduction, framework::kInjective};
+      // producer -> fusion
+      relation.fusion_op_kind = {
+          // horizontal or vertical relation(Broadcast + *Elementwise*), check with same output shape.
+          {framework::kElementWise, is_same_size},
+          // must be horizontal, as Broadcast + Broadcast is not allowed.
+          {framework::kBroadcast, is_same_size},
+          // horizontal or vertical relation(Broadcast + Reduce).
+          {framework::kReduction, horizontal_or_vertical_reduce_relation},
+          // can be horizontal or can compute inline, check with same output shape or just one consumer.
+          {framework::kInjective, horizontal_or_can_inline},
+          // must be horizontal, check with same output shape.
+          {framework::kOutFusible, is_same_shape}};
+      fusion_relation_map_[framework::kBroadcast] = std::move(relation);
+    }
+    // 3.kReduction as producer
+    {
+      FusionRelation relation;
+      // producer -> consumer
+      relation.op_kind = {framework::kElementWise, framework::kBroadcast};
+      // producer -> fusion
+      relation.fusion_op_kind = {
+          // horizontal or vertical relation(Reduce + Elementwise*), check without last dimension in reduce.
+          {framework::kElementWise, is_same_size},
+          // must be horizontal relation, check with same output shape and without last dimension in reduce.
+          {framework::kBroadcast, reduce_fuse_broadcast},
+          // must be horizontal relation and with same reduce attr.
+          {framework::kReduction, reduce_fuse_reduce},
+          // no_fuse
+          {framework::kInjective, no_fuse},
+          // can't fuse.
+          {framework::kOutFusible, no_fuse}};
+      fusion_relation_map_[framework::kReduction] = std::move(relation);
+    }
+    // 4.kInjective
+    {
+      FusionRelation relation;
+      // producer -> consumer
+      relation.op_kind = {framework::kElementWise, framework::kInjective};
+      // producer -> fusion
+      relation.fusion_op_kind = {
+          // can be horizontal or vertical(Injective + Elementwise), check with same output shape.
+          {framework::kElementWise, is_same_size},
+          // must be horizontal relation, check with same output shape.
+          {framework::kBroadcast, horizontal_with_same_size},
+          // left to fusion merge pass.
+          {framework::kReduction, no_fuse},
+          // must be horizontal relation, check with same output shape.
+          {framework::kInjective, horizontal_or_can_inline},
+          // can't fuse.
+          {framework::kOutFusible, no_fuse},
+      };
+      fusion_relation_map_[framework::kInjective] = std::move(relation);
+    }
+    // 5.kOutFusible
+    {
+      FusionRelation relation;
+      // producer -> consumer
+      relation.op_kind = {framework::kElementWise, framework::kBroadcast};
+      // producer -> fusion
+      relation.fusion_op_kind = {
+          // horizontal or vertical relation, check has same shape.
+          {framework::kElementWise, is_same_shape},
+          // it must be horizontal relation, check has same shape.
+          {framework::kBroadcast, is_same_shape},
+          // can't fuse.
+          {framework::kReduction, no_fuse},
+          // must be horizontal relation, check has same shape.
+          {framework::kInjective, is_same_shape},
+          // can't fuse.
+          {framework::kOutFusible, no_fuse},
+      };
+      fusion_relation_map_[framework::kOutFusible] = std::move(relation);
+    }
+  }
+
+  bool CanFuse(const Node* producer, const Node* consumer) {
+    auto& relation = fusion_relation_map_[GetOpKind(producer)];
+    // first step: check producer can be fused into consumer
+    if (relation.op_kind.count(GetOpKind(consumer))) {
+      auto& consumer_group = fusion_groups_[consumer];
+      // second step: check producer can be fused into consumer group
+      VLOG(3) << "Call ConditionFunction, Producer Op Pattern : " << GetOpKind(producer)
+              << " , Consumer Group Pattern : " << consumer_group->op_pattern_kind;
+      return relation.fusion_op_kind[consumer_group->op_pattern_kind](this, producer, fusion_groups_[consumer]);
+    }
+
+    return false;
+  }
+  std::vector<Node*> nodes_;
+  std::unordered_map<const Node*, GroupPtr> fusion_groups_;
+
+  struct FusionRelation {
+    // producer -> consumer
+    std::unordered_set<framework::OpPatternKind> op_kind = {};
+    // producer -> fusion sonsumer
+    std::unordered_map<framework::OpPatternKind, ConditionFunction> fusion_op_kind = {};
+  };
+  std::unordered_map<framework::OpPatternKind, FusionRelation> fusion_relation_map_;
+};
+
+void OpFusionPassInternal(Graph* graph) {
+  VLOG(3) << "OpFusionPass...!";
+  auto op_fusion_helper = OpFusionPassHelper(graph);
+  graph->fusion_groups  = op_fusion_helper();
+
+  for (auto& group : graph->fusion_groups) {
+    VLOG(3) << "Group Id : " << group->group_id;
+    for (auto& producer : group->producer_groups) {
+      VLOG(3) << "  producer group -> " << producer->group_id;
+    }
+    for (auto& consumer : group->consumer_groups) {
+      VLOG(3) << "  consumer group -> " << consumer->group_id;
+    }
+  }
+  VLOG(3) << "OpFusionPass Finish...!";
+}
+
+void BuildNonFusedGroupsPassInternal(framework::Graph* graph) {
+  auto op_fusion_helper = OpFusionPassHelper(graph);
+  VLOG(3) << "Apply OpFusionPass to generate initial non-fusion groups";
+  graph->fusion_groups = op_fusion_helper(false);
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(OpFusionPass) {
+  CINN_REGISTER_PASS(OpFusionPass)
+      .describe(
+          "Op Fusion Pass which performs Ops fusion, Producer Ops are fused into Consumer Ops with certain conditions.")
+      .set_change_structure(false)
+      .set_body(cinn::hlir::pass::OpFusionPassInternal);
+
+  CINN_REGISTER_PASS(BuildNonFusedGroupsPass)
+      .describe("Build No Fused Groups.")
+      .set_change_structure(false)
+      .set_body(cinn::hlir::pass::BuildNonFusedGroupsPassInternal);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
new file mode 100755
index 0000000000000..4ba77dec9c84b
--- /dev/null
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
@@ -0,0 +1,276 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn {
+namespace frontend {
+
+TEST(OpFusionPass, ElementWise_Fusion_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, D);
+    auto G = net_builder.Add(E, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(OpFusionPass, ElementWise_Fusion_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("ElementWise_Fusion_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(E, C);
+    auto G = net_builder.Add(E, D);
+    auto H = net_builder.Add(F, G);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(OpFusionPass, Brodcast_Test_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Brodcast_Test_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(C, A, 0);
+    auto F = net_builder.Add(D, B, 0);
+    auto G = net_builder.Add(E, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(OpFusionPass, Brodcast_Test_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Brodcast_Test_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, D);
+    auto G = net_builder.Add(F, E, 0);
+    auto H = net_builder.Add(G, C);
+    auto I = net_builder.Add(H, D);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(OpFusionPass, Brodcast_Test_2) {
+  int n = 2, c = 16, h = 32, w = 32;
+  NetBuilder net_builder("Brodcast_Test_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {c}, "A");
+    auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B");
+    auto C = net_builder.Reshape(A, {c, 1, 1});
+    auto D = net_builder.Multiply(B, C);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(OpFusionPass, Reduce_Test_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Add(C, D);
+    auto G = net_builder.ReduceSum(F, {0});
+    auto H = net_builder.Add(E, G);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 2);
+}
+
+TEST(OpFusionPass, Reduce_Test_1) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_1");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.ReduceSum(C, {0});
+    auto G = net_builder.ReduceSum(D, {0});
+    auto H = net_builder.Add(E, F);
+    auto I = net_builder.Add(G, H);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(OpFusionPass, Reduce_Test_2) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Reduce_Test_2");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h, w}, "D");
+    auto E = net_builder.ReduceSum(C, {0});
+    auto F = net_builder.ReduceSum(D, {1});
+    auto G = net_builder.Add(A, E);
+    auto H = net_builder.Add(B, F);
+    auto I = net_builder.Add(G, H);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 2);
+}
+
+TEST(OpFusionPass, Injective_Test_0) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Injective_Test_0");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {h, w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+    auto D = net_builder.CreateInput(Float(32), {h * 2, w}, "D");
+
+    auto E = net_builder.Add(A, B);
+    auto F = net_builder.Concat({C, E}, 0);
+    auto G = net_builder.Add(D, F);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(OP_LOWERING, Injective_Test_1) {
+  NetBuilder net_builder("Injective_Test_1");
+  auto A = net_builder.CreateInput(Float(32), {1, 19}, "A");
+  auto B = net_builder.CreateInput(Float(32), {1, 19, 204}, "B");
+  auto C = net_builder.ExpandDims(A, {1});
+  auto D = net_builder.BroadcastTo(C, {1, 204, 19}, {0, 1, 2});
+  auto E = net_builder.Transpose(B, {0, 2, 1});
+  auto F = net_builder.Add(D, E);
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+TEST(OpFusionPass, Test_Insert_BroadcastTo) {
+  int h = 32, w = 32;
+  NetBuilder net_builder("Test_Insert_BroadcastTo");
+  // create model
+  {
+    auto A = net_builder.CreateInput(Float(32), {w}, "A");
+    auto B = net_builder.CreateInput(Float(32), {w}, "B");
+    auto C = net_builder.CreateInput(Float(32), {h, w}, "C");
+
+    auto E = net_builder.Add(C, A, -1);
+    auto F = net_builder.Add(E, B, -1);
+  }
+
+  auto program = net_builder.Build();
+  auto target  = common::DefaultTarget();
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+
+  CHECK_EQ(graph->fusion_groups.size(), 1);
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_util.h b/paddle/cinn/hlir/pass/op_fusion_pass_util.h
new file mode 100644
index 0000000000000..580981b240c27
--- /dev/null
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_util.h
@@ -0,0 +1,337 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <queue>
+
+#include "cinn/hlir/pass/fusion_helper_base.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+#define CONDITION_FUNC(func) \
+  inline bool func(const FusionHelperBase* helper, const Node* producer, const std::shared_ptr<Graph::Group>& consumer)
+
+CONDITION_FUNC(always_fuse) { return true; }
+
+CONDITION_FUNC(no_fuse) { return false; }
+
+CONDITION_FUNC(is_same_shape) {
+  auto master_node = consumer->master_nodes.begin();
+  return helper->GetNodeDataShape(producer) == helper->GetNodeDataShape(*master_node);
+}
+
+CONDITION_FUNC(is_same_size) {
+  auto master_node    = consumer->master_nodes.begin();
+  auto producer_shape = helper->GetNodeDataShape(producer);
+  auto consumer_shape = helper->GetNodeDataShape(*master_node);
+  if (producer_shape == consumer_shape) {
+    return true;
+  }
+  auto psize = std::accumulate(producer_shape.begin(), producer_shape.end(), 1, std::multiplies<int>());
+  auto csize = std::accumulate(consumer_shape.begin(), consumer_shape.end(), 1, std::multiplies<int>());
+  return psize == csize;
+}
+
+CONDITION_FUNC(without_last_dimension_in_reduce) {
+  auto in_shape    = helper->shape_dict_.at(producer->inlinks_in_order()[0]->source()->id());
+  auto reduce_axes = absl::get<std::vector<int>>(producer->attrs.attr_store.at("dim"));
+  return helper->WithoutLastDimInReduce(in_shape, reduce_axes);
+}
+
+CONDITION_FUNC(reduce_fuse_reduce) {
+  Node* reducer = NULL;
+  for (auto* master : consumer->master_nodes) {
+    if (helper->GetOpKind(master) == framework::kReduction) {
+      reducer = master;
+      break;
+    }
+  }
+  // check reduce has same input shape and output shape
+  auto producer_input_shape  = helper->shape_dict_.at(producer->inlinks_in_order()[0]->source()->id());
+  auto producer_output_shape = helper->shape_dict_.at(producer->outlinks_in_order()[0]->sink()->id());
+
+  auto reducer_input_shape  = helper->shape_dict_.at(reducer->inlinks_in_order()[0]->source()->id());
+  auto reducer_output_shape = helper->shape_dict_.at(reducer->outlinks_in_order()[0]->sink()->id());
+
+  auto producer_reduce_dim = absl::get<std::vector<int>>(producer->attrs.attr_store.at("dim"));
+  auto reducer_reduce_dim  = absl::get<std::vector<int>>(reducer->attrs.attr_store.at("dim"));
+
+  for (auto& dim : producer_reduce_dim) {
+    // if dim = -1, set as shape.size() - 1
+    if (dim < 0) {
+      dim += producer_input_shape.size();
+    }
+  }
+
+  for (auto& dim : reducer_reduce_dim) {
+    // if dim = -1,  set as shape.size() - 1
+    if (dim < 0) {
+      dim += reducer_input_shape.size();
+    }
+  }
+
+  if (producer_output_shape == reducer_output_shape && producer_reduce_dim == reducer_reduce_dim) {
+    bool input_shape_same = producer_input_shape == reducer_input_shape;
+    bool without_last_dim = helper->WithoutLastDimInReduce(producer_input_shape, producer_reduce_dim) &&
+                            helper->WithoutLastDimInReduce(reducer_input_shape, reducer_reduce_dim);
+    // check shape is same
+    if (input_shape_same || without_last_dim) {
+      auto shared_size = helper->GetSharedSize(producer);
+      for (auto* master : consumer->master_nodes) {
+        if (helper->GetOpKind(master) == framework::kReduction) {
+          shared_size += helper->GetSharedSize(master);
+        }
+      }
+
+      constexpr int MAX_AVAILABLE_SHREAD = 32 * 1024;
+      if (shared_size > MAX_AVAILABLE_SHREAD) {
+        return false;
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+
+CONDITION_FUNC(is_horizontal_relation) {
+  auto check_depency = [&](const Node* node) {
+    std::queue<const Node*> candidates;
+    std::unordered_set<const Node*> visited_set;
+    candidates.push(node);
+
+    while (!candidates.empty()) {
+      auto& candidate = candidates.front();
+      candidates.pop();
+      // visit all producer node
+      for (auto tmp_node : helper->GetProducerNode(candidate)) {
+        // check depency.
+        if (producer == tmp_node) {
+          return true;
+        }
+        // check node is in region.
+        if (!consumer->nodes_set.count(tmp_node)) {
+          continue;
+        }
+        // recored visited node.
+        if (!visited_set.count(tmp_node)) {
+          visited_set.insert(tmp_node);
+          candidates.push(tmp_node);
+        }
+      }
+    }
+
+    return false;
+  };
+
+  for (auto node : consumer->nodes_set) {
+    if (helper->GetOpKind(node) != consumer->op_pattern_kind) {
+      continue;
+    }
+    if (check_depency(node)) {
+      return false;
+    }
+  }
+
+  return true;
+};
+
+CONDITION_FUNC(horizontal_or_vertical_reduce_relation) {
+  // check is same shape with horizontal relation.
+  if (is_same_size(helper, producer, consumer)) {
+    return true;
+  }
+
+  // reducer node in fusion op.
+  Node* reducer = NULL;
+  for (auto* master : consumer->master_nodes) {
+    if (helper->GetOpKind(master) == framework::kReduction) {
+      reducer = master;
+      break;
+    }
+  }
+
+  // check producer has same shape with reducer node.
+  auto reduce_shape = helper->shape_dict_.at(helper->GetProducerNodeData(reducer)[0]->id());
+  auto reduce_axes  = absl::get<std::vector<int>>(reducer->attrs.attr_store.at("dim"));
+  for (auto& axis : reduce_axes) {
+    // if axis = -1, set as shape.size() - 1
+    if (axis < 0) {
+      axis += reduce_shape.size();
+    }
+  }
+
+  auto node_shape  = helper->GetNodeDataShape(producer);
+  auto node_size   = std::accumulate(node_shape.begin(), node_shape.end(), 1, std::multiplies<int>());
+  auto reduce_size = std::accumulate(reduce_shape.begin(), reduce_shape.end(), 1, std::multiplies<int>());
+
+  // is not same size with reduce size.
+  if (node_size != reduce_size) {
+    return false;
+  }
+  // check without last axis in reduce.
+  if (helper->WithoutLastDimInReduce(reduce_shape, reduce_axes)) {
+    return false;
+  }
+
+  int succesive_reduce_dimension = reduce_shape.at(reduce_axes.back());
+  for (int idx = reduce_axes.size() - 2; idx >= 0; --idx) {
+    if (reduce_axes[idx] == reduce_axes[idx + 1] - 1) {
+      succesive_reduce_dimension *= reduce_shape[reduce_axes[idx]];
+      continue;
+    }
+    break;
+  }
+
+  return helper->target_ == common::DefaultNVGPUTarget()
+             ? (succesive_reduce_dimension <= helper->target_.max_num_threads() ? true : false)
+             : true;
+}
+
+CONDITION_FUNC(horizontal_or_can_inline) {
+  // horizontal relation.
+  if (is_horizontal_relation(helper, producer, consumer)) {
+    if (is_same_size(helper, producer, consumer)) {
+      return true;
+    } else {
+      // if do broadcast, check can compute inline.
+      return helper->output_nodes_set_.count(producer) == 0;
+    }
+  }
+  // vertical relation: 1.can compute inline
+  if (helper->GetNodeData(producer)->outlinks().size() == 1 && helper->output_nodes_set_.count(producer) == 0) {
+    return true;
+  }
+
+  // link to same node.
+  auto& out_links = helper->GetNodeData(producer)->outlinks();
+  for (auto link : out_links) {
+    if ((*out_links.begin())->sink() != link->sink()) {
+      return false;
+    }
+  }
+
+  return helper->output_nodes_set_.count(producer) == 0;
+}
+
+CONDITION_FUNC(horizontal_with_same_size) {
+  return is_horizontal_relation(helper, producer, consumer) && is_same_size(helper, producer, consumer);
+}
+
+CONDITION_FUNC(reduce_fuse_broadcast) {
+  if (is_horizontal_relation(helper, producer, consumer)) {
+    if (is_same_size(helper, producer, consumer)) {
+      return true;
+    }
+    return false;
+  }
+
+  if (helper->target_ != common::DefaultNVGPUTarget()) {
+    return true;
+  }
+
+  auto rinput_shape = helper->GetNodeInputShape(producer);
+  auto reduce_axes  = absl::get<std::vector<int>>(producer->attrs.attr_store.at("dim"));
+  auto keep_dim     = absl::get<bool>(producer->attrs.attr_store.at("keep_dim"));
+  for (auto& axis : reduce_axes) {
+    if (axis < 0) {
+      axis += rinput_shape.size();
+    }
+  }
+
+  int reduce_size = rinput_shape.back();
+  for (auto idx = reduce_axes.size() - 1; idx >= 1; --idx) {
+    if (reduce_axes[idx] != reduce_axes[idx - 1] + 1) {
+      return false;
+    }
+    reduce_size *= rinput_shape[idx - 1];
+  }
+
+  if (reduce_size > helper->target_.max_num_threads()) {
+    return false;
+  }
+
+  auto routput_shape = helper->GetNodeDataShape(producer);
+  auto find_reducer  = [&](const Node* node, const Node* reducer, const std::unordered_set<Node*>& nodes_set) {
+    std::queue<const Node*> candidates;
+    candidates.push(node);
+
+    while (!candidates.empty()) {
+      auto candidate = candidates.front();
+      candidates.pop();
+
+      for (auto producer : helper->GetProducerNode(candidate)) {
+        if (producer == reducer) {
+          return true;
+        }
+
+        if (nodes_set.count(producer)) {
+          candidates.push(producer);
+        }
+      }
+    }
+
+    return false;
+  };
+
+  for (auto node : consumer->nodes_set) {
+    if (helper->GetOpKind(node) != kBroadcast) {
+      continue;
+    }
+
+    if (!find_reducer(node, producer, consumer->nodes_set)) {
+      continue;
+    }
+
+    auto broadcast_shape = absl::get<std::vector<int>>(node->attrs.attr_store.at("out_shape"));
+    auto broadcast_axes  = absl::get<std::vector<int>>(node->attrs.attr_store.at("broadcast_axes"));
+    for (auto& axis : broadcast_axes) {
+      if (axis < 0) {
+        axis += broadcast_shape.size();
+      }
+    }
+
+    if (rinput_shape != broadcast_shape) {
+      return false;
+    }
+    // if keep dim = true.
+    if (keep_dim) {
+      continue;
+    } else {
+      // if routput_shape = [1]
+      if (routput_shape.size() == 1 && routput_shape[0] == 1) {
+        continue;
+      }
+      // check [reduce_axes, axes] = {0, 1, 2, 3, 4, 5, 6, ...}
+      for (int idx = 0; idx < rinput_shape.size(); ++idx) {
+        // note: !x ^ y == (!x) ^ y == !(x ^ y)
+        if ((std::find(broadcast_axes.begin(), broadcast_axes.end(), idx) != broadcast_axes.end()) ^
+            std::find(reduce_axes.begin(), reduce_axes.end(), idx) == reduce_axes.end()) {
+          return false;
+        }
+      }
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+#undef CONDITION_FUNC
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/opfusion.cc b/paddle/cinn/hlir/pass/opfusion.cc
new file mode 100644
index 0000000000000..ac6506d4d5487
--- /dev/null
+++ b/paddle/cinn/hlir/pass/opfusion.cc
@@ -0,0 +1,536 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+
+using common::GraphNode;
+using common::Type;
+using framework::Graph;
+using framework::Node;
+using framework::NodeData;
+using framework::Operator;
+using framework::OpPatternKind;
+
+struct DomNode {
+  GraphNode* ref_node{nullptr};
+  DomNode* parent{nullptr};
+  OpPatternKind pattern{framework::kNonFusible};
+  int depth{0};
+};
+
+void GetBroadcastPattern(Node* op_node,
+                         OpPatternKind* pattern,
+                         const absl::flat_hash_map<std::string, framework::shape_t>& shape_dict) {
+  if (*pattern == framework::kBroadcast) {
+    auto inlinks  = op_node->inlinks();
+    auto outlinks = op_node->outlinks();
+    CHECK_EQ(inlinks.size(), 2U);
+    CHECK_EQ(outlinks.size(), 1U);
+    std::vector<framework::shape_t> input_shapes;
+    for (auto link : inlinks) {
+      auto source = link->source();
+      CHECK(shape_dict.count(source->id()));
+      input_shapes.push_back(shape_dict.at(source->id()));
+    }
+    int small_index = input_shapes[0].size() <= input_shapes[1].size() ? 0 : 1;
+    auto begin      = std::find(
+        input_shapes[1 - small_index].begin(), input_shapes[1 - small_index].end(), input_shapes[small_index][0]);
+    bool is_same = true;
+    for (int i = 0; i < input_shapes[small_index].size(); i++) {
+      if (input_shapes[small_index][i] != (*begin)) {
+        is_same = false;
+        break;
+      } else {
+        ++begin;
+      }
+    }
+    if (is_same) {
+      *pattern = framework::kElementWise;
+    } else {
+      VLOG(2) << "not fuse broadcast";
+    }
+  }
+}
+
+class DomTree {
+ public:
+  std::vector<DomNode*>& CreatePostDomTree(const std::vector<GraphNode*>& nodes) {
+    int size = nodes.size();
+    dom_nodes_.resize(nodes.size());
+    // construct postdom tree, reverse topological_order
+    for (int i = size - 1; i >= 0; i--) {
+      auto* dom_node = CreateDomNode(nodes[i]);
+      CHECK(dom_node);
+      VLOG(2) << "dom_node: " << dom_node->ref_node->id() << ", pattern: " << dom_node->pattern
+              << ", depth: " << dom_node->depth;
+      if (dom_node->parent) {
+        VLOG(2) << dom_node->ref_node->id() << " parent: " << dom_node->parent->ref_node->id();
+      }
+      dom_nodes_[i] = dom_node;
+    }
+    return dom_nodes_;
+  }
+
+  std::vector<DomNode*> dom_nodes_;
+
+ private:
+  OpPatternKind FusePattern(OpPatternKind p0, OpPatternKind p1) { return p0 > p1 ? p0 : p1; }
+  DomNode* LCA(DomNode* l, DomNode* r, OpPatternKind* pattern) {
+    while (l != r) {
+      if (!l || !r) return nullptr;
+      if (l->depth < r->depth) {
+        *pattern = FusePattern(*pattern, r->pattern);
+        r        = r->parent;
+      } else if (l->depth > r->depth) {
+        *pattern = FusePattern(*pattern, l->pattern);
+        l        = l->parent;
+      } else {
+        *pattern = FusePattern(*pattern, l->pattern);
+        *pattern = FusePattern(*pattern, r->pattern);
+        l        = l->parent;
+        r        = r->parent;
+      }
+    }
+    return l;
+  }
+
+  DomNode* FindLCA(GraphNode* graph_node, OpPatternKind* pattern) {
+    static auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+    CHECK(graph_node);
+    CHECK(pattern);
+    DomNode* parent = nullptr;
+    int count       = 0;
+    if (graph_node->safe_as<Node>()) {
+      auto* node            = graph_node->safe_as<Node>();
+      const auto& out_links = node->outlinks_in_order();
+      for (int i = 0; i < out_links.size(); i++) {
+        auto sink         = out_links[i]->sink();
+        bool has_no_links = sink->outlinks().empty();
+        if (i) {
+          // CHECK(has_no_links) << "only the first out_var of " << node->id() << " links to other op node!";
+        } else {
+          int index = sink->get_index();
+          // the first out_var is the parent of the op node
+          parent   = dom_nodes_[index];
+          *pattern = FusePattern(*pattern, parent->pattern);
+        }
+      }
+    } else {
+      auto* node_data = graph_node->safe_as<NodeData>();
+      CHECK(node_data);
+      auto out_links = node_data->outlinks();
+      int count      = 0;
+      for (auto link : out_links) {
+        auto sink     = link->sink();
+        int index     = sink->get_index();
+        auto dom_node = dom_nodes_[index];
+        if (!count) {
+          parent = dom_node;
+          CHECK(parent);
+        } else {
+          // if the out_var links to more than one opnode, then we need to find the LCA
+          parent = LCA(parent, dom_node, pattern);
+        }
+        auto* op_node = sink->safe_as<Node>();
+        CHECK(op_node);
+        auto op_pattern = op_pattern_dict[op_node->op()];
+        VLOG(2) << sink->id() << "'s op pattern is " << op_pattern;
+        if (op_node->attrs.attr_store.count("pre_run") && absl::get<bool>(op_node->attrs.attr_store["pre_run"])) {
+          // not fuse pre_run opnode
+          op_pattern = framework::kNonFusible;
+          VLOG(3) << op_node->op()->name << " do pre_run and not fuse";
+        }
+        *pattern = FusePattern(*pattern, op_pattern);
+        count++;
+      }
+    }
+    return parent;
+  }
+  DomNode* CreateDomNode(GraphNode* graph_node) {
+    CHECK(graph_node);
+    DomNode* dom_node  = new DomNode();
+    dom_node->ref_node = graph_node;
+    if (graph_node->inlinks().empty() && graph_node->safe_as<NodeData>()) {
+      CHECK(graph_node->safe_as<NodeData>());
+      // extern input vars
+      dom_node->parent  = nullptr;
+      dom_node->pattern = framework::kNonFusible;
+      dom_node->depth   = 0;
+    } else {
+      OpPatternKind pattern{framework::kElementWise};
+      auto* parent      = FindLCA(graph_node, &pattern);
+      dom_node->parent  = parent;
+      dom_node->pattern = pattern;
+      dom_node->depth   = parent ? parent->depth + 1 : 0;
+    }
+    return dom_node;
+  }
+};
+struct GroupNode {
+  GroupNode* parent{nullptr};
+  OpPatternKind pattern;
+  common::GraphNode* ref_node{nullptr};
+  common::GraphNode* master_node{nullptr};
+  int index{0};
+  int nodes_count{1};
+  int op_nodes_count{0};
+  // get the root node
+  GroupNode* GetRootNode() {
+    if (!this->parent) return this;
+    GroupNode* root_node = this;
+    while (root_node->parent) {
+      root_node = root_node->parent;
+    }
+    // update group node's parent with root_node
+    auto* node = this;
+    while (node != root_node) {
+      auto* parent = node->parent;
+      node->parent = root_node;
+      node         = parent;
+    }
+    return root_node;
+  }
+};
+class GraphPartition {
+ public:
+  GraphPartition(const absl::flat_hash_map<std::string, framework::shape_t>& shape_dict) : shape_dict_(shape_dict) {}
+  std::vector<std::vector<Node*>> Partition(const std::vector<GraphNode*>& graph_nodes,
+                                            const std::vector<DomNode*>& dom_nodes) {
+    CHECK_EQ(graph_nodes.size(), dom_nodes.size());
+    InitGroups(graph_nodes);
+    for (int i = 0; i < 2; i++) {
+      FuseGroups(graph_nodes, dom_nodes, i);
+    }
+    SplitGroups(graph_nodes);
+#ifdef CINN_WITH_DEBUG
+    PrintGroups();
+#endif
+    return groups_;
+  }
+
+ private:
+  std::vector<GroupNode*> group_nodes_;
+  std::vector<std::vector<Node*>> groups_;
+  std::unordered_set<GraphNode*> visited_nodes_;
+  const absl::flat_hash_map<std::string, framework::shape_t>& shape_dict_;
+  void InitGroups(const std::vector<GraphNode*>& graph_nodes) {
+    static auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+    for (int i = 0; i < graph_nodes.size(); i++) {
+      GroupNode* group_node = new GroupNode();
+      GraphNode* graph_node = graph_nodes[i];
+      CHECK(graph_node);
+      auto op_node         = graph_node->safe_as<Node>();
+      group_node->ref_node = graph_node;
+      group_node->index    = graph_node->get_index();
+      if (op_node) {
+        auto pattern = op_pattern_dict[op_node->op()];
+        if (op_node->attrs.attr_store.count("pre_run") && absl::get<bool>(op_node->attrs.attr_store["pre_run"])) {
+          // not fuse pre_run opnode
+          pattern = framework::kNonFusible;
+          VLOG(3) << op_node->op()->name << " do pre_run and not fuse";
+        }
+        group_node->pattern        = pattern;
+        group_node->op_nodes_count = 1;
+        if (pattern == framework::kOutFusible) {
+          group_node->master_node = graph_node;
+        }
+      } else {
+        // var nodes
+        if (graph_node->inlinks().empty()) {
+          group_node->pattern = framework::kNonFusible;
+        } else {
+          group_node->pattern = framework::kElementWise;
+        }
+      }
+      group_nodes_.push_back(group_node);
+    }
+  }
+  bool IsSameShape(const std::vector<int>& shape1, const std::vector<int>& shape2) {
+    if (shape1.size() != shape2.size()) return false;
+    for (int i = 0; i < shape1.size(); i++) {
+      if (shape1[i] != shape2[i]) return false;
+    }
+    return true;
+  }
+  std::vector<int> GetOutshape(GraphNode* node) {
+    CHECK(node);
+    auto op_node = node->safe_as<Node>();
+    std::vector<int> out_shapes;
+    if (op_node) {
+      // first out var shape
+      CHECK(!op_node->outlinks_in_order().empty());
+      auto out_var = op_node->outlinks_in_order().front()->sink();
+      CHECK(shape_dict_.count(out_var->id()));
+      out_shapes = shape_dict_.at(out_var->id());
+    } else {
+      CHECK(shape_dict_.count(node->id()));
+      out_shapes = shape_dict_.at(node->id());
+    }
+    return out_shapes;
+  }
+  bool VerifyOutShape(GraphNode* node1, GraphNode* node2) {
+    auto out_shape1 = GetOutshape(node1);
+    auto out_shape2 = GetOutshape(node2);
+    if (out_shape1.size() == 1 || out_shape2.size() == 1) return true;
+    if (out_shape1.size() != out_shape2.size()) return false;
+    VLOG(2) << node1->id() << ", out_shape1: " << utils::Join(out_shape1, ", ");
+    VLOG(2) << node2->id() << ", out_shape2: " << utils::Join(out_shape2, ", ");
+    for (int i = 0; i < out_shape1.size(); i++) {
+      if (out_shape1[i] != out_shape2[i]) return false;
+    }
+    return true;
+  }
+  OpPatternKind GetRootPattern(GraphNode* node) {
+    CHECK(node);
+    auto* group_node = group_nodes_[node->get_index()];
+    CHECK(group_node);
+    auto* root_node = group_node->GetRootNode();
+    CHECK(root_node);
+    return root_node->pattern;
+  }
+  template <typename T>
+  bool CanFuse(GraphNode* source, GraphNode* sink, T fn) {
+    if (visited_nodes_.count(source)) return true;
+    visited_nodes_.insert(source);
+    if (!fn(GetRootPattern(source), source == sink)) return false;
+    if (source == sink) return true;
+    auto op_node = source->safe_as<Node>();
+    if (op_node) {
+      const auto& out_links = op_node->outlinks_in_order();
+      for (int i = 0; i < out_links.size(); i++) {
+        auto new_source = out_links[i]->sink();
+        // judge only the first out var of the op node can fuse
+        if (!i) {
+          if (!CanFuse(new_source, sink, fn)) return false;
+        } else {
+          CHECK(new_source->outlinks().empty()) << "only the first out_var of the op node can link to other op node";
+        }
+      }
+    } else {
+      auto& out_links = source->outlinks();
+      for (auto link : out_links) {
+        auto new_source = link->sink();
+        if (!CanFuse(new_source, sink, fn)) return false;
+      }
+    }
+    return true;
+  }
+  // check all the nodes between source and sink meet the function of fusion.
+  template <typename T>
+  bool VerifyFuse(GraphNode* source, GraphNode* sink, T fn) {
+    static auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
+    auto op_node                 = source->safe_as<Node>();
+    visited_nodes_.clear();
+    CHECK(source != sink);
+    auto sink_op_node = sink->safe_as<Node>();
+    if (sink_op_node && GetRootPattern(source) == framework::kOutFusible &&
+        op_pattern_dict[sink_op_node->op()] >= framework::kBroadcast) {
+      // verify conv and sink out shape. If sink's out shape is different from conv's, then no fuse for computeAt
+      // lowering incompatible
+      if (!VerifyOutShape(source, sink)) {
+        VLOG(2) << "source node: " << source->id();
+        VLOG(2) << "sink node: " << sink->id();
+        return false;
+      }
+    }
+    if (op_node) {
+      const auto& outlinks = op_node->outlinks_in_order();
+      for (int i = 0; i < outlinks.size(); i++) {
+        auto* new_source = outlinks[i]->sink();
+        if (!i) {
+          // verify all the nodes in the fuse path recursively
+          if (!CanFuse(new_source, sink, fn)) return false;
+        } else {
+          CHECK(new_source->outlinks().empty()) << "only the first out_var of op_node links to other op_node";
+        }
+      }
+    } else {
+      auto& outlinks = source->outlinks();
+      for (auto link : outlinks) {
+        auto* new_source = link->sink();
+        // verifyFuse all the nodes in the fuse path recursively
+        if (!CanFuse(new_source, sink, fn)) return false;
+      }
+    }
+    return true;
+  }
+  void MergeNodes(GroupNode* child, GroupNode* parent) {
+    child  = child->GetRootNode();
+    parent = parent->GetRootNode();
+    CHECK(child);
+    CHECK(parent);
+    if (child == parent) return;
+    parent->nodes_count += child->nodes_count;
+    parent->op_nodes_count += child->op_nodes_count;
+    child->parent = parent;
+    if (child->master_node) {
+      CHECK(!parent->master_node);
+      parent->master_node = child->master_node;
+      if (child->pattern > framework::kBroadcast && parent->pattern > framework::kBroadcast) {
+        LOG(FATAL) << "can't fuse 2 groups both with complex pattern";
+      } else {
+        parent->pattern = child->pattern > parent->pattern ? child->pattern : parent->pattern;
+      }
+    }
+  }
+  void Fuse(GraphNode* source, GraphNode* sink, GroupNode* target) {
+    if (source == sink) return;
+    if (visited_nodes_.count(source)) return;
+    visited_nodes_.insert(source);
+    auto* group_node = group_nodes_[source->get_index()];
+    CHECK(group_node);
+    MergeNodes(group_node, target);
+    auto op_node = source->safe_as<Node>();
+    if (op_node) {
+      const auto& outlinks = op_node->outlinks_in_order();
+      for (int i = 0; i < outlinks.size(); i++) {
+        auto* new_source = outlinks[i]->sink();
+        if (!i) {
+          Fuse(new_source, sink, target);
+        } else {
+          CHECK(new_source->outlinks().empty()) << "only the first out_var of op_node links to other op_node";
+        }
+      }
+    } else {
+      auto& outlinks = source->outlinks();
+      for (auto link : outlinks) {
+        auto* new_source = link->sink();
+        Fuse(new_source, sink, target);
+      }
+    }
+  }
+  void DoFuse(GraphNode* source, GraphNode* sink) {
+    auto* group_node = group_nodes_[sink->get_index()];
+    CHECK(group_node);
+    visited_nodes_.clear();
+    CHECK(source != sink);
+    Fuse(source, sink, group_node);
+  }
+  void FuseGroups(const std::vector<GraphNode*>& graph_nodes, const std::vector<DomNode*>& dom_nodes, int phase) {
+    CHECK_EQ(graph_nodes.size(), dom_nodes.size());
+    CHECK_EQ(group_nodes_.size(), dom_nodes.size());
+    for (int i = 0; i < graph_nodes.size(); i++) {
+      auto* graph_node = graph_nodes[i];
+      auto* dom_node   = dom_nodes[i];
+      auto* group_node = group_nodes_[i];
+      CHECK(graph_node);
+      CHECK(dom_node);
+      CHECK(group_node);
+      if (!dom_node->parent) continue;
+      if (group_node->pattern == framework::kNonFusible) continue;
+      int parent_index       = dom_node->parent->ref_node->get_index();
+      auto parent_group_node = group_nodes_[parent_index];
+      if (parent_group_node && parent_group_node->GetRootNode() == group_node->GetRootNode()) continue;
+
+      if (group_node->pattern == framework::kOutFusible) {
+        if (dom_node->pattern <= framework::kBroadcast) {
+          auto fn       = [](OpPatternKind pattern, bool is_sink) { return pattern <= framework::kBroadcast; };
+          auto lca_node = dom_node->parent->ref_node;
+          if (VerifyFuse(graph_node, lca_node, fn)) {
+            VLOG(2) << "fuse between " << graph_node->id() << " and " << lca_node->id();
+            DoFuse(graph_node, lca_node);
+          }
+        }
+      } else if (group_node->pattern <= framework::kBroadcast) {
+        if (dom_node->pattern <= framework::kReduction) {
+          auto fn = [](OpPatternKind pattern, bool is_sink) {
+            if (is_sink) {
+              return pattern <= framework::kOutFusible;
+            } else {
+              return pattern <= framework::kInjective;
+            }
+          };
+          auto lca_node = dom_node->parent->ref_node;
+          if (VerifyFuse(graph_node, lca_node, fn)) {
+            VLOG(2) << "fuse between " << graph_node->id() << " and " << lca_node->id();
+            DoFuse(graph_node, lca_node);
+          }
+        }
+      } else if (group_node->pattern == framework::kInjective && phase == 1) {
+        // fuse injective ops in the second phase so that conv2d can always finish fusing
+        if (dom_node->pattern <= framework::kInjective) {
+          auto fn       = [](OpPatternKind pattern, bool is_sink) { return pattern <= framework::kInjective; };
+          auto lca_node = dom_node->parent->ref_node;
+          if (VerifyFuse(graph_node, lca_node, fn)) {
+            VLOG(2) << "fuse between " << graph_node->id() << " and " << lca_node->id();
+            DoFuse(graph_node, lca_node);
+          }
+        }
+      }
+    }
+  }
+  void SplitGroups(const std::vector<common::GraphNode*>& graph_nodes) {
+    // split groups sorted by topo order
+    CHECK_EQ(graph_nodes.size(), group_nodes_.size());
+    absl::flat_hash_map<int, std::vector<Node*>> group_maps;
+    std::set<int> root_indice;
+    for (int i = 0; i < graph_nodes.size(); i++) {
+      CHECK(graph_nodes[i]);
+      auto* op_node = graph_nodes[i]->safe_as<Node>();
+      if (!op_node) continue;
+      CHECK(group_nodes_[i]->GetRootNode());
+      int root_index = group_nodes_[i]->GetRootNode()->ref_node->get_index();
+      group_maps[root_index].push_back(op_node);
+      root_indice.insert(root_index);
+    }
+    for (auto index : root_indice) {
+      groups_.push_back(group_maps[index]);
+    }
+  }
+  void PrintGroups() {
+    for (int i = 0; i < groups_.size(); i++) {
+      VLOG(2) << "group " << i << ": ";
+      for (auto& node : groups_[i]) {
+        VLOG(2) << node->id() << " ";
+      }
+    }
+  }
+};
+
+void OpFusionPass(Graph* graph) {
+  auto store_nodes = std::get<0>(graph->topological_order());
+  int node_size    = store_nodes.size();
+  // construct postdom tree, reverse topological_order
+  DomTree tree;
+  auto& dom_nodes = tree.CreatePostDomTree(store_nodes);
+  // graph partition
+  auto& shape_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, framework::shape_t>>("infershape");
+  GraphPartition partition(shape_dict);
+  graph->groups = partition.Partition(store_nodes, dom_nodes);
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(OpFusion) {
+  CINN_REGISTER_PASS(OpFusion)
+      .describe("This pass traverse the graph and fuse all ops.")
+      .set_change_structure(false)
+      .set_body(cinn::hlir::pass::OpFusionPass);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/opfusion_test.cc b/paddle/cinn/hlir/pass/opfusion_test.cc
new file mode 100755
index 0000000000000..56294e935828f
--- /dev/null
+++ b/paddle/cinn/hlir/pass/opfusion_test.cc
@@ -0,0 +1,540 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace cinn {
+namespace frontend {
+
+using hlir::framework::Scope;
+using utils::Join;
+
+/**
+ *  complex case: diamond structure
+ *         conv
+ *        /     \
+ *      add    relu
+ *        \     /
+ *          add
+ */
+TEST(complex2, complex2) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {3, 1, 7, 7}, "B");
+  Placeholder C(Float(32), {1, 3, 112, 112}, "C");
+  Placeholder D(Float(32), {1, 3, 1, 1}, "D");
+  Placeholder E(Float(32), {1, 3, 1, 1}, "E");
+
+  Placeholder Scale(Float(32), {64}, "Scale");
+  Placeholder Bias(Float(32), {64}, "Bias");
+  Placeholder Mean(Float(32), {64}, "Mean");
+  Placeholder Variance(Float(32), {64}, "Variance");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  absl::flat_hash_map<std::string, Program::attr_t> attrs1;
+  attrs1["epsilon"] = static_cast<float>(0.001);
+
+  auto c = program.depthwise_conv2d(A, B, attrs);
+  auto d = program.elementwise_add(c, C);
+  auto e = program.relu(c);
+  auto f = program.elementwise_add(d, e);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B, C, D, E});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "OpFusion");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+
+  runtime_program->Execute();
+}
+TEST(complex1, complex1) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder C(Float(32), {1, 64, 112, 112}, "C");
+  Placeholder D(Float(32), {1, 64, 1, 1}, "D");
+  Placeholder E(Float(32), {1, 64, 1, 1}, "E");
+
+  Placeholder Scale(Float(32), {64}, "Scale");
+  Placeholder Bias(Float(32), {64}, "Bias");
+  Placeholder Mean(Float(32), {64}, "Mean");
+  Placeholder Variance(Float(32), {64}, "Variance");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  absl::flat_hash_map<std::string, Program::attr_t> attrs1;
+  attrs1["epsilon"] = static_cast<float>(0.001);
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.elementwise_add(c, C);
+  auto e = program.relu(c);
+  auto f = program.elementwise_add(d, e);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B, C, D, E});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "OpFusion");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+
+  runtime_program->Execute();
+}
+
+// add+relu
+TEST(fuse_add_relu, fuse_add_relu) {
+  Placeholder A(Float(32), {1, 64, 112, 112}, "A");
+  Placeholder B(Float(32), {64}, "B");
+
+  Program program;
+  auto c = program.elementwise_add(A, B, 1);
+  auto d = program.relu(c);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "OpFusion");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+
+  runtime_program->Execute();
+}
+
+// add+add
+TEST(fuse_add, fuse_add) {
+  Placeholder A(Float(32), {1, 64, 112, 112}, "A");
+  Placeholder B(Float(32), {64}, "B");
+  Placeholder C(Float(32), {64}, "C");
+
+  Program program;
+  auto c = program.elementwise_add(A, B, 1);
+  auto d = program.elementwise_add(c, C, 1);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B, C});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "OpFusion");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+
+  runtime_program->Execute();
+}
+
+// conv+bn+add+add+relu
+TEST(conv_bn_conv, conv_bn_conv) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder C(Float(32), {1, 64, 112, 112}, "C");
+  Placeholder D(Float(32), {1, 64, 1, 1}, "D");
+  Placeholder E(Float(32), {1, 64, 1, 1}, "E");
+
+  Placeholder Scale(Float(32), {64}, "Scale");
+  Placeholder Bias(Float(32), {64}, "Bias");
+  Placeholder Mean(Float(32), {64}, "Mean");
+  Placeholder Variance(Float(32), {64}, "Variance");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  absl::flat_hash_map<std::string, Program::attr_t> attrs1;
+  attrs1["epsilon"] = static_cast<float>(0.001);
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.batchnorm(c, Scale, Bias, Mean, Variance, attrs1);
+  auto e = program.elementwise_add(d, C);
+  auto f = program.elementwise_mul(e, D);
+  auto g = program.relu(f);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B, C, D, E});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "OpFusion");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+  scope->Var<hlir::framework::Tensor>("E");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  auto E1 = scope->GetTensor("E");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+  SetRandData<float>(E1, target);
+
+  runtime_program->Execute();
+}
+
+// conv+add
+TEST(fuse_conv_add, fuse_conv_add) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder C(Float(32), {64}, "C");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.elementwise_add(c, C, 1);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B, C});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "OpFusion");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+
+  runtime_program->Execute();
+}
+
+// conv+add+mul
+TEST(conv_add_mul, conv_add_mul) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder C(Float(32), {64}, "C");
+  Placeholder D(Float(32), {64, 64, 7, 7}, "D");
+
+  Placeholder Scale(Float(32), {1, 64, 1, 1}, "Scale");
+  Placeholder Bias(Float(32), {64}, "Bias");
+  Placeholder Mean(Float(32), {64}, "Mean");
+  Placeholder Variance(Float(32), {64}, "Variance");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  absl::flat_hash_map<std::string, Program::attr_t> attrs1;
+  attrs1["epsilon"] = static_cast<float>(0.001);
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.elementwise_add(c, Scale);
+  auto e = program.elementwise_mul(d, Bias, 1);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B, D});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "OpFusion");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+
+  runtime_program->Execute();
+}
+
+// conv+add with different out shape
+TEST(fuse_conv_add1, fuse_conv_add1) {
+  Placeholder A(Float(32), {1, 8, 1, 1}, "A");
+  Placeholder B(Float(32), {32, 8, 1, 1}, "B");
+  Placeholder C(Float(32), {1, 32, 112, 112}, "C");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({1, 1});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({0, 0});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.elementwise_add(c, C);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B, C});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+  hlir::framework::ApplyPass(graph.get(), "OpFusion");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+
+  runtime_program->Execute();
+}
+
+TEST(transpose_reshape_concat, transpose_reshape_concat) {
+  Placeholder A(Float(32), {64, 2}, "A");
+  Placeholder B(Float(32), {64, 2}, "B");
+
+  Program program;
+  auto a = program.transpose(A, {1, 0});
+  auto b = program.transpose(B, {1, 0});
+  auto c = program.reshape(a, {4, 32});
+  auto d = program.reshape(b, {4, 32});
+  auto e = program.concat({c, d});
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "OpFusion");
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+
+  runtime_program->Execute();
+}
+
+// conv + fused_batch_norm
+TEST(conv_bn, conv_bn) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {64, 3, 7, 7}, "B");
+  Placeholder C(Float(32), {64}, "C");
+  Placeholder D(Float(32), {64, 64, 7, 7}, "D");
+
+  Placeholder Scale(Float(32), {64}, "Scale");
+  Placeholder Bias(Float(32), {64}, "Bias");
+  Placeholder Mean(Float(32), {64}, "Mean");
+  Placeholder Variance(Float(32), {64}, "Variance");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["stride"]        = std::vector<int>({2, 2});
+  attrs["dilation"]      = std::vector<int>({1, 1});
+  attrs["padding"]       = std::vector<int>({3, 3});
+  std::string src_layout = "NCHW";
+  attrs["data_format"]   = src_layout;
+
+  absl::flat_hash_map<std::string, Program::attr_t> attrs1;
+  attrs1["epsilon"] = static_cast<float>(0.001);
+
+  auto c = program.conv2d(A, B, attrs);
+  auto d = program.fused_batchnorm_inference(c, Scale, Bias, Mean, Variance, attrs1);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B, Scale, Bias, Mean, Variance});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+  hlir::framework::ApplyPass(graph.get(), "OpFusion");
+  auto scope = BuildScope(target, graph);
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("C");
+  scope->Var<hlir::framework::Tensor>("D");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  auto C1 = scope->GetTensor("C");
+  auto D1 = scope->GetTensor("D");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+  SetRandData<float>(C1, target);
+  SetRandData<float>(D1, target);
+
+  runtime_program->Execute();
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/reduce_split_pass.cc b/paddle/cinn/hlir/pass/reduce_split_pass.cc
new file mode 100644
index 0000000000000..80cab5ff240a7
--- /dev/null
+++ b/paddle/cinn/hlir/pass/reduce_split_pass.cc
@@ -0,0 +1,230 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/pass/infershape.h"
+#include "cinn/hlir/pe/nn_util.h"
+
+namespace cinn {
+namespace hlir {
+namespace pass {
+namespace {
+
+using common::GraphNode;
+using framework::Node;
+using framework::NodeData;
+using framework::Operator;
+using framework::shape_t;
+
+bool IsReduceOp(const framework::Node* node) {
+  static std::unordered_set<std::string> reduce_op_type = {
+      "reduce_sum", "reduce_mean", "reduce_max", "reduce_min", "reduce_all", "reduce_any"};
+  if (reduce_op_type.count(node->op()->name)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+std::pair<int, int> DivideToClosetNum(int n) {
+  int a = sqrt(n);
+  int b = n / a;
+  while (a * b != n) {
+    if (a * b < n) {
+      a++;
+      b = n / a;
+    } else {
+      a--;
+      b = n / a;
+    }
+  }
+  return {a, b};
+}
+
+uint32_t NextPowerOf2(uint32_t n) {
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  return n++;
+}
+
+class ReduceSplitPass {
+ public:
+  // Find the reduce op with nwhc format and large shape, split it into two ops
+  static int Apply(framework::Graph* graph) {
+    int MAX_NUM_THREADS               = common::DefaultNVGPUTarget().max_num_threads();
+    constexpr int MAX_ITER_PER_THREAD = 32;  // empirical value
+
+    int cnt          = 0;
+    auto& shape_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, shape_t>>("infershape");
+    auto& dtype_dict = graph->GetMutableAttrs<absl::flat_hash_map<std::string, Type>>("inferdtype");
+
+    // loop the nodes in graph and find reduce_xx op
+    auto nodes_inorder = std::get<0>(graph->topological_order());
+    for (auto node : nodes_inorder) {
+      if (!node->safe_as<Node>()) {
+        continue;
+      }
+      auto n = node->safe_as<Node>();
+      if (IsReduceOp(n)) {
+        auto* op  = n->op();
+        auto name = op->name;
+
+        auto dims     = absl::get<std::vector<int>>(n->attrs.attr_store.at("dim"));
+        bool keep_dim = absl::get<bool>(n->attrs.attr_store.at("keep_dim"));
+        auto in       = (*n->inlinks().begin())->source()->safe_as<NodeData>();
+        auto out      = (*n->outlinks().begin())->sink()->safe_as<NodeData>();
+
+        auto in_shape  = shape_dict.at(in->id());
+        auto out_shape = shape_dict.at(out->id());
+        // all preceding reduced
+        CHECK(in_shape.size() > 1);
+        // [NHWC]->[C], only the last dim kept
+        bool all_preceding_dim_reduced = true;
+        for (auto i = 0; i < in_shape.size() - 1; ++i) {
+          if (std::find(dims.begin(), dims.end(), i) == dims.end()) {
+            all_preceding_dim_reduced = false;
+          }
+        }
+        bool reduce_all =
+            all_preceding_dim_reduced && std::find(dims.begin(), dims.end(), in_shape.size() - 1) != dims.end();
+        if (!all_preceding_dim_reduced || reduce_all) {
+          continue;
+        }
+        int numel        = std::accumulate(in_shape.begin(), in_shape.end(), 1, std::multiplies<int>());
+        int reduce_numel = std::accumulate(in_shape.begin(), in_shape.end() - 1, 1, std::multiplies<int>());
+        CHECK(reduce_numel > 0);
+        // if the numel is not large enough, it is no need to split
+        // if loop times is too large with reduce optimize
+        int size   = std::accumulate(in_shape.begin(), (in_shape.end() - 1), 1, std::multiplies<int>());
+        int tail   = 0;
+        bool bound = true;
+        auto shape = pe::GetFirstStepReduceShape({size, in_shape.back()}, {0}, bound, tail);
+        CHECK(bound);
+        CHECK_EQ(shape.size(), 3);
+
+        auto res          = DivideToClosetNum(reduce_numel);
+        int reduce_numel0 = std::get<0>(res), reduce_numel1 = std::get<1>(res);
+
+        VLOG(3) << "InShape -> "
+                << std::accumulate(
+                       in_shape.begin(), in_shape.end(), std::string(""), [](const std::string& left, const int right) {
+                         return left + std::to_string(right) + " ";
+                       });
+        VLOG(3) << "  reduce  split : " << reduce_numel0 << " " << reduce_numel1 << " " << in_shape.back();
+        VLOG(3) << "  reshape split : "
+                << std::accumulate(shape.begin(), shape.end(), std::string(""), [](std::string left, int right) {
+                     return left + std::to_string(right) + " ";
+                   });
+
+        // Two do reduce split:
+        //   1. reshape_loop > split_loop
+        //   2. reshape thread > max_threads.
+        if (shape[0] <= reduce_numel0 && shape[1] * shape[2] <= common::GetMaxThreads()) {
+          VLOG(3) << "  Don't Do Reduce Split!";
+          continue;
+        }
+        VLOG(3) << "  Do Reduce Split!";
+
+        /*
+        if ((!all_preceding_dim_reduced) || numel <= MAX_NUM_THREADS * MAX_ITER_PER_THREAD || reduce_all) {
+          continue;
+        }
+        */
+        // create reshape node0
+        Node* reshape0 = new Node(Operator::Get("reshape"), "reshape", common::UniqName("reshape_split"));
+        reshape0->attrs.attr_store["shape"] =
+            std::vector<int>{reduce_numel0, reduce_numel1, in_shape[in_shape.size() - 1]};
+        graph->RegisterNode(reshape0->id(), reshape0);
+        in->LinkTo(reshape0);
+        in->UnLinkSingleTo(node);
+        node->UnLinkSingleTo(out);
+        auto reshape0_data = new NodeData(Shared<Node>(reshape0), 0, 0, common::UniqName("var"), false);
+        graph->RegisterNode(reshape0_data->id(), reshape0_data);
+        reshape0->LinkTo(reshape0_data);
+        shape_dict[reshape0_data->id()] = absl::get<std::vector<int>>(reshape0->attrs.attr_store.at("shape"));
+        dtype_dict[reshape0_data->id()] = common::Str2Type(common::Type2Str(dtype_dict[in->id()]));
+
+        // create reduce node0
+        Node* reduce0                         = new Node(Operator::Get(name), name, common::UniqName(name + "_split"));
+        reduce0->attrs.attr_store["dim"]      = std::vector<int>{0};
+        reduce0->attrs.attr_store["keep_dim"] = absl::get<bool>(n->attrs.attr_store.at("keep_dim"));
+        graph->RegisterNode(reduce0->id(), reduce0);
+        reshape0_data->LinkTo(reduce0);
+        auto reduce0_data = new NodeData(Shared<Node>(reduce0), 0, 0, common::UniqName("var"), false);
+        graph->RegisterNode(reduce0_data->id(), reduce0_data);
+        reduce0->LinkTo(reduce0_data);
+        shape_dict[reduce0_data->id()] = keep_dim ? std::vector<int>{1, reduce_numel1, in_shape[in_shape.size() - 1]}
+                                                  : std::vector<int>{reduce_numel1, in_shape[in_shape.size() - 1]};
+        dtype_dict[reduce0_data->id()] = common::Str2Type(common::Type2Str(dtype_dict[in->id()]));
+
+        // create reduce node1
+        Node* reduce1                         = new Node(Operator::Get(name), name, common::UniqName(name + "_split"));
+        reduce1->attrs.attr_store["dim"]      = keep_dim ? std::vector<int>{0, 1} : std::vector<int>{0};
+        reduce1->attrs.attr_store["keep_dim"] = absl::get<bool>(n->attrs.attr_store.at("keep_dim"));
+        graph->RegisterNode(reduce1->id(), reduce1);
+        reduce0_data->LinkTo(reduce1);
+        auto reduce1_data = new NodeData(Shared<Node>(reduce1), 0, 0, common::UniqName("var"), false);
+        graph->RegisterNode(reduce1_data->id(), reduce1_data);
+        reduce1->LinkTo(reduce1_data);
+        shape_dict[reduce1_data->id()] = keep_dim ? std::vector<int>{1, 1, in_shape[in_shape.size() - 1]}
+                                                  : std::vector<int>{in_shape[in_shape.size() - 1]};
+        dtype_dict[reduce1_data->id()] = common::Str2Type(common::Type2Str(dtype_dict[in->id()]));
+
+        // create reshape node1
+        Node* reshape1 = new Node(Operator::Get("reshape"), "reshape", common::UniqName("reshape_split"));
+        reshape1->attrs.attr_store["shape"] = out_shape;
+        graph->RegisterNode(reshape1->id(), reshape1);
+        reduce1_data->LinkTo(reshape1);
+        reshape1->LinkTo(out);
+        out->source_node = common::Shared<Node>(reshape1);
+
+        // drop old node
+        graph->DropNode(node);
+
+        cnt++;
+      }
+    }
+    return cnt;
+  }
+
+ private:
+};
+
+}  // namespace
+
+void ReduceSplitFunc(framework::Graph* graph) {
+  int n = ReduceSplitPass::Apply(graph);
+  VLOG(3) << "ReduceSplit was performed " << n << " times.";
+}
+
+}  // namespace pass
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(ReduceSplit) {
+  CINN_REGISTER_PASS(ReduceSplit)
+      .describe("")
+      .set_change_structure(true)
+      .provide_graph_attr("infershape")
+      .provide_graph_attr("inferdtype")
+      .set_body(cinn::hlir::pass::ReduceSplitFunc);
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/reduce_split_pass_test.cc b/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
new file mode 100644
index 0000000000000..e511e07ad954b
--- /dev/null
+++ b/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn {
+namespace frontend {
+
+std::unordered_map<std::string, std::vector<float>> RunModelTest(
+    Program& program,
+    const std::vector<std::string>&& passes,
+    const std::unordered_map<std::string, std::vector<float>>& input_data,
+    const std::unordered_set<std::string>& fetch_ids) {
+  auto target = common::DefaultTarget();
+  auto graph  = std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
+  hlir::framework::ApplyPasses(graph.get(), passes);
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto run_program = gc.Build();
+
+  for (auto& data : input_data) {
+    scope->Var<hlir::framework::Tensor>(data.first);
+    auto tensor = scope->GetTensor(data.first);
+    CopyFromVector(data.second, tensor, target);
+  }
+  run_program->Execute();
+
+  std::unordered_map<std::string, std::vector<float>> outputs;
+  for (auto id : fetch_ids) {
+    auto tensor = scope->GetTensor(id);
+    std::vector<float> data(tensor->shape().numel());
+    CopyToVector(tensor, &data);
+    outputs[id] = data;
+  }
+
+  return outputs;
+}
+
+TEST(ReduceSplit, reduce_mean_nhwc) {
+  NetBuilder net_builder("reduce_sum_nhwc");
+  // create model
+  int N = 64, H = 14, W = 14, C = 256;
+  auto in  = net_builder.CreateInput(Float(32), {N, H, W, C}, "in");
+  auto out = net_builder.ReduceSum(in, {0, 1, 2});
+
+  auto fetch_ids = {out->id};
+  std::vector<float> input_data(N * H * W * C);
+  InitRandomVector<float>(&input_data, input_data.size(), 0.0f, 1.0f, 1e-3);
+  std::unordered_map<std::string, std::vector<float>> feeds = {{"in", input_data}};
+  auto program                                              = net_builder.Build();
+  auto output        = RunModelTest(program, {"ReduceSplit", "OpFusionPass", "FusionMergePass"}, feeds, fetch_ids);
+  auto output_expect = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, feeds, fetch_ids);
+
+  for (auto& out : output) {
+    CheckOutput<float>(out.second, output_expect[out.first], 1e-8, 1e-4);
+  }
+}
+
+TEST(ReduceSplit, reduce_mean_nhwc_small_size) {
+  NetBuilder net_builder("reduce_sum_nhwc");
+  // create model
+  int N = 32, H = 2, W = 2, C = 256;
+  auto in  = net_builder.CreateInput(Float(32), {N, H, W, C}, "in");
+  auto out = net_builder.ReduceSum(in, {0, 1, 2});
+
+  auto fetch_ids = {out->id};
+  std::vector<float> input_data(N * H * W * C);
+  InitRandomVector<float>(&input_data, input_data.size(), 0.0f, 1.0f, 1e-3);
+  std::unordered_map<std::string, std::vector<float>> feeds = {{"in", input_data}};
+  auto program                                              = net_builder.Build();
+  auto output        = RunModelTest(program, {"ReduceSplit", "OpFusionPass", "FusionMergePass"}, feeds, fetch_ids);
+  auto output_expect = RunModelTest(program, {"OpFusionPass", "FusionMergePass"}, feeds, fetch_ids);
+
+  for (auto& out : output) {
+    // should be equal, since ReduceSplit is not affected
+    CheckOutput<float>(out.second, output_expect[out.first], 0.0f, 0.0f);
+  }
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/single_group_optimize_pass.cc b/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
new file mode 100644
index 0000000000000..0f1ff6be12b00
--- /dev/null
+++ b/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
@@ -0,0 +1,201 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/container/flat_hash_map.h>
+
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/visualize_helper.h"
+
+namespace cinn::hlir::pass {
+
+using framework::Graph;
+using Group = framework::Graph::Group;
+
+using common::GraphEdge;
+using common::GraphNode;
+
+using framework::Node;
+using framework::NodeData;
+
+using ShapeDict = absl::flat_hash_map<std::string, framework::shape_t>;
+using DtypeDict = absl::flat_hash_map<std::string, common::Type>;
+
+namespace utils {
+template <typename T>
+bool IsValueZero(cinn::utils::Attribute value) {
+  if (!absl::holds_alternative<T>(value)) {
+    return false;
+  }
+  return absl::get<T>(value) == static_cast<T>(0);
+}
+}  // namespace utils
+
+class SingleGroupOptimizePass {
+ public:
+  SingleGroupOptimizePass(Graph* graph);
+
+  std::vector<std::shared_ptr<Group>> Apply();
+
+ private:
+  bool TryReplaceNodeToCustomCall(Node* node) const;
+
+  bool CanReplaceToMemset(Node* node) const;
+  bool CanReplaceToMemcpy(Node* node) const;
+
+  void InitNodeToGroups();
+
+ private:
+  Graph* graph_;
+
+  const ShapeDict& shape_dict_;
+  const DtypeDict& dtype_dict_;
+  std::unordered_map<Node*, int> node_to_groups_;
+};
+
+SingleGroupOptimizePass::SingleGroupOptimizePass(Graph* graph)
+    : graph_(graph),
+      shape_dict_(graph->GetMutableAttrs<ShapeDict>("infershape")),
+      dtype_dict_(graph->GetMutableAttrs<DtypeDict>("inferdtype")) {
+  // NOTE(jeff41404): to count how many times each node are used by group.
+  // if a node used by more than one group, then will not be optimized.
+  InitNodeToGroups();
+}
+
+std::vector<std::shared_ptr<Group>> SingleGroupOptimizePass::Apply() {
+  std::vector<std::shared_ptr<Group>> optimized_groups;
+  for (const auto& group : graph_->fusion_groups) {
+    const auto& nodes = group->CollectNodes();
+    if (nodes.empty()) {
+      // empty group, skip
+      continue;
+    }
+    if (nodes.size() > 1) {
+      // The Group has multiple nodes, cannot be optimized, skip
+      optimized_groups.emplace_back(group);
+      continue;
+    }
+    CHECK(node_to_groups_.count(nodes.front())) << "Can't find node " << nodes.front()->id() << " in node_to_groups_!";
+    // NOTE(jeff41404): if a node used by more than one group, then will not be optimized to avoid unexpected changes to
+    // other group which has multiple nodes.
+    if (node_to_groups_[nodes.front()] > 1) {
+      optimized_groups.emplace_back(group);
+      continue;
+    }
+    // replace some const node like fill_constant/const_scalar to Memset,
+    // replace some copy node like identity to Memcpy
+    bool has_replaced = TryReplaceNodeToCustomCall(nodes.front());
+    if (has_replaced) {
+      // change the group pattern to kNonFusible
+      group->op_pattern_kind = framework::kNonFusible;
+    }
+
+    optimized_groups.emplace_back(group);
+  }
+
+  return optimized_groups;
+}
+
+bool SingleGroupOptimizePass::TryReplaceNodeToCustomCall(Node* node) const {
+  if (node->is_variable()) {
+    // skip variable
+    return false;
+  }
+
+  bool can_replace_to_memset = CanReplaceToMemset(node);
+
+  bool can_replace_to_memcpy = false;
+  if (!can_replace_to_memset) {
+    can_replace_to_memcpy = CanReplaceToMemcpy(node);
+  }
+
+  bool can_replace = can_replace_to_memset || can_replace_to_memcpy;
+
+  if (can_replace) {
+    // replace single node group to custom call function
+    const auto& op_name = node->op()->name;
+    VLOG(4) << "Replaced node " << framework::DebugString(node) << " by \"custom_call\"";
+    node->attrs.attr_store["original_op"] = op_name;
+    node->attrs.op                        = framework::Operator::Get("custom_call");
+  }
+
+  if (can_replace_to_memset) {
+    node->attrs.attr_store["custom_call"] = std::string("cinn_call_cuda_memset");
+  }
+  if (can_replace_to_memcpy) {
+    node->attrs.attr_store["custom_call"] = std::string("cinn_call_cuda_memcpy");
+  }
+
+  return can_replace;
+}
+
+bool SingleGroupOptimizePass::CanReplaceToMemset(Node* node) const {
+  const auto& op_name    = node->op()->name;
+  const auto& attr_store = node->attrs.attr_store;
+
+  if (op_name == "fill_constant" || op_name == "const_scalar") {
+    CHECK(attr_store.count("dtype")) << "Missing attribute \"dtype\" in op \"fill_constant\"";
+    CHECK(absl::holds_alternative<std::string>(attr_store.at("dtype")));
+
+    // if the value is 0, the op can always replaced by memset
+    const auto& value_attr = attr_store.at("value");
+    bool is_value_zero     = utils::IsValueZero<int>(value_attr) || utils::IsValueZero<float>(value_attr) ||
+                         utils::IsValueZero<bool>(value_attr) || utils::IsValueZero<int64_t>(value_attr) ||
+                         utils::IsValueZero<double>(value_attr);
+    return is_value_zero;
+    // can support memset non-0 ?
+  }
+
+  return false;
+}
+
+bool SingleGroupOptimizePass::CanReplaceToMemcpy(Node* node) const {
+  // the op do not compute and move data
+  static std::unordered_set<std::string> can_replace_to_memcpy_op = {
+      "identity", "reshape", "bitcast_convert", "squeeze", "expand_dims"};
+
+  return can_replace_to_memcpy_op.count(node->op()->name);
+}
+
+void SingleGroupOptimizePassImpl(Graph* graph) {
+  if (graph->target_ != common::DefaultNVGPUTarget()) {
+    return;
+  }
+  graph->fusion_groups = SingleGroupOptimizePass(graph).Apply();
+}
+
+void SingleGroupOptimizePass::InitNodeToGroups() {
+  for (const auto& group : graph_->fusion_groups) {
+    const auto& nodes = group->CollectNodes();
+    for (const auto& node : nodes) {
+      if (!node_to_groups_.count(node)) {
+        node_to_groups_[node] = 1;
+      } else {
+        node_to_groups_[node] += 1;
+      }
+    }
+  }
+}
+}  // namespace cinn::hlir::pass
+
+CINN_REGISTER_HELPER(SingleGroupOptimizePass) {
+  CINN_REGISTER_PASS(SingleGroupOptimizePass)
+      .describe("Optimize singel group to improve performance.")
+      .set_change_structure(true)
+      .set_body(cinn::hlir::pass::SingleGroupOptimizePassImpl);
+
+  return true;
+}
diff --git a/paddle/cinn/hlir/pass/test_dot_merger.cc b/paddle/cinn/hlir/pass/test_dot_merger.cc
new file mode 100644
index 0000000000000..d086846ce4ae2
--- /dev/null
+++ b/paddle/cinn/hlir/pass/test_dot_merger.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/pass/pass_test_helper.h"
+#include "cinn/runtime/flags.h"
+#include "gtest/gtest.h"
+
+namespace cinn::frontend::pass {
+
+/*
+ * DotMerger Test
+ *
+ * Before:
+ * (m, k) * (k, n1) -> (m1, n1)  ==> (m, n1 + n2)
+ * (m, k) * (k, n2) -> (m2, n2)
+ *
+ * After:
+ * (k, n1) concat (k, n2) -> (k, n1 + n2)
+ * (m, k) * (k, n1 + n2) -> (m, n1 + n2)
+ * (m, n1 + n2) slice -> (m, n1), (m, n2)
+ */
+
+TEST(DotMerger, lhs) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    // because op def changes with the macro
+    return;
+  }
+  int m = 2, k = 10201, n1 = 100, n2 = 100, n3 = 100, axis = 1;
+  NetBuilder builder("net_builder");
+  auto a  = builder.CreateInput(Float(32), {m, k}, "A");
+  auto b  = builder.CreateInput(Float(32), {k, n1}, "B");
+  auto c  = builder.CreateInput(Float(32), {k, n2}, "C");
+  auto c1 = builder.CreateInput(Float(32), {k, n3}, "E");
+  auto d  = builder.Matmul(a, b);
+  auto e  = builder.Matmul(a, c);
+  auto e1 = builder.Matmul(a, c1);
+  auto f  = builder.CreateInput(Float(32), {m, n1}, "D");
+  auto g  = builder.Add(d, f);
+  auto h  = builder.Add(e, g);
+  auto h1 = builder.Add(e1, h);
+  auto p  = builder.Build();
+
+  Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), c.id(), c1.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  OptimizeConfig passes({{"Decomposer", "RemoveIdentity", "TransposeFoldingInput"}, {}},
+                        {{"OpFusionPass", "FusionMergePass"}, {"DotMerger", "OpFusionPass", "FusionMergePass"}});
+  CompareResult(&p, target, input_ids, {h1->id}, 0, std::move(passes), 123, true);
+}
+
+/*
+ * DotMerger Test
+ *
+ * Before:
+ * (m1, k) * (k, n) -> (m1, n)  ==> (m1 + m2, n)
+ * (m2, k) * (k, n) -> (m2, n)
+ *
+ * After:
+ * (m1, k) concat (m2, k) -> (m1 + m2, k)
+ * (m1 + m2, k) * (k, n) -> (m1 + m2, n)
+ * (m1 + m2, n) slice -> (m1, n), (m2, n)
+ */
+
+TEST(DotMerger, rhs) {
+  if (!cinn::runtime::IsCompiledWithCUDA()) {
+    return;
+  }
+  NetBuilder builder("net_builder");
+  int m1 = 50, m2 = 50, k = 10201, n = 2, axis = 0;
+  auto a        = builder.CreateInput(Float(32), {m1, k}, "A");
+  auto b        = builder.CreateInput(Float(32), {m2, k}, "B");
+  auto c        = builder.CreateInput(Float(32), {k, n}, "C");
+  auto d        = builder.Matmul(a, c);
+  auto e        = builder.Matmul(b, c);
+  auto f        = builder.Concat({d, e}, axis);
+  auto p        = builder.Build();
+  Target target = common::DefaultNVGPUTarget();
+  std::vector<std::string> input_ids;
+  absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), c.id()},
+                    std::back_inserter(input_ids),
+                    [](absl::string_view id) { return std::string(id); });
+  OptimizeConfig passes({{"Decomposer", "RemoveIdentity", "TransposeFoldingInput", "GemmRewriter"}, {}},
+                        {{"OpFusionPass", "FusionMergePass"}, {"DotMerger", "OpFusionPass", "FusionMergePass"}});
+  CompareResult(&p, target, input_ids, {f->id}, 0, std::move(passes), 123, true);
+}
+}  // namespace cinn::frontend::pass
diff --git a/paddle/cinn/hlir/pass/test_primitive_ops.cc b/paddle/cinn/hlir/pass/test_primitive_ops.cc
new file mode 100755
index 0000000000000..af403e18be816
--- /dev/null
+++ b/paddle/cinn/hlir/pass/test_primitive_ops.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "cinn/cinn.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/hlir/pass/use_pass.h"
+#include "cinn/utils/data_util.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace cinn {
+namespace frontend {
+
+using hlir::framework::Scope;
+using utils::Join;
+
+// batch_norm primitives
+TEST(batch_norm_meta, batch_norm_meta) {
+  Placeholder A(Float(32), {1, 64, 112, 112}, "A");
+
+  Placeholder Scale(Float(32), {64}, "Scale");
+  Placeholder Bias(Float(32), {64}, "Bias");
+  Placeholder Mean(Float(32), {64}, "Mean");
+  Placeholder Variance(Float(32), {64}, "Variance");
+
+  Program program;
+  absl::flat_hash_map<std::string, Program::attr_t> attrs;
+  attrs["epsilon"] = static_cast<float>(0.001);
+
+  auto a = program.batchnorm(A, Scale, Bias, Mean, Variance, attrs);
+
+  auto b = program.fused_batchnorm_inference(A, Scale, Bias, Mean, Variance, attrs);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+#ifndef CINN_WITH_CUDA
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+#endif
+  hlir::framework::ApplyPasses(graph.get(), frontend::DefaultOpFusionPasses());
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+
+  auto A1 = scope->GetTensor("A");
+  SetRandData<float>(A1, target);
+
+  runtime_program->Execute();
+}
+
+TEST(reduction, reduce) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+
+  Program program;
+  std::unordered_map<std::string, Program::attr_t> attrs;
+  std::vector<int> axis = {1, 2};
+  bool keep_dim         = false;
+
+  auto a = program.reduce_max(A, axis, keep_dim);
+  auto b = program.reduce_min(A, axis, keep_dim);
+  auto c = program.reduce_prod(A, axis, keep_dim);
+  auto d = program.reduce_sum(A, {0, 1, 2, 3}, keep_dim);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+#ifndef CINN_WITH_CUDA
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+#endif
+  hlir::framework::ApplyPasses(graph.get(), frontend::DefaultOpFusionPasses());
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+
+  auto A1 = scope->GetTensor("A");
+  SetRandData<float>(A1, target);
+
+  runtime_program->Execute();
+}
+
+TEST(Compare, Compare) {
+  Placeholder A(Float(32), {1, 3, 224, 224}, "A");
+  Placeholder B(Float(32), {1, 3, 224, 224}, "B");
+
+  Program program;
+  auto a = program.primitive_equal(A, B);
+
+  Target target = common::DefaultTarget();
+  program.SetInputs({A, B});
+  program.Validate();
+  LOG(INFO) << "Program:\n" << program;
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+
+  hlir::framework::ApplyPass(graph.get(), "InferShape");
+#ifndef CINN_WITH_CUDA
+  hlir::framework::ApplyPass(graph.get(), "AlterLayout");
+#endif
+  hlir::framework::ApplyPasses(graph.get(), frontend::DefaultOpFusionPasses());
+  auto scope = BuildScope(target, graph);
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto runtime_program = gc.Build();
+
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
+
+  auto A1 = scope->GetTensor("A");
+  auto B1 = scope->GetTensor("B");
+  SetRandData<float>(A1, target);
+  SetRandData<float>(B1, target);
+
+  runtime_program->Execute();
+}
+
+}  // namespace frontend
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pass/use_pass.h b/paddle/cinn/hlir/pass/use_pass.h
new file mode 100644
index 0000000000000..dc44fb0869597
--- /dev/null
+++ b/paddle/cinn/hlir/pass/use_pass.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/common/macros.h"
+
+CINN_USE_REGISTER(InferShape)
+CINN_USE_REGISTER(OpFusion)
+CINN_USE_REGISTER(AlterLayout)
+CINN_USE_REGISTER(ConstPropagate)
+
+CINN_USE_REGISTER(DCE)
+CINN_USE_REGISTER(DotMerger)
+CINN_USE_REGISTER(OpFusionPass)
+CINN_USE_REGISTER(FusionMergePass)
+CINN_USE_REGISTER(CheckFusionAccuracyPass)
+
+CINN_USE_REGISTER(CommonSubexpressionEliminationPass)
+CINN_USE_REGISTER(TransToCustomCallPass)
+CINN_USE_REGISTER(DenseMergePass)
+CINN_USE_REGISTER(ConstantFolding)
+CINN_USE_REGISTER(ReduceSplit)
+CINN_USE_REGISTER(SingleGroupOptimizePass)
diff --git a/paddle/cinn/hlir/pe/CMakeLists.txt b/paddle/cinn/hlir/pe/CMakeLists.txt
new file mode 100755
index 0000000000000..b362ed80e71ff
--- /dev/null
+++ b/paddle/cinn/hlir/pe/CMakeLists.txt
@@ -0,0 +1,25 @@
+proto_library(param_proto SRCS schedule_param.proto)
+
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    broadcast.cc
+    elementwise.cc
+    nn.cc
+    nn_util.cc
+    reduction.cc
+    load_x86_params.cc
+    schedule.cc
+    ir_schedule_pe.cc
+    transform.cc
+    vision.cc
+    )
+
+cc_test(test_cinn_pe_elementwise SRCS pe_elementwise_test.cc DEPS cinncore)
+cc_test(test_cinn_pe_broadcast SRCS pe_broadcast_test.cc DEPS cinncore)
+cc_test(test_cinn_pe_transform SRCS pe_transform_test.cc DEPS cinncore)
+cc_test(test_load_params SRCS load_params_test.cc DEPS cinncore)
+
+foreach(header ${param_proto_HDRS})
+  set(core_proto_includes "${core_proto_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/cinn/hlir/pe/broadcast.cc b/paddle/cinn/hlir/pe/broadcast.cc
new file mode 100644
index 0000000000000..d65b975152bc3
--- /dev/null
+++ b/paddle/cinn/hlir/pe/broadcast.cc
@@ -0,0 +1,372 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/broadcast.h"
+
+#include <algorithm>
+
+#include "cinn/common/ir_util.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/optim/ir_copy.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+using common::make_zero;
+using ir::Tensor;
+using lang::Compute;
+
+void GetBroadcastShape(const std::vector<Expr>& shape1,
+                       const std::vector<Expr>& shape2,
+                       std::vector<Expr>* common_shape,
+                       std::vector<bool>* broadcast_flag1,
+                       std::vector<bool>* broadcast_flag2,
+                       int* axis_offset,
+                       const Expr& axis) {
+  CHECK(common_shape);
+  CHECK(broadcast_flag1);
+  CHECK(broadcast_flag2);
+
+  std::vector<Expr> shape1_new = shape1;
+  std::vector<Expr> shape2_new = shape2;
+
+  if (axis.defined()) {
+    int axis_val = axis.as_int32();
+    CHECK_GE(axis_val, -1) << "wrong axis: " << axis_val << std::endl;
+    if (shape1.size() >= shape2.size()) {
+      CHECK_LE(axis_val, static_cast<int>(shape1.size() - shape2.size()))
+          << "wrong axis: " << axis_val << " is not <= " << shape1.size() - shape2.size() << std::endl;
+      if (axis_val >= 0) {
+        *axis_offset = shape1.size() - shape2.size() - axis_val;
+        for (int i = 1; i <= *axis_offset; ++i) {
+          // specified axis to align, we insert Expr one in tensor B so as to align right with tensor A.
+          shape2_new.emplace_back(Expr(1));
+          common_shape->insert(common_shape->begin(), shape1[static_cast<int>(shape1.size() - i)]);
+          // flag is used to indicate whether to include the indice or not.
+          broadcast_flag1->emplace_back(true);
+          broadcast_flag2->emplace_back(false);
+        }
+      }
+    } else {
+      CHECK_LE(axis_val, static_cast<int>(shape2.size() - shape1.size()))
+          << "wrong axis: " << axis_val << " is not <= " << shape2.size() - shape1.size() << std::endl;
+      if (axis_val >= 0) {
+        *axis_offset = shape2.size() - shape1.size() - axis_val;
+        for (int i = 1; i <= *axis_offset; ++i) {
+          // specified axis to align, we insert Expr one in tensor B so as to align right with tensor A.
+          shape1_new.emplace_back(Expr(1));
+          common_shape->insert(common_shape->begin(), shape2[static_cast<int>(shape2.size() - i)]);
+          // flag is used to indicate whether to include the indice or not.
+          broadcast_flag2->emplace_back(true);
+          broadcast_flag1->emplace_back(false);
+        }
+      }
+    }
+  }
+
+  int size1 = shape1_new.size();
+  int size2 = shape2_new.size();
+
+  Expr one(1);
+  int i;
+  i = *axis_offset <= 0 ? 1 : *axis_offset + 1;
+  for (; i <= std::min(size1, size2); ++i) {
+    // traverse from right to left to get the output shape and broadcast flag
+    auto* var1 = shape1_new[size1 - i].As<ir::_Var_>();
+    auto* var2 = shape2_new[size2 - i].As<ir::_Var_>();
+    if (MathEqual(shape1_new[size1 - i], shape2_new[size2 - i])) {
+      common_shape->insert(common_shape->begin(), shape1_new[size1 - i]);
+      // broadcast flags are recorded in a reverse order
+      broadcast_flag1->emplace_back(true);
+      broadcast_flag2->emplace_back(true);
+    } else if (MathEqual(one, shape1_new[size1 - i])) {
+      CHECK(!MathEqual(one, shape2_new[size2 - i]));
+      common_shape->insert(common_shape->begin(), shape2_new[size2 - i]);
+      broadcast_flag1->emplace_back(false);
+      broadcast_flag2->emplace_back(true);
+    } else if (MathEqual(one, shape2_new[size2 - i])) {
+      CHECK(!MathEqual(one, shape1_new[size1 - i]));
+      common_shape->insert(common_shape->begin(), shape1_new[size1 - i]);
+      broadcast_flag1->emplace_back(true);
+      broadcast_flag2->emplace_back(false);
+    } else if (var1 && var2) {
+      Expr max_var = ir::Max::Make(shape1_new[size1 - i], shape2_new[size2 - i]);
+      common_shape->insert(common_shape->begin(), max_var);
+      broadcast_flag1->emplace_back(true);
+      broadcast_flag2->emplace_back(true);
+    } else if (var1) {
+      common_shape->insert(common_shape->begin(), shape2_new[size2 - i]);
+      broadcast_flag1->emplace_back(true);
+      broadcast_flag2->emplace_back(true);
+    } else if (var2) {
+      common_shape->insert(common_shape->begin(), shape1_new[size1 - i]);
+      broadcast_flag1->emplace_back(true);
+      broadcast_flag2->emplace_back(true);
+    } else {
+      int dim1 = shape1_new[size1 - i].as_int32();
+      int dim2 = shape2_new[size2 - i].as_int32();
+      if (dim1 == dim2) {
+        common_shape->insert(common_shape->begin(), shape1_new[size1 - i]);
+        // broadcast flags are recorded in a reverse order
+        broadcast_flag1->emplace_back(true);
+        broadcast_flag2->emplace_back(true);
+      } else if (dim1 == 1) {
+        common_shape->insert(common_shape->begin(), shape2_new[size2 - i]);
+        // broadcast flags are recorded in a reverse order
+        broadcast_flag1->emplace_back(false);
+        broadcast_flag2->emplace_back(true);
+      } else if (dim2 == 1) {
+        common_shape->insert(common_shape->begin(), shape1_new[size1 - i]);
+        // broadcast flags are recorded in a reverse order
+        broadcast_flag1->emplace_back(true);
+        broadcast_flag2->emplace_back(false);
+      } else {
+        LOG(FATAL) << "Incompatible broadcast dims " << shape1_new[size1 - i] << " and " << shape2_new[size2 - i]
+                   << " in: " << shape1_new << " and " << shape2_new << std::endl;
+      }
+    }
+  }
+  if (size1 != size2) {
+    int max_size = std::max(size1, size2);
+    auto& shape  = (size1 > size2) ? shape1_new : shape2_new;
+    auto var_l   = (size1 > size2) ? broadcast_flag1 : broadcast_flag2;
+    auto var_s   = (size1 > size2) ? broadcast_flag2 : broadcast_flag1;
+    for (; i <= max_size; ++i) {
+      common_shape->insert(common_shape->begin(), shape[max_size - i]);
+      var_l->emplace_back(true);
+      var_s->emplace_back(false);
+    }
+  }
+}
+
+void GetBroadcastOutShape(const std::vector<int>& input_shape1,
+                          const std::vector<int>& input_shape2,
+                          std::vector<int>* common_shape,
+                          int axis) {
+  std::vector<Expr> shape1;
+  std::vector<Expr> shape2;
+  auto fn_expr = [](const std::vector<int>& input_shape, std::vector<Expr>* shape) {
+    for (int i = 0; i < input_shape.size(); i++) {
+      shape->push_back(Expr(input_shape[i]));
+    }
+  };
+  fn_expr(input_shape1, &shape1);
+  fn_expr(input_shape2, &shape2);
+  std::vector<bool> broadcast_flags1;
+  std::vector<bool> broadcast_flags2;
+  int axis_offset = 0;
+  std::vector<Expr> out_shape;
+  GetBroadcastShape(shape1, shape2, &out_shape, &broadcast_flags1, &broadcast_flags2, &axis_offset, Expr(axis));
+  CHECK(common_shape);
+  for (auto& shape : out_shape) {
+    common_shape->push_back(shape.as_int32());
+  }
+}
+
+void GetBroadcastIndice(const std::vector<Expr>& indice,
+                        const Tensor& tensor_a,
+                        const Tensor& tensor_b,
+                        int axis_offset,
+                        std::vector<Expr>* broadcast_indice1,
+                        std::vector<Expr>* broadcast_indice2,
+                        const std::vector<bool>& broadcast_flags1,
+                        const std::vector<bool>& broadcast_flags2) {
+  CHECK(broadcast_indice1);
+  CHECK(broadcast_indice2);
+  if (broadcast_indice1->empty() && broadcast_indice2->empty()) {
+    int flag_size = broadcast_flags1.size();
+    int i;
+    CHECK_GE(indice.size(), flag_size);
+    for (i = 0; i < flag_size; i++) {
+      if (broadcast_flags1[flag_size - 1 - i]) {
+        // broadcast indices are added from left to right
+        broadcast_indice1->push_back(indice[i]);
+      } else if (flag_size - i <= tensor_a->shape.size() + axis_offset &&
+                 broadcast_indice1->size() < tensor_a->shape.size()) {
+        broadcast_indice1->push_back(Expr(0));
+      }
+      if (broadcast_flags2[flag_size - 1 - i]) {
+        broadcast_indice2->push_back(indice[i]);
+      } else if (flag_size - i <= tensor_b->shape.size() + axis_offset &&
+                 broadcast_indice2->size() < tensor_b->shape.size()) {
+        // insert indice 0 when have not yet reached the dimension of tensor. Meanwhile we have to consider the case of
+        // axis alignment.
+        broadcast_indice2->push_back(Expr(0));
+      }
+    }
+  }
+}
+
+template <typename FuncOp>
+Tensor Broadcast(const FuncOp& op,
+                 const Tensor& a,
+                 const Tensor& b,
+                 const std::string& output_name = "",
+                 const Expr& axis               = Expr(-1)) {
+  std::vector<Expr> common_shape;
+  std::vector<bool> broadcast_flags1;
+  std::vector<bool> broadcast_flags2;
+
+  // the counts of left-shift of tensor b so as to right alignment
+  int axis_offset = 0;
+
+  GetBroadcastShape(a->shape, b->shape, &common_shape, &broadcast_flags1, &broadcast_flags2, &axis_offset, axis);
+  auto fn = [=](const std::vector<Expr>& indice) {
+    std::vector<Expr> broadcast_indice1;
+    std::vector<Expr> broadcast_indice2;
+    GetBroadcastIndice(
+        indice, a, b, axis_offset, &broadcast_indice1, &broadcast_indice2, broadcast_flags1, broadcast_flags2);
+    return op(a(broadcast_indice1), b(broadcast_indice2));
+  };
+  Tensor output = Compute(common_shape, fn, output_name);
+  return output;
+}
+
+#define HLIR_IMP_BC_PE(name__, compute__)                                                             \
+  Tensor name__(const Tensor& A, const Tensor& B, const std::string& output_name, const Expr& axis) { \
+    auto fn = [&](const Expr& a, const Expr& b) { compute__ };                                        \
+    return Broadcast(fn, A, B, output_name, axis);                                                    \
+  }
+
+HLIR_IMP_BC_PE(Add, return a + b;);
+HLIR_IMP_BC_PE(Subtract, return a - b;);
+HLIR_IMP_BC_PE(Multiply, return a * b;);
+HLIR_IMP_BC_PE(Divide, return a / b;);
+HLIR_IMP_BC_PE(FloorDivide, return lang::FloorDivide(a, b););
+HLIR_IMP_BC_PE(Remainder, return lang::Remainder(a, b););
+HLIR_IMP_BC_PE(Mod, return lang::Mod(a, b););
+HLIR_IMP_BC_PE(Maximum, return ir::Max::Make(a, b););
+HLIR_IMP_BC_PE(Minimum, return ir::Min::Make(a, b););
+HLIR_IMP_BC_PE(LeftShift, return a << b;);
+HLIR_IMP_BC_PE(RightShift, return a >> b;);
+HLIR_IMP_BC_PE(LogicalRightShift, return lang::LogicalRightShift(a, b););
+HLIR_IMP_BC_PE(LogicalAnd, return a && b;);
+HLIR_IMP_BC_PE(LogicalOr, return a || b;);
+HLIR_IMP_BC_PE(LogicalXOr, return (a || b) && !(a && b););
+HLIR_IMP_BC_PE(BitwiseAnd, return a & b;);
+HLIR_IMP_BC_PE(BitwiseOr, return a | b;);
+HLIR_IMP_BC_PE(BitwiseXor, return a ^ b;);
+HLIR_IMP_BC_PE(Greater, return a > b;);
+HLIR_IMP_BC_PE(Less, return a < b;);
+HLIR_IMP_BC_PE(Equal, return ir::EQ::Make(a, b););
+HLIR_IMP_BC_PE(NotEqual, return ir::NE::Make(a, b););
+HLIR_IMP_BC_PE(GreaterEqual, return a >= b;);
+HLIR_IMP_BC_PE(LessEqual, return a <= b;);
+HLIR_IMP_BC_PE(Pow, return lang::Pow(a, b););
+
+Tensor Atan2(const Tensor& A, const Tensor& B, const std::string& output_name, const Expr& axis) {
+  constexpr double PI = 3.14159265358979323846;
+
+  auto fn = [&](const Expr& elem_a, const Expr& elem_b) {
+    auto atan    = lang::Atan(elem_a / elem_b);
+    auto pi      = common::make_const(atan->type(), PI);
+    auto half_pi = common::make_const(atan->type(), PI / 2);
+    auto zero    = ir::Zero(atan->type());
+    return ir::Select::Make(
+        ir::EQ::Make(elem_b, zero),
+        ir::Select::Make(
+            ir::EQ::Make(elem_a, zero), zero, ir::Select::Make(ir::GT::Make(elem_a, zero), half_pi, -half_pi)),
+        ir::Select::Make(
+            ir::GT::Make(elem_b, zero), atan, ir::Select::Make(ir::GE::Make(elem_a, zero), atan + pi, atan - pi)));
+  };
+  return Broadcast(fn, A, B, output_name, axis);
+}
+
+Tensor BroadcastTo(const Tensor& A,
+                   const std::vector<int>& out_shape,
+                   const std::vector<int>& broadcast_axes,
+                   const std::string& out_name) {
+  auto A_shape = A->shape;
+  CHECK_EQ(A_shape.size(), broadcast_axes.size()) << "broadcast_axes's size should be same with the input shape's size";
+  CHECK_GE(out_shape.size(), broadcast_axes.size()) << "broadcast_axes's size should be no more than out_shape's size";
+  auto axes = broadcast_axes;
+  for (auto& axis : axes) {
+    // if axis < 0, plus out_shape.size
+    if (axis < 0) {
+      axis = out_shape.size() + axis;
+    }
+    CHECK_LT(axis, out_shape.size());
+  }
+  std::sort(axes.begin(), axes.end());
+
+  return Compute(
+      ToCinnExprs(out_shape),
+      [=](const std::vector<Expr>& indice) {
+        std::vector<Expr> broadcast_indice;
+        for (int idx = 0; idx < axes.size(); ++idx) {
+          int a_shape_i = A_shape[idx].as_int32();
+          if (a_shape_i == 1) {
+            broadcast_indice.push_back(ir::Expr(0));
+          } else if (a_shape_i == out_shape[axes[idx]]) {
+            broadcast_indice.push_back(indice[axes[idx]]);
+          } else {
+            LOG(FATAL) << "fail to broad cast input shape " << a_shape_i << " to output shape " << out_shape[axes[idx]];
+          }
+        }
+        return A(broadcast_indice);
+      },
+      out_name);
+}
+
+ir::Tensor IsClose(const ir::Tensor& x,
+                   const ir::Tensor& y,
+                   int axis,
+                   float rtol,
+                   float atol,
+                   bool equal_nan,
+                   const std::string& out_name) {
+  // For each a=x[i], b=y[i]:
+  // ```
+  // if (isnan(a) || isnan(b)) {
+  //   out = equal_nan && isnan(a) == isnan(b);
+  // } else {
+  //   T left = (a > b ? a - b : b - a);
+  //   T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+  //   T diff = (left > right ? left - right : right - left);
+  //   out = a == b || left <= right || diff <= 1e-15;
+  // }
+  // ```
+  auto fn = [&](const Expr& a, const Expr& b) {
+    // check whether x or y is nan
+    auto check_x_nan = lang::IsNan(a);
+    auto check_y_nan = lang::IsNan(b);
+
+    // out = equal_nan && isnan(a) == isnan(b);
+    auto check_nan_same = Expr(equal_nan) && ir::EQ::Make(check_x_nan, check_y_nan);
+
+    // check whether x and y are close
+    // T left = (a > b ? a - b : b - a);
+    auto left = ir::Select::Make(a > b, a - b, b - a);
+    // T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+    auto right = ir::Cast::Make(x->type(), atol) + ir::Select::Make(b > ir::Zero(b->type()),
+                                                                    ir::Cast::Make(x->type(), rtol) * b,
+                                                                    ir::Cast::Make(x->type(), -rtol) * b);
+    // T diff = (left > right ? left - right : right - left);
+    auto diff = ir::Select::Make(left > right, left - right, right - left);
+    // out = a == b || left <= right || diff <= 1e-15;
+    auto check_diff = (ir::EQ::Make(a, b) || (left <= right)) || (diff <= lang::Epsilon(diff->type()));
+
+    return ir::Select::Make(check_x_nan || check_y_nan, check_nan_same, check_diff);
+  };
+  return Broadcast(fn, x, y, out_name, Expr(axis));
+}
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/broadcast.h b/paddle/cinn/hlir/pe/broadcast.h
new file mode 100644
index 0000000000000..fc287b09b0944
--- /dev/null
+++ b/paddle/cinn/hlir/pe/broadcast.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+void GetBroadcastOutShape(const std::vector<int>& input_shape1,
+                          const std::vector<int>& input_shape2,
+                          std::vector<int>* common_shape,
+                          int axis = -1);
+/**
+ * @brief Compute A && B with auto-broadcasting.
+ *
+ * @param A The first Tensor or Expr
+ * @param B The second Tensor or Expr
+ * @param axis Tensor B's beginning position of Tensor A. Default is -1(right align) and then axis = rank(X)-rank(Y).
+ * @param out_name The name of the output Tensor
+ *
+ * @return The result Tensor or Expr.
+ * @notes Tensor A's shape should no less than Tensor B's.
+ * e.g.
+ * shape(A) = (2, 3, 4, 5), shape(B) = (4, 5), with axis=-1(default) or axis=2
+ * shape(A) = (2, 3, 4, 5), shape(B) = (3, 4), with axis=1
+ * shape(A) = (2, 3, 4, 5), shape(B) = (2), with axis=0
+ * shape(A) = (2, 3, 4, 5), shape(B) = (2, 1), with axis=0
+ */
+#define HLIR_DCL_BC_PE(name__)                                                           \
+  ir::Tensor name__(const ir::Tensor& A,                                                 \
+                    const ir::Tensor& B,                                                 \
+                    const std::string& out_name = common::UniqName("T_" #name__ "_out"), \
+                    const Expr& axis            = Expr());
+
+//! Compute A + B with auto-broadcasting.
+HLIR_DCL_BC_PE(Add);
+//! Compute Atan2 with auto-broadcasting.
+HLIR_DCL_BC_PE(Atan2);
+//! Compute A - B with auto-broadcasting.
+HLIR_DCL_BC_PE(Subtract);
+//! Compute A * B with auto-broadcasting.
+HLIR_DCL_BC_PE(Multiply);
+//! Compute A / B with auto-broadcasting.
+HLIR_DCL_BC_PE(Divide);
+//! Compute Floor(A / B) with auto-broadcasting.
+HLIR_DCL_BC_PE(FloorDivide);
+//! Compute A % B with auto-broadcasting.
+HLIR_DCL_BC_PE(Mod);
+//! Compute A - floor_div(A, B) * B with auto-broadcasting.
+HLIR_DCL_BC_PE(Remainder);
+//! Compute Maximum(A, B) with auto-broadcasting.
+HLIR_DCL_BC_PE(Maximum);
+//! Compute Minimum(A, B) with auto-broadcasting.
+HLIR_DCL_BC_PE(Minimum);
+//! Compute A << B with auto-broadcasting.
+HLIR_DCL_BC_PE(LeftShift);
+//! Compute A >> B with auto-broadcasting.
+HLIR_DCL_BC_PE(RightShift);
+//! Compute A && B with auto-broadcasting.
+HLIR_DCL_BC_PE(LogicalAnd);
+//! Compute A || B with auto-broadcasting.
+HLIR_DCL_BC_PE(LogicalOr);
+//! Compute A ^ B with auto-broadcasting.
+HLIR_DCL_BC_PE(LogicalXOr);
+//! Compute A & B with auto-broadcasting.
+HLIR_DCL_BC_PE(BitwiseAnd);
+//! Compute A | B with auto-broadcasting.
+HLIR_DCL_BC_PE(BitwiseOr);
+//! Compute A ^ B with auto-broadcasting.
+HLIR_DCL_BC_PE(BitwiseXor);
+//! Compute A > B with auto-broadcasting.
+HLIR_DCL_BC_PE(Greater);
+//! Compute A < B with auto-broadcasting.
+HLIR_DCL_BC_PE(Less);
+//! Compute A == B with auto-broadcasting.
+HLIR_DCL_BC_PE(Equal);
+//! Compute A != B with auto-broadcasting.
+HLIR_DCL_BC_PE(NotEqual);
+//! Compute A >= B with auto-broadcasting.
+HLIR_DCL_BC_PE(GreaterEqual);
+//! Compute A <= B with auto-broadcasting.
+HLIR_DCL_BC_PE(LessEqual);
+//! Compute  (unsigned)A >> B with auto-broadcasting.
+HLIR_DCL_BC_PE(LogicalRightShift);
+//! Compute  pow(A, B) with auto-broadcasting.
+HLIR_DCL_BC_PE(Pow);
+
+ir::Tensor Pow(const ir::Tensor& A,
+               const ir::Tensor& B,
+               const std::string& output_name,
+               const Expr& axis,
+               const common::Target& target);
+
+ir::Tensor BroadcastTo(const ir::Tensor& A,
+                       const std::vector<int>& out_shape,
+                       const std::vector<int>& broadcast_axes,
+                       const std::string& out_name = common::UniqName("T_broadcast_to_out"));
+
+// This operator checks if all x and y satisfy the condition: |x - y| <= atol + rtol * |y|
+ir::Tensor IsClose(const ir::Tensor& x,
+                   const ir::Tensor& y,
+                   int axis                    = -1,
+                   float rtol                  = 1e-05f,
+                   float atol                  = 1e-08f,
+                   bool equal_nan              = false,
+                   const std::string& out_name = common::UniqName("IsClose_output"));
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc
new file mode 100644
index 0000000000000..f548d7664921d
--- /dev/null
+++ b/paddle/cinn/hlir/pe/elementwise.cc
@@ -0,0 +1,233 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/elementwise.h"
+
+#include <algorithm>
+#include <string>
+
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/utils/functional.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+using ir::Expr;
+using ir::Tensor;
+using lang::Compute;
+
+#define HLIR_IMP_UNARY_PE(name__)                                                                          \
+  std::vector<ir::Tensor> name__(const Tensor& A, const std::string& output_name) {                        \
+    return {Compute(                                                                                       \
+        A->shape, [=](const std::vector<Expr>& indice) { return lang::name__(A(indice)); }, output_name)}; \
+  }
+
+#define HLIR_MKL_IMP_UNARY_PE(name__, ex_name__)                                                     \
+  std::vector<ir::Tensor> name__##MKL(const Tensor& A, const std::string& output_name) {             \
+    CHECK(A->type().is_float()) << "type should be float or double but get " << A->type();           \
+    std::string fn_name = "cinn_mkl_" #ex_name__ "_v_fp" + std::to_string(A->type().bits());         \
+    auto call           = Compute(                                                                   \
+        {Expr(1)}, [=]() -> Expr { return lang::CallExtern(fn_name, {A}); }, output_name); \
+    auto out = call->TupleGet(0);                                                                    \
+    out->WithBuffer(A->type());                                                                      \
+    return {out, call};                                                                              \
+  }
+
+HLIR_MKL_IMP_UNARY_PE(Exp, exp);
+HLIR_MKL_IMP_UNARY_PE(Erf, erf);
+HLIR_MKL_IMP_UNARY_PE(Sqrt, sqrt);
+HLIR_MKL_IMP_UNARY_PE(Log, log);
+HLIR_MKL_IMP_UNARY_PE(Log2, log2);
+HLIR_MKL_IMP_UNARY_PE(Log10, log10);
+HLIR_MKL_IMP_UNARY_PE(Floor, floor);
+HLIR_MKL_IMP_UNARY_PE(Ceil, ceil);
+HLIR_MKL_IMP_UNARY_PE(Round, round);
+HLIR_MKL_IMP_UNARY_PE(Tanh, tanh);
+HLIR_MKL_IMP_UNARY_PE(Trunc, trunc);
+HLIR_MKL_IMP_UNARY_PE(Cos, cos);
+HLIR_MKL_IMP_UNARY_PE(Sin, sin);
+HLIR_MKL_IMP_UNARY_PE(Cosh, cosh);
+HLIR_MKL_IMP_UNARY_PE(Tan, tan);
+HLIR_MKL_IMP_UNARY_PE(Sinh, sinh);
+HLIR_MKL_IMP_UNARY_PE(Acos, acos);
+HLIR_MKL_IMP_UNARY_PE(Acosh, acosh);
+HLIR_MKL_IMP_UNARY_PE(Asin, asin);
+HLIR_MKL_IMP_UNARY_PE(Asinh, asinh);
+HLIR_MKL_IMP_UNARY_PE(Atan, atan);
+HLIR_MKL_IMP_UNARY_PE(Atanh, atanh);
+
+HLIR_IMP_UNARY_PE(Exp);
+HLIR_IMP_UNARY_PE(Erf);
+HLIR_IMP_UNARY_PE(Sqrt);
+HLIR_IMP_UNARY_PE(Log);
+HLIR_IMP_UNARY_PE(Log2);
+HLIR_IMP_UNARY_PE(Log10);
+HLIR_IMP_UNARY_PE(Floor);
+HLIR_IMP_UNARY_PE(Ceil);
+HLIR_IMP_UNARY_PE(Round);
+HLIR_IMP_UNARY_PE(Trunc);
+HLIR_IMP_UNARY_PE(Cos);
+HLIR_IMP_UNARY_PE(Cosh);
+HLIR_IMP_UNARY_PE(Tan);
+HLIR_IMP_UNARY_PE(Sin);
+HLIR_IMP_UNARY_PE(Sinh);
+HLIR_IMP_UNARY_PE(Acos);
+HLIR_IMP_UNARY_PE(Acosh);
+HLIR_IMP_UNARY_PE(Asin);
+HLIR_IMP_UNARY_PE(Asinh);
+HLIR_IMP_UNARY_PE(Atan);
+HLIR_IMP_UNARY_PE(Atanh);
+HLIR_IMP_UNARY_PE(IsNan);
+HLIR_IMP_UNARY_PE(Tanh);
+HLIR_IMP_UNARY_PE(IsFinite);
+HLIR_IMP_UNARY_PE(IsInf);
+
+HLIR_IMP_UNARY_PE(Negative);
+HLIR_IMP_UNARY_PE(Identity);
+HLIR_IMP_UNARY_PE(LogicalNot);
+HLIR_IMP_UNARY_PE(BitwiseNot);
+HLIR_IMP_UNARY_PE(Sigmoid);
+HLIR_IMP_UNARY_PE(Sign);
+HLIR_IMP_UNARY_PE(Abs);
+HLIR_IMP_UNARY_PE(Rsqrt);
+HLIR_IMP_UNARY_PE(Cbrt);
+HLIR_IMP_UNARY_PE(Clz);
+HLIR_IMP_UNARY_PE(Popc);
+
+ir::Tensor Squeeze(const ir::Tensor& A, const std::vector<int>& axes, const std::string& output_name) {
+  std::vector<int> position;
+  std::vector<Expr> output_shape;
+  if (axes.size()) {
+    // if axis < 0, plus tensor rank.
+    std::vector<int> naxes;
+    for (auto axis : axes) {
+      if (axis < 0) {
+        axis += A->shape.size();
+      }
+
+      naxes.push_back(axis);
+    }
+    for (int idx = 0; idx < A->shape.size(); ++idx) {
+      // if can't find idx in axis
+      if (std::find(naxes.begin(), naxes.end(), idx) == naxes.end()) {
+        output_shape.push_back(A->shape[idx]);
+        position.push_back(idx);
+      } else {
+        CHECK_EQ(A->shape[idx], Expr(1));
+      }
+    }
+  } else {
+    for (int idx = 0; idx < A->shape.size(); ++idx) {
+      if (A->shape[idx] != Expr(1)) {
+        output_shape.push_back(A->shape[idx]);
+        position.push_back(idx);
+      }
+    }
+  }
+
+  auto res = Compute(
+      output_shape,
+      [=](const std::vector<Expr>& indices) {
+        std::vector<Expr> indexs(A->shape.size(), Expr(0));
+        for (int idx = 0; idx < indices.size(); ++idx) {
+          indexs[position[idx]] = indices[idx];
+        }
+        return A(indexs);
+      },
+      output_name);
+  return res;
+}
+
+ir::Tensor ExpandDims(const ir::Tensor& A,
+                      const std::vector<int>& axes,
+                      const std::vector<int>& out_shape,
+                      const std::string& output_name) {
+  const auto& posi_axes = utils::GetPositiveAxes(axes, out_shape.size());
+
+  return Compute(
+      ToCinnExprs(out_shape),
+      [=](const std::vector<Expr>& indice) {
+        std::vector<Expr> idx;
+        int axes_pos = 0;
+        for (int i = 0; i < indice.size(); ++i) {
+          if (axes_pos < posi_axes.size() && posi_axes[axes_pos] == i) {
+            ++axes_pos;
+          } else {
+            idx.push_back(indice[i]);
+          }
+        }
+        CHECK_EQ(idx.size(), A->shape.size()) << "The index size not equal with the input rank.";
+        return A(idx);
+      },
+      UniqName(output_name));
+}
+
+ir::Tensor Reshape(const ir::Tensor& A, const std::vector<int>& new_shape, const std::string& name) {
+  std::vector<Expr> new_expr_shape;
+  std::vector<Expr> A_expr_shape = A->shape;
+  int input_total_size           = 1;
+  int output_total_size          = 1;
+  for (auto& i : A_expr_shape) {
+    CHECK(i.is_constant()) << "Input tensor's shape should be constant value.";
+    input_total_size *= static_cast<int>(i.get_constant());
+  }
+  for (auto& i : new_shape) {
+    output_total_size *= i;
+    new_expr_shape.push_back(Expr(i));
+  }
+  CHECK_EQ(input_total_size, output_total_size)
+      << "In op reshape, the input tensor and output tensor's total size should be equal, please check!";
+  auto res = Compute(
+      new_expr_shape,
+      [=](const std::vector<Expr>& indice) {
+        Expr offset = Expr(0);
+        for (int i = 0; i < indice.size(); i++) {
+          offset = offset * new_expr_shape[i] + indice[i];
+        }
+        std::vector<Expr> indice_a;
+        for (int i = A_expr_shape.size() - 1; i >= 0; i--) {
+          auto temp = offset % A_expr_shape[i];
+          indice_a.insert(indice_a.begin(), temp);
+          offset = (offset - temp) / A_expr_shape[i];
+        }
+        return A(indice_a);
+      },
+      name);
+  return res;
+}
+
+ir::Tensor Cast(const ir::Tensor& A, const Type& dtype, const std::string& name) {
+  auto res = Compute(
+      A->shape, [=](const std::vector<Expr>& indices) { return ir::Cast::Make(dtype, A(indices)); }, name);
+  return res;
+}
+
+ir::Tensor Arange(
+    const float start, const float stop, const float step, const Type& dtype, const std::string& output_name) {
+  int num        = static_cast<int>(std::ceil((stop - start) / step));
+  ir::Tensor res = lang::Compute(
+      {Expr(num)},
+      [=](const std::vector<ir::Expr>& indices) {
+        return ir::Cast::Make(dtype, Expr(start) + Expr(step) * ir::Cast::Make(common::F32(), indices[0]));
+      },
+      output_name);
+  return res;
+}
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/elementwise.h b/paddle/cinn/hlir/pe/elementwise.h
new file mode 100644
index 0000000000000..2899efb00416d
--- /dev/null
+++ b/paddle/cinn/hlir/pe/elementwise.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+/**
+ * @brief Unary primitive emitters
+ *
+ * @param A The input Tensor
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensor.
+ */
+#define HLIR_DCL_UNARY_PE(name__)                                                                            \
+  std::vector<ir::Tensor> name__(const ir::Tensor& A, const std::string& output_name = "T_" #name__ "_out"); \
+  std::vector<ir::Tensor> name__##MKL(const ir::Tensor& A, const std::string& output_name = "T_" #name__ "_mkl_out");
+
+HLIR_DCL_UNARY_PE(Exp);
+HLIR_DCL_UNARY_PE(Erf);
+HLIR_DCL_UNARY_PE(Sqrt);
+HLIR_DCL_UNARY_PE(Log);
+HLIR_DCL_UNARY_PE(Log2);
+HLIR_DCL_UNARY_PE(Log10);
+HLIR_DCL_UNARY_PE(Floor);
+HLIR_DCL_UNARY_PE(Ceil);
+HLIR_DCL_UNARY_PE(Round);
+HLIR_DCL_UNARY_PE(Trunc);
+HLIR_DCL_UNARY_PE(Cos);
+HLIR_DCL_UNARY_PE(Cosh);
+HLIR_DCL_UNARY_PE(Tan);
+HLIR_DCL_UNARY_PE(Sin);
+HLIR_DCL_UNARY_PE(Sinh);
+HLIR_DCL_UNARY_PE(Acos);
+HLIR_DCL_UNARY_PE(Acosh);
+HLIR_DCL_UNARY_PE(Asin);
+HLIR_DCL_UNARY_PE(Asinh);
+HLIR_DCL_UNARY_PE(Atan);
+HLIR_DCL_UNARY_PE(Atanh);
+HLIR_DCL_UNARY_PE(IsNan);
+HLIR_DCL_UNARY_PE(Tanh);
+HLIR_DCL_UNARY_PE(IsFinite);
+HLIR_DCL_UNARY_PE(IsInf);
+
+HLIR_DCL_UNARY_PE(Negative);
+HLIR_DCL_UNARY_PE(Identity);
+HLIR_DCL_UNARY_PE(LogicalNot);
+HLIR_DCL_UNARY_PE(BitwiseNot);
+HLIR_DCL_UNARY_PE(Sigmoid);
+HLIR_DCL_UNARY_PE(Sign);
+HLIR_DCL_UNARY_PE(Abs);
+HLIR_DCL_UNARY_PE(Rsqrt);
+HLIR_DCL_UNARY_PE(Reinterpret);
+HLIR_DCL_UNARY_PE(ElementwiseSum);
+HLIR_DCL_UNARY_PE(Full);
+HLIR_DCL_UNARY_PE(FullLike);
+HLIR_DCL_UNARY_PE(Cbrt);
+HLIR_DCL_UNARY_PE(Clz);
+HLIR_DCL_UNARY_PE(Popc);
+
+template <typename T>
+ir::Tensor AssignValue(const std::vector<T>& values,
+                       const common::Type& type       = common::type_of<T>(),
+                       const std::string& output_name = "T_assign_value_out") {
+  CHECK(!values.empty()) << "The input of pe::AssignValue should not empty! Please check.";
+
+  auto out = lang::Compute(
+      {ir::Expr(static_cast<int>(values.size()))},
+      [=](const std::vector<ir::Expr>& indice) {
+        auto init_value =
+            (type == common::type_of<T>()) ? ir::Expr(values[0]) : common::cast(ir::Expr(values[0]), type);
+        ir::Expr previous = ir::Select::Make(ir::EQ::Make(indice[0], ir::Expr(0)), init_value, lang::Zero(type));
+
+        for (int i = 1; i < values.size(); ++i) {
+          auto val = (type == common::type_of<T>()) ? ir::Expr(values[i]) : common::cast(ir::Expr(values[i]), type);
+          previous = ir::Select::Make(ir::EQ::Make(indice[0], ir::Expr(i)), val, previous);
+        }
+        return previous;
+      },
+      output_name);
+
+  return out;
+}
+
+ir::Tensor Squeeze(const ir::Tensor& A,
+                   const std::vector<int>& axes   = {},
+                   const std::string& output_name = UniqName("T_Elementwise_Squeeze_out"));
+
+ir::Tensor ExpandDims(const ir::Tensor& A,
+                      const std::vector<int>& axes,
+                      const std::vector<int>& out_shape,
+                      const std::string& output_name = UniqName("T_Elementwise_ExpandDims_out"));
+
+ir::Tensor Reshape(const ir::Tensor& A,
+                   const std::vector<int>& new_shape,
+                   const std::string& name = UniqName("T_Elementwise_Reshape_out"));
+
+ir::Tensor Cast(const ir::Tensor& A, const Type& dtype, const std::string& name = UniqName("T_Elementwise_Cast_out"));
+
+ir::Tensor Arange(const float start,
+                  const float stop,
+                  const float step,
+                  const Type& dtype,
+                  const std::string& name = UniqName("T_Elementwise_Arange_out"));
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
new file mode 100644
index 0000000000000..40fbb9d397009
--- /dev/null
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -0,0 +1,1223 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+
+#include <absl/container/flat_hash_map.h>
+#include <isl/cpp.h>
+
+#include <algorithm>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <numeric>
+#include <utility>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/pe/load_x86_params.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/poly/isl_utils.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+void IRElementwiseSchedule(ir::IRSchedule &ir_sch, const std::vector<int> &output_shape, const common::Target &target) {
+  VLOG(3) << "Before IRElementwiseSchedule, new ir is : " << ir_sch.GetModule().GetExprs().at(0);
+  if (target == common::DefaultNVGPUTarget()) {
+    auto blocks = ir_sch.GetAllBlocks();
+    ir_sch.FlattenLoops(ir_sch.GetLoops(blocks[0]), true);
+
+    auto loops = ir_sch.GetLoops(blocks[0]);
+    auto size  = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+    if (size <= target.max_num_threads()) {
+      ir_sch.Bind(loops[0], "threadIdx.x");
+    } else {
+      auto splited = ir_sch.Split(loops[0], {-1, target.max_num_threads()});
+      ir_sch.Bind(splited[0], "blockIdx.x");
+      ir_sch.Bind(splited[1], "threadIdx.x");
+    }
+  } else {
+    // IRScheduleInjectiveCPU(ir_sch, output_shape, target, false);
+    auto blocks = ir_sch.GetAllBlocks();
+    ir_sch.FlattenLoops(ir_sch.GetLoops(blocks[0]), true);
+  }
+  VLOG(3) << "After IRElementwiseSchedule, new ir is : " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRInjectiveSchedule(ir::IRSchedule &ir_sch, const std::vector<int> &output_shape, const common::Target &target) {
+  VLOG(3) << "Before IRInjectiveSchedule, new ir is : " << ir_sch.GetModule().GetExprs().at(0);
+  if (target == common::DefaultNVGPUTarget()) {
+    auto blocks = ir_sch.GetAllBlocks();
+    ir_sch.FlattenLoops(ir_sch.GetLoops(blocks[0]), false);
+
+    auto loops = ir_sch.GetLoops(blocks[0]);
+    auto size  = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+    if (size <= target.max_num_threads()) {
+      ir_sch.Bind(loops[0], "threadIdx.x");
+    } else {
+      auto splited = ir_sch.Split(loops[0], {-1, target.max_num_threads()});
+      ir_sch.Bind(splited[0], "blockIdx.x");
+      ir_sch.Bind(splited[1], "threadIdx.x");
+    }
+  } else {
+    // IRScheduleInjectiveCPU(ir_sch, output_shape, target, false);
+    auto blocks = ir_sch.GetAllBlocks();
+    ir_sch.FlattenLoops(ir_sch.GetLoops(blocks[0]), false);
+  }
+  VLOG(3) << "After IRInjectiveSchedule, new ir is : " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,
+                            const std::vector<int> &output_shape,
+                            const common::Target &target,
+                            bool vectorizable) {
+  VLOG(3) << "Begin IRScheduleInjectiveCPU" << ir_sch.GetModule().GetExprs().at(0);
+  auto all_blocks = ir_sch.GetAllBlocks();
+  auto loops      = ir_sch.GetLoops(all_blocks[0]);
+  int dims        = output_shape.size();
+  int factor      = GetBasicFactor(GetTensor(all_blocks[0])->type(), target);
+  auto fused      = loops[0];
+  if (dims >= 5) {
+    CHECK_GE(loops.size(), 3U);
+    fused = ir_sch.Fuse({loops[0], loops[1], loops[2]});
+    dims  = dims - 2;
+  } else if (dims >= 3) {
+    CHECK_GE(loops.size(), 2U);
+    fused = ir_sch.Fuse({loops[0], loops[1]});
+    dims  = dims - 1;
+  }
+  // This part needs to be fixed. @Haoze
+  /*   ir_sch.Parallel(fused);
+    if (vectorizable) {
+      auto all_blocks = ir_sch.GetAllBlocks();
+      auto loops      = ir_sch.GetLoops(all_blocks[0]);
+      int last_shape  = ir::GetLoopExtent(loops.back());
+      factor          = GetVectorizeFactor(last_shape, factor);
+      auto splited    = ir_sch.Split(loops.back(), {-1, factor});
+      ir_sch.Vectorize(splited[1], factor);
+      if (dims == 1) {
+        ir_sch.Parallel(splited[0]);
+      }
+    } */
+  VLOG(3) << "After IRScheduleInjectiveCPU, new ir is : " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,
+                             const std::vector<int> &output_shape,
+                             const common::Target &target) {
+  VLOG(3) << "Begin IRCudaScheduleInjective ";
+  auto all_blocks = ir_sch.GetAllBlocks();
+  auto loops      = ir_sch.GetLoops(all_blocks[0]);
+  auto fused      = ir_sch.Fuse(loops);
+
+  int num_thread   = target.max_num_threads();
+  int vector_width = 1;
+  int prod_size    = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+  if (prod_size > num_thread) {
+    auto splited = ir_sch.Split(fused, {-1, num_thread});
+    ir_sch.Bind(splited[0], "blockIdx.x");
+    ir_sch.Bind(splited[1], "threadIdx.x");
+  } else {
+    ir_sch.Bind(fused, "threadIdx.x");
+  }
+  VLOG(3) << "After IRCudaScheduleInjective, new ir is : " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+std::vector<common::CINNValue> IRCudaScheduleMatMul(const common::CINNValuePack &arg_pack,
+                                                    const std::vector<int> &output_shape,
+                                                    const common::Target &target) {
+  if (target.arch == Target::Arch::X86) {
+    CINN_NOT_IMPLEMENTED
+  }
+  std::vector<Expr> vec_ast;
+  for (int i = 0; i < arg_pack.size(); i++) {
+    if (arg_pack[i].is_expr()) {
+      Expr temp = arg_pack[i];
+      vec_ast.emplace_back(temp);
+    }
+  }
+  CHECK(!vec_ast.empty());
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  ir_sch.MergeExprs();
+  // Generally, there are 2 ScheduleBlocks in the lowered function,
+  // the first is for reduce_init and the second is the real compute block,
+  // here we use loops of the first block to Bind GPU index in top spatial axies
+  auto init_block = ir_sch.GetAllBlocks().front();
+  VLOG(3) << "Matmul lowered expr:\n" << ir_sch.GetModule().GetExprs().front();
+
+  int prod_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+  if (prod_size > 1) {
+    int num_thread = target.max_num_threads();
+    auto loops     = ir_sch.GetLoops(init_block);
+    if (loops.size() == 1) {
+      if (ir::GetLoopExtent(loops[0]) > num_thread) {
+        auto splited = ir_sch.Split(loops[0], {-1, num_thread});
+        ir_sch.Bind(splited[0], "blockIdx.x");
+        ir_sch.Bind(splited[1], "threadIdx.x");
+      } else {
+        ir_sch.Bind(loops[0], "threadIdx.x");
+      }
+    } else {
+      if (ir::GetLoopExtent(loops[1]) > num_thread) {
+        ir_sch.Split(loops[1], {-1, num_thread});
+        init_block = ir_sch.GetAllBlocks().front();
+        ir_sch.Fuse(init_block, {0, 1});
+        init_block = ir_sch.GetAllBlocks().front();
+        loops      = ir_sch.GetLoops(init_block);
+      }
+      ir_sch.Bind(loops[0], "blockIdx.x");
+      ir_sch.Bind(loops[1], "threadIdx.x");
+    }
+  }
+
+  return {common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+}
+
+void IRCudaScheduleMul(ir::IRSchedule &ir_sch, const std::vector<int> &output_shape, const common::Target &target) {
+  auto all_blocks = ir_sch.GetAllBlocks();
+  auto loops      = ir_sch.GetLoops(all_blocks.back());
+  CHECK_GE(loops.size(), 2U);
+  auto splited = ir_sch.Split(loops[1], {-1, 2});
+  all_blocks   = ir_sch.GetAllBlocks();
+  loops        = ir_sch.GetLoops(all_blocks.back());
+  ir_sch.Bind(loops[0], "blockIdx.x");
+  ir_sch.Bind(loops[1], "threadIdx.x");
+}
+
+void IRMulScheduleCPU(ir::IRSchedule &ir_sch,
+                      const std::vector<int> &reduce_first_shape,
+                      const common::Target &target) {
+  ir_sch.MergeExprs();
+  auto all_blocks = ir_sch.GetAllBlocks();
+  CHECK_EQ(all_blocks.size(), 4U);
+  auto loops    = ir_sch.GetLoops(all_blocks[1]);
+  int loop_size = loops.size();
+  // ir_sch.Reorder({loops[loop_size-1], loops[loop_size-2]});
+
+  if (reduce_first_shape.back() > 1) {
+    all_blocks = ir_sch.GetAllBlocks();
+    loops      = ir_sch.GetLoops(all_blocks[3]);
+    ir_sch.Unroll(loops.back());
+  }
+}
+
+void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,
+                         const std::vector<std::vector<int>> &output_shapes,
+                         int axis,
+                         const common::Target &target) {
+  VLOG(3) << "In IRCudaSplitSchedule, Before schedule expr is : " << ir_sch.GetModule().GetExprs().at(0);
+  ir_sch.MergeExprs();
+  // if all output are with same shape
+  bool with_same_shape = true;
+  for (int idx = 1; idx < output_shapes.size(); ++idx) {
+    if (output_shapes[0] != output_shapes[idx]) {
+      with_same_shape = false;
+      break;
+    }
+  }
+
+  // collect block names
+  auto get_block_name = [](ir::Expr expr) {
+    CHECK(expr.As<ir::ScheduleBlockRealize>());
+    CHECK(expr.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>());
+    return expr.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name;
+  };
+  std::vector<std::string> block_names;
+  auto blocks = ir_sch.GetAllBlocks();
+  for (auto &block : blocks) {
+    block_names.push_back(get_block_name(block));
+  }
+  // if output with same shape.
+  if (with_same_shape && target == common::DefaultNVGPUTarget()) {
+    // flat loops.
+    {
+      auto tsize = std::accumulate(output_shapes[0].begin(), output_shapes[0].end(), 1, std::multiplies<int>());
+      for (auto &block_name : block_names) {
+        ir_sch.FlattenLoops(ir_sch.GetLoops(block_name), false);
+
+        if (tsize > target.max_num_threads()) {
+          // split [-1, 256]
+          auto splited = ir_sch.Split(ir_sch.GetLoops(block_name)[0], {-1, target.max_num_threads() / 4});
+          ir_sch.Bind(splited[0], "blockIdx.x");
+          ir_sch.Bind(splited[1], "threadIdx.x");
+        } else {
+          auto splited = ir_sch.Split(ir_sch.GetLoops(block_name)[0], {1, tsize});
+          ir_sch.Bind(splited[0], "blockIdx.x");
+          ir_sch.Bind(splited[1], "threadIdx.x");
+        }
+      }
+    }
+    // do simple compute at.
+    {
+      for (int idx = 1; idx < block_names.size(); ++idx) {
+        auto master_loops = ir_sch.GetLoops(block_names[0]);
+        ir_sch.SimpleComputeAt(ir_sch.GetBlock(block_names[idx]), master_loops[1]);
+      }
+    }
+  } else if (target == common::DefaultNVGPUTarget()) {
+    // flat loops.
+    {
+      for (int idx = 0; idx < block_names.size(); ++idx) {
+        ir_sch.FlattenLoops(ir_sch.GetLoops(block_names[idx]), false);
+        auto first_loop = ir_sch.GetLoops(block_names[idx])[0];
+        CHECK(first_loop.As<ir::For>());
+        auto tsize = first_loop.As<ir::For>()->extent.as_int32();
+        if (tsize > target.max_num_threads()) {
+          // split [-1, 256]
+          auto splited = ir_sch.Split(ir_sch.GetLoops(block_names[idx])[0], {-1, target.max_num_threads() / 4});
+          ir_sch.Bind(splited[0], "blockIdx.x");
+          ir_sch.Bind(splited[1], "threadIdx.x");
+        } else {
+          auto splited = ir_sch.Split(ir_sch.GetLoops(block_names[idx])[0], {1, tsize});
+          ir_sch.Bind(splited[0], "blockIdx.x");
+          ir_sch.Bind(splited[1], "threadIdx.x");
+        }
+      }
+    }
+  } else {
+    {
+      for (auto &block_name : block_names) {
+        ir_sch.FlattenLoops(ir_sch.GetLoops(block_name), false);
+      }
+    }
+  }
+  VLOG(3) << "In IRCudaSplitSchedule, After schedule expr is : " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,
+                          ir::Tensor output,
+                          int last_dimension_num,
+                          const common::Target &target) {
+  VLOG(3) << "Before IRCudaScheduleReduce : " << ir_sch.GetModule().GetExprs().at(0);
+  int parallel_thread_num = 1;
+  auto &output_shape      = output->shape;
+  for (int idx = output_shape.size() - 1; idx >= static_cast<int>(output_shape.size()) - last_dimension_num; --idx) {
+    parallel_thread_num *= output_shape[idx].as_int32();
+  }
+
+  int index = ir_sch.GetLoops(output->name + "__reduce_init").size() - last_dimension_num;
+  for (int idx = output_shape.size() - last_dimension_num; idx < static_cast<int>(output_shape.size()) - 1; ++idx) {
+    auto loops = ir_sch.GetLoops(output->name);
+    ir_sch.Fuse({loops[index], loops[index + 1]});
+  }
+
+  int max_block_size = target.max_num_threads();
+  if (parallel_thread_num > max_block_size) {
+    auto loops = ir_sch.GetLoops(output->name);
+    CHECK_GE(loops.size(), index + 1);
+    for (int idx = max_block_size; idx > 0; --idx) {
+      if (parallel_thread_num % idx == 0) {
+        auto nloops = ir_sch.Split(loops[index], {-1, idx});
+        ir_sch.Bind(nloops.back(), "threadIdx.x");
+        break;
+      }
+      CHECK_GT(idx, 1);
+    }
+    ++index;
+  } else {
+    auto loops = ir_sch.GetLoops(output->name);
+    CHECK_GE(loops.size(), index + 1);
+    ir_sch.Bind(loops[index], "threadIdx.x");
+  }
+
+  for (int idx = 0; idx < index - 1; ++idx) {
+    auto loops = ir_sch.GetLoops(output->name);
+    CHECK_GT(loops.size(), 2U);
+    if (loops.size() > 2) ir_sch.Fuse({loops[0], loops[1]});
+  }
+
+  if (index > 0) {
+    auto loops = ir_sch.GetLoops(output->name);
+    ir_sch.Bind(loops[0], "blockIdx.x");
+  }
+  VLOG(3) << "After IRCudaScheduleReduce : " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,
+                                       ir::Tensor tmp_out,
+                                       ir::Tensor out,
+                                       const common::Target &target) {
+  VLOG(3) << "Before IRCudaScheduleBlockReduceInternal : " << ir_sch.GetModule().GetExprs().at(0);
+  int fuse_times = ir_sch.GetLoops(tmp_out->name).size() - 2;
+  for (int idx = 0; idx < fuse_times; ++idx) {
+    for (auto &tensor : {tmp_out, out}) {
+      auto loops = ir_sch.GetLoops(tensor->name);
+      CHECK_GE(loops.size(), 2U);
+      ir_sch.Fuse({loops[0], loops[1]});
+    }
+  }
+
+  // as out shape size = [1], insert for in ast tree.
+  if (tmp_out->shape.size() == 1) {
+    CHECK_EQ(out->shape[0], Expr(1));
+
+    // block and root
+    auto out_block  = ir_sch.GetBlock(out->name);
+    auto root_block = ir_sch.GetRootBlock(out_block);
+
+    CHECK(out_block->as<ir::ScheduleBlockRealize>());
+    CHECK(out_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>());
+
+    // create var
+    auto var = ir::Var(ir::Expr(0), ir::Expr(1), common::UniqName("i"));
+    out_block->as<ir::ScheduleBlockRealize>()->iter_values.push_back(var);
+    out_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>()->iter_vars.push_back(var);
+
+    CHECK(root_block->as<ir::ScheduleBlockRealize>());
+    CHECK(root_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>());
+
+    // create for and block node
+    auto for_node =
+        ir::For::Make(var, Expr(0), Expr(1), ir::ForType::Serial, ir::DeviceAPI::UNK, ir::Block::Make({out_block}));
+    auto block_node = ir::Block::Make({root_block->as<ir::ScheduleBlockRealize>()
+                                           ->schedule_block->as<ir::ScheduleBlock>()
+                                           ->body->as<ir::Block>()
+                                           ->stmts[0],
+                                       for_node});
+
+    root_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>()->body = block_node;
+
+    for (auto &tensor : {tmp_out, out}) {
+      auto loops = ir_sch.GetLoops(tensor->name);
+      ir_sch.Split(loops[0], {-1, ir::GetLoopExtent(loops[0])});
+    }
+  }
+
+  auto loops_tmp_out = ir_sch.GetLoops(tmp_out->name);
+  auto loops_out     = ir_sch.GetLoops(out->name);
+  if (loops_tmp_out.size() == 1) {
+    ir_sch.Bind(loops_tmp_out[0], "threadIdx.x");
+    ir_sch.Bind(loops_out[0], "threadIdx.x");
+  } else {
+    ir_sch.Bind(loops_tmp_out[0], "blockIdx.x");
+    ir_sch.Bind(loops_tmp_out[1], "threadIdx.x");
+
+    if (loops_out.size() == 1) {
+      ir_sch.Split(loops_out[0], {-1, 1});
+    }
+    loops_out = ir_sch.GetLoops(out->name);
+    ir_sch.Bind(loops_out[0], "blockIdx.x");
+    ir_sch.Bind(loops_out[1], "threadIdx.x");
+  }
+
+  for (auto &tensor : {tmp_out}) {
+    auto block = ir_sch.GetBlock(tensor->name);
+    ir_sch.SetBuffer(block, "local", true);
+  }
+
+  VLOG(3) << "After IRCudaScheduleBlockReduceInternal : " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,
+                               ir::Tensor reduce_tmp_out,
+                               ir::Tensor tmp_out,
+                               ir::Tensor out,
+                               const common::Target &target) {
+  VLOG(3) << "Before IRCudaScheduleBlockReduce : " << ir_sch.GetModule().GetExprs().at(0);
+  int tmp_put_shape_size_without_reduce = 0;
+  for (auto i : tmp_out->shape) {
+    CHECK(i.is_constant());
+    if (i.as_int32() != 1) tmp_put_shape_size_without_reduce++;
+  }
+  tmp_put_shape_size_without_reduce--;
+  // fuse last parallel dimension
+  int reduce_temp_out_shape_size = 0;
+  for (auto i : reduce_tmp_out->shape) {
+    CHECK(i.is_constant());
+    if (i.as_int32() != 1) reduce_temp_out_shape_size++;
+  }
+
+  int tmp_out_shape_size = tmp_put_shape_size_without_reduce + 1;
+  for (int idx = 0; idx < reduce_temp_out_shape_size - tmp_out_shape_size; ++idx) {
+    auto loops      = ir_sch.GetLoops(reduce_tmp_out->name);
+    int reduce_axis = reduce_tmp_out->reduce_axis.size();
+    if (loops.size() >= tmp_put_shape_size_without_reduce + 2 + reduce_axis)
+      ir_sch.Fuse({loops[tmp_put_shape_size_without_reduce], loops[tmp_put_shape_size_without_reduce + 1]});
+  }
+
+  // fuse parallel dimension
+  for (int idx = 0; idx < tmp_put_shape_size_without_reduce - 1; ++idx) {
+    for (auto &tensor : {reduce_tmp_out, tmp_out, out}) {
+      auto loops      = ir_sch.GetLoops(tensor->name);
+      int reduce_axis = tensor->reduce_axis.size();
+      if (loops.size() >= 2 + reduce_axis) {
+        ir_sch.Fuse({loops[0], loops[1]});
+      }
+    }
+  }
+
+  // Special handling when keepdim = True in reduce stage 1. When keepdim = True, shape size may not be equal to 1. But
+  // we still need to split the loops, otherwise there will be a problem of data read and write conflict.
+  int numel = std::accumulate(tmp_out->shape.begin(), tmp_out->shape.end(), 1, [](const int &num, const ir::Expr &e) {
+    return num * e.as_int32();
+  });
+  if (tmp_out->shape.size() == 1 || (numel == tmp_out->shape.back().as_int32())) {
+    CHECK_EQ(out->shape[0], Expr(1));
+
+    // block and root
+    auto out_block  = ir_sch.GetBlock(out->name);
+    auto root_block = ir_sch.GetRootBlock(out_block);
+
+    CHECK(out_block->as<ir::ScheduleBlockRealize>());
+    CHECK(out_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>());
+
+    // create var
+    auto var = ir::Var(ir::Expr(0), ir::Expr(1), cinn::UniqName("i"));
+    out_block->as<ir::ScheduleBlockRealize>()->iter_values.push_back(var);
+    out_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>()->iter_vars.push_back(var);
+
+    CHECK(root_block->as<ir::ScheduleBlockRealize>());
+    CHECK(root_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>());
+
+    // create for and block node
+    auto for_node =
+        ir::For::Make(var, Expr(0), Expr(1), ir::ForType::Serial, ir::DeviceAPI::UNK, ir::Block::Make({out_block}));
+    auto block_node = ir::Block::Make({root_block->as<ir::ScheduleBlockRealize>()
+                                           ->schedule_block->as<ir::ScheduleBlock>()
+                                           ->body->as<ir::Block>()
+                                           ->stmts[0],
+                                       for_node});
+
+    root_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>()->body = block_node;
+
+    for (auto &tensor : {reduce_tmp_out, tmp_out, out}) {
+      auto loops = ir_sch.GetLoops(tensor->name);
+      if (loops.empty()) continue;
+      ir_sch.Split(loops[0], {-1, ir::GetLoopExtent(loops[0])});
+    }
+  }
+
+  // bind block and thread for reduce.
+  // as outer loop range should be eqaul, get loop size.
+  auto b_loop = ir::GetLoopExtent(ir_sch.GetLoops(out->name)[0]);
+  // reduce_tmp_out
+  {
+    auto loops = ir_sch.GetLoops(reduce_tmp_out->name);
+    if (loops.size() <= 2U) {
+      if (ir_sch.GetLoops(tmp_out->name).size() == 1) {
+        ir_sch.Split(loops[0], {b_loop, -1});
+      }
+      loops = ir_sch.GetLoops(reduce_tmp_out->name);
+    }
+    ir_sch.Bind(loops[0], "blockIdx.x");
+    ir_sch.Bind(loops[1], "threadIdx.x");
+  }
+  // tmp_out
+  {
+    auto loops = ir_sch.GetLoops(tmp_out->name);
+    if (loops.size() < 2U) {
+      ir_sch.Split(loops.back(), {b_loop, -1});
+      loops = ir_sch.GetLoops(tmp_out->name);
+    }
+
+    ir_sch.Bind(loops[0], "blockIdx.x");
+    ir_sch.Bind(loops[1], "threadIdx.x");
+  }
+  // out
+  {
+    auto loops = ir_sch.GetLoops(out->name);
+    if (loops.size() < 2U) {
+      ir_sch.Split(loops.back(), {-1, 1});
+      loops = ir_sch.GetLoops(out->name);
+    }
+    ir_sch.Bind(loops[0], "blockIdx.x");
+    ir_sch.Bind(loops[1], "threadIdx.x");
+  }
+
+  for (auto &tensor : {reduce_tmp_out, tmp_out}) {
+    auto block = ir_sch.GetBlock(tensor->name);
+    ir_sch.SetBuffer(block, "local", true);
+  }
+
+  VLOG(3) << "After IRCudaScheduleBlockReduce : " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,
+                                      ir::Tensor reshape,
+                                      ir::Tensor internal,
+                                      ir::Tensor reduce_out,
+                                      const common::Target &target) {
+  VLOG(3) << "Before IRCudaScheduleBlockShuffleReduce : " << ir_sch.GetModule().GetExprs().at(0);
+  // reshape compute inline
+  {
+    // simplify reshape index
+    auto hand_write_simplify = [](std::vector<ir::Expr> loops, ir::Expr block) {
+      // check exist select.
+      auto find_select = ir::CollectIRNodesInOrder(block, [&](const Expr *x) { return x->As<ir::Select>(); });
+      if (find_select.size() > 0) {
+        return;
+      }
+
+      auto schedule_realize = block.As<ir::ScheduleBlockRealize>();
+      auto schedule_block   = block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>();
+
+      int stride = 1;
+      std::unordered_map<std::string, ir::Expr> var_strides;
+      for (int idx = loops.size() - 1; idx > 0; --idx) {
+        stride = stride * GetLoopExtent(loops[idx]);
+
+        auto var               = loops[idx - 1].As<ir::For>()->loop_var;
+        var_strides[var->name] = ir::Expr(stride);
+      }
+
+      ir::Expr index = ir::Expr(schedule_block->iter_vars.back());
+      for (int idx = 0; idx < schedule_block->iter_vars.size() - 1; ++idx) {
+        auto var = schedule_realize->iter_values[idx].as_var();
+        if (!var) {
+          continue;
+        }
+
+        if (!var_strides.count(var->name)) {
+          continue;
+        }
+
+        auto stride = var_strides.find(var->name)->second;
+        index       = index + ir::Expr(schedule_block->iter_vars[idx]) * stride;
+      }
+
+      auto exprs = ir::CollectIRNodesInOrder(block, [&](const Expr *x) { return x->As<ir::Load>(); });
+      CHECK_EQ(exprs.size(), 1);
+      auto load     = exprs.front().As<ir::Load>();
+      load->indices = {index};
+    };
+    hand_write_simplify(ir_sch.GetLoops(reshape->name), ir_sch.GetBlock(reshape->name));
+    auto block = ir_sch.GetBlock(reshape->name);
+    ir_sch.ComputeInline(block);
+    VLOG(4) << "After simplify reshape index : " << ir_sch.GetModule().GetExprs().at(0);
+  }
+
+  // internal bind shared
+  {
+    auto block = ir_sch.GetBlock(internal->name);
+    ir_sch.SetBuffer(block, "shared");
+  }
+
+  //
+  auto get_loop_index = [&internal](ir::Expr inner_loop, ir::Expr block) {
+    auto loop_var         = inner_loop.As<ir::For>()->loop_var;
+    auto schedule_realize = block.As<ir::ScheduleBlockRealize>();
+    auto schedule_block   = block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>();
+    CHECK_EQ(schedule_realize->iter_values.size(), schedule_block->iter_vars.size());
+
+    ir::Var var_name;
+    for (int idx = 0; idx < schedule_block->iter_vars.size(); ++idx) {
+      if (!schedule_realize->iter_values[idx].as_var()) {
+        continue;
+      }
+      if (schedule_realize->iter_values[idx].as_var()->name != loop_var->name) {
+        continue;
+      }
+
+      var_name = schedule_block->iter_vars[idx];
+      break;
+    }
+
+    auto exprs = ir::CollectIRNodesInOrder(block, [&](const Expr *x) { return x->As<ir::Load>(); });
+    for (auto expr : exprs) {
+      auto load = expr.As<ir::Load>();
+      auto t    = load->tensor.as_tensor_ref();
+      if (t->name != internal->name) {
+        continue;
+      }
+
+      int index_var_count = 0;
+      for (int idx = 0; idx < load->indices.size(); ++idx) {
+        if (!load->indices[idx].is_var()) {
+          continue;
+        }
+
+        if (load->indices[idx].as_var()->name == var_name->name) {
+          break;
+        }
+
+        ++index_var_count;
+      }
+
+      // remove dimension range = 1.
+      int loop_var_count = 0;
+      for (int idx = 0; idx < index_var_count; ++idx) {
+        if (internal->shape[idx].as_int32() > 1) {
+          ++loop_var_count;
+        }
+      }
+      return loop_var_count;
+    }
+    LOG(FATAL) << "Can't find var in tensor indeces!";
+  };
+  auto loop_var_count = get_loop_index(ir_sch.GetLoops(reduce_out->name).back(), ir_sch.GetBlock(reduce_out->name));
+  // fuse loop to bind gpu block.x
+  if (loop_var_count > 1) {
+    auto internal_loops = ir_sch.GetLoops(internal->name);
+    std::vector<ir::Expr> fuse_internal_loops(internal_loops.begin(), internal_loops.begin() + loop_var_count);
+    ir_sch.Fuse(fuse_internal_loops);
+
+    auto reduce_out_loops = ir_sch.GetLoops(reduce_out->name);
+    std::vector<ir::Expr> fuse_reduce_out_loops(reduce_out_loops.begin(), reduce_out_loops.begin() + loop_var_count);
+    ir_sch.Fuse(fuse_reduce_out_loops);
+  }
+
+  VLOG(4) << "After fuse loop for blockIdx.x : " << ir_sch.GetModule().GetExprs().at(0);
+  // fuse reduce tail to bind gpu thread.
+  if (ir_sch.GetLoops(reduce_out->name + "__reduce_init").size() > (loop_var_count ? 2 : 1)) {
+    int start_index = loop_var_count == 0 ? 0 : 1;
+    // first reduce step:
+    // [block.x, thread.y, tail] or [thread.y, tail]
+    auto internal_loops = ir_sch.GetLoops(internal->name + "__reduce_init");
+    std::vector<ir::Expr> fuse_internal_loops(internal_loops.begin() + start_index + 1, internal_loops.end());
+    ir_sch.Fuse(fuse_internal_loops);
+
+    // second reduce step:
+    // [block.x, tail] or [tail]
+    auto reduce_out_loops = ir_sch.GetLoops(reduce_out->name + "__reduce_init");
+    std::vector<ir::Expr> fuse_reduce_out_loops(reduce_out_loops.begin() + start_index, reduce_out_loops.end());
+    ir_sch.Fuse(fuse_reduce_out_loops);
+  }
+
+  VLOG(4) << "After fuse tail loop for threadIdx.x : " << ir_sch.GetModule().GetExprs().at(0);
+  // split reduce loop to bind thread.y
+  {
+    if (loop_var_count > 0) {
+      auto reduce_out_loops = ir_sch.GetLoops(reduce_out->name + "__reduce_init");
+      ir_sch.Split(reduce_out_loops[1], {1, -1});
+    } else {
+      auto reduce_out_loops = ir_sch.GetLoops(reduce_out->name + "__reduce_init");
+      ir_sch.Split(reduce_out_loops[0], {1, -1});
+    }
+  }
+
+  std::vector<int> axis_in_nroder;
+  // split internal tail to bind thread
+  {
+    auto start_index = loop_var_count == 0 ? 0 : 1;
+    auto i_loops     = ir_sch.GetLoops(internal->name + "__reduce_init");
+    auto r_loops     = ir_sch.GetLoops(reduce_out->name + "__reduce_init");
+    // bind blockIdx.x
+    if (loop_var_count) {
+      ir_sch.Bind(i_loops[0], "blockIdx.x");
+      i_loops = ir_sch.GetLoops(internal->name + "__reduce_init");
+
+      ir_sch.Bind(r_loops[0], "blockIdx.x");
+      r_loops = ir_sch.GetLoops(reduce_out->name + "__reduce_init");
+
+      axis_in_nroder.push_back(0);
+    }
+    // bind threadIdx.y
+    {
+      ir_sch.Bind(i_loops[start_index], "threadIdx.y");
+      i_loops = ir_sch.GetLoops(internal->name + "__reduce_init");
+
+      ir_sch.Bind(r_loops[start_index], "threadIdx.y");
+      r_loops = ir_sch.GetLoops(reduce_out->name + "__reduce_init");
+
+      axis_in_nroder.push_back(start_index);
+    }
+
+    auto bind_thread = [&](int tail) {
+      if (GetLoopExtent(i_loops[start_index + 1]) > tail) {
+        ir_sch.Split(i_loops[start_index + 1], {-1, tail});
+        i_loops = ir_sch.GetLoops(internal->name + "__reduce_init");
+
+        ir_sch.Split(r_loops[start_index + 1], {-1, tail});
+        r_loops = ir_sch.GetLoops(reduce_out->name + "__reduce_init");
+
+        ir_sch.Bind(i_loops[start_index + 1], "blockIdx.y");
+        ir_sch.Bind(r_loops[start_index + 1], "blockIdx.y");
+
+        ir_sch.Bind(i_loops[start_index + 2], "threadIdx.x");
+        ir_sch.Bind(r_loops[start_index + 2], "threadIdx.x");
+
+        axis_in_nroder.insert(axis_in_nroder.end() - 1, start_index + 1);
+        axis_in_nroder.insert(axis_in_nroder.end() - 1, start_index + 2);
+      } else {
+        ir_sch.Bind(i_loops[start_index + 1], "threadIdx.x");
+        ir_sch.Bind(r_loops[start_index + 1], "threadIdx.x");
+
+        axis_in_nroder.insert(axis_in_nroder.end() - 1, start_index + 1);
+      }
+    };
+    // split and bind blockIdx.y/threadIdx.x
+    if (GetLoopExtent(i_loops[start_index]) > 32) {
+      bind_thread(8);
+    } else if (GetLoopExtent(i_loops[start_index]) > 16) {
+      bind_thread(16);
+    } else if (GetLoopExtent(i_loops[start_index]) > 4) {
+      bind_thread(32);
+    } else {
+      bind_thread(64);
+    }
+  }
+  VLOG(4) << "After split tail loop for threadIdx.x : " << ir_sch.GetModule().GetExprs().at(0);
+  // do reorder
+  {
+    ir_sch.Reorder(internal->name + "__reduce_init", axis_in_nroder);
+    ir_sch.Reorder(reduce_out->name + "__reduce_init", axis_in_nroder);
+  }
+  // unroll last dim
+  {
+    auto i_loops = ir_sch.GetLoops(internal->name);
+    if (ir_sch.GetLoops(internal->name + "__reduce_init").size() < i_loops.size() &&
+        GetLoopExtent(i_loops.back()) <= 64) {
+      ir_sch.Unroll(i_loops.back());
+    }
+
+    auto r_loops = ir_sch.GetLoops(reduce_out->name);
+    if (ir_sch.GetLoops(reduce_out->name + "__reduce_init").size() < r_loops.size()) {
+      ir_sch.Unroll(r_loops.back());
+    }
+  }
+  VLOG(3) << "After IRCudaScheduleBlockShuffleReduce : " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,
+                                 ir::Tensor reshape,
+                                 ir::Tensor internal,
+                                 ir::Tensor tmp_out,
+                                 ir::Tensor out,
+                                 const common::Target &target) {
+  VLOG(3) << "Before IRCudaTwoStepReduceSchedule : " << ir_sch.GetModule().GetExprs().at(0);
+  // fuse axis
+  int fuse_times = ir_sch.GetLoops(internal->name).size() - internal->reduce_axis.size() - 2;
+  for (int idx = 0; idx < fuse_times; ++idx) {
+    for (auto &tensor : {internal, tmp_out, out}) {
+      auto block      = ir_sch.GetBlock(tensor->name);
+      auto loops      = ir_sch.GetLoops(block);
+      int reduce_axis = tensor->reduce_axis.size();
+      ir_sch.Fuse({loops[0], loops[1]});
+    }
+  }
+
+  if (ir_sch.GetLoops(tmp_out->name).size() == 1) {
+    // block and root
+    auto out_block  = ir_sch.GetBlock(out->name);
+    auto root_block = ir_sch.GetRootBlock(out_block);
+
+    CHECK(out_block->as<ir::ScheduleBlockRealize>());
+    CHECK(out_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>());
+
+    // create var
+    // auto var = ir::Var(ir::Expr(0), ir::Expr(1), "i_0");
+    auto var = ir::Var(ir::Expr(0), ir::Expr(1), cinn::UniqName("i"));
+    out_block->as<ir::ScheduleBlockRealize>()->iter_values.push_back(var);
+    out_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>()->iter_vars.push_back(var);
+
+    CHECK(root_block->as<ir::ScheduleBlockRealize>());
+    CHECK(root_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>());
+
+    // create for and block node
+    auto for_node =
+        ir::For::Make(var, Expr(0), Expr(1), ir::ForType::Serial, ir::DeviceAPI::UNK, ir::Block::Make({out_block}));
+
+    auto block_node = ir::Block::Make({root_block->as<ir::ScheduleBlockRealize>()
+                                           ->schedule_block->as<ir::ScheduleBlock>()
+                                           ->body->as<ir::Block>()
+                                           ->stmts[0],
+                                       root_block->as<ir::ScheduleBlockRealize>()
+                                           ->schedule_block->as<ir::ScheduleBlock>()
+                                           ->body->as<ir::Block>()
+                                           ->stmts[1],
+                                       for_node});
+
+    root_block->as<ir::ScheduleBlockRealize>()->schedule_block->as<ir::ScheduleBlock>()->body = block_node;
+
+    for (auto &tensor : {internal, tmp_out, out}) {
+      auto block = ir_sch.GetBlock(tensor->name);
+      auto loops = ir_sch.GetLoops(block);
+      if (!loops.empty()) ir_sch.Split(loops[0], {-1, ir::GetLoopExtent(loops[0])});
+    }
+  }
+  auto reshape_block = ir_sch.GetBlock(reshape->name);
+  ir_sch.ComputeInline(reshape_block);
+
+  auto internal_block = ir_sch.GetBlock(internal->name);
+  ir_sch.SetBuffer(internal_block, "local", true);
+
+  auto tmp_out_block = ir_sch.GetBlock(tmp_out->name);
+  ir_sch.SetBuffer(tmp_out_block, "local", true);
+
+  // The current one-dimensional reduce does not make full use of SM.
+  // This case is optimized into a two-dimensional.
+  auto internal_loops = ir_sch.GetLoops(internal->name);
+  auto block_dim_x    = internal_loops[1].As<ir::For>()->extent.as_int32();
+  int block_dim_y     = block_dim_x <= 32 ? 2 : 1;
+
+  for (auto &tensor : {internal, tmp_out, out}) {
+    auto loops = ir_sch.GetLoops(tensor->name);
+    if (loops.size() == 1) {
+      ir_sch.Split(loops[0], {-1, 1});
+      loops = ir_sch.GetLoops(tensor->name);
+    }
+    if (block_dim_y != 1) {
+      ir_sch.Split(loops[0], {-1, block_dim_y});
+      loops = ir_sch.GetLoops(tensor->name);
+      ir_sch.Bind(loops[0], "blockIdx.x");
+      ir_sch.Bind(loops[1], "threadIdx.y");
+      ir_sch.Bind(loops[2], "threadIdx.x");
+    } else {
+      ir_sch.Bind(loops[0], "blockIdx.x");
+      ir_sch.Bind(loops[1], "threadIdx.x");
+    }
+  }
+  VLOG(3) << "After IRCudaTwoStepReduceSchedule : " << ir_sch.GetModule().GetExprs().at(0);
+  // ir_sch.SimpleComputeAt(ir_sch.GetBlock(tmp_out->name), ir_sch.GetLoops(out->name)[0]);
+  // ir_sch.SimpleComputeAt(ir_sch.GetBlock(internal->name), ir_sch.GetLoops(out->name)[0]);
+}
+
+void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis) {
+  ir_sch.MergeExprs();
+  auto all_blocks = ir_sch.GetAllBlocks();
+  CHECK_EQ(all_blocks.size(), 3U);
+  auto output = GetTensor(all_blocks[2]);
+  if (axis == -1) {
+    axis += output->shape.size();
+  }
+  auto loops = ir_sch.GetLoops(all_blocks[2]);
+  // ir_sch.Parallel(loops[0]);
+  all_blocks = ir_sch.GetAllBlocks();
+  for (int i = 1; i < axis; ++i) {
+    ir_sch.Fuse(all_blocks[2], {0, 1});
+  }
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[2]);
+  ir_sch.ComputeAt(all_blocks[1], loops[0]);
+}
+
+void IRPoolScheduleGPU(ir::IRSchedule &ir_sch, const common::Target &target, int arg_pack_size) {
+  VLOG(3) << "Before IRPoolScheduleGPU: " << ir_sch.GetModule().GetExprs().at(0);
+  auto all_blocks = ir_sch.GetAllBlocks();
+  VLOG(3) << "all_blocks[0] is : " << all_blocks[0];
+  auto loops = ir_sch.GetLoops(all_blocks[0]);
+  ir_sch.Fuse(loops);
+  // Blocks were changed after Fuse, so we have to get all blocks again.
+  all_blocks   = ir_sch.GetAllBlocks();
+  loops        = ir_sch.GetLoops(all_blocks[0]);
+  auto splited = ir_sch.Split(loops[0], {-1, 1024});
+  ir_sch.Bind(splited[0], "blockIdx.x");
+  ir_sch.Bind(splited[1], "threadIdx.x");
+  VLOG(3) << "End IRPoolScheduleGPU: " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch, const common::Target &target) {
+  VLOG(3) << "Before IRGlobalPoolScheduleGPU: " << ir_sch.GetModule().GetExprs().at(0);
+  auto all_blocks = ir_sch.GetAllBlocks();
+  CHECK_EQ(all_blocks.size(), 2U);
+  auto loops = ir_sch.GetLoops(all_blocks[1]);
+  if (loops.size() > 1) {
+    auto fused   = ir_sch.Fuse(all_blocks[0], {0, 1});
+    auto splited = ir_sch.Split(fused, {-1, 32});
+    all_blocks   = ir_sch.GetAllBlocks();
+    fused        = ir_sch.Fuse(all_blocks[1], {0, 1});
+    splited      = ir_sch.Split(fused, {-1, 32});
+    ir_sch.Bind(splited[0], "blockIdx.x");
+    ir_sch.Bind(splited[1], "threadIdx.y");
+    all_blocks = ir_sch.GetAllBlocks();
+    ir_sch.SimpleComputeAt(all_blocks[0], splited[1]);
+    all_blocks = ir_sch.GetAllBlocks();
+    ir_sch.SetBuffer(all_blocks[0], "local", true);
+    loops = ir_sch.GetLoops(all_blocks[0]);
+    CHECK_GE(loops.size(), 3U);
+    ir_sch.Bind(loops[2], "threadIdx.x");
+  } else {
+    loops        = ir_sch.GetLoops(all_blocks[0]);
+    auto splited = ir_sch.Split(loops[0], {-1, 32});
+    all_blocks   = ir_sch.GetAllBlocks();
+    loops        = ir_sch.GetLoops(all_blocks[1]);
+    splited      = ir_sch.Split(loops[0], {-1, 32});
+    ir_sch.Bind(splited[0], "blockIdx.x");
+    ir_sch.Bind(splited[1], "threadIdx.y");
+    all_blocks = ir_sch.GetAllBlocks();
+    splited    = ir_sch.GetLoops(all_blocks[1]);
+    ir_sch.SimpleComputeAt(all_blocks[0], splited[1]);
+    all_blocks = ir_sch.GetAllBlocks();
+    ir_sch.SetBuffer(all_blocks[0], "local", true);
+    loops = ir_sch.GetLoops(all_blocks[0]);
+    CHECK_GE(loops.size(), 3U);
+    ir_sch.Bind(loops[2], "threadIdx.x");
+  }
+  VLOG(3) << "After IRGlobalPoolScheduleGPU: " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch, const std::vector<ir::Expr> &tensors) {
+  if (tensors.size() == 3U) {
+    CHECK(tensors[1].as_tensor());
+    auto input_pad = ir_sch.GetBlock(tensors[1].as_tensor_ref()->name);
+    ir_sch.ComputeInline(input_pad);
+  }
+  auto all_blocks = ir_sch.GetAllBlocks();
+  VLOG(3) << "Begin IRCudaScheduleDepthwiseConv with expr: " << ir_sch.GetModule().GetExprs().at(0);
+  auto OL    = ir_sch.CacheWrite(all_blocks[0], 0, "local");
+  all_blocks = ir_sch.GetAllBlocks();
+  CHECK_GE(all_blocks.size(), 2);
+  auto loops = ir_sch.GetLoops(all_blocks[1]);
+  CHECK_GE(loops.size(), 4);
+  ir_sch.Bind(loops[0], "blockIdx.x");
+  ir_sch.Bind(loops[1], "blockIdx.y");
+  ir_sch.Bind(loops[2], "blockIdx.z");
+  ir_sch.Bind(loops[3], "threadIdx.x");
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[1]);
+  ir_sch.ComputeAt(all_blocks[0], loops[3]);
+  VLOG(3) << "After IRCudaScheduleDepthwiseConv with expr: " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRCudaScheduleConv(ir::IRSchedule &ir_sch, const common::Target &target) {
+  VLOG(3) << "Begin IRCudaScheduleConv with expr: " << ir_sch.GetModule().GetExprs().at(0);
+  auto &res = ScheduleParam::get_cuda_instance().GetParam();
+
+  auto all_blocks = ir_sch.GetAllBlocks();
+  CHECK_EQ(all_blocks.size(), 3U);
+  auto input_pad = GetTensor(all_blocks[0]);
+  auto output    = GetTensor(all_blocks[2]);
+  all_blocks     = ir_sch.GetAllBlocks();
+  CHECK_EQ(all_blocks.size(), 3U);
+  auto weights = GetReadTensor(all_blocks[2], 2);
+
+  int n = output->shape[0].as_int32();
+  int c = output->shape[1].as_int32();
+  optim::Simplify(&(output->shape[2]));
+  int h = output->shape[2].as_int32();
+  optim::Simplify(&(output->shape[3]));
+  int w  = output->shape[3].as_int32();
+  int rc = input_pad->shape[1].as_int32();
+
+  std::string key =
+      "CudaDirectConvSchedule " + std::to_string(input_pad->shape[0].as_int32()) + " " +
+      std::to_string(input_pad->shape[1].as_int32()) + " " + std::to_string(input_pad->shape[2].as_int32()) + " " +
+      std::to_string(input_pad->shape[3].as_int32()) + " " + std::to_string(weights->shape[0].as_int32()) + " " +
+      std::to_string(weights->shape[1].as_int32()) + " " + std::to_string(weights->shape[2].as_int32()) + " " +
+      std::to_string(weights->shape[3].as_int32()) + " " + std::to_string(output->shape[0].as_int32()) + " " +
+      std::to_string(output->shape[1].as_int32()) + " " + std::to_string(output->shape[2].as_int32()) + " " +
+      std::to_string(output->shape[3].as_int32());
+  if (res.count(key) == 0) {
+    VLOG(3) << "Didn't find saved param, key is: " << key;
+  } else {
+    VLOG(3) << "Find saved param! key is: " << key;
+    // Todo:temporarily turn off loading params
+    // IRCudaScheduleConv2(ir_sch, input_pad, weights, output, target, key);
+    // return;
+  }
+  ir_sch.ComputeInline(all_blocks[0]);
+  int f_inner  = GetInnerSplitter(c, h);
+  int block_z  = SplitEven(c / f_inner);
+  int thread_z = c / f_inner / block_z;
+
+  int rc_factor = SplitEven(rc);
+  while (w * thread_z > 1024 && thread_z % 2 == 0) {
+    thread_z = thread_z / 2;
+    f_inner  = f_inner * 2;
+  }
+  CHECK_LE(w * thread_z, 1024) << "Wrong Param of Conv2d!";
+  std::vector<Expr> loops;
+  all_blocks            = ir_sch.GetAllBlocks();
+  auto reduce_init_name = GetTensor(all_blocks[0])->name;
+  {
+    // Do CacheWrite
+    all_blocks = ir_sch.GetAllBlocks();
+    auto OL    = ir_sch.CacheWrite(all_blocks[1], 0, "local");
+    VLOG(3) << "After CacheWrite with expr: " << ir_sch.GetModule().GetExprs().at(0);
+  }
+  all_blocks             = ir_sch.GetAllBlocks();
+  auto temp_output_name  = GetTensor(all_blocks[1])->name;
+  auto final_output_name = GetTensor(all_blocks[2])->name;
+  {
+    // Do Split
+    loops = ir_sch.GetLoops(final_output_name);
+    CHECK_GE(loops.size(), 2U);
+    ir_sch.Split(loops[1], {-1, thread_z, f_inner});
+  }
+  {
+    // Do Reorder
+    loops = ir_sch.GetLoops(final_output_name);
+    CHECK_GE(loops.size(), 6U);
+    ir_sch.Reorder({loops[1], loops[4], loops[2], loops[5], loops[3]});
+  }
+  {
+    // Do ComputeAt
+    auto temp_out = ir_sch.GetBlock(temp_output_name);
+    loops         = ir_sch.GetLoops(final_output_name);
+    CHECK_GE(loops.size(), 5U);
+    ir_sch.ComputeAt(temp_out, loops[4]);
+  }
+  VLOG(3) << "After ComputeAt with expr: " << ir_sch.GetModule().GetExprs().at(0);
+  {
+    // Do Split
+    loops = ir_sch.GetLoops(temp_output_name);
+    CHECK_GE(loops.size(), 7U);
+    ir_sch.Split(loops[6], {-1, rc_factor});
+  }
+  {
+    // Do Split
+    auto reduce_init                            = ir_sch.GetBlock(reduce_init_name);
+    ir::ScheduleBlockRealize *reduce_init_block = reduce_init.As<ir::ScheduleBlockRealize>();
+    loops                                       = ir_sch.GetLoops(reduce_init_name);
+    // If loops size is less than 4, it means one or more 1-loops are eliminated in the lowering process.
+    // Here we restore them by identifying the constant iter value in the ScheduleBlock
+    while (loops.size() < 4U) {
+      for (int i = 0; i < reduce_init_block->iter_values.size(); ++i) {
+        auto &v = reduce_init_block->iter_values[i];
+        if (v.is_constant()) {
+          ir_sch.Split(loops[i], {1, -1});
+        }
+      }
+      loops = ir_sch.GetLoops(reduce_init_name);
+    }
+    CHECK_EQ(loops.size(), 4U);
+    ir_sch.Split(loops[1], {-1, thread_z, f_inner});
+  }
+  {
+    // Do Reorder
+    loops = ir_sch.GetLoops(reduce_init_name);
+    CHECK_GE(loops.size(), 6U);
+    ir_sch.Reorder({loops[1], loops[4], loops[2], loops[5], loops[3]});
+  }
+  VLOG(3) << "After Reorder with expr: " << ir_sch.GetModule().GetExprs().at(0);
+  {
+    // Do SimpleComputeAt
+    auto reduce_init = ir_sch.GetBlock(reduce_init_name);
+    loops            = ir_sch.GetLoops(temp_output_name);
+    CHECK_GE(loops.size(), 6U);
+    ir_sch.SimpleComputeAt(reduce_init, loops[5]);
+  }
+  {
+    // Do Bind
+    loops = ir_sch.GetLoops(final_output_name);
+    CHECK_GE(loops.size(), 5U);
+    ir_sch.Bind(loops[1], "blockIdx.z");
+    ir_sch.Bind(loops[2], "blockIdx.y");
+    ir_sch.Bind(loops[3], "threadIdx.z");
+    ir_sch.Bind(loops[4], "threadIdx.x");
+  }
+  VLOG(3) << "After IRCudaScheduleConv, expr is : " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,
+                         ir::Tensor &input_pad,
+                         ir::Tensor &weights,
+                         ir::Tensor &output,
+                         const common::Target &target,
+                         const std::string &key) {
+  auto &res = ScheduleParam::get_cuda_instance().GetParam();
+
+  auto all_blocks = ir_sch.GetAllBlocks();
+
+  // stages[input_pad]->ComputeInline();
+
+  optim::Simplify(&(output->shape[2]));
+  optim::Simplify(&(output->shape[3]));
+
+  VLOG(3) << "Begin IRCudaScheduleConv2 with expr : " << ir_sch.GetModule().GetExprs().at(0);
+  auto input_cache   = ir_sch.CacheRead(all_blocks[2], 1, "shared");
+  all_blocks         = ir_sch.GetAllBlocks();
+  auto weights_cache = ir_sch.CacheRead(all_blocks[3], 2, "shared");
+  all_blocks         = ir_sch.GetAllBlocks();
+  auto output_cache  = ir_sch.CacheWrite(all_blocks[4], 0, "local");
+  all_blocks         = ir_sch.GetAllBlocks();
+  ir_sch.ComputeInline(all_blocks[1]);
+  VLOG(3) << "In the middle of IRCudaScheduleConv2, expr is: " << ir_sch.GetModule().GetExprs().at(0);
+  auto &x_param  = res[key]["x"];
+  auto &y_param  = res[key]["y"];
+  auto &f_param  = res[key]["f"];
+  auto &rx_param = res[key]["rx"];
+  auto &ry_param = res[key]["ry"];
+  auto &rc_param = res[key]["rc"];
+
+  all_blocks = ir_sch.GetAllBlocks();
+  auto loops = ir_sch.GetLoops(all_blocks[4]);
+  CHECK_GE(loops.size(), 4U);
+  ir_sch.Split(loops[3], {-1, x_param[1], x_param[2], x_param[3]});
+
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[4]);
+  CHECK_GE(loops.size(), 3U);
+  ir_sch.Split(loops[2], {-1, y_param[1], y_param[2], y_param[3]});
+
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[4]);
+  CHECK_GE(loops.size(), 2U);
+  ir_sch.Split(loops[1], {-1, f_param[1], f_param[2], f_param[3]});
+
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[4]);
+  CHECK_GE(loops.size(), 13U);
+  ir_sch.Reorder({loops[0],
+                  loops[1],
+                  loops[5],
+                  loops[9],
+                  loops[2],
+                  loops[6],
+                  loops[10],
+                  loops[3],
+                  loops[7],
+                  loops[11],
+                  loops[4],
+                  loops[8],
+                  loops[12]});
+
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[4]);
+  CHECK_GE(loops.size(), 13U);
+  ir_sch.Bind(loops[1], "blockIdx.z");
+  ir_sch.Bind(loops[2], "blockIdx.y");
+  ir_sch.Bind(loops[3], "blockIdx.x");
+  ir_sch.Bind(loops[7], "threadIdx.z");
+  ir_sch.Bind(loops[8], "threadIdx.y");
+  ir_sch.Bind(loops[9], "threadIdx.x");
+  ir_sch.Unroll(loops[10]);
+  ir_sch.Unroll(loops[11]);
+  ir_sch.Unroll(loops[12]);
+
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[4]);
+  CHECK_GE(loops.size(), 10U);
+  ir_sch.ComputeAt(all_blocks[3], loops[9]);
+
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[3]);
+  CHECK_GE(loops.size(), 16U);
+  ir_sch.Split(loops[15], {-1, rx_param[1]});
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[3]);
+  CHECK_GE(loops.size(), 15U);
+  ir_sch.Split(loops[14], {-1, ry_param[1]});
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[3]);
+  CHECK_GE(loops.size(), 14U);
+  ir_sch.Split(loops[13], {-1, rc_param[1]});
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[3]);
+  CHECK_GE(loops.size(), 14U);
+  ir_sch.Reorder({loops[13], loops[15], loops[17], loops[14], loops[16], loops[18], loops[10], loops[11], loops[12]});
+
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[3]);
+  CHECK_GE(loops.size(), 13U);
+  ir_sch.ComputeAt(all_blocks[0], loops[12]);
+  all_blocks = ir_sch.GetAllBlocks();
+  loops      = ir_sch.GetLoops(all_blocks[3]);
+  CHECK_GE(loops.size(), 13U);
+  ir_sch.ComputeAt(all_blocks[1], loops[12]);
+  // Work In Progress
+  VLOG(3) << "After IRCudaScheduleConv2, expr is: " << ir_sch.GetModule().GetExprs().at(0);
+}
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.h b/paddle/cinn/hlir/pe/ir_schedule_pe.h
new file mode 100644
index 0000000000000..fcea4956885c0
--- /dev/null
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/pe/schedule_param.pb.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/ir_schedule_util.h"
+#include "cinn/lang/compute.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+void IRElementwiseSchedule(ir::IRSchedule &ir_sch, const std::vector<int> &output_shape, const common::Target &target);
+
+void IRInjectiveSchedule(ir::IRSchedule &ir_sch, const std::vector<int> &output_shape, const common::Target &target);
+
+void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,
+                            const std::vector<int> &output_shape,
+                            const common::Target &target,
+                            bool vectorizable = true);
+
+void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,
+                             const std::vector<int> &output_shape,
+                             const common::Target &target);
+
+std::vector<common::CINNValue> IRCudaScheduleMatMul(const common::CINNValuePack &arg_pack,
+                                                    const std::vector<int> &output_shape,
+                                                    const common::Target &target);
+
+void IRCudaScheduleMul(ir::IRSchedule &ir_sch, const std::vector<int> &output_shape, const common::Target &target);
+
+void IRMulScheduleCPU(ir::IRSchedule &ir_sch, const std::vector<int> &reduce_first_shape, const common::Target &target);
+
+void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,
+                         const std::vector<std::vector<int>> &output_shapes,
+                         int axis,
+                         const common::Target &target);
+
+void IRCudaScheduleReduce(ir::IRSchedule &ir_sch, ir::Tensor out, int last_dimension_num, const common::Target &target);
+
+void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,
+                               ir::Tensor reduce_tmp_out,
+                               ir::Tensor tmp_out,
+                               ir::Tensor out,
+                               const common::Target &target);
+
+void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,
+                                       ir::Tensor tmp_out,
+                                       ir::Tensor out,
+                                       const common::Target &target);
+
+void IRCudaScheduleBlockShuffleReduce(
+    ir::IRSchedule &ir_sch, ir::Tensor reshape, ir::Tensor internal, ir::Tensor out, const common::Target &target);
+
+void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,
+                                 ir::Tensor reshape,
+                                 ir::Tensor internal,
+                                 ir::Tensor tmp_out,
+                                 ir::Tensor out,
+                                 const common::Target &target);
+
+void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis = -1);
+
+void IRPoolScheduleGPU(ir::IRSchedule &ir_sch, const common::Target &target, int arg_pack_size = 3);
+
+void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch, const std::vector<ir::Expr> &tensors);
+
+void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch, const common::Target &target);
+
+void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,
+                         ir::Tensor &input_pad,
+                         ir::Tensor &weights,
+                         ir::Tensor &output,
+                         const common::Target &target,
+                         const std::string &key);
+
+void IRCudaScheduleConv(ir::IRSchedule &ir_sch, const common::Target &target);
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/load_params_test.cc b/paddle/cinn/hlir/pe/load_params_test.cc
new file mode 100644
index 0000000000000..6bf561701aecc
--- /dev/null
+++ b/paddle/cinn/hlir/pe/load_params_test.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/hlir/pe/schedule.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+using ir::Tensor;
+
+TEST(load_x86_params, load_x86_params) {
+  auto &res       = ScheduleParam::get_x86_instance().GetParam();
+  std::string key = "X86ScheduleConv input 1 3 224 224 weight 64 3 7 7 stride 2 2 padding 3 3 dilation 1 1";
+  ASSERT_EQ(res.count(key), 1);
+
+  absl::flat_hash_map<std::string, int> conv2d_factors;
+  auto target                    = common::DefaultHostTarget();
+  std::vector<int> shape_input   = {1, 64, 56, 56};
+  std::vector<int> shape_weights = {64, 64, 3, 3};
+  std::vector<int> strides       = {1, 1};
+  std::vector<int> pads          = {1, 1};
+  std::vector<int> dilations     = {1, 1};
+  key                            = GenerateX86ConvKey(shape_input, shape_weights, strides, pads, dilations);
+  GetConv2dFactors(&conv2d_factors, -1, -1, -1, -1, -1, Float(32), target, key);
+  int ic_bn_size = conv2d_factors["ic_bn"];
+  int oc_bn_size = conv2d_factors["oc_bn"];
+  int fc_bn_size = conv2d_factors["fc_bn"];
+  int ow_bn_size = conv2d_factors["ow_bn"];
+  int unroll_kw  = conv2d_factors["unroll_kw"];
+  ASSERT_EQ(ic_bn_size, 64);
+  ASSERT_EQ(fc_bn_size, 64);
+  ASSERT_EQ(oc_bn_size, 32);
+  ASSERT_EQ(ow_bn_size, 7);
+  ASSERT_EQ(unroll_kw, 1);
+}
+
+TEST(load_cuda_params, load_cuda_params) {
+  auto &res = ScheduleParam::get_cuda_instance().GetParam();
+  if (res.empty()) {
+    CreateCudaSerialData();
+    LoadSerialData(&res);
+  }
+  std::string key = "CudaDirectConvSchedule 1 3 230 230 64 3 7 7 1 64 112 112";
+  ASSERT_EQ(res.count(key), 1);
+}
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/load_x86_params.cc b/paddle/cinn/hlir/pe/load_x86_params.cc
new file mode 100644
index 0000000000000..a9184bb9de98f
--- /dev/null
+++ b/paddle/cinn/hlir/pe/load_x86_params.cc
@@ -0,0 +1,1308 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/load_x86_params.h"
+
+#include <glog/logging.h>
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+void InputX86Param(absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data,
+                   const std::string &key,
+                   const absl::flat_hash_map<std::string, std::vector<int>> &schedule_data) {
+  CHECK(model_data);
+  (*model_data)[key] = schedule_data;
+}
+
+void LoadX86DefaultParams(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data) {
+  CHECK(model_data);
+  // resnet 1
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 3 224 224 weight 64 3 7 7 stride 2 2 padding 3 3 dilation 1 1",
+                {{"ic_bn", {1, 3}}, {"oc_bn", {2, 32}}, {"ow_bn", {14, 8}}, {"unroll_kw", {0}}});
+  // resnet 3 4 5 6
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 64 56 56 weight 64 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {1, 64}}, {"oc_bn", {2, 32}}, {"ow_bn", {8, 7}}, {"unroll_kw", {1}}});
+  // resnet 8
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 64 56 56 weight 128 64 3 3 stride 2 2 padding 1 1 dilation 1 1",
+                {{"ic_bn", {2, 32}}, {"oc_bn", {2, 64}}, {"ow_bn", {7, 4}}, {"unroll_kw", {0}}});
+  // resnet 9 10 11
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 128 28 28 weight 128 128 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {1, 128}}, {"oc_bn", {4, 32}}, {"ow_bn", {4, 7}}, {"unroll_kw", {1}}});
+  // resnet 7
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 64 56 56 weight 128 64 1 1 stride 2 2 padding 0 0 dilation 1 1",
+                {{"ic_bn", {8, 8}}, {"oc_bn", {4, 32}}, {"ow_bn", {7, 4}}, {"oh_bn", {1}}});
+  // resnet 13
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 128 28 28 weight 256 128 3 3 stride 2 2 padding 1 1 dilation 1 1",
+                {{"ic_bn", {16, 8}}, {"oc_bn", {8, 32}}, {"ow_bn", {2, 7}}, {"unroll_kw", {1}}});
+  // resnet 14 15 16
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 256 14 14 weight 256 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {2, 128}}, {"oc_bn", {16, 16}}, {"ow_bn", {1, 14}}, {"unroll_kw", {1}}});
+  // resnet 12
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 128 28 28 weight 256 128 1 1 stride 2 2 padding 0 0 dilation 1 1",
+                {{"ic_bn", {2, 64}}, {"oc_bn", {16, 16}}, {"ow_bn", {1, 14}}, {"oh_bn", {1}}});
+  // resnet 18
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 256 14 14 weight 512 256 3 3 stride 2 2 padding 1 1 dilation 1 1",
+                {{"ic_bn", {32, 8}}, {"oc_bn", {16, 32}}, {"ow_bn", {1, 7}}, {"unroll_kw", {1}}});
+  // resnet 19 20 21
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 512 7 7 weight 512 512 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {1, 512}}, {"oc_bn", {16, 32}}, {"ow_bn", {1, 7}}, {"unroll_kw", {1}}});
+  // resnet 17
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 256 14 14 weight 512 256 1 1 stride 2 2 padding 0 0 dilation 1 1",
+                {{"ic_bn", {2, 128}}, {"oc_bn", {16, 32}}, {"ow_bn", {1, 7}}, {"oh_bn", {1}}});
+  // resnet 2
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 64 56 56 weight 64 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {4, 16}}, {"oc_bn", {2, 32}}, {"ow_bn", {4, 14}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 64 56 56 weight 256 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {16, 4}}, {"oc_bn", {8, 32}}, {"ow_bn", {8, 7}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 256 56 56 weight 64 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {1, 256}}, {"oc_bn", {2, 32}}, {"ow_bn", {8, 7}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 256 56 56 weight 128 256 1 1 stride 2 2 padding 0 0 dilation 1 1",
+                {{"ic_bn", {1, 256}}, {"oc_bn", {4, 32}}, {"ow_bn", {4, 7}}, {"oh_bn", {1}}});
+  // resnet 50
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 256 56 56 weight 512 256 1 1 stride 2 2 padding 0 0 dilation 1 1",
+                // Todo: tempory fix, enhance alterlayout and test performance
+                {{"ic_bn", {1, 256}}, {"oc_bn", {16, 32}}, {"ow_bn", {7, 4}}, {"oh_bn", {1}}});
+  // {{"ic_bn", {1, 256}}, {"oc_bn", {8, 64}}, {"ow_bn", {7, 4}}, {"oh_bn", {1}}});
+  // resnet50
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 128 28 28 weight 512 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {32, 4}}, {"oc_bn", {16, 32}}, {"ow_bn", {4, 7}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 512 28 28 weight 128 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {1, 512}}, {"oc_bn", {2, 64}}, {"ow_bn", {7, 4}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 512 28 28 weight 256 512 1 1 stride 2 2 padding 0 0 dilation 1 1",
+                {{"ic_bn", {8, 64}}, {"oc_bn", {4, 64}}, {"ow_bn", {7, 2}}, {"oh_bn", {2}}});
+  // resnet 50
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 512 28 28 weight 1024 512 1 1 stride 2 2 padding 0 0 dilation 1 1",
+                {{"ic_bn", {1, 512}}, {"oc_bn", {16, 64}}, {"ow_bn", {7, 2}}, {"oh_bn", {2}}});
+  // resnet 50
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 256 14 14 weight 1024 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {1, 256}}, {"oc_bn", {16, 64}}, {"ow_bn", {7, 2}}, {"oh_bn", {2}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 1024 14 14 weight 256 1024 1 1 stride 2 2 padding 0 0 dilation 1 1",
+                {{"ic_bn", {2, 512}}, {"oc_bn", {4, 64}}, {"ow_bn", {7, 2}}, {"oh_bn", {2}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 1024 14 14 weight 512 1024 1 1 stride 2 2 padding 0 0 dilation 1 1",
+                {{"ic_bn", {2, 512}}, {"oc_bn", {16, 32}}, {"ow_bn", {1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 1024 14 14 weight 2048 1024 1 1 stride 2 2 padding 0 0 dilation 1 1",
+                {{"ic_bn", {1, 1024}}, {"oc_bn", {64, 32}}, {"ow_bn", {1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 512 7 7 weight 2048 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {128, 4}}, {"oc_bn", {64, 32}}, {"ow_bn", {1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 2048 7 7 weight 512 2048 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {512, 4}}, {"oc_bn", {16, 32}}, {"ow_bn", {1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 3 224 224 weight 64 3 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {1, 3}}, {"oc_bn", {2, 32}}, {"ow_bn", {28, 8}}, {"unroll_kw", {0}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 64 224 224 weight 64 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {4, 16}}, {"oc_bn", {2, 32}}, {"ow_bn", {28, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 64 112 112 weight 128 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {2, 32}}, {"oc_bn", {2, 64}}, {"ow_bn", {28, 4}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 128 112 112 weight 128 128 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {2, 64}}, {"oc_bn", {2, 64}}, {"ow_bn", {28, 4}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 128 56 56 weight 256 128 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {4, 32}}, {"oc_bn", {8, 32}}, {"ow_bn", {7, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 256 56 56 weight 256 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {1, 256}}, {"oc_bn", {8, 32}}, {"ow_bn", {7, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 256 28 28 weight 512 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {1, 256}}, {"oc_bn", {16, 32}}, {"ow_bn", {4, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 512 28 28 weight 512 512 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {1, 512}}, {"oc_bn", {32, 16}}, {"ow_bn", {2, 14}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "X86ScheduleConv input 1 512 14 14 weight 512 512 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {1, 512}}, {"oc_bn", {32, 16}}, {"ow_bn", {1, 14}}, {"unroll_kw", {1}}});
+}
+
+void LoadResNet18Params(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data) {
+  CHECK(model_data);
+  InputX86Param(
+      model_data,
+      "resnet18 index 0 X86ScheduleConv input 1 3 224 224 weight 64 3 7 7 stride 2 2 padding 3 3 dilation 1 1",
+      {{"ic_bn", {-1, 1}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 1 X86ScheduleConv input 1 64 56 56 weight 64 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 2 X86ScheduleConv input 1 64 56 56 weight 64 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 3 X86ScheduleConv input 1 64 56 56 weight 64 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 4 X86ScheduleConv input 1 64 56 56 weight 64 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 5 X86ScheduleConv input 1 64 56 56 weight 64 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 6 X86ScheduleConv input 1 64 56 56 weight 128 64 1 1 stride 2 2 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 7 X86ScheduleConv input 1 64 56 56 weight 128 64 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 8 X86ScheduleConv input 1 128 28 28 weight 128 128 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 9 X86ScheduleConv input 1 128 28 28 weight 128 128 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 10 X86ScheduleConv input 1 128 28 28 weight 128 128 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 11 X86ScheduleConv input 1 128 28 28 weight 256 128 1 1 stride 2 2 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 12 X86ScheduleConv input 1 128 28 28 weight 256 128 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 13 X86ScheduleConv input 1 256 14 14 weight 256 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 14 X86ScheduleConv input 1 256 14 14 weight 256 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 15 X86ScheduleConv input 1 256 14 14 weight 256 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 16 X86ScheduleConv input 1 256 14 14 weight 512 256 1 1 stride 2 2 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 17 X86ScheduleConv input 1 256 14 14 weight 512 256 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 18 X86ScheduleConv input 1 512 7 7 weight 512 512 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 19 X86ScheduleConv input 1 512 7 7 weight 512 512 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet18 index 20 X86ScheduleConv input 1 512 7 7 weight 512 512 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+}
+
+void LoadResNet50Params(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data) {
+  CHECK(model_data);
+  InputX86Param(
+      model_data,
+      "resnet50 index 0 X86ScheduleConv input 1 3 224 224 weight 64 3 7 7 stride 2 2 padding 3 3 dilation 1 1",
+      {{"ic_bn", {-1, 1}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 1 X86ScheduleConv input 1 64 56 56 weight 256 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 2 X86ScheduleConv input 1 64 56 56 weight 64 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 3 X86ScheduleConv input 1 64 56 56 weight 64 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 4 X86ScheduleConv input 1 64 56 56 weight 256 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 5 X86ScheduleConv input 1 256 56 56 weight 64 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 6 X86ScheduleConv input 1 64 56 56 weight 64 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 7 X86ScheduleConv input 1 64 56 56 weight 256 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 8 X86ScheduleConv input 1 256 56 56 weight 64 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 9 X86ScheduleConv input 1 64 56 56 weight 64 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 10 X86ScheduleConv input 1 64 56 56 weight 256 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 11 X86ScheduleConv input 1 256 56 56 weight 512 256 1 1 stride 2 2 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 12 X86ScheduleConv input 1 256 56 56 weight 128 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 13 X86ScheduleConv input 1 128 56 56 weight 128 128 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 14 X86ScheduleConv input 1 128 28 28 weight 512 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 15 X86ScheduleConv input 1 512 28 28 weight 128 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 16 X86ScheduleConv input 1 128 28 28 weight 128 128 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 17 X86ScheduleConv input 1 128 28 28 weight 512 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 18 X86ScheduleConv input 1 512 28 28 weight 128 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 19 X86ScheduleConv input 1 128 28 28 weight 128 128 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 20 X86ScheduleConv input 1 128 28 28 weight 512 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 21 X86ScheduleConv input 1 512 28 28 weight 128 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 22 X86ScheduleConv input 1 128 28 28 weight 128 128 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 23 X86ScheduleConv input 1 128 28 28 weight 512 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 24 X86ScheduleConv input 1 512 28 28 weight 1024 512 1 1 stride 2 2 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 128}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 25 X86ScheduleConv input 1 512 28 28 weight 256 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 26 X86ScheduleConv input 1 256 28 28 weight 256 256 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 27 X86ScheduleConv input 1 256 14 14 weight 1024 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 28 X86ScheduleConv input 1 1024 14 14 weight 256 1024 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 29 X86ScheduleConv input 1 256 14 14 weight 256 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 30 X86ScheduleConv input 1 256 14 14 weight 1024 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 31 X86ScheduleConv input 1 1024 14 14 weight 256 1024 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 32 X86ScheduleConv input 1 256 14 14 weight 256 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 33 X86ScheduleConv input 1 256 14 14 weight 1024 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 34 X86ScheduleConv input 1 1024 14 14 weight 256 1024 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 35 X86ScheduleConv input 1 256 14 14 weight 256 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 36 X86ScheduleConv input 1 256 14 14 weight 1024 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 37 X86ScheduleConv input 1 1024 14 14 weight 256 1024 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 38 X86ScheduleConv input 1 256 14 14 weight 256 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 39 X86ScheduleConv input 1 256 14 14 weight 1024 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 40 X86ScheduleConv input 1 1024 14 14 weight 256 1024 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 41 X86ScheduleConv input 1 256 14 14 weight 256 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 42 X86ScheduleConv input 1 256 14 14 weight 1024 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 43 X86ScheduleConv input 1 1024 14 14 weight 2048 1024 1 1 stride 2 2 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 44 X86ScheduleConv input 1 1024 14 14 weight 512 1024 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 128}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 45 X86ScheduleConv input 1 512 14 14 weight 512 512 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 46 X86ScheduleConv input 1 512 7 7 weight 2048 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 47 X86ScheduleConv input 1 2048 7 7 weight 512 2048 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 48 X86ScheduleConv input 1 512 7 7 weight 512 512 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 49 X86ScheduleConv input 1 512 7 7 weight 2048 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 50 X86ScheduleConv input 1 2048 7 7 weight 512 2048 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 51 X86ScheduleConv input 1 512 7 7 weight 512 512 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "resnet50 index 52 X86ScheduleConv input 1 512 7 7 weight 2048 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+}
+
+void LoadMobileNetV1Params(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data) {
+  CHECK(model_data);
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 0 X86ScheduleConv input 1 3 224 224 weight 32 3 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 1}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 1 X86ScheduleConv input 1 32 112 112 weight 32 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 2 X86ScheduleConv input 1 32 112 112 weight 64 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 3 X86ScheduleConv input 1 64 112 112 weight 64 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 4 X86ScheduleConv input 1 64 56 56 weight 128 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 5 X86ScheduleConv input 1 128 56 56 weight 128 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 6 X86ScheduleConv input 1 128 56 56 weight 128 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 7 X86ScheduleConv input 1 128 56 56 weight 128 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 8 X86ScheduleConv input 1 128 28 28 weight 256 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 9 X86ScheduleConv input 1 256 28 28 weight 256 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 4}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 10 X86ScheduleConv input 1 256 28 28 weight 256 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 11 X86ScheduleConv input 1 256 28 28 weight 256 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 12 X86ScheduleConv input 1 256 14 14 weight 512 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 13 X86ScheduleConv input 1 512 14 14 weight 512 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 14 X86ScheduleConv input 1 512 14 14 weight 512 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 15 X86ScheduleConv input 1 512 14 14 weight 512 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 16 X86ScheduleConv input 1 512 14 14 weight 512 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 17 X86ScheduleConv input 1 512 14 14 weight 512 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 18 X86ScheduleConv input 1 512 14 14 weight 512 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 19 X86ScheduleConv input 1 512 14 14 weight 512 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 20 X86ScheduleConv input 1 512 14 14 weight 512 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 21 X86ScheduleConv input 1 512 14 14 weight 512 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 22 X86ScheduleConv input 1 512 14 14 weight 512 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 23 X86ScheduleConv input 1 512 14 14 weight 512 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 4}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 24 X86ScheduleConv input 1 512 7 7 weight 1024 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 25 X86ScheduleConv input 1 1024 7 7 weight 1024 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv1 index 26 X86ScheduleConv input 1 1024 7 7 weight 1024 1024 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+}
+
+void LoadMobileNetV2Params(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data) {
+  CHECK(model_data);
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 0 X86ScheduleConv input 1 3 224 224 weight 32 3 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 1}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 1 X86ScheduleConv input 1 32 112 112 weight 32 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 2 X86ScheduleConv input 1 32 112 112 weight 32 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 3 X86ScheduleConv input 1 32 112 112 weight 16 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 16}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 4 X86ScheduleConv input 1 16 112 112 weight 96 16 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 4}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 5 X86ScheduleConv input 1 96 112 112 weight 96 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 2}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 6 X86ScheduleConv input 1 96 56 56 weight 24 96 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 14}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 7 X86ScheduleConv input 1 24 56 56 weight 144 24 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 8 X86ScheduleConv input 1 144 56 56 weight 144 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 9 X86ScheduleConv input 1 144 56 56 weight 24 144 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 28}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 10 X86ScheduleConv input 1 24 56 56 weight 144 24 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 11 X86ScheduleConv input 1 144 56 56 weight 144 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 2}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 12 X86ScheduleConv input 1 144 28 28 weight 32 144 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 13 X86ScheduleConv input 1 32 28 28 weight 192 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 14 X86ScheduleConv input 1 192 28 28 weight 192 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 15 X86ScheduleConv input 1 192 28 28 weight 32 192 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 16 X86ScheduleConv input 1 32 28 28 weight 192 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 17 X86ScheduleConv input 1 192 28 28 weight 192 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 18 X86ScheduleConv input 1 192 28 28 weight 32 192 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 19 X86ScheduleConv input 1 32 28 28 weight 192 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 20 X86ScheduleConv input 1 192 28 28 weight 192 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 21 X86ScheduleConv input 1 192 14 14 weight 64 192 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 96}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 22 X86ScheduleConv input 1 64 14 14 weight 384 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 23 X86ScheduleConv input 1 384 14 14 weight 384 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 24 X86ScheduleConv input 1 384 14 14 weight 64 384 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 25 X86ScheduleConv input 1 64 14 14 weight 384 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 26 X86ScheduleConv input 1 384 14 14 weight 384 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 27 X86ScheduleConv input 1 384 14 14 weight 64 384 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 28 X86ScheduleConv input 1 64 14 14 weight 384 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 29 X86ScheduleConv input 1 384 14 14 weight 384 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 30 X86ScheduleConv input 1 384 14 14 weight 64 384 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 31 X86ScheduleConv input 1 64 14 14 weight 384 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 32 X86ScheduleConv input 1 384 14 14 weight 384 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 33 X86ScheduleConv input 1 384 14 14 weight 96 384 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 34 X86ScheduleConv input 1 96 14 14 weight 576 96 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 35 X86ScheduleConv input 1 576 14 14 weight 576 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 36 X86ScheduleConv input 1 576 14 14 weight 96 576 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 37 X86ScheduleConv input 1 96 14 14 weight 576 96 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 38 X86ScheduleConv input 1 576 14 14 weight 576 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 39 X86ScheduleConv input 1 576 14 14 weight 96 576 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 40 X86ScheduleConv input 1 96 14 14 weight 576 96 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 41 X86ScheduleConv input 1 576 14 14 weight 576 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 42 X86ScheduleConv input 1 576 7 7 weight 160 576 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 3}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 43 X86ScheduleConv input 1 160 7 7 weight 960 160 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 80}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 44 X86ScheduleConv input 1 960 7 7 weight 960 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 192}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  // InputX86Param(model_data, "mobilenetv2 index 45 X86ScheduleConv input 1 960 7 7 weight 160 960 1 1 stride 1 1
+  // padding 0 0 dilation 1 1", {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 45 X86ScheduleConv input 1 960 7 7 weight 160 960 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 46 X86ScheduleConv input 1 160 7 7 weight 960 160 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 80}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 47 X86ScheduleConv input 1 960 7 7 weight 960 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 192}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  // InputX86Param(model_data, "mobilenetv2 index 48 X86ScheduleConv input 1 960 7 7 weight 160 960 1 1 stride 1 1
+  // padding 0 0 dilation 1 1", {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 48 X86ScheduleConv input 1 960 7 7 weight 160 960 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 49 X86ScheduleConv input 1 160 7 7 weight 960 160 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 80}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 50 X86ScheduleConv input 1 960 7 7 weight 960 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 80}}, {"oc_bn", {-1, 80}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 51 X86ScheduleConv input 1 960 7 7 weight 320 960 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 80}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "mobilenetv2 index 52 X86ScheduleConv input 1 320 7 7 weight 1280 320 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+}
+
+void LoadSqueezeNetParams(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data) {
+  CHECK(model_data);
+  InputX86Param(
+      model_data,
+      "squeezenet index 0 X86ScheduleConv input 1 3 227 227 weight 64 3 3 3 stride 2 2 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 1}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 1 X86ScheduleConv input 1 64 56 56 weight 16 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 3 X86ScheduleConv input 1 16 56 56 weight 64 16 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 2 X86ScheduleConv input 1 16 56 56 weight 64 16 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 4 X86ScheduleConv input 1 128 56 56 weight 16 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 14}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 6 X86ScheduleConv input 1 16 56 56 weight 64 16 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 5 X86ScheduleConv input 1 16 56 56 weight 64 16 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 7 X86ScheduleConv input 1 128 28 28 weight 32 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 9 X86ScheduleConv input 1 32 28 28 weight 128 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 8 X86ScheduleConv input 1 32 28 28 weight 128 32 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 10 X86ScheduleConv input 1 256 28 28 weight 32 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 12 X86ScheduleConv input 1 32 28 28 weight 128 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 11 X86ScheduleConv input 1 32 28 28 weight 128 32 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 2}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 13 X86ScheduleConv input 1 256 14 14 weight 48 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 15 X86ScheduleConv input 1 48 14 14 weight 192 48 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 48}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 8}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 14 X86ScheduleConv input 1 48 14 14 weight 192 48 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 48}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 16 X86ScheduleConv input 1 384 14 14 weight 48 384 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 18 X86ScheduleConv input 1 48 14 14 weight 192 48 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 48}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 8}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 17 X86ScheduleConv input 1 48 14 14 weight 192 48 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 48}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 19 X86ScheduleConv input 1 384 14 14 weight 64 384 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 21 X86ScheduleConv input 1 64 14 14 weight 256 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 20 X86ScheduleConv input 1 64 14 14 weight 256 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 22 X86ScheduleConv input 1 512 14 14 weight 64 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 24 X86ScheduleConv input 1 64 14 14 weight 256 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 23 X86ScheduleConv input 1 64 14 14 weight 256 64 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "squeezenet index 25 X86ScheduleConv input 1 512 14 14 weight 1000 512 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 1}}, {"oc_bn", {-1, 10}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+}
+
+void LoadFaceDetParams(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data) {
+  CHECK(model_data);
+  InputX86Param(model_data,
+                "facedet index 0 X86ScheduleConv input 1 3 240 320 weight 16 3 3 3 stride 2 2 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 1}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 1 X86ScheduleConv input 1 16 120 160 weight 16 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 2 X86ScheduleConv input 1 16 120 160 weight 32 16 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 20}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 3 X86ScheduleConv input 1 32 120 160 weight 32 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(model_data,
+                "facedet index 4 X86ScheduleConv input 1 32 60 80 weight 32 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 5}}, {"oh_bn", {2}}});
+  InputX86Param(model_data,
+                "facedet index 5 X86ScheduleConv input 1 32 60 80 weight 32 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "facedet index 6 X86ScheduleConv input 1 32 60 80 weight 32 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 5}}, {"oh_bn", {2}}});
+  InputX86Param(model_data,
+                "facedet index 7 X86ScheduleConv input 1 32 60 80 weight 32 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "facedet index 8 X86ScheduleConv input 1 32 30 40 weight 64 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "facedet index 9 X86ScheduleConv input 1 64 30 40 weight 64 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "facedet index 10 X86ScheduleConv input 1 64 30 40 weight 64 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "facedet index 11 X86ScheduleConv input 1 64 30 40 weight 64 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "facedet index 13 X86ScheduleConv input 1 64 30 40 weight 64 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 4}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "facedet index 26 X86ScheduleConv input 1 64 30 40 weight 64 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "facedet index 12 X86ScheduleConv input 1 64 30 40 weight 64 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 20}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "facedet index 14 X86ScheduleConv input 1 64 30 40 weight 8 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 40}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "facedet index 18 X86ScheduleConv input 1 8 30 40 weight 16 8 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 16}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 22 X86ScheduleConv input 1 16 30 40 weight 16 16 3 3 stride 1 1 padding 2 2 dilation 2 2",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "facedet index 15 X86ScheduleConv input 1 64 30 40 weight 8 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 40}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "facedet index 19 X86ScheduleConv input 1 8 30 40 weight 16 8 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "facedet index 21 X86ScheduleConv input 1 16 30 40 weight 16 16 3 3 stride 1 1 padding 3 3 dilation 3 3",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(model_data,
+                "facedet index 16 X86ScheduleConv input 1 64 30 40 weight 8 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 40}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "facedet index 17 X86ScheduleConv input 1 8 30 40 weight 12 8 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 12}}, {"ow_bn", {-1, 10}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 20 X86ScheduleConv input 1 12 30 40 weight 16 12 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 12}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 10}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 23 X86ScheduleConv input 1 16 30 40 weight 16 16 3 3 stride 1 1 padding 5 5 dilation 5 5",
+      {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 24 X86ScheduleConv input 1 48 30 40 weight 64 48 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 6}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 5}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "facedet index 27 X86ScheduleConv input 1 64 30 40 weight 64 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {0}}});
+  InputX86Param(model_data,
+                "facedet index 29 X86ScheduleConv input 1 64 30 40 weight 6 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 6}}, {"ow_bn", {-1, 40}}, {"oh_bn", {1}}});
+  InputX86Param(model_data,
+                "facedet index 25 X86ScheduleConv input 1 64 30 40 weight 64 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 5}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 30 X86ScheduleConv input 1 64 15 20 weight 128 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 5}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 31 X86ScheduleConv input 1 128 15 20 weight 128 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 32 X86ScheduleConv input 1 128 15 20 weight 128 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 5}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 33 X86ScheduleConv input 1 128 15 20 weight 128 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 34 X86ScheduleConv input 1 128 15 20 weight 128 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 5}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 36 X86ScheduleConv input 1 128 15 20 weight 128 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 39 X86ScheduleConv input 1 128 15 20 weight 4 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 4}}, {"ow_bn", {-1, 20}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 35 X86ScheduleConv input 1 128 15 20 weight 128 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 40 X86ScheduleConv input 1 128 8 10 weight 256 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "facedet index 41 X86ScheduleConv input 1 256 8 10 weight 256 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 42 X86ScheduleConv input 1 256 8 10 weight 256 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 5}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 44 X86ScheduleConv input 1 256 8 10 weight 256 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 48 X86ScheduleConv input 1 256 8 10 weight 4 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 4}}, {"ow_bn", {-1, 10}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 43 X86ScheduleConv input 1 256 8 10 weight 64 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(model_data,
+                "facedet index 46 X86ScheduleConv input 1 64 8 10 weight 64 1 3 3 stride 2 2 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {0}}});
+  InputX86Param(model_data,
+                "facedet index 49 X86ScheduleConv input 1 64 4 5 weight 256 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+                {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 5}}, {"oh_bn", {2}}});
+  InputX86Param(model_data,
+                "facedet index 51 X86ScheduleConv input 1 256 4 5 weight 6 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+                {{"ic_bn", {-1, 16}}, {"oc_bn", {-1, 6}}, {"ow_bn", {-1, 5}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 28 X86ScheduleConv input 1 64 30 40 weight 12 64 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 12}}, {"ow_bn", {-1, 40}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 37 X86ScheduleConv input 1 128 15 20 weight 128 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 38 X86ScheduleConv input 1 128 15 20 weight 8 128 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 20}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 45 X86ScheduleConv input 1 256 8 10 weight 256 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 47 X86ScheduleConv input 1 256 8 10 weight 8 256 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 10}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "facedet index 50 X86ScheduleConv input 1 256 4 5 weight 12 256 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 1}}, {"oc_bn", {-1, 12}}, {"ow_bn", {-1, 5}}, {"unroll_kw", {0}}});
+}
+
+void LoadEfficientNetParams(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data) {
+  CHECK(model_data);
+  InputX86Param(
+      model_data,
+      "efficientnet index 0 X86ScheduleConv input 1 3 224 224 weight 32 3 3 3 stride 2 2 padding 2 2 dilation 1 1",
+      {{"ic_bn", {-1, 1}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 4}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 1 X86ScheduleConv input 1 32 112 112 weight 32 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 2 X86ScheduleConv input 1 32 1 1 weight 8 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 32}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  // InputX86Param(model_data, "efficientnet index 3 X86ScheduleConv input 1 8 1 1 weight 32 8 1 1 stride 1 1 padding 0
+  // 0 dilation 1 1", {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 3 X86ScheduleConv input 1 8 1 1 weight 32 8 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 4 X86ScheduleConv input 1 32 112 112 weight 16 32 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 2}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 28}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 5 X86ScheduleConv input 1 16 112 112 weight 96 16 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 2}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 6 X86ScheduleConv input 1 96 112 112 weight 96 1 3 3 stride 2 2 padding 2 2 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 7 X86ScheduleConv input 1 96 1 1 weight 4 96 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 96}}, {"oc_bn", {-1, 4}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  // InputX86Param(model_data, "efficientnet index 8 X86ScheduleConv input 1 4 1 1 weight 96 4 1 1 stride 1 1 padding 0
+  // 0 dilation 1 1", {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 96}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 8 X86ScheduleConv input 1 4 1 1 weight 96 4 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 9 X86ScheduleConv input 1 96 56 56 weight 24 96 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 6}}, {"oc_bn", {-1, 12}}, {"ow_bn", {-1, 14}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 10 X86ScheduleConv input 1 24 56 56 weight 144 24 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 6}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 11 X86ScheduleConv input 1 144 56 56 weight 144 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 12 X86ScheduleConv input 1 144 1 1 weight 6 144 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 36}}, {"oc_bn", {-1, 6}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  // InputX86Param(model_data, "efficientnet index 13 X86ScheduleConv input 1 6 1 1 weight 144 6 1 1 stride 1 1 padding
+  // 0 0 dilation 1 1", {{"ic_bn", {-1, 6}}, {"oc_bn", {-1, 144}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 13 X86ScheduleConv input 1 6 1 1 weight 144 6 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 6}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 14 X86ScheduleConv input 1 144 56 56 weight 24 144 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 12}}, {"ow_bn", {-1, 14}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 15 X86ScheduleConv input 1 144 56 56 weight 144 1 5 5 stride 2 2 padding 3 3 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 29}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 16 X86ScheduleConv input 1 144 28 28 weight 40 144 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 28}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 17 X86ScheduleConv input 1 40 28 28 weight 240 40 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 18 X86ScheduleConv input 1 240 28 28 weight 240 1 5 5 stride 1 1 padding 2 2 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 4}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 19 X86ScheduleConv input 1 240 1 1 weight 10 240 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 6}}, {"oc_bn", {-1, 10}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 20 X86ScheduleConv input 1 10 1 1 weight 240 10 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 5}}, {"oc_bn", {-1, 4}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 21 X86ScheduleConv input 1 240 28 28 weight 40 240 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 22 X86ScheduleConv input 1 240 28 28 weight 240 1 3 3 stride 2 2 padding 2 2 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 5}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 23 X86ScheduleConv input 1 240 14 14 weight 80 240 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 80}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 24 X86ScheduleConv input 1 80 14 14 weight 480 80 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 7}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 25 X86ScheduleConv input 1 480 14 14 weight 480 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 80}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 14}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 26 X86ScheduleConv input 1 480 1 1 weight 20 480 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 24}}, {"oc_bn", {-1, 20}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 27 X86ScheduleConv input 1 20 1 1 weight 480 20 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 20}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 28 X86ScheduleConv input 1 480 14 14 weight 80 480 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 80}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 29 X86ScheduleConv input 1 480 14 14 weight 480 1 5 5 stride 1 1 padding 2 2 dilation 1 1",
+      {{"ic_bn", {-1, 96}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 30 X86ScheduleConv input 1 480 14 14 weight 112 480 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 80}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 31 X86ScheduleConv input 1 112 14 14 weight 672 112 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 56}}, {"oc_bn", {-1, 32}}, {"ow_bn", {-1, 2}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 32 X86ScheduleConv input 1 672 14 14 weight 672 1 5 5 stride 1 1 padding 2 2 dilation 1 1",
+      {{"ic_bn", {-1, 96}}, {"oc_bn", {-1, 48}}, {"ow_bn", {-1, 2}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 33 X86ScheduleConv input 1 672 1 1 weight 28 672 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 6}}, {"oc_bn", {-1, 14}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 34 X86ScheduleConv input 1 28 1 1 weight 672 28 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 1}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 35 X86ScheduleConv input 1 672 14 14 weight 112 672 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 96}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 14}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 36 X86ScheduleConv input 1 672 14 14 weight 672 1 5 5 stride 2 2 padding 3 3 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 8}}, {"unroll_kw", {0}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 37 X86ScheduleConv input 1 672 7 7 weight 192 672 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 38 X86ScheduleConv input 1 192 7 7 weight 1152 192 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 3}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 39 X86ScheduleConv input 1 1152 7 7 weight 1152 1 5 5 stride 1 1 padding 2 2 dilation 1 1",
+      {{"ic_bn", {-1, 8}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 7}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 40 X86ScheduleConv input 1 1152 1 1 weight 48 1152 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 576}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 41 X86ScheduleConv input 1 48 1 1 weight 1152 48 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 12}}, {"oc_bn", {-1, 8}}, {"ow_bn", {-1, 1}}, {"oh_bn", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 42 X86ScheduleConv input 1 1152 7 7 weight 192 1152 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 72}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 43 X86ScheduleConv input 1 1152 7 7 weight 1152 1 3 3 stride 1 1 padding 1 1 dilation 1 1",
+      {{"ic_bn", {-1, 64}}, {"oc_bn", {-1, 64}}, {"ow_bn", {-1, 1}}, {"unroll_kw", {1}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 44 X86ScheduleConv input 1 1152 7 7 weight 320 1152 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 384}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+  InputX86Param(
+      model_data,
+      "efficientnet index 45 X86ScheduleConv input 1 320 7 7 weight 1280 320 1 1 stride 1 1 padding 0 0 dilation 1 1",
+      {{"ic_bn", {-1, 4}}, {"oc_bn", {-1, 16}}, {"ow_bn", {-1, 7}}, {"oh_bn", {2}}});
+}
+
+absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> CreateX86Params() {
+  absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> model_data;
+  LoadX86DefaultParams(&model_data);
+  LoadResNet18Params(&model_data);
+  LoadResNet50Params(&model_data);
+  LoadMobileNetV1Params(&model_data);
+  // LoadMobileNetV2Params(model_data);
+  // LoadFaceDetParams(model_data);
+  LoadEfficientNetParams(&model_data);
+  LoadSqueezeNetParams(&model_data);
+  return model_data;
+}
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/load_x86_params.h b/paddle/cinn/hlir/pe/load_x86_params.h
new file mode 100644
index 0000000000000..9273b242dae79
--- /dev/null
+++ b/paddle/cinn/hlir/pe/load_x86_params.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <string>
+#include <vector>
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+void InputX86Param(absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data,
+                   const std::string &key,
+                   const absl::flat_hash_map<std::string, std::vector<int>> &schedule_data);
+
+absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> CreateX86Params();
+void LoadResNet18Params(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data);
+void LoadResNet50Params(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data);
+void LoadMobileNetV1Params(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data);
+void LoadMobileNetV2Params(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data);
+void LoadFaceDetParams(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data);
+void LoadEfficientNetParams(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data);
+void LoadSqueezeNetParams(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data);
+
+void CreateX86Params(absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *model_data);
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/nn.cc b/paddle/cinn/hlir/pe/nn.cc
new file mode 100644
index 0000000000000..e2b3da2a64e1c
--- /dev/null
+++ b/paddle/cinn/hlir/pe/nn.cc
@@ -0,0 +1,1290 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/nn.h"
+
+#include <absl/container/flat_hash_map.h>
+
+#include <functional>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/context.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/nn_util.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/optim/ir_copy.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+using cinn::lang::Compute;
+using ir::Max;
+using ir::Min;
+using ir::Select;
+using ir::Tensor;
+
+std::string Type2StrForNN(common::Type type) {
+  std::string suffix;
+  if (type.is_float(64)) {
+    return "fp64";
+  } else if (type.is_float(32)) {
+    return "fp32";
+  } else if (type.is_bfloat16()) {
+    return "bf16";
+  } else if (type.is_float16()) {
+    return "fp16";
+  }
+  LOG(FATAL) << "NN Not Support " << type;
+  return "";
+}
+
+ir::Tensor Relu(const ir::Tensor &A, double threshold, const std::string &output_name) {
+  return lang::Compute(
+      A->shape, [=](const std::vector<Expr> &indice) { return lang::Relu(A(indice), threshold); }, output_name);
+}
+
+ir::Tensor Relu6(const ir::Tensor &A, double threshold, const std::string &output_name) {
+  return lang::Compute(
+      A->shape, [=](const std::vector<Expr> &indice) { return lang::Relu6(A(indice), threshold); }, output_name);
+}
+
+Tensor LeakyRelu(const Tensor &A, double alpha, const std::string &output_name) {
+  return Compute(
+      A->shape, [=](const std::vector<Expr> &indice) { return lang::LeakyRelu(A(indice), alpha); }, output_name);
+}
+
+Tensor PRelu(const Tensor &A, const Tensor &slope, const int axis, const std::string &output_name) {
+  CHECK_LT(axis, A->shape.size()) << "Wrong axis value: " << axis << std::endl;
+  CHECK(A->shape[axis] == slope->shape[0]) << "Wrong slope shape: " << slope->shape[0] << std::endl;
+  return Compute(
+      A->shape,
+      [=](const std::vector<Expr> &indice) { return lang::LeakyRelu(A(indice), slope(indice[axis])); },
+      output_name);
+}
+
+std::vector<ir::Tensor> Conv2d_winograd_NCHW(const ir::Tensor &input,
+                                             const ir::Tensor &weights,
+                                             int pad_h,
+                                             int pad_w,
+                                             int stride_h,
+                                             int stride_w,
+                                             int dilation_h,
+                                             int dilation_w,
+                                             const std::string &output_name) {
+  CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of Conv2d_winograd_NCHW op is not 4! Please check.";
+  CHECK_EQ(weights->shape.size(), 4U) << "Weight's dimension of Conv2d_winograd_NCHW op is not 4! Please check.";
+  std::vector<Expr> output_shape;
+  std::vector<Expr> new_weights_shape;
+  std::vector<Expr> input_pad_shape;
+
+  int tile_size = input->shape[2].as_int32() % 8 == 0 ? 4 : 2;
+
+  new_weights_shape = {weights->shape[0],
+                       weights->shape[1],
+                       dilation_h * (weights->shape[2] - 1) + 1,
+                       dilation_w * (weights->shape[3] - 1) + 1};
+
+  auto weights_dilation = Compute(
+      new_weights_shape,
+      [=](Expr nn, Expr cc, Expr yy, Expr xx) {
+        auto cond = lang::logic_and({(yy) % dilation_h == 0, xx % dilation_w == 0});
+        return ir::Select::Make(
+            cond, weights(nn, cc, (yy / dilation_h), (xx / dilation_w)), common::make_const(weights->type(), 0));
+      },
+      UniqName("weights_dilation"));
+
+  CHECK(MathEqual((weights->shape[0] * weights->shape[1]) % input->shape[1], Expr(0)))
+      << "filter's output channel size must be divisible by group\n";
+
+  int alpha = weights_dilation->shape[3].as_int32() + tile_size - 1;
+
+  input_pad_shape = {input->shape[0], input->shape[1], input->shape[2] + 2 * pad_h, input->shape[3] + 2 * pad_w};
+
+  ir::Tensor input_pad;
+  if (pad_h == 0 && pad_w == 0) {
+    input_pad = Compute(
+        input->shape, [=](Expr nn, Expr cc, Expr yy, Expr xx) { return input(nn, cc, yy, xx); }, UniqName("input_pad"));
+  } else {
+    input_pad = Compute(
+        input_pad_shape,
+        [=](Expr nn, Expr cc, Expr yy, Expr xx) {
+          auto cond =
+              lang::logic_and({yy >= pad_h, yy < input->shape[2] + pad_h, xx >= pad_w, xx < input->shape[3] + pad_w});
+          return ir::Select::Make(cond, input(nn, cc, yy - pad_h, xx - pad_w), ir::Zero(input->type()));
+        },
+        UniqName("input_pad"));
+  }
+
+  int r = weights_dilation->shape[3].as_int32();
+  int m = tile_size;
+
+  // # output_shape
+  output_shape = {
+      input->shape[0],    // B
+      weights->shape[0],  // O
+      common::AutoSimplify(
+          (input->shape[2] - ((weights_dilation->shape[2] - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1),  // H
+      common::AutoSimplify(
+          (input->shape[3] - ((weights_dilation->shape[3] - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + 1)  // W
+  };
+
+  std::vector<ir::Tensor> winograd_transform = winograd_transform_matrices(m, r);
+  ir::Tensor A                               = winograd_transform[0];
+  ir::Tensor B                               = winograd_transform[1];
+  ir::Tensor G                               = winograd_transform[2];
+
+  int nH = (common::AutoSimplify(output_shape[2]).as_int32() + m - 1) / m;
+  int nW = (common::AutoSimplify(output_shape[3]).as_int32() + m - 1) / m;
+
+  int P = input->shape[0].as_int32() * nH * nW;
+
+  Var r_kh(weights_dilation->shape[2], UniqName("r_kh"));
+  Var r_kw(weights_dilation->shape[3], UniqName("r_kw"));
+  std::vector<Expr> kernel_shape = {Expr(alpha), Expr(alpha), weights_dilation->shape[1], weights_dilation->shape[0]};
+  auto kernel_pack               = Compute(
+      kernel_shape,
+      [=](Expr eps, Expr nu, Expr ci, Expr co) {
+        return lang::ReduceSum(weights_dilation(co, ci, r_kh, r_kw) * G(eps, r_kh) * G(nu, r_kw), {r_kh, r_kw});
+      },
+      UniqName("kernel_pack"));
+
+  // pack input tile
+  std::vector<Expr> input_tile_shape = {weights_dilation->shape[1], Expr(P), Expr(alpha), Expr(alpha)};
+  auto input_tile                    = Compute(
+      input_tile_shape,
+      [=](Expr c, Expr p, Expr eps, Expr nu) {
+        return input_pad((p / (nH * nW)), c, ((p / nW) % nH) * m + eps, (p % nW) * m + nu);
+      },
+      UniqName("input_tile"));
+
+  std::vector<Expr> data_pack_shape = {Expr(alpha), Expr(alpha), weights_dilation->shape[1], Expr(P)};
+  Var r_a(input_tile->shape[2], UniqName("r_a"));
+  Var r_b(input_tile->shape[3], UniqName("r_b"));
+  auto data_pack = Compute(
+      data_pack_shape,
+      [=](Expr eps, Expr nu, Expr ci, Expr p) {
+        return lang::ReduceSum(input_tile(ci, p, r_a, r_b) * B(r_a, eps) * B(r_b, nu), {r_a, r_b});
+      },
+      UniqName("data_pack"));
+
+  // do batch gemm
+  std::vector<Expr> bgemm_shape = {Expr(alpha), Expr(alpha), weights_dilation->shape[0], Expr(P)};
+  Var ci(kernel_pack->shape[2], UniqName("ci"));
+  auto bgemm = Compute(
+      bgemm_shape,
+      [=](Expr eps, Expr nu, Expr co, Expr p) {
+        return lang::ReduceSum(kernel_pack(eps, nu, ci, co) * data_pack(eps, nu, ci, p), {ci});
+      },
+      UniqName("bgemm"));
+
+  // # inverse transform
+  std::vector<Expr> inverse_shape = {weights_dilation->shape[0], Expr(P), Expr(m), Expr(m)};
+  Var r_g_a(bgemm->shape[0], UniqName("r_g_a"));
+  Var r_g_b(bgemm->shape[1], UniqName("r_g_b"));
+  auto inverse = Compute(
+      inverse_shape,
+      [=](Expr co, Expr p, Expr vh, Expr vw) {
+        return lang::ReduceSum(bgemm(r_g_a, r_g_b, co, p) * A(r_g_a, vh) * A(r_g_b, vw), {r_g_a, r_g_b});
+      },
+      UniqName("inverse"));
+  auto res = Compute(
+      output_shape,
+      [=](Expr n, Expr co, Expr h, Expr w) {
+        return inverse(co, n * nH * nW + (h / m) * nW + (w / m), (h % m), (w % m));
+      },
+      output_name);
+
+  return {weights_dilation, input_pad, A, B, G, kernel_pack, input_tile, data_pack, bgemm, inverse, res};
+}
+
+std::vector<ir::Tensor> Conv2d_NCHW(const ir::Tensor &input,
+                                    const ir::Tensor &weights,
+                                    int pad_h,
+                                    int pad_w,
+                                    int stride_h,
+                                    int stride_w,
+                                    int dilation_h,
+                                    int dilation_w,
+                                    const std::string &output_name,
+                                    bool choose_direct_compute) {
+  CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of Conv2d_NCHW op is not 4! Please check.";
+  CHECK_EQ(weights->shape.size(), 4U) << "Weight's dimension of Conv2d_NCHW op is not 4! Please check.";
+  std::vector<int> output_shape_int;
+  std::vector<int> new_weights_shape_int;
+  std::vector<int> input_pad_shape_int;
+  output_shape_int = {
+      input->shape[0].as_int32(),    // B
+      weights->shape[0].as_int32(),  // O
+      (input->shape[2].as_int32() - ((weights->shape[2].as_int32() - 1) * dilation_h + 1) + 2 * pad_h) / stride_h +
+          1,  // H
+      (input->shape[3].as_int32() - ((weights->shape[3].as_int32() - 1) * dilation_w + 1) + 2 * pad_w) / stride_w +
+          1  // W
+  };
+  new_weights_shape_int = {weights->shape[0].as_int32(),
+                           weights->shape[1].as_int32(),
+                           dilation_h * (weights->shape[2].as_int32() - 1) + 1,
+                           dilation_w * (weights->shape[3].as_int32() - 1) + 1};
+  input_pad_shape_int   = {input->shape[0].as_int32(),
+                         input->shape[1].as_int32(),
+                         input->shape[2].as_int32() + 2 * pad_h,
+                         input->shape[3].as_int32() + 2 * pad_w};
+  std::vector<Expr> output_shape{
+      Expr(output_shape_int[0]), Expr(output_shape_int[1]), Expr(output_shape_int[2]), Expr(output_shape_int[3])};
+  std::vector<Expr> new_weights_shape{Expr(new_weights_shape_int[0]),
+                                      Expr(new_weights_shape_int[1]),
+                                      Expr(new_weights_shape_int[2]),
+                                      Expr(new_weights_shape_int[3])};
+  std::vector<Expr> input_pad_shape{Expr(input_pad_shape_int[0]),
+                                    Expr(input_pad_shape_int[1]),
+                                    Expr(input_pad_shape_int[2]),
+                                    Expr(input_pad_shape_int[3])};
+  CHECK_EQ(weights->shape.size(), 4);
+  CHECK(weights->shape[2].is_constant());
+  CHECK(weights->shape[3].is_constant());
+  int kh = weights->shape[2].as_int32();
+  int kw = weights->shape[3].as_int32();
+  if (!choose_direct_compute && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1 && 2 < kh &&
+      kh < 8 && 2 < kw && kw < 8) {
+    auto &res       = ScheduleParam::get_cuda_instance().GetParam();
+    std::string key = "CudaWinogradConvSchedule " + std::to_string(input_pad_shape_int[0]) + " " +
+                      std::to_string(input_pad_shape_int[1]) + " " + std::to_string(input_pad_shape_int[2]) + " " +
+                      std::to_string(input_pad_shape_int[3]) + " " + std::to_string(new_weights_shape_int[0]) + " " +
+                      std::to_string(new_weights_shape_int[1]) + " " + std::to_string(new_weights_shape_int[2]) + " " +
+                      std::to_string(new_weights_shape_int[3]) + " " + std::to_string(output_shape_int[0]) + " " +
+                      std::to_string(output_shape_int[1]) + " " + std::to_string(output_shape_int[2]) + " " +
+                      std::to_string(output_shape_int[3]);
+    if (res.count(key) > 0) {
+      VLOG(3) << "Find saved winograd_conv2d schedule param! key is: " << key;
+      return Conv2d_winograd_NCHW(
+          input, weights, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, output_name);
+    }
+    VLOG(3) << "Didn't find saved winograd_conv2d schedule param! key is: " << key;
+  }
+  ir::Tensor input_pad;
+  if (pad_h == 0 && pad_w == 0) {
+    input_pad = Compute(
+        input->shape, [=](Expr nn, Expr cc, Expr yy, Expr xx) { return input(nn, cc, yy, xx); }, UniqName("input_pad"));
+  } else {
+    input_pad = Compute(
+        input_pad_shape,
+        [=](Expr nn, Expr cc, Expr yy, Expr xx) {
+          auto cond =
+              lang::logic_and({yy >= pad_h, yy < input->shape[2] + pad_h, xx >= pad_w, xx < input->shape[3] + pad_w});
+          return ir::Select::Make(cond, input(nn, cc, yy - pad_h, xx - pad_w), ir::Zero(input->type()));
+        },
+        UniqName("input_pad"));
+  }
+
+  Var rc(weights->shape[1], UniqName("rc"));
+  Var ry(weights->shape[2], UniqName("ry"));
+  Var rx(weights->shape[3], UniqName("rx"));
+
+  CHECK(MathEqual((weights->shape[0] * weights->shape[1]) % input->shape[1], Expr(0)))
+      << "filter's output channel size must be divisible by group\n";
+  auto res = Compute(
+      output_shape,
+      [=](Expr nn, Expr ff, Expr yy, Expr xx) {
+        return lang::ReduceSum(input_pad(nn, rc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w) *
+                                   weights(ff, rc, ry, rx),
+                               {rc, ry, rx});
+      },
+      output_name);
+  return {res, input_pad};
+}
+
+std::vector<ir::Tensor> Conv2d_NCHW_5D(const ir::Tensor &input,
+                                       const ir::Tensor &weights,
+                                       int pad_h,
+                                       int pad_w,
+                                       int stride_h,
+                                       int stride_w,
+                                       int dilation_h,
+                                       int dilation_w,
+                                       std::string key,
+                                       const std::string &output_name,
+                                       const common::Target &target) {
+  // input: 4D to 5D, NCHW->NCHWc
+  // [batch, in_channel, in_height, in_width] ->
+  // [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+  auto type                       = input->type();
+  std::vector<Expr> shape_input   = input->shape;
+  std::vector<Expr> shape_weights = weights->shape;
+  CHECK_EQ(shape_input.size(), 4U) << "input's shape size should be 4";
+  CHECK_EQ(shape_weights.size(), 4U) << "weight's shape size should be 4";
+  Expr c_in     = common::AutoSimplify(shape_input[1]);
+  Expr c_filter = common::AutoSimplify(shape_weights[1]);
+  Expr c_out    = common::AutoSimplify(shape_weights[0]);
+  absl::flat_hash_map<std::string, int> conv2d_factors;
+  int oc      = c_out.as_int32();
+  int ic      = c_in.as_int32();
+  int fc_size = c_filter.as_int32();
+  if (key.empty()) {
+    key =
+        GenerateX86ConvKey(shape_input, shape_weights, {stride_h, stride_w}, {pad_h, pad_w}, {dilation_h, dilation_w});
+  }
+  GetConv2dFactors(&conv2d_factors, oc, ic, fc_size, -1, -1, type, target, key);
+  int ic_bn_size = conv2d_factors["ic_bn"];
+  int oc_bn_size = conv2d_factors["oc_bn"];
+  int fc_bn_size = conv2d_factors["fc_bn"];
+  VLOG(3) << "oc_bn: " << oc_bn_size;
+  VLOG(3) << "ic_bn: " << ic_bn_size;
+  VLOG(3) << "fc_bn: " << fc_bn_size;
+  Expr ic_bn    = Expr(ic_bn_size);
+  Expr oc_bn    = Expr(oc_bn_size);
+  Expr fc_bn    = Expr(fc_bn_size);
+  Expr ic_chunk = c_in / ic_bn;
+  Expr oc_chunk = c_out / oc_bn;
+  Expr fc_chunk = c_filter / fc_bn;
+
+  // pack data, 4D->5D
+  Expr batch = shape_input[0];
+  Expr h_in  = shape_input[2];
+  Expr w_in  = shape_input[3];
+  Expr h_f   = shape_weights[2];
+  Expr w_f   = shape_weights[3];
+  auto data  = Compute(
+      {batch, ic_chunk, h_in, w_in, ic_bn},
+      [=](Expr n, Expr icc, Expr h, Expr w, Expr icb) { return input(n, icc * ic_bn + icb, h, w); },
+      UniqName("data_vec"));
+  // pack kernel, 4D->6D
+  std::vector<Expr> new_weights_shape;
+  new_weights_shape = {oc_chunk, fc_chunk, shape_weights[2], shape_weights[3], fc_bn, oc_bn};
+
+  auto weights_dilation = Compute(
+      new_weights_shape,
+      [=](Expr occ, Expr fcc, Expr yy, Expr xx, Expr fcb, Expr ocb) {
+        return weights(occ * oc_bn + ocb, fcc * ic_bn + fcb, yy, xx);
+      },
+      UniqName("weights_dilation_vec"));
+
+  auto tensors = Conv2d_NCHWc(data, weights_dilation, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w);
+  CHECK_EQ(tensors.size(), 2U) << "Conv2d_NCHWc should return 2 tensors";
+  auto packed_out = tensors[0];
+  auto input_pad  = tensors[1];
+  // 5D back to 4D, NCHWc->NCHW
+  std::vector<Expr> output_shape = {
+      batch,                                                                                   // B
+      c_out,                                                                                   // O
+      common::AutoSimplify((h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1),  // H
+      common::AutoSimplify((w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + 1)   // W
+  };
+  auto res = Compute(
+      output_shape,
+      [=](Expr n, Expr c, Expr h, Expr w) { return packed_out(n, c / oc_bn, h, w, c % oc_bn); },
+      UniqName("conv2d_nchw_out"));
+  return {res, packed_out, weights_dilation, input_pad, data};
+}
+
+std::vector<ir::Tensor> Conv2d_NCHWc(const ir::Tensor &input,
+                                     const ir::Tensor &weights,
+                                     int pad_h,
+                                     int pad_w,
+                                     int stride_h,
+                                     int stride_w,
+                                     int dilation_h,
+                                     int dilation_w,
+                                     const std::string &output_name,
+                                     const common::Target &target) {
+  // input: [N, c_in_outer, H, W, c_in_inner]
+  // weight: [c_out_outer, c_filter_outer, filter_h, filter_w, c_filter_inner, c_out_inner]
+  auto type                       = input->type();
+  std::vector<Expr> shape_input   = input->shape;
+  std::vector<Expr> shape_weights = weights->shape;
+  CHECK_EQ(shape_input.size(), 5U) << "Conv2d_NCHWc input's shape size should be 5";
+  CHECK_EQ(shape_weights.size(), 6U) << "Conv2d_NCHWc weight's shape size should be 6";
+
+  Expr batch      = shape_input[0];
+  Expr c_in_outer = common::AutoSimplify(shape_input[1]);
+  Expr h_in       = shape_input[2];
+  Expr w_in       = shape_input[3];
+  Expr c_in_inner = common::AutoSimplify(shape_input[4]);
+
+  Expr c_out_outer    = shape_weights[0];
+  Expr c_filter_outer = common::AutoSimplify(shape_weights[1]);
+  Expr h_f            = shape_weights[2];
+  Expr w_f            = shape_weights[3];
+  Expr c_filter_inner = common::AutoSimplify(shape_weights[4]);
+  Expr c_out_inner    = common::AutoSimplify(shape_weights[5]);
+
+  Expr c_filter = common::AutoSimplify(c_filter_outer * c_filter_inner);
+  Expr c_out    = common::AutoSimplify(c_out_outer * c_out_inner);
+  Expr c_in     = common::AutoSimplify(c_in_outer * c_in_inner);
+  Var fc(c_filter, UniqName("fc"));
+  Var fy(h_f, UniqName("fy"));
+  Var fx(w_f, UniqName("fx"));
+  std::vector<Expr> output_shape = {
+      batch,                                                                                   // B
+      c_out_outer,                                                                             // O
+      common::AutoSimplify((h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1),  // H
+      common::AutoSimplify((w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + 1),  // W
+      c_out_inner};
+
+  ir::Tensor input_pad;
+  if (pad_h == 0 && pad_w == 0) {
+    input_pad = Compute(
+        input->shape,
+        [=](Expr n, Expr icc, Expr yy, Expr xx, Expr icb) { return input(n, icc, yy, xx, icb); },
+        UniqName("input_pad"));
+  } else {
+    auto pad_h_bound = common::AutoSimplify((output_shape[2] - 1) * stride_h + (h_f - 1) * dilation_h + 1);
+    auto pad_w_bound = common::AutoSimplify((output_shape[3] - 1) * stride_w + (w_f - 1) * dilation_w + 1);
+    auto pad_out_h   = std::min(pad_h_bound.as_int32(), common::AutoSimplify(h_in + 2 * pad_h).as_int32());
+    auto pad_out_w   = std::min(pad_w_bound.as_int32(), common::AutoSimplify(w_in + 2 * pad_w).as_int32());
+    auto h_in_pad    = common::AutoSimplify(h_in + pad_h);
+    auto w_in_pad    = common::AutoSimplify(w_in + pad_w);
+    input_pad        = Compute(
+        {batch, c_in_outer, Expr(pad_out_h), Expr(pad_out_w), c_in_inner},
+        [=](Expr n, Expr icc, Expr yy, Expr xx, Expr icb) {
+          auto cond = lang::logic_and({yy >= pad_h, xx >= pad_w});
+          if (pad_out_h > h_in_pad.as_int32()) {
+            cond = lang::logic_and({cond, yy < h_in_pad});
+          }
+          if (pad_out_w > w_in_pad.as_int32()) {
+            cond = lang::logic_and({cond, xx < w_in_pad});
+          }
+          return ir::Select::Make(cond, input(n, icc, yy - pad_h, xx - pad_w, icb), ir::Zero(type));
+        },
+        UniqName("input_pad"));
+  }
+
+  auto packed_out = Compute(
+      output_shape,
+      [=](Expr n, Expr oc_chunk, Expr oh, Expr ow, Expr oc_block) {
+        Expr c_out_per_group = common::AutoSimplify(c_out * c_filter / c_in);
+        Expr ic_outer, ic_inner;
+        if (c_in == c_filter) {
+          ic_outer = common::AutoSimplify(fc / c_in_inner);
+          ic_inner = common::AutoSimplify(fc % c_in_inner);
+        } else {
+          ic_outer = common::AutoSimplify(((oc_chunk * c_out_inner + oc_block) / c_out_per_group * c_filter + fc) /
+                                          c_in_inner);
+          ic_inner = common::AutoSimplify(((oc_chunk * c_out_inner + oc_block) / c_out_per_group * c_filter + fc) %
+                                          c_in_inner);
+        }
+        return lang::ReduceSum(
+            input_pad(n, ic_outer, oh * stride_h + fy * dilation_h, ow * stride_w + fx * dilation_w, ic_inner) *
+                weights(oc_chunk, fc / c_filter_inner, fy, fx, fc % c_filter_inner, oc_block),
+            {fc, fy, fx});
+      },
+      UniqName("conv2d_NCHWc_out"));
+  return {packed_out, input_pad};
+}
+
+#ifdef CINN_WITH_MKLDNN
+std::vector<ir::Tensor> Conv2d_NCHW_MKLDNN(const ir::Tensor &input,
+                                           const ir::Tensor &weights,
+                                           int pad_h,
+                                           int pad_w,
+                                           int stride_h,
+                                           int stride_w,
+                                           int dilation_h,
+                                           int dilation_w,
+                                           const std::string &output_name) {
+  CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of Conv2d_NCHW op is not 4! Please check.";
+  CHECK_EQ(weights->shape.size(), 4U) << "Weight's dimension of Conv2d_NCHW op is not 4! Please check.";
+  std::vector<Expr> output_shape;
+  std::vector<Expr> new_weights_shape;
+  std::vector<Expr> input_pad_shape;
+  int group = input->shape[1].as_int32() / weights->shape[1].as_int32();
+  CHECK_EQ(input->shape[1].as_int32(), weights->shape[1].as_int32() * group)
+      << "input channel should be divisible by filter channel";
+  auto call = Compute(
+      {Expr(1)},
+      [=]() -> Expr {
+        return lang::CallExtern("cinn_cpu_mkldnn_conv2d_nchw_fp32",
+                                {
+                                    Expr(input->shape[0]),    // batch_size
+                                    Expr(input->shape[1]),    // c_in
+                                    Expr(input->shape[2]),    // input_h
+                                    Expr(input->shape[3]),    // input_w
+                                    Expr(weights->shape[0]),  // c_out
+                                    Expr(group),              // group
+                                    Expr(weights->shape[2]),  // filter_h
+                                    Expr(weights->shape[3]),  // filter_w
+                                    Expr(pad_h),              // pad_h
+                                    Expr(pad_w),              // pad_w
+                                    Expr(stride_h),           // stride_h
+                                    Expr(stride_w),           // stride_w
+                                    Expr(dilation_h),         // dilation_h
+                                    Expr(dilation_w),         // dilation_w
+                                    input,                    // input
+                                    weights                   // weights
+                                });
+      },
+      UniqName("conv2d_nchw_mkldnn_out"));
+  auto out = call->TupleGet(0);
+  out->WithBuffer(input->type());
+  return {out, call};
+}
+#endif
+
+std::vector<ir::Tensor> Conv2d_NHWC(const ir::Tensor &input,
+                                    const ir::Tensor &weights,
+                                    int pad_h,
+                                    int pad_w,
+                                    int stride_h,
+                                    int stride_w,
+                                    int dilation_h,
+                                    int dilation_w,
+                                    const std::string &output_name) {
+  CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of Conv2d_NHWC op is not 4! Please check.";
+  CHECK_EQ(weights->shape.size(), 4U) << "Weight's dimension of Conv2d_NHWC op is not 4! Please check.";
+  std::vector<Expr> output_shape;
+  std::vector<Expr> new_weights_shape;
+  std::vector<Expr> input_pad_shape;
+
+  output_shape = {
+      input->shape[0],                                                                                  // B
+      Expr((input->shape[1] - ((weights->shape[2] - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1),  // H
+      Expr((input->shape[2] - ((weights->shape[3] - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + 1),  // W
+      weights->shape[0]                                                                                 // O
+  };
+  new_weights_shape = {weights->shape[0],
+                       weights->shape[1],
+                       dilation_h * (weights->shape[2] - 1) + 1,
+                       dilation_w * (weights->shape[3] - 1) + 1};
+  input_pad_shape   = {input->shape[0], input->shape[1] + 2 * pad_h, input->shape[2] + 2 * pad_w, input->shape[3]};
+  auto input_pad    = Compute(
+      input_pad_shape,
+      [=](Expr nn, Expr yy, Expr xx, Expr cc) {
+        auto cond =
+            lang::logic_and({yy >= pad_h, yy - pad_h < input->shape[1], xx >= pad_w, xx - pad_w < input->shape[2]});
+        return ir::Select::Make(cond, input(nn, yy - pad_h, xx - pad_w, cc), ir::Zero(input->type()));
+      },
+      UniqName("input_pad"));
+
+  auto weights_dilation = Compute(
+      new_weights_shape,
+      [=](Expr nn, Expr cc, Expr yy, Expr xx) {
+        auto cond = lang::logic_and({(yy) % dilation_h == 0, xx % dilation_w == 0});
+        return ir::Select::Make(
+            cond, weights(nn, cc, yy / dilation_h, xx / dilation_w), common::make_const(weights->type(), 0));
+      },
+      UniqName("weights_dilation"));
+
+  Var fc(weights->shape[1], UniqName("fc"));
+  Var fy(weights_dilation->shape[2], UniqName("fy"));
+  Var fx(weights_dilation->shape[3], UniqName("fx"));
+
+  CHECK(MathEqual((weights->shape[0] * weights->shape[1]) % input->shape[3], Expr(0)))
+      << "filter's output channel size must be divisible by group\n";
+  auto res = Compute(
+      output_shape,
+      [=](Expr nn, Expr yy, Expr xx, Expr ff) {
+        return lang::ReduceSum(
+            input_pad(nn,
+                      yy * stride_h + fy,
+                      xx * stride_w + fx,
+                      ff / (weights->shape[0] * weights->shape[1] / input->shape[3]) * weights->shape[1] + fc) *
+                weights_dilation(ff, fc, fy, fx),
+            {fy, fx, fc});
+      },
+      output_name);
+  return {res, input_pad, weights_dilation};
+}
+
+std::vector<Tensor> Depthwise_Conv2d_NCHW(const Tensor &input,
+                                          const Tensor &weight,
+                                          int pad_h,
+                                          int pad_w,
+                                          int stride_h,
+                                          int stride_w,
+                                          const std::string output_name) {
+  CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of Depthwise_Conv2d_NCHW is not 4! Please check.\n";
+  CHECK_EQ(weight->shape.size(), 4U) << "Weight's dimension of Depthwise_Conv2d_NCHW is not 4! Please check.\n";
+  Expr in_h = input->shape[2];
+  Expr in_w = input->shape[3];
+  Expr c_m  = weight->shape[1];  // channel_multiplier
+  std::vector<Expr> output_shape;
+  CHECK(input->shape[0].is_constant());
+  CHECK(input->shape[1].is_constant());
+  CHECK(input->shape[2].is_constant());
+  CHECK(input->shape[3].is_constant());
+  CHECK(weight->shape[1].is_constant());
+  CHECK(weight->shape[2].is_constant());
+  CHECK(weight->shape[3].is_constant());
+  int B = (int)input->shape[0].get_constant();
+  int O = (int)weight->shape[1].get_constant() * (int)input->shape[1].get_constant();
+  int H = ((int)input->shape[2].get_constant() - (int)weight->shape[2].get_constant() + 2 * pad_h) / stride_h + 1;
+  int W = ((int)input->shape[3].get_constant() - (int)weight->shape[3].get_constant() + 2 * pad_w) / stride_w + 1;
+  output_shape = {
+      Expr(B),  // B
+      Expr(O),  // O
+      Expr(H),  // H
+      Expr(W)   // W
+  };
+  auto input_pad =
+      (pad_h == 0 && pad_w == 0) ? Identity(input).front() : Pad(input, {Expr(0), Expr(0), Expr(pad_h), Expr(pad_w)});
+
+  Var kernel_h = Var(weight->shape[2], "kh");
+  Var kernel_w = Var(weight->shape[3], "kw");
+  VLOG(3) << "Output shape is : " << cinn::utils::Join(output_shape, ",");
+  auto res = Compute(
+      output_shape,
+      [=](Expr nn, Expr ff, Expr yy, Expr xx) {
+        return lang::ReduceSum(input_pad(nn, ff / c_m, yy * stride_h + kernel_h, xx * stride_w + kernel_w) *
+                                   weight(ff / c_m, ff % c_m, kernel_h, kernel_w),
+                               {kernel_h, kernel_w});
+      },
+      output_name);
+  return {res, input_pad};
+}
+
+std::vector<Tensor> Depthwise_Conv2d_NHWC(const Tensor &input,
+                                          const Tensor &weight,
+                                          int pad_h,
+                                          int pad_w,
+                                          int stride_h,
+                                          int stride_w,
+                                          const std::string output_name) {
+  CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of Depthwise_Conv2d_NCHW is not 4! Please check.\n";
+  CHECK_EQ(weight->shape.size(), 4U) << "Weight's dimension of Depthwise_Conv2d_NCHW is not 4! Please check.\n";
+  Expr in_h = input->shape[1];
+  Expr in_w = input->shape[2];
+  Expr c_m  = weight->shape[1];  // channel_multiplier
+  std::vector<Expr> output_shape;
+
+  output_shape = {
+      input->shape[0],                                                  // B
+      (input->shape[1] - weight->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (input->shape[2] - weight->shape[3] + 2 * pad_w) / stride_w + 1,  // W
+      weight->shape[1] * input->shape[3]                                // O
+  };
+
+  auto input_pad =
+      (pad_h == 0 && pad_w == 0) ? Identity(input).front() : Pad(input, {Expr(0), Expr(pad_h), Expr(pad_w), Expr(0)});
+
+  Var kernel_h = Var(weight->shape[2], "kh");
+  Var kernel_w = Var(weight->shape[3], "kw");
+  auto res     = Compute(
+      output_shape,
+      [=](Expr nn, Expr yy, Expr xx, Expr ff) {
+        return lang::ReduceSum(input_pad(nn, yy * stride_h + kernel_h, xx * stride_w + kernel_w, ff / c_m) *
+                                   weight(ff / c_m, ff % c_m, kernel_h, kernel_w),
+                               {kernel_h, kernel_w});
+      },
+      output_name);
+  return {res, input_pad};
+}
+
+/**
+ * Can be used as a normalizer function for convolution or fully_connected operations.
+ * Specified for NCHW layout.
+ * Math: Y = (X - mean) / sqrt(variance + epsilon) * scale + bias
+ * @param input The input variable.
+ * @param weights The weights containing mean, variance, scale and bias.
+ * @param epsilon The param epsilon is added to avoid divide zero.
+ * @param output_name The name of output tensor.
+ * @return The calculated output tensor.
+ */
+ir::Tensor BatchNorm_NCHW(const ir::Tensor &input,
+                          const ir::Tensor &scale,
+                          const ir::Tensor &bias,
+                          const ir::Tensor &mean,
+                          const ir::Tensor &variance,
+                          float epsilon,
+                          const std::string &output_name) {
+  CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of BatchNorm op is not 4! Please check.";
+  CHECK_EQ(scale->shape.size(), 1U) << "Scale's dimension of BatchNorm op is not 1! Please check.";
+  CHECK_EQ(bias->shape.size(), 1U) << "Bias's dimension of BatchNorm op is not 1! Please check.";
+  CHECK_EQ(mean->shape.size(), 1U) << "Mean's dimension of BatchNorm op is not 1! Please check.";
+  CHECK_EQ(variance->shape.size(), 1U) << "Variance's dimension of BatchNorm op is not 1! Please check.";
+  auto res = Compute(
+      input->shape,
+      [=](Expr n, Expr c, Expr h, Expr w) {
+        return (input(n, c, h, w) - mean(c)) * scale(c) /
+                   lang::Sqrt(variance(c) + common::make_const(input->type(), epsilon)) +
+               bias(c);
+      },
+      UniqName(output_name));
+  return res;
+}
+
+ir::Tensor BatchNorm_NCHWc(const ir::Tensor &input,
+                           const ir::Tensor &scale,
+                           const ir::Tensor &bias,
+                           const ir::Tensor &mean,
+                           const ir::Tensor &variance,
+                           float epsilon,
+                           const std::string &output_name) {
+  CHECK_EQ(input->shape.size(), 5U) << "Input's dimension of BatchNorm op is not 5! Please check.";
+  CHECK_EQ(scale->shape.size(), 1U) << "Scale's dimension of BatchNorm op is not 1! Please check.";
+  CHECK_EQ(bias->shape.size(), 1U) << "Bias's dimension of BatchNorm op is not 1! Please check.";
+  CHECK_EQ(mean->shape.size(), 1U) << "Mean's dimension of BatchNorm op is not 1! Please check.";
+  CHECK_EQ(variance->shape.size(), 1U) << "Variance's dimension of BatchNorm op is not 1! Please check.";
+  Expr ic_bn = input->shape.back();
+  auto res   = Compute(
+      input->shape,
+      [=](Expr n, Expr icc, Expr h, Expr w, Expr icb) {
+        Expr new_c = icc * ic_bn + icb;
+        return (input(n, icc, h, w, icb) - mean(new_c)) * scale(new_c) /
+                   lang::Sqrt(variance(new_c) + common::make_const(input->type(), epsilon)) +
+               bias(new_c);
+      },
+      UniqName(output_name));
+  return res;
+}
+
+/**
+ * This operator implements the softmax layer.
+ * @param A The input tensor.
+ * @param axis The axis parameter.
+ * @param output_name The name of output tensor.
+ * @return The calculated output tensor.
+ */
+std::vector<ir::Tensor> Softmax(const ir::Tensor &A, int axis, const std::string &output_name) {
+  if (axis == -1) {
+    axis = A->shape.size() - 1;
+  }
+  Var reduce_axis(A->shape[axis], UniqName("reduce_axis"));
+  std::vector<Expr> new_shapes;
+  for (size_t i = 0; i < A->shape.size(); i++) {
+    if (static_cast<int>(i) != axis) {
+      new_shapes.push_back(A->shape[i]);
+    }
+  }
+  auto temp = Compute(
+      new_shapes,
+      [=](const std::vector<Expr> &indice) {
+        std::vector<Expr> new_indice;
+        int count = 0;
+        for (size_t i = 0; i < A->shape.size(); i++) {
+          if (static_cast<int>(i) != axis) {
+            new_indice.push_back(indice[count++]);
+          } else {
+            new_indice.push_back(reduce_axis);
+          }
+        }
+        return lang::ReduceSum(lang::Exp(A(new_indice)), {reduce_axis});
+      },
+      UniqName("softmax_temp_out"));
+  temp->WithBuffer("local");
+
+  ir::Tensor out = Compute(
+      A->shape,
+      [=](const std::vector<Expr> &indice) {
+        std::vector<Expr> new_indice;
+        for (size_t i = 0; i < indice.size(); i++) {
+          if (static_cast<int>(i) != axis) {
+            new_indice.push_back(indice[i]);
+          }
+        }
+        return lang::Exp(A(indice)) / temp(new_indice);
+      },
+      output_name);
+  return {out, temp};
+}
+
+#ifdef CINN_WITH_MKLDNN
+std::vector<ir::Tensor> SoftmaxMKLDNN(const ir::Tensor &A, int axis, const std::string &output_name) {
+  CHECK_LE(A->shape.size(), 4U) << "Input's dimension of mkldnn softmax op is less than 4! Please check.";
+  if (axis == -1) {
+    axis = A->shape.size() - 1;
+  }
+  auto shape = A->shape;
+  for (size_t i = shape.size(); i < 4; i++) {
+    shape.push_back(Expr(1));
+  }
+
+  auto call = Compute(
+      {Expr(1)},
+      [=]() -> Expr {
+        return lang::CallExtern("cinn_cpu_mkldnn_softmax_fp32",
+                                {
+                                    shape[0],    // batch_size
+                                    shape[1],    // c_in
+                                    shape[2],    // h
+                                    shape[3],    // w
+                                    Expr(axis),  // axis
+                                    A,           // input
+                                });
+      },
+      output_name);
+  auto out = call->TupleGet(0);
+  out->WithBuffer(A->type());
+  return {out, call};
+}
+#endif
+
+/**
+ * @brief Perform padding operation.
+ * @param tensor The input tensor.
+ * @param pad_before Vector of Exprs describing the padding before the respective dimension
+ * @param pad_after Vector of Exprs describing the padding after the respective dimension
+ * @param pad_value The value to fill padding elements with. Default is zero.
+ * @param name The name of the output padding tensor
+ * @param pad_mode Padding type to use: "constant" pads with constant_value; "edge" pads using the edge values of the
+ * input array; "reflect" pads by reflecting values with respect to the edges.
+ *
+ * @return the output tensor after padding.
+ *
+ * @note
+ *  The pad_after vector must either be empty or have the same length as pad_before
+ *  When pad_after is empty, it takes the same values as pad_before (symmetric padding)
+ *  The pad vector applies from the leading dimensions and skips missing trailing dimensions:
+ *  e.g.
+ *      pad(t(i, j, k), {1}, {1}) returns the equivalent operation for
+ *          the following pseudocode:
+ *              for i in [0, t.shape[0] + 2):
+ *                  for j in [0, t.shape[0] + 2):
+ *                      for k in [0, t.shape[0] + 2):
+ *                         name(i,j,k) =
+ *                             i < 1 ? 0 :
+ *                               ((1 <= i < t.shape[0] + 1) ?
+ *                                 t(i-1, j, k) : 0));
+ *
+ */
+Tensor Pad(const Tensor &tensor,
+           const std::vector<Expr> &pad_before,
+           std::vector<Expr> pad_after,
+           Expr pad_value,
+           const std::string &name,
+           const std::string &pad_mode) {
+  // When pad_after is empty, it takes the same values as pad_before (symmetric padding)
+  if (pad_after.size() < pad_before.size()) {
+    for (size_t i = pad_after.size(); i < pad_before.size(); ++i) {
+      pad_after.push_back(pad_before[i]);
+    }
+  }
+  CHECK(!pad_before.empty());
+  CHECK_EQ(pad_before.size(), pad_after.size());
+  std::vector<Expr> output_shape;
+  for (auto &ele : pad_before) {
+    CHECK(ele.type().is_int(32)) << "padding size should be int32\n";
+  }
+  for (auto &ele : pad_after) {
+    CHECK(ele.type().is_int(32)) << "padding size should be int32\n";
+  }
+  for (size_t i = 0; i < tensor->shape.size(); ++i) {
+    if (i >= pad_before.size()) {
+      output_shape.push_back(tensor->shape[i]);
+    } else {
+      auto shape = common::AutoSimplify(tensor->shape[i] + pad_before[i] + pad_after[i]);
+      output_shape.push_back(shape);
+    }
+  }
+  // default value is zero
+  if (!pad_value.defined()) {
+    pad_value = make_const(tensor->type(), 0);
+  }
+
+  auto fn = [=](const std::vector<Expr> &ovars) {
+    std::vector<Expr> indices;
+    std::vector<Expr> sel;
+    std::vector<Expr> pad_idx;
+    for (size_t i = 0; i < tensor->shape.size(); ++i) {
+      if (i >= pad_before.size()) {
+        indices.emplace_back(ovars[i]);
+        continue;
+      }
+      if (!MathEqual(pad_before[i], Expr(0))) {
+        sel.push_back(ir::GE::Make(ovars[i], pad_before[i]));
+        indices.push_back(ovars[i] - pad_before[i]);
+      } else {
+        indices.emplace_back(ovars[i]);
+      }
+      Expr sel_after;
+      if (!MathEqual(pad_after[i], Expr(0))) {
+        sel_after = common::AutoSimplify(ovars[i] < pad_before[i] + tensor->shape[i]);
+        sel.push_back(sel_after);
+      }
+      if (pad_mode == "edge") {
+        pad_idx.push_back(Select::Make(
+            ovars[i] < pad_before[i],
+            0,
+            Select::Make(
+                ovars[i] >= pad_before[i] + tensor->shape[i], tensor->shape[i] - 1, ovars[i] - pad_before[i])));
+      } else if (pad_mode == "reflect") {
+        pad_idx.push_back(Select::Make(ovars[i] < pad_before[i],
+                                       pad_before[i] - ovars[i],
+                                       Select::Make(ovars[i] >= pad_before[i] + tensor->shape[i],
+                                                    tensor->shape[i] * 2 - ovars[i] + pad_before[i] - 2,
+                                                    ovars[i] - pad_before[i])));
+      }
+    }
+    if (sel.size() != 0) {
+      auto fn = [](Expr a, Expr b) { return a && b; };
+      if (pad_mode == "constant") {
+        return Select::Make(FoldExpr(fn, sel), tensor(indices), pad_value);
+      } else if (pad_mode == "edge" || pad_mode == "reflect") {
+        return Select::Make(FoldExpr(fn, sel), tensor(indices), tensor(pad_idx));
+      }
+    }
+    return tensor(indices);
+  };
+  return Compute(output_shape, fn, UniqName(name));
+}
+
+/**
+ * @brief Perform pooling on N-dimension of data.
+ *
+ * @param tensor The input tensor with the shape of {N, C, H, W} or {N, H, W, C}.
+ * @param kernel_size Vector of N ints that indicates pooling kernel size. If N is 2, then is {pool_kernel_Height,
+ * pool_kernel_Width}.
+ * @param stride_size Vector of N ints that indicates pooling stride size. If N is 2, then is {pool_stride_Height,
+ * pool_stride_Width}.
+ * @param padding_size Vector of N*2 ints {head_pad_d1, head_pad_d2, ..., head_pad_dN, tail_pad_d1, tail_pad_d2, ...,
+ * tail_pad_dN}. If N is 2, then is {pad_height_top, pad_width_left, pad_height_bottom, pad_width_right]}.
+ * @param pool_type The type of pooling operator, currently support "max" and "avg".
+ * @param axis Vector of axes of the tensor for pooling.
+ * @param ceil_mode Whether to use ceil when calculating the output size.
+ * @param exclusive Whether include padding in the calculation'.
+ * @param output_name the name of the output tensor after padding and pooling.
+ *
+ * @return the vector of padding tensor and pooling tensor
+ */
+std::vector<Tensor> PoolImpl(const Tensor &tensor,
+                             const std::vector<int> &kernel_size,
+                             const std::vector<int> &stride_size,
+                             const std::vector<int> &padding_size,
+                             const std::string &pooling_type,
+                             const std::vector<int> &axis,
+                             bool ceil_mode,
+                             bool exclusive,
+                             bool adaptive,
+                             const std::string &output_name) {
+  CHECK(!kernel_size.empty()) << "Pooling kernel_size should not be empty\n";
+  int k_size = kernel_size.size();
+  int x_size = tensor->shape.size();
+  CHECK_EQ(stride_size.size(), k_size) << "Pooling stride_size must have same elements as kernel\n";
+  CHECK_EQ(padding_size.size(), k_size * 2) << "Pooling padding_size must have double elements as kernel\n";
+  CHECK_EQ(axis.size(), k_size) << "Axis must have same elements as kernel\n";
+
+  std::string pool_type;
+  std::transform(pooling_type.begin(), pooling_type.end(), std::back_inserter(pool_type), [](unsigned char c) {
+    return std::tolower(c);
+  });
+  CHECK(pool_type == "max" || pool_type == "avg") << "pool_type for pool2d should be max or avg.\n";
+
+  std::vector<Var> daxis;
+  std::vector<Expr> kernel(k_size);
+  std::vector<Expr> stride(k_size);
+  std::vector<Expr> pad_head(k_size);
+  std::vector<Expr> pad_tail(k_size);
+  std::vector<Expr> pad_before(x_size, Expr(0));
+  std::vector<Expr> pad_after(x_size, Expr(0));
+  std::vector<Expr> out_shape = tensor->shape;
+
+  bool do_pad = false;
+  for (int i = 0; i < k_size; i++) {
+    int ii      = axis[i];
+    kernel[i]   = Expr(kernel_size[i]);
+    stride[i]   = Expr(stride_size[i]);
+    pad_head[i] = Expr(padding_size[i]);
+    pad_tail[i] = Expr(padding_size[i + k_size]);
+    do_pad      = (do_pad) ? do_pad : (padding_size[i] || padding_size[i + k_size]);
+
+    if (ceil_mode) {
+      pad_tail[i] = common::AutoSimplify(pad_tail[i] + stride[i] - 1);
+    }
+
+    daxis.emplace_back(Var(kernel[i], UniqName("kernel_idx")));
+
+    pad_before[ii] = pad_head[i];
+    pad_after[ii]  = pad_tail[i];
+
+    auto out_dim = common::AutoSimplify((tensor->shape[ii] - kernel[i] + pad_head[i] + pad_tail[i]) / stride[i] + 1);
+
+    out_shape[ii] = out_dim;
+  }
+
+  do_pad = do_pad || (ceil_mode && stride_size[0] > 1);
+  Tensor temp;
+  Tensor res;
+  if (pool_type == "max") {
+    Expr min_value = lang::min_value(tensor->type());
+    // Pad the input tensor with the pad_value of type's minimum value
+    temp = do_pad ? Pad(tensor, pad_before, pad_after, min_value, UniqName("pad_temp")) : tensor;
+    res  = Compute(
+        out_shape,
+        [=](const std::vector<Expr> &output) {
+          std::vector<Expr> indices;
+          for (auto &var : output) indices.push_back(var);
+
+          for (int i = 0; i < k_size; i++) {
+            int ii      = axis[i];
+            indices[ii] = output[ii] * stride[i] + daxis[i];
+          }
+
+          return lang::ReduceMax(temp(indices), {daxis}, min_value);
+        },
+        output_name);
+  } else if (pool_type == "avg") {
+    // Pad the input tensor with pad_value zero
+    temp = do_pad ? Pad(tensor, pad_before, pad_after, 0, UniqName("pad_temp")) : tensor;
+    res  = Compute(
+        out_shape,
+        [=](const std::vector<Expr> &output) {
+          std::vector<Expr> indices;
+          for (const Expr &var : output) indices.push_back(var);
+
+          for (int i = 0; i < k_size; i++) {
+            int ii      = axis[i];
+            indices[ii] = output[ii] * stride[i] + daxis[i];
+          }
+
+          if (exclusive) {
+            std::vector<Expr> start(k_size);
+            std::vector<Expr> end(k_size);
+            auto temp_factor = make_const(Int(32), 1);
+            for (int i = 0; i < k_size; i++) {
+              int ii      = axis[i];
+              start[i]    = common::AutoSimplify(output[ii] * stride[i] - pad_head[i]);
+              end[i]      = Min::Make(start[i] + kernel[i], tensor->shape[ii]);
+              start[i]    = Max::Make(start[i], make_const(Int(32), 0));
+              temp_factor = temp_factor * (end[i] - start[i]);
+            }
+            common::AutoSimplify(temp_factor);
+            Expr divide_factor = Max::Make(temp_factor, make_const(Int(32), 1));
+            return lang::ReduceSum(ir::Div::Make(temp(indices), ir::Cast::Make(temp->type(), divide_factor)), {daxis});
+          } else {
+            auto temp_factor = make_const(Int(32), 1);
+            for (int i = 0; i < k_size; i++) {
+              temp_factor = temp_factor * kernel[i];
+            }
+            common::AutoSimplify(temp_factor);
+            return lang::ReduceSum(ir::Div::Make(temp(indices), ir::Cast::Make(temp->type(), temp_factor)), daxis);
+          }
+        },
+        output_name);
+  } else {
+    LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
+  }
+  if (adaptive) {
+    CHECK_EQ(pool_type, "avg");
+    std::vector<Expr> out_shape = tensor->shape;
+    CHECK_EQ(k_size, 2);
+    CHECK_EQ(k_size, (int)axis.size());
+    for (int i = 0; i < k_size; i++) {
+      out_shape[axis[i]] = Expr(kernel_size[i]);
+    }
+    VLOG(4) << "PoolImpl out_shape: " << cinn::utils::Join(out_shape, ",");
+    CHECK(!do_pad);
+    temp = do_pad ? Pad(tensor, pad_before, pad_after, 0, UniqName("pad_temp")) : tensor;
+    std::vector<Var> reduce_axis;
+
+    for (int i = 0; i < k_size; i++) {
+      reduce_axis.emplace_back(Var(Expr(static_cast<int>(tensor->shape[axis[i]].get_constant()) / kernel_size[i]),
+                                   UniqName("adaptive_reduce")));
+    }
+
+    res = Compute(
+        out_shape,
+        [=](const std::vector<Expr> &output) {
+          std::vector<Expr> indices;
+          for (const Expr &var : output) indices.push_back(var);
+
+          for (int i = 0; i < k_size; i++) {
+            indices[axis[i]] =
+                output[axis[i]] * Expr(static_cast<int>(tensor->shape[axis[i]].get_constant()) / kernel_size[i]) +
+                reduce_axis[i];
+          }
+
+          auto temp_factor = make_const(Int(32), 1);
+          for (int i = 0; i < k_size; i++) {
+            temp_factor = temp_factor * Expr(static_cast<int>(tensor->shape[axis[i]].get_constant()) / kernel_size[i]);
+          }
+          common::AutoSimplify(temp_factor);
+          Expr divide_factor = Max::Make(temp_factor, make_const(Int(32), 1));
+          return lang::ReduceSum(ir::Div::Make(temp(indices), ir::Cast::Make(temp->type(), divide_factor)),
+                                 {reduce_axis});
+        },
+        output_name);
+  }
+  if (do_pad) {
+    return {res, temp};
+  } else {
+    return {res};
+  }
+}
+
+std::vector<Tensor> Pool1d(const Tensor &tensor,
+                           const std::vector<int> &kernel_size,
+                           const std::vector<int> &stride_size,
+                           const std::vector<int> &padding_size,
+                           const std::string &pool_type,
+                           bool ceil_mode,
+                           bool exclusive,
+                           const std::string &data_format,
+                           const std::string &output_name) {
+  int width_axis = -1;
+  if (data_format == "NCW") {
+    width_axis = 2;
+  } else if (data_format == "NWC") {
+    width_axis = 1;
+  } else {
+    LOG(FATAL) << "Unsupported data format: " << data_format << std::endl;
+  }
+  CHECK_EQ(tensor->shape.size(), 3U) << "pool1d requires tensor's shape_size to be 3\n";
+  std::vector<int> axis = {width_axis};
+  return PoolImpl(
+      tensor, kernel_size, stride_size, padding_size, pool_type, axis, ceil_mode, exclusive, false, output_name);
+}
+
+std::vector<Tensor> GlobalPool2d(const Tensor &tensor, const std::string &pool_type, const std::string &output_name) {
+  // TODO 1. check warp shuffle is supported!
+  // TODO 2. using `cub` with NVRTC
+  Expr extend = tensor->shape[2] * tensor->shape[3];
+  if (pool_type == "max") {
+    auto temp = Compute(
+        {tensor->shape[0], tensor->shape[1], Expr(32)},
+        [=](Expr n, Expr c, Expr k) -> Expr {
+          Expr offset = common::IndiceToAbsOffset(tensor->shape, {n, c, Expr(0), Expr(0)});
+          return lang::CallExtern("cinn_warp_reduce_max_" + Type2StrForNN(tensor->type()), {tensor, offset, extend});
+        },
+        UniqName(output_name + "_temp"));
+    temp->WithBuffer(tensor->type());
+    auto ret = Compute(
+        {tensor->shape[0], tensor->shape[1]},
+        [=](Expr n, Expr c) -> Expr {
+          return temp({n, c, Expr(0)});
+        },
+        UniqName(output_name));
+    return {ret, temp};
+  } else if (pool_type == "avg") {
+    auto temp = Compute(
+        {tensor->shape[0], tensor->shape[1], Expr(32)},
+        [=](Expr n, Expr c, Expr k) -> Expr {
+          Expr offset = common::IndiceToAbsOffset(tensor->shape, {n, c, Expr(0), Expr(0)});
+          return lang::CallExtern("cinn_warp_reduce_avg_" + Type2StrForNN(tensor->type()), {tensor, offset, extend});
+        },
+        UniqName(output_name + "_temp"));
+    temp->WithBuffer(tensor->type());
+    auto ret = Compute(
+        {tensor->shape[0], tensor->shape[1]},
+        [=](Expr n, Expr c) -> Expr {
+          return temp({n, c, Expr(0)});
+        },
+        UniqName(output_name));
+    return {ret, temp};
+  } else {
+    LOG(FATAL) << "unsupported pooling type.";
+  }
+  return {};
+}
+
+std::vector<Tensor> Pool2d(const Tensor &tensor,
+                           const std::vector<int> &kernel_size,
+                           const std::vector<int> &stride_size,
+                           const std::vector<int> &padding_size,
+                           const std::string &pool_type,
+                           bool ceil_mode,
+                           bool exclusive,
+                           const std::string &data_format,
+                           bool adaptive,
+                           const std::string &output_name) {
+  int height_axis = -1;
+  int width_axis  = -1;
+  if (data_format == "NCHW") {
+    height_axis = 2;
+    width_axis  = 3;
+  } else if (data_format == "NHWC") {
+    height_axis = 1;
+    width_axis  = 2;
+  } else if (data_format == "AnyLayout") {
+    height_axis = 2;
+    width_axis  = 3;
+  } else {
+    LOG(FATAL) << "Unsupported data format: " << data_format << std::endl;
+  }
+  CHECK(tensor->shape.size() == 4U || tensor->shape.size() == 5U)
+      << "pool2d requires tensor's shape_size to be 4 or 5\n";
+  std::vector<int> axis = {height_axis, width_axis};
+  return PoolImpl(
+      tensor, kernel_size, stride_size, padding_size, pool_type, axis, ceil_mode, exclusive, adaptive, output_name);
+}
+
+std::vector<Tensor> Pool3d(const Tensor &tensor,
+                           const std::vector<int> &kernel_size,
+                           const std::vector<int> &stride_size,
+                           const std::vector<int> &padding_size,
+                           const std::string &pool_type,
+                           bool ceil_mode,
+                           bool exclusive,
+                           const std::string &data_format,
+                           const std::string &output_name) {
+  int height_axis = -1;
+  int width_axis  = -1;
+  int depth_axis  = -1;
+  if (data_format == "NCDHW") {
+    depth_axis  = 2;
+    height_axis = 3;
+    width_axis  = 4;
+  } else if (data_format == "NDHWC") {
+    depth_axis  = 1;
+    height_axis = 2;
+    width_axis  = 3;
+  } else {
+    LOG(FATAL) << "Unsupported data format: " << data_format << std::endl;
+  }
+  CHECK_EQ(tensor->shape.size(), 5U) << "pool1d requires tensor's shape_size to be 5\n";
+  std::vector<int> axis = {depth_axis, height_axis, width_axis};
+  return PoolImpl(tensor,
+                  kernel_size,
+                  stride_size,
+                  padding_size,
+                  pool_type,
+                  axis,
+                  ceil_mode,
+                  exclusive,
+                  false,
+                  UniqName(output_name));
+}
+
+Tensor DropoutInfer(const ir::Tensor &tensor,
+                    float dropout_prob,
+                    const std::string &dropout_implementation,
+                    const std::string &output_name) {
+  if (dropout_implementation == "downgrade_in_infer") {
+    return Compute(
+        tensor->shape,
+        [=](const std::vector<Expr> &indice) {
+          return tensor(indice) * common::make_const(tensor->type(), 1 - dropout_prob);
+        },
+        output_name);
+  } else if (dropout_implementation == "upscale_in_train") {
+    // The name here must be consistent, otherwise it cannot participate in the fusion schedule.
+    return Identity(tensor, output_name).front();
+  } else {
+    LOG(FATAL) << "dropout_implementation attr must be 'downgrade_in_infer' or 'upscale_in_train'\n";
+  }
+}
+
+ir::Tensor Select(const ir::Tensor &condition,
+                  const ir::Tensor &true_value,
+                  const ir::Tensor &false_value,
+                  const std::string &output_name) {
+  CHECK(condition->type().is_bool()) << "The condition tensor type should be bool!";
+  CHECK(condition->shape == true_value->shape && true_value->shape == false_value->shape)
+      << "The input tensor shape is not equal!";
+  return lang::Compute(
+      condition->shape,
+      [=](const std::vector<Expr> &indice) {
+        return common::select(condition(indice), true_value(indice), false_value(indice));
+      },
+      output_name);
+}
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/nn.h b/paddle/cinn/hlir/pe/nn.h
new file mode 100755
index 0000000000000..b0627c501ae60
--- /dev/null
+++ b/paddle/cinn/hlir/pe/nn.h
@@ -0,0 +1,426 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+/**
+ * @brief Rectified Linear Unit.
+ *
+ * @param A The first Tensor
+ * @param threshold The relu threshold (default: 0)
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensor.
+ */
+ir::Tensor Relu(const ir::Tensor &A, double threshold = 0.0, const std::string &output_name = UniqName("T_Relu_out"));
+
+/**
+ * @brief Rectified Linear Unit bounded by six.
+ *
+ * @param A The first Tensor
+ * @param threshold The relu threshold (default: 0)
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensor.
+ */
+ir::Tensor Relu6(const ir::Tensor &A, double threshold = 0.0, const std::string &output_name = UniqName("T_Relu6_out"));
+
+/**
+ * @brief Leaky Rectified Linear Unit.
+ *
+ * @param A The first Tensor
+ * @param alpha The slope for the small gradient when t < 0
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensor.
+ */
+ir::Tensor LeakyRelu(const ir::Tensor &A,
+                     double alpha                   = 0.1,
+                     const std::string &output_name = UniqName("T_LeakyRelu_out"));
+
+/**
+ * @brief Leaky Rectified Linear Unit.
+ *
+ * @param A The first Tensor
+ * @param slope The channel-wise slope tensor
+ * @param axis The axis where the channel data needs to be applied
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensor.
+ */
+ir::Tensor PRelu(const ir::Tensor &A,
+                 const ir::Tensor &slope,
+                 const int axis                 = 1,
+                 const std::string &output_name = UniqName("T_PRelu_out"));
+
+/**
+ * @brief Perform a 2-D convolution with an NCHW-layout using winograd algorithm.
+ *
+ * @param input The 4-D input tensor {N, C_in, H, W}
+ * @param weights The 4-D weight tensor {C_out, C_in/group, filter_h, filter_w}
+ * @param pad_h padding applied to the height of the image, default is 0
+ * @param pad_w padding applied to the width of the image, default is 0
+ * @param stride_h striding applied to the height of the image, default is 1
+ * @param stride_w striding applied to the width of the image, default is 1
+ * @param dilation_h dilation applied to the height of the image, default is 1
+ * @param dilation_w dilation applied to the width of the image, default is 1
+ * @param output_name The name of the output tensors
+ *
+ * @return the output tensor
+ */
+std::vector<ir::Tensor> Conv2d_winograd_NCHW(const ir::Tensor &input,
+                                             const ir::Tensor &weights,
+                                             int pad_h,
+                                             int pad_w,
+                                             int stride_h,
+                                             int stride_w,
+                                             int dilation_h,
+                                             int dilation_w,
+                                             const std::string &output_name = UniqName("T_Conv2d_winograd_NCHW_out"));
+
+/**
+ * @brief Perform a 2-D convolution with an NCHW-layout and support group and depthwise convolution.
+ *
+ * @param input The 4-D input tensor {N, C_in, H, W}
+ * @param weights The 4-D weight tensor {C_out, C_in/group, filter_h, filter_w}
+ * @param pad_h padding applied to the height of the image, default is 0
+ * @param pad_w padding applied to the width of the image, default is 0
+ * @param stride_h striding applied to the height of the image, default is 1
+ * @param stride_w striding applied to the width of the image, default is 1
+ * @param dilation_h dilation applied to the height of the image, default is 1
+ * @param dilation_w dilation applied to the width of the image, default is 1
+ * @param output_name The name of the output tensors
+ *
+ * @return the output tensor
+ */
+std::vector<ir::Tensor> Conv2d_NCHW(const ir::Tensor &input,
+                                    const ir::Tensor &weights,
+                                    int pad_h,
+                                    int pad_w,
+                                    int stride_h,
+                                    int stride_w,
+                                    int dilation_h,
+                                    int dilation_w,
+                                    const std::string &output_name = UniqName("T_Conv2d_NCHW_out"),
+                                    bool choose_direct_compute     = false);
+
+std::vector<ir::Tensor> Conv2d_NCHW_5D(const ir::Tensor &input,
+                                       const ir::Tensor &weights,
+                                       int pad_h,
+                                       int pad_w,
+                                       int stride_h,
+                                       int stride_w,
+                                       int dilation_h,
+                                       int dilation_w,
+                                       std::string key,
+                                       const std::string &output_name = UniqName("T_Conv2d_NCHW_5D_out"),
+                                       const common::Target &target   = common::DefaultHostTarget());
+
+/**
+ * @brief Perform a 2-D convolution with an NCHWc-layout.
+ *
+ * @param input The 5-D input tensor {N, C_in_outer, H, W, C_in_inner}
+ * @param weight The 6-D weight tensor {C_out_outer, C_filter_outer, filter_h, filter_w, C_filter_inner, C_out_inner}
+ * @param pad_h padding applied to the height of the image, default is 0
+ * @param pad_w padding applied to the width of the image, default is 0
+ * @param stride_h striding applied to the height of the image, default is 1
+ * @param stride_w striding applied to the width of the image, default is 1
+ * @param dilation_h dilation applied to the height of the image, default is 1
+ * @param dilation_w dilation applied to the width of the image, default is 1
+ * @param output_shapes The shape of the output tensors
+ * @param output_name The name of the output tensors
+ *
+ * @return the output tensor
+ */
+std::vector<ir::Tensor> Conv2d_NCHWc(const ir::Tensor &input,
+                                     const ir::Tensor &weights,
+                                     int pad_h,
+                                     int pad_w,
+                                     int stride_h,
+                                     int stride_w,
+                                     int dilation_h,
+                                     int dilation_w,
+                                     const std::string &output_name = UniqName("T_Conv2d_NCHWc_out"),
+                                     const common::Target &target   = common::DefaultHostTarget());
+
+#ifdef CINN_WITH_MKLDNN
+std::vector<ir::Tensor> Conv2d_NCHW_MKLDNN(const ir::Tensor &input,
+                                           const ir::Tensor &weights,
+                                           int pad_h,
+                                           int pad_w,
+                                           int stride_h,
+                                           int stride_w,
+                                           int dilation_h,
+                                           int dilation_w,
+                                           const std::string &output_name = UniqName("T_Conv2d_NCHW_out"));
+#endif
+
+/**
+ * @brief Perform a 2-D convolution with an NHWC-layout and support group and depthwise convolution.
+ *
+ * @param input The 4-D input tensor {N, H, W, C_in}
+ * @param weight The 4-D weight tensor {C_out, C_in/group, filter_h, filter_w}
+ * @param pad_h padding applied to the height of the image, default is 0
+ * @param pad_w padding applied to the width of the image, default is 0
+ * @param stride_h striding applied to the height of the image, default is 1
+ * @param stride_w striding applied to the width of the image, default is 1
+ * @param dilation_h dilation applied to the height of the image, default is 1
+ * @param dilation_w dilation applied to the width of the image, default is 1
+ * @param output_shapes The shape of the output tensors
+ * @param output_name The name of the output tensor
+ *
+ * @return the output tensors
+ */
+std::vector<ir::Tensor> Conv2d_NHWC(const ir::Tensor &input,
+                                    const ir::Tensor &weights,
+                                    int pad_h,
+                                    int pad_w,
+                                    int stride_h,
+                                    int stride_w,
+                                    int dilation_h,
+                                    int dilation_w,
+                                    const std::string &output_name = UniqName("T_Conv2d_NHWC_out"));
+
+/**
+ * @brief Perform a 2-D depthwise convolution with an NCHW-layout
+ *
+ * @param input The 4-D input tensor {N, C_in, H, W}
+ * @param weight The 4-D weight tensor {C_in, channel_multiplier, filter_h, filter_w}
+ * @param pad_h padding counts applied to the height of the image, before and after (symmetric padding)
+ * @param pad_w padding counts applied to the width of the image, before and after (symmetric padding)
+ * @param stride_h striding counts applied to the height of the image, before and after (symmetric padding)
+ * @param stride_w striding counts applied to the width of the image, before and after (symmetric padding)
+ * @param output_shapes The shape of the output tensors
+ * @param output_name The name of the output tensors
+ *
+ * @return the output tensor
+ */
+std::vector<ir::Tensor> Depthwise_Conv2d_NCHW(const ir::Tensor &input,
+                                              const ir::Tensor &weight,
+                                              int pad_h,
+                                              int pad_w,
+                                              int stride_h,
+                                              int stride_w,
+                                              const std::string output_name = UniqName("T_depthwise_conv2d_nchw"));
+
+/**
+ * @brief Perform a 2-D depthwise convolution with an NHWC-layout
+ *
+ * @param input The 4-D input tensor {N, H, W, C_in}
+ * @param weight The 4-D weight tensor {C_in, channel_multiplier, filter_h, filter_w}
+ * @param pad_h padding counts applied to the height of the image, before and after (symmetric padding)
+ * @param pad_w padding counts applied to the width of the image, before and after (symmetric padding)
+ * @param stride_h striding counts applied to the height of the image, before and after (symmetric padding)
+ * @param stride_w striding counts applied to the width of the image, before and after (symmetric padding)
+ * @param output_shapes The shape of the output tensors
+ * @param output_name The name of the output tensor
+ *
+ * @return the output tensors
+ */
+std::vector<ir::Tensor> Depthwise_Conv2d_NHWC(const ir::Tensor &input,
+                                              const ir::Tensor &weight,
+                                              int pad_h,
+                                              int pad_w,
+                                              int stride_h,
+                                              int stride_w,
+                                              const std::string output_name = UniqName("T_depthwise_conv2d_nhwc"));
+
+ir::Tensor BatchNorm_NCHW(const ir::Tensor &input,
+                          const ir::Tensor &scale,
+                          const ir::Tensor &bias,
+                          const ir::Tensor &mean,
+                          const ir::Tensor &variance,
+                          float epsilon,
+                          const std::string &output_name = UniqName("T_BatchNorm_NCHW_out"));
+
+ir::Tensor BatchNorm_NCHWc(const ir::Tensor &input,
+                           const ir::Tensor &scale,
+                           const ir::Tensor &bias,
+                           const ir::Tensor &mean,
+                           const ir::Tensor &variance,
+                           float epsilon,
+                           const std::string &output_name = UniqName("T_BatchNorm_NCHWc_out"));
+
+/**
+ * @brief Perform padding operation.
+ * @param tensor The input tensor.
+ * @param pad_before Vector of Exprs describing the padding before the respective dimension
+ * @param pad_after Vector of Exprs describing the padding after the respective dimension
+ * @param pad_value The value to fill padding elements with. Default is zero.
+ * @param name The name of the output padding tensor
+ * @param pad_mode Padding type to use: "constant" pads with constant_value; "edge" pads using the edge values of the
+ * input array; "reflect" pads by reflecting values with respect to the edges.
+ *
+ * @return the output tensor after padding.
+ *
+ * @note
+ *  The pad_after vector must either be empty or have the same length as pad_before
+ *  When pad_after is empty, it takes the same values as pad_before (symmetric padding)
+ *  The pad vector applies from the leading dimensions and skips missing trailing dimensions:
+ *  e.g.
+ *      pad(t(i, j, k), {1}, {1}) returns the equivalent operation for
+ *          the following pseudocode:
+ *              for i in [0, t.shape[0] + 2):
+ *                  for j in [0, t.shape[0] + 2):
+ *                      for k in [0, t.shape[0] + 2):
+ *                         name(i,j,k) =
+ *                             i < 1 ? 0 :
+ *                               ((1 <= i < t.shape[0] + 1) ?
+ *                                 t(i-1, j, k) : 0));
+ *
+ */
+ir::Tensor Pad(const ir::Tensor &tensor,
+               const std::vector<Expr> &pad_before,
+               std::vector<Expr> pad_after = std::vector<Expr>(),
+               Expr pad_value              = Expr(),
+               const std::string &name     = UniqName("T_pad_out"),
+               const std::string &pad_mode = "constant");
+
+std::vector<ir::Tensor> Softmax(const ir::Tensor &A,
+                                int axis                       = -1,
+                                const std::string &output_name = UniqName("T_softmax_out"));
+
+#ifdef CINN_WITH_MKLDNN
+std::vector<ir::Tensor> SoftmaxMKLDNN(const ir::Tensor &A,
+                                      int axis                       = -1,
+                                      const std::string &output_name = UniqName("T_softmax_out"));
+#endif
+
+/**
+ * @brief Perform pooling on the width dimension of the tensor.
+ *        Width axis is determined by the data_format string in which 'W' means width. Only support NCW and NWC
+ * data_format.
+ * @param tensor The input tensor with shape of {N, C, W} or {N, W, C}
+ * @param kernel_size Vector of ints: {pool_kernel_width}
+ * @param stride_size Vector of ints: {pool_stride_width}
+ * @param padding_size Vector of ints: {head_pad_width, tail_pad_width}
+ * @param pool_type The type of pooling operator, currently support "max" and "avg". Default is "max".
+ * @param ceil_mode Whether to use ceil when calculating the output size. Default is false.
+ * @param exclusive Whether include padding in the calculation. Default is True.
+ * @param data_format The input data format. Only support NCW and NWC data_format.
+ * @param output_name the name of the output tensor after padding and pooling.
+ *
+ * @return the vector of padding tensor and pooling tensor.
+ */
+std::vector<ir::Tensor> Pool1d(const ir::Tensor &tensor,
+                               const std::vector<int> &kernel_size,
+                               const std::vector<int> &stride_size,
+                               const std::vector<int> &padding_size,
+                               const std::string &pool_type   = "max",
+                               bool ceil_mode                 = false,
+                               bool exclusive                 = true,
+                               const std::string &data_format = "NCW",
+                               const std::string &output_name = UniqName("T_Pool1d_out"));
+
+/**
+ * @brief Perform pooling on the height and width dimension of the tensor.
+ *        Height and width axes are determined by the data_format string in which 'H' means height and 'W' means width.
+ *        Only support NCHW and NHWC data_format.
+ * @param tensor The input tensor with shape of {N, C, H, W} or {N, H, W, C}
+ * @param kernel_size Vector of ints: {pool_kernel_height, pool_kernel_width}
+ * @param stride_size Vector of ints: {pool_stride_height, pool_stride_width}
+ * @param padding_size Vector of ints: {head_pad_height, head_pad_width, tail_pad_height, tail_pad_width}
+ * @param pool_type The type of pooling operator, currently support "max" and "avg". Default is "max".
+ * @param ceil_mode Whether to use ceil when calculating the output size. Default is false.
+ * @param exclusive Whether include padding in the calculation. Default is True.
+ * @param data_format The input data format. Only support NCHW and NHWC data_format.
+ * @param output_name the name of the output tensor after padding and pooling.
+ *
+ * @return the vector of padding tensor and pooling tensor.
+ */
+std::vector<ir::Tensor> Pool2d(const ir::Tensor &tensor,
+                               const std::vector<int> &kernel_size,
+                               const std::vector<int> &stride_size,
+                               const std::vector<int> &padding_size,
+                               const std::string &pool_type   = "max",
+                               bool ceil_mode                 = false,
+                               bool exclusive                 = true,
+                               const std::string &data_format = "NCHW",
+                               bool adaptive                  = false,
+                               const std::string &output_name = UniqName("T_Pool2d_out"));
+std::vector<ir::Tensor> GlobalPool2d(const ir::Tensor &tensor,
+                                     const std::string &pool_type,
+                                     const std::string &output_name);
+/**
+ * @brief Perform pooling on the depth, height and width dimension of the tensor.
+ *        Depth, height and width axis is determined by the data_format string in which 'D' means depth, 'H' means
+ * height and 'W' means width. Only support NCDHW and NDHWC data_format.
+ * @param tensor The input tensor with shape of {N, C, D, H, W} or {N, D, H, W, C}
+ * @param kernel_size Vector of ints: {pool_kernel_depth, pool_kernel_height, pool_kernel_width}
+ * @param stride_size Vector of ints: {pool_stride_depth, pool_stride_height, pool_stride_width}
+ * @param padding_size Vector of ints: {head_pad_depth, head_pad_height, head_pad_width, tail_pad_depth,
+ * tail_pad_height, tail_pad_width}
+ * @param pool_type The type of pooling operator, currently support "max" and "avg". Default is "max".
+ * @param ceil_mode Whether to use ceil when calculating the output size. Default is false.
+ * @param exclusive Whether include padding in the calculation. Default is True.
+ * @param data_format The input data format. Only support NCDHW and NDHWC data_format.
+ * @param output_name the name of the output tensor after padding and pooling.
+ */
+std::vector<ir::Tensor> Pool3d(const ir::Tensor &tensor,
+                               const std::vector<int> &kernel_size,
+                               const std::vector<int> &stride_size,
+                               const std::vector<int> &padding_size,
+                               const std::string &pool_type   = "max",
+                               bool ceil_mode                 = false,
+                               bool exclusive                 = true,
+                               const std::string &data_format = "NCDHW",
+                               const std::string &output_name = UniqName("T_Pool3d_out"));
+
+/**
+ * @brief Perform dropout in the inference which will downgrade the outcome at inference or keep the same.
+ * @param tensor The input tensor
+ * @param dropout_prob float. Probability of setting units to zero.
+ * @param dropout_implementation ['downgrade_in_infer'(default)|'upscale_in_train']
+ * 1. downgrade_in_infer(default), downgrade the outcome at inference
+ *      out = input * (1.0 - dropout_prob)
+ * 2. upscale_in_train, keep the same
+ *      out = input
+ * @param output_name the name of the output tensor.
+ */
+ir::Tensor DropoutInfer(const ir::Tensor &tensor,
+                        float dropout_prob,
+                        const std::string &dropout_implementation = "downgrade_in_infer",
+                        const std::string &output_name            = UniqName("T_Dropout_infer_out"));
+
+/**
+ * @brief Perform Select for meta op 'Select'.
+ * @param condition : the condition tensor for select value.
+ * @param true_value : value for condition value = true.
+ * @param false_value : value for condition value = false.
+ * 1. condition expr = true
+ *     out = true_value
+ * 2. condition expr = false
+ *     out = false_value
+ * @param ouput_name : the name of the output tensor.
+ */
+ir::Tensor Select(const ir::Tensor &condition,
+                  const ir::Tensor &true_value,
+                  const ir::Tensor &false_value,
+                  const std::string &output_name = UniqName("T_Select_out"));
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/nn_util.cc b/paddle/cinn/hlir/pe/nn_util.cc
new file mode 100644
index 0000000000000..cef6f243ef5e8
--- /dev/null
+++ b/paddle/cinn/hlir/pe/nn_util.cc
@@ -0,0 +1,428 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/nn_util.h"
+
+#include "cinn/common/ir_util.h"
+#include "cinn/common/target.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+using cinn::lang::Compute;
+using ir::Tensor;
+
+std::vector<std::vector<std::vector<float>>> get_winograd_val(const int& tile_size, const int& kernel_size) {
+  std::unordered_map<std::string, std::vector<std::vector<std::vector<float>>>> all_vals;
+  {
+    std::string keys = "2+3";
+    std::vector<std::vector<std::vector<float>>> nums;
+    std::vector<std::vector<float>> A = {{1., 0.}, {1., -1.}, {1., 1.}, {0., 1.}};
+    nums.push_back(A);
+    std::vector<std::vector<float>> B = {{1., 0., 0., 0.}, {0., -1., 1., -1.}, {-1., 1., 1., 0.}, {0., 0., 0., 1.}};
+    nums.push_back(B);
+    std::vector<std::vector<float>> G = {{1., 0., 0.}, {0.5, -0.5, 0.5}, {0.5, 0.5, 0.5}, {0., 0., 1.}};
+    nums.push_back(G);
+    all_vals[keys] = nums;
+  }
+  {
+    std::string keys = "2+5";
+    std::vector<std::vector<std::vector<float>>> nums;
+    std::vector<std::vector<float>> A = {{1.0, 0.0}, {1.0, -1.0}, {1.0, 1.0}, {1.0, 0.5}, {1.0, -2.0}, {0.0, 1.0}};
+    nums.push_back(A);
+    std::vector<std::vector<float>> B = {{1.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                                         {-1.5, 1.0, -1.0, -2.0, 0.5, 1.0},
+                                         {-2.0, -2.5, 0.5, -1.0, -1.0, -1.5},
+                                         {1.5, 0.5, 2.5, 2.0, -0.5, -2.0},
+                                         {1.0, 1.0, 1.0, 1.0, 1.0, 1.5},
+                                         {0.0, 0.0, 0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(B);
+    std::vector<std::vector<float>> G = {
+        {1.0, 0.0, 0.0, 0.0, 0.0},
+        {-0.3333333333333333, 0.3333333333333333, -0.3333333333333333, 0.3333333333333333, -0.3333333333333333},
+        {0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333},
+        {-1.0666666666666667, -0.5333333333333333, -0.26666666666666666, -0.13333333333333333, -0.06666666666666667},
+        {0.06666666666666667, -0.13333333333333333, 0.26666666666666666, -0.5333333333333333, 1.0666666666666667},
+        {0.0, 0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(G);
+    all_vals[keys] = nums;
+  }
+  {
+    std::string keys = "2+7";
+    std::vector<std::vector<std::vector<float>>> nums;
+    std::vector<std::vector<float>> A = {
+        {1.0, 0.0}, {1.0, -1.0}, {1.0, 1.0}, {1.0, 0.5}, {1.0, -0.5}, {1.0, 2.0}, {1.0, -2.0}, {0.0, 1.0}};
+    nums.push_back(A);
+    std::vector<std::vector<float>> B = {{1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                                         {0.0, -1.0, 1.0, 2.0, -2.0, 0.5, -0.5, -1.0},
+                                         {-5.25, 1.0, 1.0, 4.0, 4.0, 0.25, 0.25, 0.0},
+                                         {0.0, 4.25, -4.25, -2.5, 2.5, -2.5, 2.5, 5.25},
+                                         {5.25, -4.25, -4.25, -5.0, -5.0, -1.25, -1.25, 0.0},
+                                         {0.0, -1.0, 1.0, 0.5, -0.5, 2.0, -2.0, -5.25},
+                                         {-1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0},
+                                         {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(B);
+    std::vector<std::vector<float>> G = {{1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                                         {-0.2222222222222222,
+                                          0.2222222222222222,
+                                          -0.2222222222222222,
+                                          0.2222222222222222,
+                                          -0.2222222222222222,
+                                          0.2222222222222222,
+                                          -0.2222222222222222},
+                                         {-0.2222222222222222,
+                                          -0.2222222222222222,
+                                          -0.2222222222222222,
+                                          -0.2222222222222222,
+                                          -0.2222222222222222,
+                                          -0.2222222222222222,
+                                          -0.2222222222222222},
+                                         {0.7111111111111111,
+                                          0.35555555555555557,
+                                          0.17777777777777778,
+                                          0.08888888888888889,
+                                          0.044444444444444446,
+                                          0.022222222222222223,
+                                          0.011111111111111112},
+                                         {0.7111111111111111,
+                                          -0.35555555555555557,
+                                          0.17777777777777778,
+                                          -0.08888888888888889,
+                                          0.044444444444444446,
+                                          -0.022222222222222223,
+                                          0.011111111111111112},
+                                         {0.011111111111111112,
+                                          0.022222222222222223,
+                                          0.044444444444444446,
+                                          0.08888888888888889,
+                                          0.17777777777777778,
+                                          0.35555555555555557,
+                                          0.7111111111111111},
+                                         {0.011111111111111112,
+                                          -0.022222222222222223,
+                                          0.044444444444444446,
+                                          -0.08888888888888889,
+                                          0.17777777777777778,
+                                          -0.35555555555555557,
+                                          0.7111111111111111},
+                                         {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(G);
+    all_vals[keys] = nums;
+  }
+  {
+    std::string keys = "4+3";
+    std::vector<std::vector<std::vector<float>>> nums;
+    std::vector<std::vector<float>> A = {{1.0, 0.0, 0.0, 0.0},
+                                         {1.0, -1.0, 1.0, -1.0},
+                                         {1.0, 1.0, 1.0, 1.0},
+                                         {1.0, 0.5, 0.25, 0.125},
+                                         {1.0, -2.0, 4.0, -8.0},
+                                         {0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(A);
+    std::vector<std::vector<float>> B = {{1.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                                         {-1.5, 1.0, -1.0, -2.0, 0.5, 1.0},
+                                         {-2.0, -2.5, 0.5, -1.0, -1.0, -1.5},
+                                         {1.5, 0.5, 2.5, 2.0, -0.5, -2.0},
+                                         {1.0, 1.0, 1.0, 1.0, 1.0, 1.5},
+                                         {0.0, 0.0, 0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(B);
+    std::vector<std::vector<float>> G = {{1.0, 0.0, 0.0},
+                                         {-0.3333333333333333, 0.3333333333333333, -0.3333333333333333},
+                                         {0.3333333333333333, 0.3333333333333333, 0.3333333333333333},
+                                         {-1.0666666666666667, -0.5333333333333333, -0.26666666666666666},
+                                         {0.06666666666666667, -0.13333333333333333, 0.26666666666666666},
+                                         {0.0, 0.0, 1.0}};
+    nums.push_back(G);
+    all_vals[keys] = nums;
+  }
+  {
+    std::string keys = "4+5";
+    std::vector<std::vector<std::vector<float>>> nums;
+    std::vector<std::vector<float>> A = {{1.0, 0.0, 0.0, 0.0},
+                                         {1.0, -1.0, 1.0, -1.0},
+                                         {1.0, 1.0, 1.0, 1.0},
+                                         {1.0, 0.5, 0.25, 0.125},
+                                         {1.0, -0.5, 0.25, -0.125},
+                                         {1.0, 2.0, 4.0, 8.0},
+                                         {1.0, -2.0, 4.0, -8.0},
+                                         {0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(A);
+    std::vector<std::vector<float>> B = {{1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                                         {0.0, -1.0, 1.0, 2.0, -2.0, 0.5, -0.5, -1.0},
+                                         {-5.25, 1.0, 1.0, 4.0, 4.0, 0.25, 0.25, 0.0},
+                                         {0.0, 4.25, -4.25, -2.5, 2.5, -2.5, 2.5, 5.25},
+                                         {5.25, -4.25, -4.25, -5.0, -5.0, -1.25, -1.25, 0.0},
+                                         {0.0, -1.0, 1.0, 0.5, -0.5, 2.0, -2.0, -5.25},
+                                         {-1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0},
+                                         {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(B);
+    std::vector<std::vector<float>> G = {
+        {1.0, 0.0, 0.0, 0.0, 0.0},
+        {-0.2222222222222222, 0.2222222222222222, -0.2222222222222222, 0.2222222222222222, -0.2222222222222222},
+        {-0.2222222222222222, -0.2222222222222222, -0.2222222222222222, -0.2222222222222222, -0.2222222222222222},
+        {0.7111111111111111, 0.35555555555555557, 0.17777777777777778, 0.08888888888888889, 0.044444444444444446},
+        {0.7111111111111111, -0.35555555555555557, 0.17777777777777778, -0.08888888888888889, 0.044444444444444446},
+        {0.011111111111111112, 0.022222222222222223, 0.044444444444444446, 0.08888888888888889, 0.17777777777777778},
+        {0.011111111111111112, -0.022222222222222223, 0.044444444444444446, -0.08888888888888889, 0.17777777777777778},
+        {0.0, 0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(G);
+    all_vals[keys] = nums;
+  }
+  {
+    std::string keys = "4+7";
+    std::vector<std::vector<std::vector<float>>> nums;
+    std::vector<std::vector<float>> A = {{1.0, 0.0, 0.0, 0.0},
+                                         {1.0, -1.0, 1.0, -1.0},
+                                         {1.0, 1.0, 1.0, 1.0},
+                                         {1.0, 0.5, 0.25, 0.125},
+                                         {1.0, -0.5, 0.25, -0.125},
+                                         {1.0, 2.0, 4.0, 8.0},
+                                         {1.0, -2.0, 4.0, -8.0},
+                                         {1.0, -0.25, 0.0625, -0.015625},
+                                         {1.0, 4.0, 16.0, 64.0},
+                                         {0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(A);
+    std::vector<std::vector<float>> B = {
+        {1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+        {3.75, 1.0, -1.0, -2.0, 2.0, -0.5, 0.5, 4.0, -0.25, 1.0},
+        {-6.25, 2.75, -4.75, -11.5, 3.4999999999999996, -2.125, 1.625, -1.0, -1.0, 3.75},
+        {-19.6875, -9.0, 1.5, -10.5, -19.5, 2.0625, -3.9375, -21.0, 1.3125, -6.25},
+        {10.5, -10.6875, 21.1875, 18.375, -0.375, 10.875, -7.875, 5.25, 5.25, -19.6875},
+        {19.6875, 21.1875, 10.6875, 15.75, 21.75, 0.1875, 9.1875, 21.0, -1.3125, 10.5},
+        {-6.25, -1.5, -9.0, -7.875, -4.125, -9.75, 5.25, -5.25, -5.25, 19.6875},
+        {-3.75, -4.75, -2.75, -3.25, -4.25, -1.7499999999999998, -5.75, -4.0, 0.25, -6.25},
+        {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -3.7500000000000004},
+        {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(B);
+    std::vector<std::vector<float>> G = {{1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                                         {-0.05925925925925926,
+                                          0.05925925925925926,
+                                          -0.05925925925925926,
+                                          0.05925925925925926,
+                                          -0.05925925925925926,
+                                          0.05925925925925926,
+                                          -0.05925925925925926},
+                                         {0.05925925925925926,
+                                          0.05925925925925926,
+                                          0.05925925925925926,
+                                          0.05925925925925926,
+                                          0.05925925925925926,
+                                          0.05925925925925926,
+                                          0.05925925925925926},
+                                         {-0.2708994708994709,
+                                          -0.13544973544973546,
+                                          -0.06772486772486773,
+                                          -0.033862433862433865,
+                                          -0.016931216931216932,
+                                          -0.008465608465608466,
+                                          -0.004232804232804233},
+                                         {0.6320987654320988,
+                                          -0.3160493827160494,
+                                          0.1580246913580247,
+                                          -0.07901234567901234,
+                                          0.03950617283950617,
+                                          -0.019753086419753086,
+                                          0.009876543209876543},
+                                         {-0.0024691358024691358,
+                                          -0.0049382716049382715,
+                                          -0.009876543209876543,
+                                          -0.019753086419753086,
+                                          -0.03950617283950617,
+                                          -0.07901234567901234,
+                                          -0.1580246913580247},
+                                         {0.0010582010582010583,
+                                          -0.0021164021164021165,
+                                          0.004232804232804233,
+                                          -0.008465608465608466,
+                                          0.016931216931216932,
+                                          -0.033862433862433865,
+                                          0.06772486772486773},
+                                         {-1.3598091088287168,
+                                          0.3399522772071792,
+                                          -0.0849880693017948,
+                                          0.0212470173254487,
+                                          -0.005311754331362175,
+                                          0.0013279385828405437,
+                                          -0.0003319846457101359},
+                                         {2.0749040356883495e-05,
+                                          8.299616142753398e-05,
+                                          0.0003319846457101359,
+                                          0.0013279385828405437,
+                                          0.005311754331362175,
+                                          0.0212470173254487,
+                                          0.0849880693017948},
+                                         {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0}};
+    nums.push_back(G);
+    all_vals[keys] = nums;
+  }
+  std::string keys = std::to_string(tile_size) + "+" + std::to_string(kernel_size);
+  return all_vals[keys];
+}
+
+ir::Tensor const_matrix(const std::vector<std::vector<float>>& input, const std::string& name) {
+  int row                        = input.size();
+  int col                        = input[0].size();
+  std::vector<Expr> tensor_shape = {Expr(row), Expr(col)};
+  auto result                    = Compute(
+      tensor_shape,
+      [=](Expr yy, Expr xx) {
+        auto now = cinn::common::make_const(1.0f);
+        for (int ii = 0; ii < row; ii++) {
+          for (int jj = 0; jj < col; jj++) {
+            // if (common::is_zero(Expr(ii)-yy) && common::is_zero(Expr(jj)-xx)) {
+            //     now = cinn::common::make_const(input[ii][jj]);
+            // }
+            auto cond = common::and_all({Expr(ii) - yy == 0, Expr(jj) - xx == 0});
+            now       = common::select(cond, cinn::common::make_const(input[ii][jj]), now);
+          }
+        }
+        return now;
+      },
+      name);
+  return result;
+}
+
+std::vector<ir::Tensor> winograd_transform_matrices(const int& tile_size, const int& kernel_size) {
+  std::vector<std::vector<std::vector<float>>> vals = get_winograd_val(tile_size, kernel_size);
+  CHECK_EQ(vals.size(), 3U) << "vals_size of winograd is not 3! Please check.";
+
+  std::vector<std::vector<float>> A = vals[0];
+  std::vector<std::vector<float>> B = vals[1];
+  std::vector<std::vector<float>> G = vals[2];
+
+  std::string name_a = "A_matrix";
+  auto tensor_a      = const_matrix(A, name_a);
+
+  std::string name_b = "B_matrix";
+  auto tensor_b      = const_matrix(B, name_b);
+
+  std::string name_g = "G_matrix";
+  auto tensor_g      = const_matrix(G, name_g);
+
+  return {tensor_a, tensor_b, tensor_g};
+}
+
+int GetPostParallelSize(const std::vector<int>& inshape, const std::vector<int>& axes) {
+  int parallel_size = 1;
+  for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+    parallel_size *= inshape[idx];
+  }
+  return parallel_size;
+}
+
+int GetParallelSize(const std::vector<int>& inshape, const std::vector<int>& axes) {
+  int parallel_size = 1;
+  for (int idx = 0; idx < inshape.size(); ++idx) {
+    if (std::find(axes.begin(), axes.end(), idx) != axes.end()) {
+      continue;
+    }
+    parallel_size *= inshape[idx];
+  }
+  return parallel_size;
+}
+
+int GetTailSize(const std::vector<int>& inshape, const std::vector<int>& axes) {
+  int tail_size = 1;
+  for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+    tail_size *= inshape[idx];
+  }
+  return tail_size;
+}
+
+std::vector<int> GetFirstStepReduceShape(const std::vector<int>& shape,
+                                         const std::vector<int>& axes,
+                                         bool& inbound,
+                                         int& tail) {
+  // post parallel size
+  int post_parallel_size = GetPostParallelSize(shape, axes);
+  // the size to unfold las reduce axis
+  int unfold_size = common::GetMaxThreads() / GetParallelSize(shape, axes);
+  CHECK_GT(unfold_size, 1);
+
+  // fuse reduce axis.
+  int insert_zero_num  = 0;
+  int last_axis_index  = axes.size() - 1;
+  int last_reduce_size = shape[axes.back()];
+  for (; last_axis_index >= 1; --last_axis_index) {
+    if (axes[last_axis_index] - 1 != axes[last_axis_index - 1]) {
+      break;
+    }
+    ++insert_zero_num;
+    int index = axes[last_axis_index - 1];
+    last_reduce_size *= shape[index];
+  }
+
+  std::vector<int> reduce_shape;
+  for (int idx = 0; idx < axes[last_axis_index]; ++idx) {
+    reduce_shape.push_back(shape[idx]);
+  }
+
+  // insert 1 to keep dimension size.
+  for (int idx = 0; idx < insert_zero_num; ++idx) {
+    reduce_shape.emplace_back(1);
+  }
+
+  // get tail size.
+  if (last_reduce_size < unfold_size && last_reduce_size < 64) {
+    reduce_shape.emplace_back(1);
+    reduce_shape.emplace_back(last_reduce_size);
+
+    for (int idx = axes.back() + 1; idx < shape.size(); ++idx) {
+      reduce_shape.push_back(shape[idx]);
+    }
+    return reduce_shape;
+  }
+
+  // set loop size set.
+  static std::vector<int> loop_size_set = {64, 48, 32, 24, 16, 12, 8, 4, 2, 1};
+  for (auto loop_size : loop_size_set) {
+    if (last_reduce_size < loop_size || unfold_size < loop_size) {
+      continue;
+    }
+
+    if (last_reduce_size % loop_size != 0) {
+      continue;
+    }
+
+    if (loop_size > 1) {
+      reduce_shape.emplace_back(last_reduce_size / loop_size);
+      reduce_shape.emplace_back(loop_size);
+    } else {
+      for (auto loop_size : loop_size_set) {
+        if (unfold_size < loop_size) {
+          continue;
+        }
+        tail = last_reduce_size - (last_reduce_size / loop_size) * loop_size;
+        // do ceil
+        reduce_shape.emplace_back((last_reduce_size - 1) / loop_size + 1);
+        reduce_shape.emplace_back(loop_size);
+        inbound = false;
+        break;
+      }
+    }
+    break;
+  }
+
+  // std::vector<ir::Expr> tail;
+  for (int idx = axes.back() + 1; idx < shape.size(); ++idx) {
+    reduce_shape.push_back(shape[idx]);
+  }
+
+  return reduce_shape;
+}
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/nn_util.h b/paddle/cinn/hlir/pe/nn_util.h
new file mode 100644
index 0000000000000..77f84190ba0bd
--- /dev/null
+++ b/paddle/cinn/hlir/pe/nn_util.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/context.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+ir::Tensor const_matrix(const std::vector<std::vector<float>>& input, const std::string& name);
+
+std::vector<std::vector<std::vector<float>>> get_winograd_val(const int& tile_size, const int& kernel_size);
+
+std::vector<ir::Tensor> winograd_transform_matrices(const int& tile_size, const int& kernel_size);
+
+std::vector<int> GetFirstStepReduceShape(const std::vector<int>& shape,
+                                         const std::vector<int>& axes,
+                                         bool& inbound,
+                                         int& tail);
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/pe_broadcast_test.cc b/paddle/cinn/hlir/pe/pe_broadcast_test.cc
new file mode 100644
index 0000000000000..fcf23647b9aaf
--- /dev/null
+++ b/paddle/cinn/hlir/pe/pe_broadcast_test.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <math.h>
+
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/cinn.h"
+#include "cinn/common/target.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/runtime/cpu/host_intrinsics.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+using ir::Tensor;
+
+void TestBroadcastPE(
+    const std::string &fn_name,
+    Tensor (*func_op)(const Tensor &A, const Tensor &B, const std::string &output_name, const Expr &axis),
+    float (*fn_runtime)(float, float),
+    int set_value = 0) {
+  Expr M(100), N(32);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto C = func_op(A.tensor(), B.tensor(), "C", Expr());
+
+  auto stages = CreateStages({C});
+
+  Target target = common::DefaultHostTarget();
+  Module::Builder builder("module0", target);
+  auto func = Lower("fn", stages, {A, B, C});
+  builder.AddFunction(func);
+  LOG(INFO) << "func:\n" << func;
+
+  auto jit    = backends::ExecutionEngine::Create({});
+  auto module = builder.Build();
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn");
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf;
+  cinn_buffer_t *B_buf;
+  if (set_value != 0) {
+    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_val(set_value).Build();
+    B_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_val(set_value).Build();
+  } else {
+    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+    B_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+  }
+  auto *C_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_zero().Build();
+
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
+  fn_(args, 3);
+
+  auto *ad = reinterpret_cast<float *>(A_buf->memory);
+  auto *bd = reinterpret_cast<float *>(B_buf->memory);
+  auto *cd = reinterpret_cast<float *>(C_buf->memory);
+  for (int i = 0; i < A_buf->num_elements(); i++) {
+    ASSERT_NEAR(cd[i], fn_runtime(ad[i], bd[i]), 1e-5);
+  }
+}
+
+void TestBroadcastPE1(
+    const std::string &fn_name,
+    Tensor (*func_op)(const Tensor &A, const Tensor &B, const std::string &output_name, const Expr &axis),
+    float (*fn_runtime)(float, float),
+    int set_value = 0) {
+  Expr M(100), N(32), K(10);
+  Placeholder<float> A("A", {M, N, K});
+  Placeholder<float> B("B", {N});
+  auto C        = func_op(A.tensor(), B.tensor(), "C", Expr(1));
+  auto stages   = CreateStages({C});
+  Target target = common::DefaultHostTarget();
+  Module::Builder builder("module0", target);
+  auto func = Lower("fn", stages, {A, B, C});
+  builder.AddFunction(func);
+  LOG(INFO) << "func:\n" << func;
+  auto jit    = backends::ExecutionEngine::Create({});
+  auto module = builder.Build();
+  jit->Link(module);
+  auto fn = jit->Lookup("fn");
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+  cinn_buffer_t *A_buf;
+  cinn_buffer_t *B_buf;
+  if (set_value != 0) {
+    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32(), K.as_int32()}).set_val(set_value).Build();
+    B_buf = common::BufferBuilder(Float(32), {N.as_int32()}).set_val(set_value).Build();
+  } else {
+    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32(), K.as_int32()}).set_random().Build();
+    B_buf = common::BufferBuilder(Float(32), {N.as_int32()}).set_random().Build();
+  }
+  auto *C_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32(), K.as_int32()}).set_zero().Build();
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
+  fn_(args, 3);
+  auto *ad = reinterpret_cast<float *>(A_buf->memory);
+  auto *bd = reinterpret_cast<float *>(B_buf->memory);
+  auto *cd = reinterpret_cast<float *>(C_buf->memory);
+  for (size_t i = 0; i < 100; i++) {
+    for (size_t j = 0; j < 32; j++) {
+      for (size_t k = 0; k < 10; k++) {
+        int index = 32 * 10 * i + 10 * j + k;
+        ASSERT_NEAR(cd[index], fn_runtime(ad[index], bd[j]), 1e-5);
+      }
+    }
+  }
+}
+
+void TestBroadcastPE2(
+    const std::string &fn_name,
+    Tensor (*func_op)(const Tensor &A, const Tensor &B, const std::string &output_name, const Expr &axis),
+    float (*fn_runtime)(float, float),
+    int set_value = 0) {
+  Expr M(100), N(32), K(10), R(1);
+  Placeholder<float> A("A", {M, N, K, R});
+  Placeholder<float> B("B", {N, K});
+  auto C        = func_op(A.tensor(), B.tensor(), "C", Expr(1));
+  auto stages   = CreateStages({C});
+  Target target = common::DefaultHostTarget();
+  Module::Builder builder("module0", target);
+  auto func = Lower("fn", stages, {A, B, C});
+  builder.AddFunction(func);
+  LOG(INFO) << "func:\n" << func;
+  auto jit    = backends::ExecutionEngine::Create({});
+  auto module = builder.Build();
+  jit->Link(module);
+  auto fn = jit->Lookup("fn");
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+  cinn_buffer_t *A_buf;
+  cinn_buffer_t *B_buf;
+  if (set_value != 0) {
+    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32(), K.as_int32(), R.as_int32()})
+                .set_val(set_value)
+                .Build();
+    B_buf = common::BufferBuilder(Float(32), {N.as_int32(), K.as_int32()}).set_val(set_value).Build();
+  } else {
+    A_buf =
+        common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32(), K.as_int32(), R.as_int32()}).set_random().Build();
+    B_buf = common::BufferBuilder(Float(32), {N.as_int32(), K.as_int32()}).set_random().Build();
+  }
+  auto *C_buf =
+      common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32(), K.as_int32(), R.as_int32()}).set_zero().Build();
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
+  fn_(args, 3);
+  auto *ad = reinterpret_cast<float *>(A_buf->memory);
+  auto *bd = reinterpret_cast<float *>(B_buf->memory);
+  auto *cd = reinterpret_cast<float *>(C_buf->memory);
+  for (size_t i = 0; i < 100; i++) {
+    for (size_t j = 0; j < 32; j++) {
+      for (size_t k = 0; k < 10; k++) {
+        for (size_t r = 0; r < 1; r++) {
+          int index = 32 * 10 * i + 10 * j + k + r;
+          ASSERT_NEAR(cd[index], fn_runtime(ad[index], bd[10 * j + k]), 1e-5);
+        }
+      }
+    }
+  }
+}
+
+#define RULE(test_name__, rule__) \
+  float test_name__(float a, float b) { rule__ }
+
+#define TEST_BROADCAST_PE_FP32_BASIC(test_name__) \
+  TEST(broadcast_pe, test_name__) { TestBroadcastPE("PE_Broadcast_" #test_name__ "_fp32", test_name__, test_name__); }
+
+#define TEST_BROADCAST_PE_FP32_SET_BASIC(test_name__) \
+  TEST(broadcast_pe, test_name__) { TestBroadcastPE("PE_Broadcast_" #test_name__ "_fp32", test_name__, value); }
+
+#define TEST_BROADCAST_PE_FP32(test_name__, rule__) \
+  RULE(test_name__, rule__)                         \
+  TEST_BROADCAST_PE_FP32_BASIC(test_name__)
+
+TEST_BROADCAST_PE_FP32(Add, return a + b;)
+TEST_BROADCAST_PE_FP32(Multiply, return a * b;)
+
+#define PI 3.1415926535
+float Atan2(float a, float b) {
+  if (b == 0.0) {
+    if (a > 0) {
+      return PI / 2;
+    } else {
+      return -PI / 2;
+    }
+  } else {
+    auto at = atan(a / b);
+    if (b > 0) {
+      return at;
+    } else if (a >= 0) {
+      return at + PI;
+    } else {
+      return at - PI;
+    }
+  }
+}
+TEST_BROADCAST_PE_FP32_BASIC(Atan2);
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/pe_elementwise_test.cc b/paddle/cinn/hlir/pe/pe_elementwise_test.cc
new file mode 100644
index 0000000000000..d22eca299c9c5
--- /dev/null
+++ b/paddle/cinn/hlir/pe/pe_elementwise_test.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+// #include <cmath>
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/codegen_x86.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/cinn.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/target.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/reduction.h"
+#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "cinn/runtime/cpu/use_extern_funcs.h"
+#include "cinn/utils/timer.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+template <typename FuncOp, typename FuncRuntime>
+void TestElementwisePE(const std::string &fn_name,
+                       const FuncOp &func_op,
+                       const FuncRuntime &fn_runtime,
+                       Type type           = Float(32),
+                       int set_value       = 0,
+                       bool test_benchmark = true) {
+  Expr M(1024), N(2048);
+
+  Placeholder<float> A("A", {M, N});
+
+  auto A_out = func_op(A.tensor(), fn_name + "_out");
+  std::vector<ir::Tensor> tensor_args{A};
+  tensor_args.insert(tensor_args.end(), A_out.begin(), A_out.end());
+  auto stages = CreateStages(tensor_args);
+
+  stages[A_out[0]]->Parallel(0);
+
+  Target target = common::DefaultHostTarget();
+  Module::Builder builder("module0", target);
+  for (auto &tensor : A_out) {
+    stages->InsertLazily(tensor);
+  }
+  auto func = Lower("fn", stages, tensor_args);
+  LOG(INFO) << "func:\n" << func;
+  builder.AddFunction(func);
+
+  auto jit    = backends::ExecutionEngine::Create({});
+  auto module = builder.Build();
+
+  jit->Link<backends::CodeGenX86>(module);
+  auto fn = jit->Lookup("fn");
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf;
+  if (set_value != 0) {
+    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_val(set_value).Build();
+  } else {
+    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+  }
+  auto *B_buf = common::BufferBuilder(type, {M.as_int32(), N.as_int32()}).set_align(type.bits()).Build();
+
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg};
+
+  if (test_benchmark) {
+    cinn::utils::Timer timer;
+    timer.Start();
+    fn_(args, 2);
+    double test_op_time = timer.Stop();
+    LOG(INFO) << "kernel warmup run time: " << test_op_time << " ms";
+    timer.Start();
+    int repeat_ = 10;
+    for (int i = 0; i < repeat_; i++) {
+      fn_(args, 2);
+    }
+    test_op_time = timer.Stop() / repeat_;
+    LOG(INFO) << "repeat times: " << repeat_ << ", kernel run time: " << test_op_time << " ms";
+  } else {
+    fn_(args, 2);
+  }
+
+  auto *ad = reinterpret_cast<float *>(A_buf->memory);
+  if (type.is_bool()) {
+    auto *bd = reinterpret_cast<int8_t *>(B_buf->memory);
+    for (int i = 0; i < A_buf->num_elements(); i++) {
+      ASSERT_NEAR(bd[i], fn_runtime(ad[i]), 1e-5);
+    }
+  } else {
+    auto *bd = reinterpret_cast<float *>(B_buf->memory);
+    for (int i = 0; i < A_buf->num_elements(); i++) {
+      ASSERT_NEAR(bd[i], fn_runtime(ad[i]), 1e-5);
+    }
+  }
+}
+
+bool isnan(float e) { return std::isnan(e); }
+bool isfinite(float e) { return std::isfinite(e); }
+bool isinf(float e) { return std::isinf(e); }
+float rsqrt(float e) { return 1.0f / sqrtf(e); }
+
+#define TEST_ELEMENTWISE_PE_FP32(test_name__, PE__)                                               \
+  TEST(elementwise_pe, test_name__) {                                                             \
+    cinn::hlir::pe::TestElementwisePE("PE_Elementwise_" #test_name__ "_fp32", PE__, test_name__); \
+  }
+#define TEST_ELEMENTWISE_PE_FP32_BOOL(test_name__, PE__)                                                  \
+  TEST(elementwise_pe, test_name__) {                                                                     \
+    cinn::hlir::pe::TestElementwisePE("PE_Elementwise_" #test_name__ "_fp32", PE__, test_name__, Bool()); \
+  }
+#define TEST_ELEMENTWISE_PE_FP32_SET(test_name__, PE__, value__)                                                      \
+  TEST(elementwise_pe, test_name__) {                                                                                 \
+    cinn::hlir::pe::TestElementwisePE("PE_Elementwise_" #test_name__ "_fp32", PE__, test_name__, Float(32), value__); \
+  }
+
+TEST_ELEMENTWISE_PE_FP32(expf, Exp)
+TEST_ELEMENTWISE_PE_FP32(erff, Erf)
+TEST_ELEMENTWISE_PE_FP32(sqrtf, Sqrt)
+TEST_ELEMENTWISE_PE_FP32(logf, Log)
+TEST_ELEMENTWISE_PE_FP32(log2f, Log2)
+TEST_ELEMENTWISE_PE_FP32(log10f, Log10)
+TEST_ELEMENTWISE_PE_FP32(floorf, Floor)
+TEST_ELEMENTWISE_PE_FP32(ceilf, Ceil)
+TEST_ELEMENTWISE_PE_FP32(roundf, Round)
+TEST_ELEMENTWISE_PE_FP32(truncf, Trunc)
+TEST_ELEMENTWISE_PE_FP32(cosf, Cos)
+TEST_ELEMENTWISE_PE_FP32(coshf, Cosh)
+TEST_ELEMENTWISE_PE_FP32(tanf, Tan)
+TEST_ELEMENTWISE_PE_FP32(sinf, Sin)
+TEST_ELEMENTWISE_PE_FP32(sinhf, Sinh)
+TEST_ELEMENTWISE_PE_FP32(acosf, Acos)
+TEST_ELEMENTWISE_PE_FP32_SET(acoshf, Acosh, 1.5)
+TEST_ELEMENTWISE_PE_FP32(asinf, Asin)
+TEST_ELEMENTWISE_PE_FP32(asinhf, Asinh)
+TEST_ELEMENTWISE_PE_FP32(atanf, Atan)
+TEST_ELEMENTWISE_PE_FP32(atanhf, Atanh)
+TEST_ELEMENTWISE_PE_FP32(tanhf, Tanh)
+TEST_ELEMENTWISE_PE_FP32_BOOL(isnan, IsNan)
+TEST_ELEMENTWISE_PE_FP32_BOOL(isfinite, IsFinite)
+TEST_ELEMENTWISE_PE_FP32_BOOL(isinf, IsInf)
+TEST_ELEMENTWISE_PE_FP32(rsqrt, Rsqrt)
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/pe_transform_test.cc b/paddle/cinn/hlir/pe/pe_transform_test.cc
new file mode 100644
index 0000000000000..f5a76014e82cc
--- /dev/null
+++ b/paddle/cinn/hlir/pe/pe_transform_test.cc
@@ -0,0 +1,229 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/backends/codegen_cuda_util.h"
+#include "cinn/backends/cuda_util.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "cinn/cinn.h"
+#include "cinn/common/target.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/hlir/pe/reduction.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/hlir/pe/transform.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "cinn/runtime/cuda/cuda_module.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+using ir::Tensor;
+
+TEST(MatmulPE, MatmulCase1) {
+  int m = 100;
+  int n = 32;
+  int k = 16;
+  Expr M(m), N(n), K(k);
+
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  auto C = hlir::pe::Matmul(A.tensor(), B.tensor(), false, false, 1, "C");
+
+  auto stages                         = CreateStages({A, B});
+  std::vector<ir::Tensor> tensor_args = {A, B};
+  for (size_t i = 0; i < C.size(); i++) {
+    tensor_args.push_back(C[i]);
+    stages->InsertLazily(C[i]);
+  }
+  Target target = common::DefaultHostTarget();
+  Module::Builder builder("module0", target);
+  auto func = Lower("fn", stages, tensor_args);
+  builder.AddFunction(func);
+  LOG(INFO) << "func:\n" << func;
+
+  auto jit    = backends::ExecutionEngine::Create({});
+  auto module = builder.Build();
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn");
+  CHECK(fn);
+  auto fn_             = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+  cinn_buffer_t *A_buf = common::BufferBuilder(Float(32), {m, k}).set_random().Build();
+  cinn_buffer_t *B_buf = common::BufferBuilder(Float(32), {k, n}).set_random().Build();
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf);
+  std::vector<cinn_pod_value_t> args = {a_arg, b_arg};
+  std::vector<cinn_buffer_t *> C_buf;
+  for (int i = 0; i < C.size(); i++) {
+    std::vector<int> shapes;
+    for (auto &shape : C[i]->shape) {
+      shapes.push_back(shape.as_int32());
+    }
+    auto *buffer = common::BufferBuilder(Float(32), shapes).set_zero().Build();
+    CHECK(buffer);
+    C_buf.push_back(buffer);
+    cinn_pod_value_t arg(buffer);
+    args.push_back(arg);
+  }
+  fn_(reinterpret_cast<void **>(args.data()), args.size());
+  auto *ad   = reinterpret_cast<float *>(A_buf->memory);
+  auto *bd   = reinterpret_cast<float *>(B_buf->memory);
+  auto *cd   = reinterpret_cast<float *>(C_buf[0]->memory);
+  int size_a = m;
+  int size_b = n;
+  int size_c = k;
+  for (int i = 0; i < size_a; i++) {
+    for (int j = 0; j < size_b; j++) {
+      float tmp = 0;
+      for (int k = 0; k < size_c; k++) {
+        int index1 = i * size_c + k;
+        int index2 = j + k * size_b;
+        tmp += ad[index1] * bd[index2];
+      }
+      ASSERT_NEAR(cd[i * size_b + j], tmp, 1e-5);
+    }
+  }
+}
+
+TEST(ScatterAssign, ScatterAssign) {
+  int m = 128;
+  int n = 32;
+  int k = 32;
+  Expr M(m), N(n), K(k);
+
+  Placeholder<float> input("A", {M, K});
+  Placeholder<float> assign("B", {N, K});
+  Placeholder<int> indexs("C", {N});
+  int axis = 0;
+
+#ifdef CINN_WITH_CUDA
+  auto target = common::DefaultNVGPUTarget();
+#else
+  auto target = common::DefaultHostTarget();
+#endif
+
+  auto output = hlir::pe::ScatterAssign(input.tensor(), assign.tensor(), indexs.tensor(), target, axis);
+  auto stages = CreateStages({input, assign, indexs, output});
+  auto func   = Lower("fn", stages, {input, assign, indexs, output});
+  LOG(INFO) << "func:\n" << func;
+
+#ifdef CINN_WITH_CUDA
+  Module::Builder builder("ScatterAssign_Builder", target);
+  builder.AddFunction(func);
+
+  auto module                    = builder.Build();
+  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
+  auto &host_module              = std::get<0>(host_module_device_module);
+  auto &device_module            = std::get<1>(host_module_device_module);
+
+  backends::CodeGenCUDA_Dev codegen(target);
+  auto source_code = codegen.Compile(builder.Build());
+  LOG(INFO) << "compiled code:\n\n\n" << source_code;
+
+  // nv jit compile to ptx
+  backends::nvrtc::Compiler compiler;
+  auto ptx = compiler(source_code);
+  CHECK(!ptx.empty());
+  // cuda_module load ptx
+  runtime::cuda::CUDAModule cuda_module(ptx, runtime::cuda::CUDAModule::Kind::PTX);
+#endif  // CINN_WITH_CUDA
+}
+
+TEST(SliceAssign, SliceAssign) {
+  int m = 128;
+  int n = 32;
+  int k = 32;
+  Expr M(m), N(n), K(k);
+
+  std::vector<int> axis    = {0, 1};
+  std::vector<int> starts  = {32, 32};
+  std::vector<int> ends    = {64, 64};
+  std::vector<int> strides = {1, 1};
+
+  Placeholder<float> input("A", {M, M});
+  Placeholder<float> assign("B", {N, N});
+
+  auto output = hlir::pe::SliceAssign(input.tensor(), assign.tensor(), axis, starts, ends, strides);
+  auto stages = CreateStages({input, assign, output});
+  auto func   = Lower("fn", stages, {input, assign, output});
+  LOG(INFO) << "func:\n" << func;
+
+#ifdef CINN_WITH_CUDA
+  auto target = common::DefaultNVGPUTarget();
+  Module::Builder builder("SliceAssign_Builder", target);
+  builder.AddFunction(func);
+
+  auto module                    = builder.Build();
+  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
+  auto &host_module              = std::get<0>(host_module_device_module);
+  auto &device_module            = std::get<1>(host_module_device_module);
+
+  backends::CodeGenCUDA_Dev codegen(target);
+  auto source_code = codegen.Compile(builder.Build());
+  LOG(INFO) << "compiled code:\n\n\n" << source_code;
+
+  // nv jit compile to ptx
+  backends::nvrtc::Compiler compiler;
+  auto ptx = compiler(source_code);
+  CHECK(!ptx.empty());
+
+  runtime::cuda::CUDAModule cuda_module(ptx, runtime::cuda::CUDAModule::Kind::PTX);
+#endif
+}
+
+TEST(Concat, ConcatCase0) {
+  int m = 128;
+  int n = 32;
+  Expr M(m), N(n);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+  Placeholder<float> C("C", {M, N});
+  Placeholder<float> D("D", {M, N});
+
+  std::vector<ir::Tensor> inputs{A.tensor(), B.tensor(), C.tensor(), D.tensor()};
+  auto output = hlir::pe::Concat(inputs, 1);
+  auto stages = CreateStages({output});
+  auto func   = Lower("fn", stages, {A, B, C, D, output});
+  LOG(INFO) << "func:\n" << func;
+
+#ifdef CINN_WITH_CUDA
+  auto target = common::DefaultNVGPUTarget();
+  Module::Builder builder("Concat_Builder", target);
+  builder.AddFunction(func);
+
+  auto module                    = builder.Build();
+  auto host_module_device_module = backends::SplitCudaAndHostModule(module);
+  auto &host_module              = std::get<0>(host_module_device_module);
+  auto &device_module            = std::get<1>(host_module_device_module);
+
+  backends::CodeGenCUDA_Dev codegen(target);
+  auto source_code = codegen.Compile(builder.Build());
+  LOG(INFO) << "compiled code:\n\n\n" << source_code;
+
+  // nv jit compile to ptx
+  backends::nvrtc::Compiler compiler;
+  auto ptx = compiler(source_code);
+  CHECK(!ptx.empty());
+#endif
+}
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc
new file mode 100644
index 0000000000000..db2cd38f0f411
--- /dev/null
+++ b/paddle/cinn/hlir/pe/reduction.cc
@@ -0,0 +1,884 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/reduction.h"
+
+#include <cinn/ir/ir_base.h>
+
+#include <algorithm>
+
+#include "cinn/common/common.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/nn_util.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+using ir::Tensor;
+using lang::Compute;
+
+/**
+ * @brief transform reduction axes which could be empty or have negative elements into real axes with valid dimension
+ * indices.
+ *
+ * @param ndim Number of dimensions of the output tensor.
+ * @param axes The axes parameter.
+ * @param real_axes A non-empty sorted array of valid dimension indices, with no duplicates.
+ *
+ * @notes If the input axes are empty, the result will be axes including all dimensions. If any input element is
+ * negative, it will be treated as an offset from the last dimension (same as python indexing rules).
+ */
+void GetRealAxes(int ndim, const std::vector<int>& axes, std::vector<int>* real_axes) {
+  CHECK(real_axes);
+  if (axes.empty()) {
+    for (int i = 0; i < ndim; ++i) {
+      real_axes->push_back(i);
+    }
+  } else {
+    for (auto axis : axes) {
+      if (axis < 0) {
+        axis += ndim;
+      }
+      CHECK_LE(axis, ndim) << "exceeds the maximum dimension: " << ndim << std::endl;
+      CHECK_GE(axis, 0);
+      real_axes->push_back(axis);
+    }
+    real_axes->resize(std::unique(real_axes->begin(), real_axes->end()) - real_axes->begin());
+    std::sort(real_axes->begin(), real_axes->end());
+  }
+}
+
+std::string Type2StrForReduce(common::Type type) {
+  std::string suffix;
+  if (type.is_int(32)) {
+    return "_int32";
+  } else if (type.is_int(64)) {
+    return "_int64";
+  } else if (type.is_bfloat16()) {
+    return "_bf16";
+  } else if (type.is_float16()) {
+    return "_fp16";
+  } else if (type.is_float(32)) {
+    return "_fp32";
+  } else if (type.is_float(64)) {
+    return "_fp64";
+  } else if (type.is_bool()) {
+    return "";
+  }
+  LOG(FATAL) << "Reduce Not Support " << type;
+  return "";
+}
+
+/**
+ * @brief Calculate the target reduced shape.
+ *
+ * @param real_axes A non-empty sorted array of valid dimension indices, with no duplicates.
+ * @param output_shape The output Tensor shape.
+ * @param tensor The input tensor.
+ * @param keep_dims If this is set to true, the reduced axes are kept as dimensions with size one. This enables the
+ * result to broadcast correctly against the input array.
+ */
+void GetOutputShape(const std::vector<int>& real_axes,
+                    std::vector<Expr>* output_shape,
+                    const Tensor& tensor,
+                    bool keep_dims) {
+  CHECK(output_shape);
+  auto ndim = tensor->shape.size();
+  if (keep_dims) {
+    for (size_t i = 0; i < ndim; ++i) {
+      if (std::find(real_axes.begin(), real_axes.end(), i) != real_axes.end()) {
+        output_shape->push_back(common::make_one());
+      } else {
+        output_shape->push_back(tensor->shape[i]);
+      }
+    }
+  } else {
+    for (size_t i = 0; i < ndim; ++i) {
+      if (std::find(real_axes.begin(), real_axes.end(), i) == real_axes.end()) {
+        output_shape->push_back(tensor->shape[i]);
+      }
+    }
+  }
+  if (output_shape->empty()) {
+    output_shape->push_back(common::make_one());
+  }
+}
+
+/*!
+ * @brief Create a reduction PE.
+ *
+ * @param tensor The input tensor.
+ * @param fn The reduction function eg. ReduceSum
+ * @param output_shape The output Tensor shape.
+ * @param real_axes The real axes where the reduction is performed.
+ * @param squeeze_axes The real axes to squeeze. If unsqueezed, reduced axes will have shape 1 in the output tensor.
+ * @param initial Starting value for the sum.
+ * @param output_name The name of the output Tensor.
+ *
+ * @return The result tensor.
+ */
+template <typename FuncOp>
+Tensor DoReduce(const Tensor& tensor,
+                const FuncOp& fn,
+                const std::vector<Expr>& output_shape,
+                const std::vector<int>& real_axes,
+                const std::vector<int>& squeeze_axes,
+                Expr initial,
+                const std::string& output_name) {
+  std::vector<Var> reduce_axes;
+  int reduce_k_id = 0;
+  for (auto& axis : real_axes) {
+    std::string name = cinn::UniqName(std::string("reduce_k_") + std::to_string(reduce_k_id));
+    reduce_axes.push_back(Var(tensor->shape[axis], name));
+    reduce_k_id++;
+  }
+  auto compute = [&](const std::vector<Expr>& indices) -> Expr {
+    std::vector<Expr> eval_indice;
+    int indice_cnt = 0;
+    int reduce_cnt = 0;
+
+    for (size_t i = 0; i < tensor->shape.size(); ++i) {
+      bool squeeze_i = std::find(squeeze_axes.begin(), squeeze_axes.end(), i) != squeeze_axes.end();
+      if (std::find(real_axes.begin(), real_axes.end(), i) != real_axes.end()) {
+        eval_indice.push_back(reduce_axes[reduce_cnt]);
+        reduce_cnt++;
+        indice_cnt += !squeeze_i;
+        continue;
+      }
+      eval_indice.push_back(indices[indice_cnt]);
+      indice_cnt++;
+    }
+    return fn(tensor(eval_indice), reduce_axes, initial);
+  };
+
+  Tensor C = Compute(output_shape, compute, output_name);
+  return C;
+}
+
+/**
+ * @brief reduction PE
+ *
+ * @param tensor The input tensor.
+ * @param axes The axes along which the reduction are performed.
+ * @param fn The reduction function eg. ReduceSum
+ * @param keep_dims If it is set true, the axes which are reduced are left in the result as dimensions with size one.
+ * @param initial Starting value for the sum.
+ *
+ * @return The result tensor.
+ */
+template <typename FuncOp>
+Tensor Reduce(const Tensor& tensor,
+              const std::vector<int>& axes,
+              const FuncOp& fn,
+              bool keep_dims,
+              ir::Expr initial,
+              const std::string& output_name) {
+  auto ndim = tensor->shape.size();
+  CHECK_GT(ndim, 0) << "Reduce tensor's dim must be more than 0";
+  std::vector<int> real_axes;
+  GetRealAxes(static_cast<int>(ndim), axes, &real_axes);
+  std::vector<Expr> output_shapes;
+  GetOutputShape(real_axes, &output_shapes, tensor, keep_dims);
+  return DoReduce(
+      tensor, fn, output_shapes, real_axes, keep_dims ? std::vector<int>() : real_axes, initial, output_name);
+}
+
+Tensor ReduceSum(const Tensor& A, const std::vector<int>& axes, const bool keep_dims, const std::string& output_name) {
+  return Reduce(A, axes, lang::ReduceSum, keep_dims, ir::Zero(A->type()), output_name);
+}
+
+Tensor ReduceProd(const Tensor& A, const std::vector<int>& axes, const bool keep_dims, const std::string& output_name) {
+  return Reduce(A, axes, lang::ReduceMul, keep_dims, lang::One(A->type()), output_name);
+}
+
+Tensor ReduceMax(const Tensor& A, const std::vector<int>& axes, const bool keep_dims, const std::string& output_name) {
+  return Reduce(A, axes, lang::ReduceMax, keep_dims, lang::min_value(A->type()), output_name);
+}
+
+Tensor ReduceMin(const Tensor& A, const std::vector<int>& axes, const bool keep_dims, const std::string& output_name) {
+  return Reduce(A, axes, lang::ReduceMin, keep_dims, lang::max_value(A->type()), output_name);
+}
+
+Tensor ReduceAll(const Tensor& A, const std::vector<int>& axes, const bool keep_dims, const std::string& output_name) {
+  return Reduce(A, axes, lang::ReduceAll, keep_dims, Expr(true), output_name);
+}
+
+Tensor ReduceAny(const Tensor& A, const std::vector<int>& axes, const bool keep_dims, const std::string& output_name) {
+  return Reduce(A, axes, lang::ReduceAny, keep_dims, Expr(false), output_name);
+}
+
+std::vector<Tensor> WarpReduce(const ir::Tensor& A,
+                               const int last_reduce_dim_num,
+                               const bool keep_dim,
+                               const std::string& reduce_type,
+                               const std::string& output_name) {
+  // compute shape size without last reduce dimension.
+  int shape_size_without_reduce_dim = A->shape.size() - last_reduce_dim_num;
+
+  // compute reduce dimension size.
+  Expr reduce_width(1);
+  for (int idx = shape_size_without_reduce_dim; idx < A->shape.size(); ++idx) {
+    reduce_width = reduce_width * A->shape[idx].as_int32();
+  }
+
+  // comput tmp output shape.
+  std::vector<Expr> tmp_shape(A->shape.begin(), A->shape.begin() + shape_size_without_reduce_dim);
+  tmp_shape.push_back(Expr(32));
+  auto tmp_out = Compute(
+      tmp_shape,
+      [=](const std::vector<Expr>& indexs) -> Expr {
+        std::vector<Expr> tmp_indexs(indexs.begin(), indexs.begin() + indexs.size() - 1);
+        for (int idx = 0; idx < last_reduce_dim_num; ++idx) {
+          tmp_indexs.push_back(Expr(0));
+        }
+        CHECK_EQ(A->shape.size(), tmp_indexs.size());
+        Expr offset = common::IndiceToAbsOffset(A->shape, tmp_indexs);
+        return lang::CallExtern(reduce_type, {A, offset, reduce_width});
+      },
+      UniqName(output_name + "_" + reduce_type));
+
+  // compute ouput shape.
+  std::vector<Expr> out_shape(A->shape.begin(), A->shape.begin() + shape_size_without_reduce_dim);
+  for (int idx = 0; idx < last_reduce_dim_num && keep_dim; ++idx) {
+    out_shape.push_back(Expr(1));
+  }
+  // if reduce on all dimension, the out_shape = {1}.
+  if (out_shape.size() == 0) {
+    out_shape.push_back(Expr(1));
+  }
+  auto out = Compute(
+      out_shape,
+      [=](const std::vector<Expr>& indexs) -> Expr {
+        std::vector<Expr> tmp_indexs(indexs.begin(), indexs.begin() + shape_size_without_reduce_dim);
+        tmp_indexs.push_back(Expr(0));
+        return tmp_out(tmp_indexs);
+      },
+      output_name);
+
+  return {out, tmp_out};
+}
+
+std::vector<ir::Tensor> WarpReduceMax(const ir::Tensor& A,
+                                      const int last_reduce_dim_num,
+                                      const bool keep_dim,
+                                      const std::string& output_name) {
+  return WarpReduce(
+      A, last_reduce_dim_num, keep_dim, "cinn_warp_reduce_max" + Type2StrForReduce(A->type()), output_name);
+}
+
+std::vector<ir::Tensor> WarpReduceSum(const ir::Tensor& A,
+                                      const int last_reduce_dim_num,
+                                      const bool keep_dim,
+                                      const std::string& output_name) {
+  return WarpReduce(
+      A, last_reduce_dim_num, keep_dim, "cinn_warp_reduce_sum" + Type2StrForReduce(A->type()), output_name);
+}
+
+std::vector<ir::Tensor> WarpReduceAvg(const ir::Tensor& A,
+                                      const int last_reduce_dim_num,
+                                      const bool keep_dim,
+                                      const std::string& output_name) {
+  return WarpReduce(
+      A, last_reduce_dim_num, keep_dim, "cinn_warp_reduce_avg" + Type2StrForReduce(A->type()), output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceInternal(const ir::Tensor& A,
+                                            const std::vector<int>& axes,
+                                            const bool keep_dim,
+                                            const std::string& reduce_type,
+                                            const std::string& output_name) {
+  CHECK_GE(A->shape.size(), axes.back() + 1) << "Axes is over size!";
+  // compute reduce dimension size.
+  Expr reduce_width(1);
+  for (int idx = axes.front(); idx < A->shape.size(); ++idx) {
+    reduce_width = reduce_width * A->shape[idx].as_int32();
+  }
+
+  // compute tmp output shape.
+  std::vector<Expr> tmp_shape(A->shape.begin(), A->shape.begin() + axes.front());
+  tmp_shape.push_back(reduce_width);
+
+  // compute the reduce dimension stride.
+  std::vector<Expr> last_reduce_stride(A->shape.size() - axes.front(), Expr(1));
+  for (int idx = A->shape.size(), index = int(last_reduce_stride.size()) - 2; index >= 0; --index) {
+    last_reduce_stride[index] = last_reduce_stride[index + 1] * A->shape[--idx];
+  }
+
+  auto tmp_out = Compute(
+      tmp_shape,
+      [=](const std::vector<Expr>& indexs) -> Expr {
+        // comput index map from output to input.
+        auto last_index = indexs.back();
+        std::vector<Expr> input_indexs(indexs.begin(), indexs.begin() + indexs.size() - 1);
+        for (int idx = 0; idx < A->shape.size() - axes.front(); ++idx) {
+          input_indexs.push_back(last_index / last_reduce_stride[idx]);
+          last_index = last_index % last_reduce_stride[idx];
+        }
+
+        // checkout input_indexs size equals input shape
+        CHECK_EQ(input_indexs.size(), A->shape.size());
+        return lang::CallExtern(reduce_type, {A(input_indexs)});
+      },
+      UniqName(output_name + "_tmp"));
+
+  // compute output shape.
+  std::vector<Expr> out_shape(A->shape.begin(), A->shape.begin() + axes.front());
+  int tailf = keep_dim ? (int(A->shape.size()) - axes.front()) : (int(A->shape.size()) - axes.back() - 1);
+  for (int idx = 0; idx < tailf; ++idx) {
+    out_shape.push_back(Expr(1));
+  }
+  // if reduce on all dimension, the out_shape = {1}.
+  if (out_shape.size() == 0) {
+    out_shape.push_back(Expr(1));
+  }
+  auto out = Compute(
+      out_shape,
+      [=](const std::vector<Expr>& indexs) -> Expr {
+        std::vector<Expr> tmp_indexs(indexs.begin(), indexs.begin() + axes.front());
+        tmp_indexs.push_back(Expr(0));
+        return tmp_out(tmp_indexs);
+      },
+      output_name);
+  return {out, tmp_out};
+}
+
+std::vector<ir::Tensor> BlockReduceSumInternal(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim,
+                                               const std::string& output_name) {
+  return BlockReduceInternal(
+      A, axes, keep_dim, "cinn_block_reduce_sum" + Type2StrForReduce(A->type()) + "_internal", output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceProdInternal(const ir::Tensor& A,
+                                                const std::vector<int>& axes,
+                                                const bool keep_dim,
+                                                const std::string& output_name) {
+  return BlockReduceInternal(
+      A, axes, keep_dim, "cinn_block_reduce_prod" + Type2StrForReduce(A->type()) + "_internal", output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceMaxInternal(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim,
+                                               const std::string& output_name) {
+  return BlockReduceInternal(
+      A, axes, keep_dim, "cinn_block_reduce_max" + Type2StrForReduce(A->type()) + "_internal", output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceMinInternal(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim,
+                                               const std::string& output_name) {
+  return BlockReduceInternal(
+      A, axes, keep_dim, "cinn_block_reduce_min" + Type2StrForReduce(A->type()) + "_internal", output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceAllInternal(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim,
+                                               const std::string& output_name) {
+  return BlockReduceInternal(A, axes, keep_dim, "cinn_block_reduce_all_internal", output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceAnyInternal(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim,
+                                               const std::string& output_name) {
+  return BlockReduceInternal(A, axes, keep_dim, "cinn_block_reduce_any_internal", output_name);
+}
+
+/**
+ * @brief compute the sum of array elements over the last dimension with block reduce
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduce(const ir::Tensor& A,
+                                    const std::vector<int>& axes,
+                                    const int block_size,
+                                    const bool keep_dim,
+                                    const std::string& reduce_type,
+                                    const std::string& output_name) {
+  // compute reduce dimension size.
+  Expr reduce_width(1);
+  for (int idx = axes.front(); idx < A->shape.size(); ++idx) {
+    reduce_width = reduce_width * A->shape[idx].as_int32();
+  }
+
+  // compute tmp output tensor shape
+  std::vector<Expr> tmp_shape(A->shape.begin(), A->shape.begin() + axes.front());
+  tmp_shape.push_back(Expr(block_size));
+  auto tmp_out = Compute(
+      tmp_shape,
+      [=](const std::vector<Expr>& indexs) -> Expr {
+        std::vector<Expr> tmp_indexs(indexs.begin(), indexs.begin() + axes.front());
+        for (int idx = 0; idx < A->shape.size() - axes.front(); ++idx) {
+          tmp_indexs.push_back(Expr(0));
+        }
+        // checkout input shape size equals tmp indexs size.
+        CHECK_EQ(A->shape.size(), tmp_indexs.size());
+        // compute offset.
+        Expr offset = common::IndiceToAbsOffset(A->shape, tmp_indexs);
+        // call block reduce sum
+        return lang::CallExtern(reduce_type, {A, offset, reduce_width});
+      },
+      UniqName(output_name + "_tmp"));
+
+  // compute output tensor shape.
+  std::vector<Expr> out_shape(A->shape.begin(), A->shape.begin() + axes.front());
+  int tailf = keep_dim ? (int(A->shape.size()) - axes.front()) : (int(A->shape.size()) - axes.back() - 1);
+  for (int idx = 0; idx < tailf; ++idx) {
+    out_shape.push_back(Expr(1));
+  }
+  // if reduce on all dimension, the out_shape = {1}.
+  if (out_shape.size() == 0) {
+    out_shape.push_back(Expr(1));
+  }
+  auto out = Compute(
+      out_shape,
+      [=](const std::vector<Expr>& indexs) -> Expr {
+        // compute input index
+        std::vector<Expr> tmp_indexs(indexs.begin(), indexs.begin() + axes.front());
+        tmp_indexs.push_back(Expr(0));
+        return tmp_out(tmp_indexs);
+      },
+      output_name);
+
+  return {out, tmp_out};
+}
+
+std::vector<ir::Tensor> BlockReduceSum(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const int block_size,
+                                       const bool keep_dim,
+                                       const std::string& output_name) {
+  return BlockReduce(
+      A, axes, block_size, keep_dim, "cinn_block_reduce_sum" + Type2StrForReduce(A->type()), output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceProd(const ir::Tensor& A,
+                                        const std::vector<int>& axes,
+                                        const int block_size,
+                                        const bool keep_dim,
+                                        const std::string& output_name) {
+  return BlockReduce(
+      A, axes, block_size, keep_dim, "cinn_block_reduce_prod" + Type2StrForReduce(A->type()), output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceMax(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const int block_size,
+                                       const bool keep_dim,
+                                       const std::string& output_name) {
+  return BlockReduce(
+      A, axes, block_size, keep_dim, "cinn_block_reduce_max" + Type2StrForReduce(A->type()), output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceMin(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const int block_size,
+                                       const bool keep_dim,
+                                       const std::string& output_name) {
+  return BlockReduce(
+      A, axes, block_size, keep_dim, "cinn_block_reduce_min" + Type2StrForReduce(A->type()), output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceAll(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const int block_size,
+                                       const bool keep_dim,
+                                       const std::string& output_name) {
+  return BlockReduce(A, axes, block_size, keep_dim, "cinn_block_reduce_all", output_name);
+}
+
+std::vector<ir::Tensor> BlockReduceAny(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const int block_size,
+                                       const bool keep_dim,
+                                       const std::string& output_name) {
+  return BlockReduce(A, axes, block_size, keep_dim, "cinn_block_reduce_any", output_name);
+}
+
+int GetPostParallelSize(const ir::Tensor& A, const std::vector<int>& axes) {
+  int parallel_size = 1;
+  for (int idx = axes.back() + 1; idx < A->shape.size(); ++idx) {
+    parallel_size *= A->shape[idx].as_int32();
+  }
+  return parallel_size;
+}
+
+int GetParallelSize(const ir::Tensor& A, const std::vector<int>& axes) {
+  int parallel_size = 1;
+  for (int idx = 0; idx < A->shape.size(); ++idx) {
+    if (std::find(axes.begin(), axes.end(), idx) != axes.end()) {
+      continue;
+    }
+    parallel_size *= A->shape[idx].as_int32();
+  }
+  return parallel_size;
+}
+
+using ReduceFunc =
+    std::function<ir::Tensor(const ir::Tensor&, const std::vector<int>&, const bool, const std::string&)>;
+
+std::vector<ir::Tensor> ReduceInternal(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const bool keep_dim,
+                                       const std::string& output_name,
+                                       ReduceFunc reduce_func,
+                                       ir::Expr initial,
+                                       std::string reduce_type) {
+  int tail     = 0;
+  bool inbound = true;
+  std::vector<int> inshape;
+  std::transform(
+      A->shape.begin(), A->shape.end(), std::back_inserter(inshape), [](ir::Expr expr) { return expr.as_int32(); });
+  auto reduce_shape = GetFirstStepReduceShape(inshape, axes, inbound, tail);
+  CHECK_GT(reduce_shape.size(), 0);
+
+  VLOG(4) << "Reduce " << output_name << " on " << reduce_type << " with input shape=["
+          << cinn::utils::Join(inshape, ", ") << "], and first step reduce_shape=["
+          << cinn::utils::Join(reduce_shape, ", ") << "] at axes=[" << cinn::utils::Join(axes, ", ") << "]";
+
+  // reshape input
+  auto do_reshape_inbound = [&]() {
+    int axis = axes.back();
+    std::vector<ir::Expr> reshape_output_shape;
+    // last successive axis in reduce axes.
+    int axis_index = axes.size() - 1;
+    for (; axis_index >= 1; --axis_index) {
+      if (axes[axis_index] - 1 != axes[axis_index - 1]) {
+        break;
+      }
+    }
+    // compute reduce stride.
+    std::vector<ir::Expr> strides(1, ir::Expr(1));
+    for (int idx = axes.back(); idx > axes[axis_index]; --idx) {
+      strides.insert(strides.begin(), strides.front() * ir::Expr(inshape[idx]));
+    }
+    CHECK_EQ(strides.size(), axes.size() - axis_index);
+    std::transform(reduce_shape.begin(), reduce_shape.end(), std::back_inserter(reshape_output_shape), [](int val) {
+      return ir::Expr(val);
+    });
+    return Compute(
+        reshape_output_shape,
+        [=](const std::vector<Expr>& indexs) -> Expr {
+          // index is last axis in axes and index is last axis >= tail.
+          auto selected = ir::And::Make(ir::EQ::Make(indexs[axis], ir::Expr(reduce_shape[axis] - 1)),
+                                        ir::GE::Make(indexs[axis + 1], ir::Expr(tail)));
+          auto index    = indexs[axis] * ir::Expr(reshape_output_shape[axis + 1]) + indexs[axis + 1];
+
+          // first part index
+          std::vector<ir::Expr> tmp_indexs(indexs.begin(), indexs.begin() + axes[axis_index]);
+          // second part index
+          for (int idx = 0; idx < strides.size(); ++idx) {
+            tmp_indexs.push_back(index / strides[idx]);
+            index = index % strides[idx];
+          }
+          // third part index
+          for (int idx = axis + 2; idx < indexs.size(); ++idx) {
+            tmp_indexs.push_back(indexs[idx]);
+          }
+
+          CHECK_EQ(tmp_indexs.size(), A->shape.size()) << "Indexs size is not equal to Input shape!";
+          return ir::Select::Make(selected, A(tmp_indexs), initial);
+        },
+        UniqName(output_name + "_reshape"));
+  };
+  auto reshape = inbound ? pe::Reshape(A, reduce_shape, output_name + "_reshape") : do_reshape_inbound();
+  // do first step reduce
+  auto internal = reduce_func(reshape, axes, keep_dim, output_name + "_internal");
+  // do second step reduce
+  std::vector<int> s_axes = {};
+  if (keep_dim) {
+    s_axes = {axes.back() + 1};
+  } else {
+    s_axes = {axes.back() + 1 - static_cast<int>(axes.size())};
+  }
+  auto reduce_out = reduce_func(internal, s_axes, false, output_name);
+
+  return {reduce_out, internal, reshape};
+}
+
+#define BLOCK_SHUFFLE_REDUCE(name, reduce_type, initial)                                                        \
+  std::vector<ir::Tensor> BlockShuffleReduce##name(                                                             \
+      const ir::Tensor& A, const std::vector<int>& axes, const bool keep_dim, const std::string& output_name) { \
+    if (common::GetMaxThreads() / GetParallelSize(A, axes) <= 1) {                                              \
+      return {Reduce##name(A, axes, keep_dim, output_name)};                                                    \
+    } else {                                                                                                    \
+      auto rs = ReduceInternal(A, axes, keep_dim, output_name, Reduce##name, initial, reduce_type);             \
+      if (rs.size() == 0) {                                                                                     \
+        return {Reduce##name(A, axes, keep_dim, output_name)};                                                  \
+      } else                                                                                                    \
+        return rs;                                                                                              \
+    }                                                                                                           \
+  }
+
+BLOCK_SHUFFLE_REDUCE(Sum, "block_shuffle_sum" + Type2StrForReduce(A->type()), ir::Zero(A->type()));
+BLOCK_SHUFFLE_REDUCE(Prod, "block_shuffle_prod" + Type2StrForReduce(A->type()), lang::One(A->type()));
+BLOCK_SHUFFLE_REDUCE(Max, "block_shuffle_max" + Type2StrForReduce(A->type()), lang::min_value(A->type()));
+BLOCK_SHUFFLE_REDUCE(Min, "block_shuffle_min" + Type2StrForReduce(A->type()), lang::max_value(A->type()));
+BLOCK_SHUFFLE_REDUCE(All, "block_shuffle_all" + Type2StrForReduce(A->type()), Expr(true));
+BLOCK_SHUFFLE_REDUCE(Any, "block_shuffle_any" + Type2StrForReduce(A->type()), Expr(false));
+
+bool WithoutLastDimInReduce(const std::vector<ir::Expr>& inshape, const std::vector<int>& axes) {
+  // if last axis is in reduce.
+  if (std::find(axes.begin(), axes.end(), inshape.size() - 1) != axes.end() ||
+      std::find(axes.begin(), axes.end(), -1) != axes.end()) {
+    return false;
+  }
+
+  int sum_last_axes = 1;
+  for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
+    sum_last_axes *= inshape[idx].as_int32();
+  }
+
+  if (sum_last_axes > 1) {
+    return true;
+  } else {
+    return false;
+  }
+};
+
+using BlockReduceFunc =
+    std::function<std::vector<ir::Tensor>(const ir::Tensor&, const std::vector<int>&, const bool, const std::string&)>;
+
+std::vector<ir::Tensor> TwoStepBlockReduceInternal(const ir::Tensor& A,
+                                                   const std::vector<int>& axes,
+                                                   const bool keep_dim,
+                                                   const std::string& output_name,
+                                                   ReduceFunc reduce_func,
+                                                   BlockReduceFunc block_reduce_func,
+                                                   ir::Expr initial) {
+  CHECK(!WithoutLastDimInReduce(A->shape, axes)) << "Can't find last axis in reduce!";
+  // If the number of current device SM is smaller than the number of SM
+  // required by Warp Reduce, the performance of Warp Reduce is better.
+  // Otherwise, use Block Reduce.
+  auto max_num_threads       = common::DefaultNVGPUTarget().max_num_threads();
+  int need_reduce_last_count = 1;
+  for (int i = 0; i < A->shape.size(); i++) {
+    if (find(axes.begin(), axes.end(), i) == axes.end()) {
+      need_reduce_last_count *= A->shape[i].as_int32();
+    }
+  }
+  int warp_reduce_need_sm_count =
+      ceil((need_reduce_last_count * 32) / float(common::DefaultNVGPUTarget().get_max_threads_per_sm()));
+  // Set Num_max_threads to 32 is Warp Reduce
+  if (common::DefaultNVGPUTarget().get_multi_processor_count() < warp_reduce_need_sm_count) {
+    max_num_threads = 32;
+  }
+
+  int lane  = A->shape[axes.back()].as_int32();
+  int index = static_cast<int>(axes.size()) - 2;
+  for (; index >= 0; --index) {
+    if (lane >= max_num_threads / 2) {
+      break;
+    }
+    if (axes[index] != axes[index + 1] - 1) {
+      break;
+    }
+    lane *= A->shape[axes[index]].as_int32();
+  }
+  std::vector<int> first_axes(axes.begin(), axes.begin() + index + 1);
+  std::vector<int> second_axes(axes.begin() + index + 1, axes.end());
+
+  bool keep_dim_first      = keep_dim;
+  bool keep_dim_second     = keep_dim;
+  auto reduce_reshape_func = [&first_axes,
+                              &keep_dim_first,
+                              &second_axes,
+                              &keep_dim_second,
+                              A,
+                              axes,
+                              keep_dim,
+                              output_name,
+                              lane,
+                              index,
+                              max_num_threads,
+                              &initial]() {
+    bool check_bound = true;
+    std::vector<Expr> out_shape(A->shape.begin(), A->shape.begin() + second_axes.front());
+    if (second_axes.size() == 1) {
+      int times = 1;
+      int tail  = max_num_threads;
+      for (; tail >= max_num_threads / 2; --tail) {
+        if (lane % tail == 0) {
+          check_bound = false;
+          break;
+        }
+      }
+      if (!check_bound) {
+        times = lane / tail;
+        out_shape.emplace_back(times);
+        out_shape.emplace_back(tail);
+      } else {
+        times = (lane + max_num_threads - 1) / max_num_threads;
+        out_shape.emplace_back(times);
+        out_shape.emplace_back(max_num_threads);
+      }
+    } else {
+      int times = 1;
+      int head  = A->shape[second_axes.front()].as_int32();
+      int tail  = lane / head;
+      // from (1024, 512) check one size as tail.
+      for (int idx = (max_num_threads / tail); idx > (max_num_threads / 2 / tail); --idx) {
+        if (head % idx == 0) {
+          check_bound = false;
+          times       = idx;
+          tail *= idx;
+          break;
+        }
+      }
+      if (!check_bound) {
+        out_shape.emplace_back(head / times);
+        out_shape.emplace_back(tail);
+      } else {
+        times = max_num_threads / tail;
+        out_shape.emplace_back((head + times - 1) / times);
+        out_shape.emplace_back(tail * times);
+      }
+    }
+    first_axes.push_back(out_shape.size() - 2);
+
+    int tail_count = 0;
+    if (keep_dim) {
+      second_axes = {static_cast<int>(out_shape.size()) - 1};
+      if (out_shape.size() > A->shape.size()) {
+        keep_dim_second = false;
+      } else {
+        keep_dim_second = true;
+        tail_count      = A->shape.size() - out_shape.size();
+        for (int idx = 0; idx < tail_count; ++idx) {
+          out_shape.push_back(Expr(1));
+        }
+      }
+    } else {
+      second_axes = {static_cast<int>(out_shape.size()) - static_cast<int>(first_axes.size()) - 1};
+    }
+
+    int size_without_tail = out_shape.size() - tail_count;
+    std::vector<int> tail_strides(A->shape.size() - (size_without_tail - 2), 1);
+    for (int idx = static_cast<int>(tail_strides.size()) - 2, index = static_cast<int>(A->shape.size()) - 1; idx >= 0;
+         --idx, --index) {
+      tail_strides[idx] = tail_strides[idx + 1] * A->shape[index].as_int32();
+    }
+    auto out = Compute(
+        out_shape,
+        [=](const std::vector<Expr>& indexs) -> Expr {
+          Expr index = indexs[size_without_tail - 1] + indexs[size_without_tail - 2] * out_shape[size_without_tail - 1];
+          std::vector<Expr> tmp_indexs(indexs.begin(), indexs.begin() + size_without_tail - 2);
+          // last and the second of last.
+          auto selected = ir::LT::Make(index, Expr(lane));
+          for (auto tail_stride : tail_strides) {
+            tmp_indexs.push_back(index / Expr(tail_stride));
+            index = index % Expr(tail_stride);
+          }
+
+          CHECK_EQ(tmp_indexs.size(), A->shape.size()) << "Indexs size is not equal to Input shape!";
+          if (check_bound) {
+            return ir::Select::Make(selected, A(tmp_indexs), initial);
+          } else {
+            return A(tmp_indexs);
+          }
+        },
+        UniqName(output_name + "_reshape"));
+    return out;
+  };
+  std::vector<ir::Tensor> results;
+  if (lane > max_num_threads) {
+    VLOG(3) << "Do Reduce Reshape!";
+    results.push_back(reduce_reshape_func());
+  } else {
+    if (!keep_dim) {
+      for (auto& axis : second_axes) {
+        axis -= first_axes.size();
+      }
+    }
+  }
+  if (first_axes.size()) {
+    VLOG(3) << "Do Reduce Internal!";
+    results.push_back(
+        reduce_func(results.size() ? results.back() : A, first_axes, keep_dim_first, output_name + "_internal"));
+    results.back()->WithBuffer("local");
+  }
+  if (second_axes.size()) {
+    VLOG(3) << "Do Block Reduce!";
+    auto res = block_reduce_func(results.size() ? results.back() : A, second_axes, keep_dim_second, output_name);
+    results.push_back(res[1]);
+    results.push_back(res[0]);
+  }
+  std::reverse(results.begin(), results.end());
+  return results;
+}
+
+std::vector<ir::Tensor> TwoStepBlockReduceSum(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name) {
+  return TwoStepBlockReduceInternal(
+      A, axes, keep_dim, output_name, ReduceSum, BlockReduceSumInternal, ir::Zero(A->type()));
+}
+
+std::vector<ir::Tensor> TwoStepBlockReduceProd(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim,
+                                               const std::string& output_name) {
+  return TwoStepBlockReduceInternal(
+      A, axes, keep_dim, output_name, ReduceProd, BlockReduceProdInternal, lang::One(A->type()));
+}
+
+std::vector<ir::Tensor> TwoStepBlockReduceMax(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name) {
+  return TwoStepBlockReduceInternal(
+      A, axes, keep_dim, output_name, ReduceMax, BlockReduceMaxInternal, lang::min_value(A->type()));
+}
+
+std::vector<ir::Tensor> TwoStepBlockReduceMin(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name) {
+  return TwoStepBlockReduceInternal(
+      A, axes, keep_dim, output_name, ReduceMin, BlockReduceMinInternal, lang::max_value(A->type()));
+}
+
+std::vector<ir::Tensor> TwoStepBlockReduceAll(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name) {
+  return TwoStepBlockReduceInternal(A, axes, keep_dim, output_name, ReduceAll, BlockReduceAllInternal, Expr(true));
+}
+
+std::vector<ir::Tensor> TwoStepBlockReduceAny(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name) {
+  return TwoStepBlockReduceInternal(A, axes, keep_dim, output_name, ReduceAny, BlockReduceAnyInternal, Expr(false));
+}
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/reduction.h b/paddle/cinn/hlir/pe/reduction.h
new file mode 100644
index 0000000000000..b4b960194cc35
--- /dev/null
+++ b/paddle/cinn/hlir/pe/reduction.h
@@ -0,0 +1,419 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+/**
+ * @brief sums array elements over a given axis
+ *
+ * @param A The input Tensor
+ * @param stages The stage map
+ * @param axis Axis or axes along which a sum is performed. If axis is empty, the operation will sum over all elements
+ * of the input array. If axis is negative it counts from the last to the first axis.
+ * @param keep_dims If it is set true, the axes which are reduced are left in the result as dimensions with size one.
+ * With this option, the result will broadcast correctly against the input array.
+ * @param initial Starting value for the sum.
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensors.
+ */
+ir::Tensor ReduceSum(const ir::Tensor& A,
+                     const std::vector<int>& axis,
+                     const bool keep_dims           = false,
+                     const std::string& output_name = "T_Reduce_Sum_out");
+
+/**
+ * @brief product array elements over a given axis
+ *
+ * @param A The input Tensor
+ * @param stages The stage map
+ * @param axis Axis or axes along which a production is performed. If axis is empty, the operation will product over all
+ * elements of the input array. If axis is negative it counts from the last to the first axis.
+ * @param keep_dims If it is set true, the axes which are reduced are left in the result as dimensions with size one.
+ * With this option, the result will broadcast correctly against the input array.
+ * @param initial Starting value for the production.
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensors.
+ */
+ir::Tensor ReduceProd(const ir::Tensor& A,
+                      const std::vector<int>& axis,
+                      const bool keep_dims           = false,
+                      const std::string& output_name = "T_Reduce_Prod_out");
+
+/**
+ * @brief find the maxium of array elements over a given axis
+ *
+ * @param A The input Tensor
+ * @param stages The stage map
+ * @param axis Axis or axes to find the maximum over. If axis is empty, the operation will product over all elements of
+ * the input array. If axis is negative it counts from the last to the first axis.
+ * @param keep_dims If it is set true, the axes which are reduced are left in the result as dimensions with size one.
+ * With this option, the result will broadcast correctly against the input array.
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensor.
+ */
+ir::Tensor ReduceMax(const ir::Tensor& A,
+                     const std::vector<int>& axis,
+                     const bool keep_dims           = false,
+                     const std::string& output_name = "T_Reduce_Max_out");
+
+/**
+ * @brief find the minimum of array elements over a given axis
+ *
+ * @param A The input Tensor
+ * @param stages The stage map
+ * @param axis Axis or axes to find the minimum over. If axis is empty, the operation will product over all elements of
+ * the input array. If axis is negative it counts from the last to the first axis.
+ * @param keep_dims If it is set true, the axes which are reduced are left in the result as dimensions with size one.
+ * With this option, the result will broadcast correctly against the input array.
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensor.
+ */
+ir::Tensor ReduceMin(const ir::Tensor& A,
+                     const std::vector<int>& axis,
+                     const bool keep_dims           = false,
+                     const std::string& output_name = "T_Reduce_Min_out");
+
+/**
+ * @brief find the logic and of array elements over a given axis
+ *
+ * @param A The input Tensor
+ * @param stages The stage map
+ * @param axis Axis or axes to find the logic and over. If axis is empty, the operation will product over all elements
+ * of the input array. If axis is negative it counts from the last to the first axis.
+ * @param keep_dims If it is set true, the axes which are reduced are left in the result as dimensions with size one.
+ * With this option, the result will broadcast correctly against the input array.
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensor.
+ */
+ir::Tensor ReduceAll(const ir::Tensor& A,
+                     const std::vector<int>& axis,
+                     const bool keep_dims           = false,
+                     const std::string& output_name = "T_Reduce_All_out");
+
+/**
+ * @brief find the logic or of array elements over a given axis
+ *
+ * @param A The input Tensor
+ * @param stages The stage map
+ * @param axis Axis or axes to find the logic or over. If axis is empty, the operation will product over all elements of
+ * the input array. If axis is negative it counts from the last to the first axis.
+ * @param keep_dims If it is set true, the axes which are reduced are left in the result as dimensions with size one.
+ * With this option, the result will broadcast correctly against the input array.
+ * @param output_name The name of the output Tensor
+ *
+ * @return The result Tensor.
+ */
+ir::Tensor ReduceAny(const ir::Tensor& A,
+                     const std::vector<int>& axis,
+                     const bool keep_dims           = false,
+                     const std::string& output_name = "T_Reduce_Any_out");
+
+/**
+ * @brief find the max of array elements over the last dimension
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> WarpReduceMax(const ir::Tensor& A,
+                                      const int last_reduce_dim_num,
+                                      const bool keep_dim            = false,
+                                      const std::string& output_name = "T_Warp_Reduce_Max_out");
+
+/**
+ * @brief compute the sum of array elements over the last dimension
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> WarpReduceSum(const ir::Tensor& A,
+                                      const int last_reduce_dim_num,
+                                      const bool keep_dim            = false,
+                                      const std::string& output_name = "T_Warp_Reduce_Sum_out");
+
+/**
+ * @brief compute the average of array elements over the last dimension
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> WarpReduceAvg(const ir::Tensor& A,
+                                      const int last_reduce_dim_num,
+                                      const bool keep_dim            = false,
+                                      const std::string& output_name = "T_Warp_Reduce_Avg_out");
+
+/**
+ * @brief compute the sum of array elements over the last dimension with block reduce.
+ *        'BlockReduceSumInternal' is used as the internal compute of reduce sum, do not use it directly.
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceSumInternal(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim            = false,
+                                               const std::string& output_name = "T_Block_Reduce_Sum_Internal_out");
+
+/**
+ * @brief compute the Product of array elements over the last dimension with block reduce.
+ *        'BlockReduceSumInternal' is used as the internal compute of reduce sum, do not use it directly.
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceProdInternal(const ir::Tensor& A,
+                                                const std::vector<int>& axes,
+                                                const bool keep_dim            = false,
+                                                const std::string& output_name = "T_Block_Reduce_Prod_Internal_out");
+
+/**
+ * @brief compute the Max of array elements over the last dimension with block reduce.
+ *        'BlockReduceSumInternal' is used as the internal compute of reduce sum, do not use it directly.
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceMaxInternal(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim            = false,
+                                               const std::string& output_name = "T_Block_Reduce_Max_Internal_out");
+
+/**
+ * @brief compute the Min of array elements over the last dimension with block reduce.
+ *        'BlockReduceSumInternal' is used as the internal compute of reduce sum, do not use it directly.
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceMinInternal(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim            = false,
+                                               const std::string& output_name = "T_Block_Reduce_Min_Internal_out");
+
+/**
+ * @brief compute the logic and of array elements over the last dimension with block reduce.
+ *        'BlockReduceSumInternal' is used as the internal compute of reduce sum, do not use it directly.
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceAllInternal(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim            = false,
+                                               const std::string& output_name = "T_Block_Reduce_All_Internal_out");
+
+/**
+ * @brief compute the logic or of array elements over the last dimension with block reduce.
+ *        'BlockReduceSumInternal' is used as the internal compute of reduce sum, do not use it directly.
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceAnyInternal(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim            = false,
+                                               const std::string& output_name = "T_Block_Reduce_Any_Internal_out");
+
+/**
+ * @brief compute the Sum of array elements over the last dimension with block reduce
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceSum(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const int block_size,
+                                       const bool keep_dim            = false,
+                                       const std::string& output_name = "T_Block_Reduce_Sum_out");
+
+/**
+ * @brief compute the Product of array elements over the last dimension with block reduce
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceProd(const ir::Tensor& A,
+                                        const std::vector<int>& axes,
+                                        const int block_size,
+                                        const bool keep_dim            = false,
+                                        const std::string& output_name = "T_Block_Reduce_Prod_out");
+
+/**
+ * @brief compute the Max of array elements over the last dimension with block reduce
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceMax(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const int block_size,
+                                       const bool keep_dim            = false,
+                                       const std::string& output_name = "T_Block_Reduce_Max_out");
+
+/**
+ * @brief compute the Min of array elements over the last dimension with block reduce
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceMin(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const int block_size,
+                                       const bool keep_dim            = false,
+                                       const std::string& output_name = "T_Block_Reduce_Min_out");
+
+/**
+ * @brief compute the logic and of array elements over the last dimension with block reduce
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceAll(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const int block_size,
+                                       const bool keep_dim            = false,
+                                       const std::string& output_name = "T_Block_Reduce_All_out");
+
+/**
+ * @brief compute the logic or of array elements over the last dimension with block reduce
+ *
+ * @param A The input Tensor.
+ * @param last_reduce_dim_num the number of last reduce dimension.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockReduceAny(const ir::Tensor& A,
+                                       const std::vector<int>& axes,
+                                       const int block_size,
+                                       const bool keep_dim            = false,
+                                       const std::string& output_name = "T_Block_Reduce_Any_out");
+
+/**
+ * @brief compute the value of array elements over the last dimension with block reduce
+ *
+ * @param A The input Tensor.
+ * @param axes the reduce axes.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+std::vector<ir::Tensor> BlockShuffleReduceSum(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name = "T_Reduce_Sum_out");
+
+std::vector<ir::Tensor> BlockShuffleReduceProd(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim,
+                                               const std::string& output_name = "T_Reduce_Prod_out");
+
+std::vector<ir::Tensor> BlockShuffleReduceMax(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name = "T_Reduce_Max_out");
+
+std::vector<ir::Tensor> BlockShuffleReduceMin(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name = "T_Reduce_Min_out");
+
+std::vector<ir::Tensor> BlockShuffleReduceAll(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name = "T_Reduce_All_out");
+
+std::vector<ir::Tensor> BlockShuffleReduceAny(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name = "T_Reduce_Any_out");
+
+/**
+ * @brief compute the value of array elements over the last dimension with block reduce
+ *
+ * @param A The input Tensor.
+ * @param axes the reduce axes.
+ * @param keep_dim keep the output tensor shape size as input.
+ * @param output_name The name of the output Tensor.
+ */
+
+std::vector<ir::Tensor> TwoStepBlockReduceSum(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name = "T_Reduce_Sum_out");
+
+std::vector<ir::Tensor> TwoStepBlockReduceProd(const ir::Tensor& A,
+                                               const std::vector<int>& axes,
+                                               const bool keep_dim,
+                                               const std::string& output_name = "T_Reduce_Prod_out");
+
+std::vector<ir::Tensor> TwoStepBlockReduceMax(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name = "T_Reduce_Max_out");
+
+std::vector<ir::Tensor> TwoStepBlockReduceMin(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name = "T_Reduce_Min_out");
+
+std::vector<ir::Tensor> TwoStepBlockReduceAll(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name = "T_Reduce_All_out");
+
+std::vector<ir::Tensor> TwoStepBlockReduceAny(const ir::Tensor& A,
+                                              const std::vector<int>& axes,
+                                              const bool keep_dim,
+                                              const std::string& output_name = "T_Reduce_Any_out");
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/schedule.cc b/paddle/cinn/hlir/pe/schedule.cc
new file mode 100644
index 0000000000000..af5a10c328ee2
--- /dev/null
+++ b/paddle/cinn/hlir/pe/schedule.cc
@@ -0,0 +1,2270 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/schedule.h"
+
+#include <absl/container/flat_hash_map.h>
+#include <isl/cpp.h>
+#include <math.h>
+
+#include <algorithm>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <numeric>
+#include <utility>
+
+#include "cinn/common/cas.h"
+#include "cinn/hlir/pe/load_x86_params.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/poly/isl_utils.h"
+#include "cinn/utils/string.h"
+
+DECLARE_bool(cinn_use_cuda_vectorize);
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+ScheduleParam::ScheduleParam(common::Target::Arch arch) {
+  switch (arch) {
+    case common::Target::Arch::X86: {
+      param_data = CreateX86Params();
+      break;
+    }
+    case common::Target::Arch::NVGPU: {
+      param_data = CreateCudaParams();
+      break;
+    }
+    default: {
+      LOG(FATAL) << "Schedule params must be initialized with target x86 or nvgpu.";
+    }
+  }
+}
+
+ScheduleParam::~ScheduleParam() {}
+
+int GetInnerSplitter(int origin, int other_axis) {
+  if (origin <= 1) return 1;
+  int two_exp = 1;
+  while (origin % two_exp == 0) {
+    two_exp *= 2;
+  }
+  two_exp = two_exp / 2;
+  int a   = SplitEven(two_exp);
+  int b   = two_exp / a;
+  while (a * other_axis >= 1024 || b * other_axis >= 1024) {
+    two_exp = two_exp / 2;
+    a       = SplitEven(two_exp);
+    b       = two_exp / a;
+  }
+  if (origin == two_exp) {
+    return 2;
+  }
+  return origin / two_exp;
+}
+
+int SplitEven(int origin) {
+  if (origin <= 1) return 1;
+  int res = 1;
+  while (origin % res == 0 && res * res < origin) {
+    res *= 2;
+  }
+  res = res / 2;
+  return res;
+}
+
+int GetBasicFactor(const Type &type, const common::Target &target) {
+  int target_native_vector_bits = target.get_target_bits() * 8;
+  int type_bits                 = type.bits();
+  return target_native_vector_bits / type_bits;
+}
+
+int GetBetterSplitFactor(int shape, int split_factor) {
+  int better_factor = split_factor;
+  while (better_factor > shape) {
+    better_factor /= 2;
+  }
+  if (better_factor < shape && better_factor != split_factor) return better_factor * 2;
+  return better_factor;
+}
+
+int GetVectorizeFactor(int shape, int split_factor) {
+  int better_factor = 1;
+  for (int i = split_factor; i > 1; i--) {
+    if (shape % i == 0) {
+      better_factor = i;
+      break;
+    }
+  }
+  return better_factor;
+}
+
+void ScheduleInjectiveCPU(poly::Stage *stage,
+                          const std::vector<int> &output_shape,
+                          const common::Target &target,
+                          bool vectorizable) {
+  int dims             = stage->n_out_dims();
+  int factor           = GetBasicFactor(stage->tensor()->type(), target);
+  poly::Iterator fused = stage->axis(0);
+  if (dims >= 5) {
+    fused = stage->Fuse({0, 1, 2});
+  } else if (dims >= 3) {
+    fused = stage->Fuse({0, 1});
+  }
+  stage->Parallel(fused);
+  dims = stage->n_out_dims();
+
+  if (vectorizable) {
+    poly::Iterator lo;
+    poly::Iterator li;
+    int last_shape   = stage->GetDimRange(dims - 1);
+    factor           = GetVectorizeFactor(last_shape, factor);
+    std::tie(lo, li) = stage->Split(stage->axis(dims - 1), factor);
+    stage->Vectorize(li, factor);
+    if (dims == 1) {
+      stage->Parallel(0);
+    }
+  }
+}
+
+void ScheduleInjectiveCPU1(poly::Stage *stage,
+                           const std::vector<int> &output_shape,
+                           const common::Target &target,
+                           bool vectorizable) {
+  int dims = stage->n_out_dims();
+  if (dims > 1) {
+    CHECK_EQ(stage->n_out_dims(), stage->n_in_dims()) << "The dims of op are not equal";
+    CHECK_EQ(stage->n_out_dims(), output_shape.size())
+        << "The origin stage out dims should be same with output_shape sizes";
+    poly::Iterator fused          = stage->axis(dims - 1);
+    int target_native_vector_bits = target.get_target_bits() * 8;
+    int type_bits                 = stage->tensor()->type().bits();
+    int prod_size                 = output_shape.back();
+    // fuse conservatively for the complex index from poly and may not benefit a lot compared with llvm optimization,
+    // only fuse the last two dims when the last dimension is too small and can split and vectorize Todo: try reorder
+    if (output_shape.back() * type_bits < target_native_vector_bits) {
+      int last_two_dim_bits = output_shape[dims - 2] * output_shape[dims - 1] * type_bits;
+      if (last_two_dim_bits % target_native_vector_bits == 0) {
+        fused = stage->Fuse(dims - 2, dims - 1);
+        prod_size *= output_shape[dims - 2];
+      }
+    }
+    int split_factor = target_native_vector_bits / type_bits;
+    if (vectorizable) {
+      if (prod_size <= split_factor) {
+        split_factor = GetBetterSplitFactor(prod_size, split_factor);
+        if (split_factor >= 8) {
+          stage->Vectorize(fused, split_factor);
+        }
+      } else {
+        auto ssplit   = stage->Split(fused, split_factor);
+        auto &j_outer = std::get<0>(ssplit);
+        auto &j_inner = std::get<1>(ssplit);
+        stage->Vectorize(j_inner, split_factor);
+      }
+    }
+  }
+  if (stage->n_out_dims() > 1) {
+    stage->Parallel(0);
+  }
+}
+
+int GetArrayPackingFactor(int shape, const Type &type, const common::Target &target) {
+  int split_base   = GetBasicFactor(type, target);
+  int split_factor = 1;
+  // temporily use shape-1 instead of shape for isl wrong for1 elimination
+  int i = split_base * split_base < shape ? split_base * split_base : shape;
+  for (; i > 1; i--) {
+    if (shape % i == 0) {
+      split_factor = i;
+      break;
+    }
+  }
+  return split_factor;
+}
+
+void MatmulScheduleCUDA(poly::StageMap stages, const ir::Tensor &output, const common::Target &target) {
+  stages[output]->Split(1, 2);
+  stages[output]->Bind(0, "blockIdx.x");
+  stages[output]->Bind(1, "threadIdx.x");
+}
+
+void MatmulScheduleCPU(poly::StageMap stages,
+                       const ir::Tensor &output,
+                       const ir::Tensor &packedB,
+                       const common::Target &target) {
+  CHECK_EQ(output->type(), packedB->type());
+  int basic_split_factor = GetBasicFactor(packedB->type(), target);
+  // packedB
+  int packedB_dims         = stages[packedB]->axis_names().size();
+  int packed_last_dim      = packedB->shape[packedB_dims - 1].as_int32();
+  int packedB_split_factor = GetBetterSplitFactor(packed_last_dim, basic_split_factor);
+  // tempory solution for indivisible case
+  if (packedB_split_factor >= 8 && packed_last_dim % packedB_split_factor == 0) {
+    stages[packedB]->Vectorize(packedB_dims - 1, packedB_split_factor);
+  }
+  // output
+  int output_size = output->shape.size();
+  // M, N
+  int M             = output->shape[output_size - 2].as_int32();
+  int N             = output->shape[output_size - 1].as_int32();
+  int bm            = GetArrayPackingFactor(M, output->type(), target);
+  int bn            = GetArrayPackingFactor(N, output->type(), target);
+  int out_axis_dims = stages[output]->axis_names().size();
+  CHECK_GE(out_axis_dims, 3U) << "output tensor's size should be at least 3";
+  poly::Iterator i_axis = stages[output]->axis(out_axis_dims - 3);
+  poly::Iterator j_axis = stages[output]->axis(out_axis_dims - 2);
+  poly::Iterator i_outer, i_inner, j_outer, j_inner;
+  std::vector<poly::Iterator> i_axes, j_axes, k_axes;
+  std::vector<poly::Iterator> all_axes;
+  std::vector<poly::Iterator> all_axes_outer;
+  std::vector<poly::Iterator> all_axes_inner;
+  bool is_m_splited = false;
+  bool is_n_splited = false;
+  // tempory solution for isl for1 wrong elimination
+  if (bm >= 4 && M != bm) {
+    auto axes = stages[output]->Split(i_axis, bm);
+    all_axes_outer.push_back(std::get<0>(axes));
+    all_axes_inner.push_back(std::get<1>(axes));
+    is_m_splited = true;
+  } else {
+    all_axes_outer.push_back(i_axis);
+  }
+  out_axis_dims = stages[output]->axis_names().size();
+  // temp solution for isl for1 wrong elimination
+  if (bn >= 4 && N != bn) {
+    auto axes = stages[output]->Split(j_axis, bn);
+    all_axes_outer.push_back(std::get<0>(axes));
+    all_axes_inner.push_back(std::get<1>(axes));
+    is_n_splited = true;
+  } else {
+    all_axes_outer.push_back(j_axis);
+  }
+  // K
+  int K              = packedB->shape[packedB->shape.size() - 2].as_int32();
+  int k_split_factor = GetBetterSplitFactor(K, basic_split_factor);
+  out_axis_dims      = stages[output]->axis_names().size();
+  auto k_axis        = stages[output]->axis(out_axis_dims - 1);
+  bool is_k_splited  = false;
+  if (k_split_factor >= 4) {
+    auto axes = stages[output]->Split(k_axis, k_split_factor);
+    k_axes.push_back(std::get<0>(axes));
+    k_axes.push_back(std::get<1>(axes));
+    all_axes_outer.push_back(std::get<0>(axes));
+    all_axes_inner.push_back(std::get<1>(axes));
+    is_k_splited = true;
+  } else {
+    all_axes_outer.push_back(k_axis);
+  }
+  std::vector<poly::Iterator> new_order;
+  out_axis_dims = stages[output]->axis_names().size();
+  if (output_size > 2) {
+    // batch
+    all_axes.push_back(stages[output]->axis(0));
+  }
+  for (int i = 0; i < all_axes_outer.size(); ++i) {
+    all_axes.push_back(all_axes_outer[i]);
+  }
+  for (int i = 0; i < all_axes_inner.size(); ++i) {
+    all_axes.push_back(all_axes_inner[i]);
+  }
+  // int axies
+  CHECK_EQ(all_axes.size(), out_axis_dims);
+  if (is_k_splited) {
+    if (is_m_splited || is_n_splited) {
+      // swap k_inner and j_inner/i_inner
+      std::swap(all_axes[out_axis_dims - 1], all_axes[out_axis_dims - 2]);
+    } else {
+      // swap k_inner and j
+      std::swap(all_axes[out_axis_dims - 1], all_axes[out_axis_dims - 3]);
+    }
+  } else {
+    // swap k and j
+    std::swap(all_axes[out_axis_dims - 1], all_axes[out_axis_dims - 2]);
+  }
+  stages[output]->Reorder(all_axes);
+  // vectorize output's last dimemsion
+  auto out_domain = stages[output]->transformed_domain();
+  auto range      = poly::isl_set_get_axis_range(out_domain.get(), out_axis_dims - 1);
+  auto &min       = std::get<0>(range);
+  auto &max       = std::get<1>(range);
+  CHECK_EQ(min.get_num_si(), 0) << "axis range should begin from zero";
+  int out_last_dim        = max.get_num_si() + 1;
+  int output_split_factor = GetBetterSplitFactor(out_last_dim, basic_split_factor);
+  // tempory solution for indivisible case
+  if (output_split_factor >= 8 && packed_last_dim % output_split_factor == 0) {
+    stages[output]->Vectorize(out_axis_dims - 1, output_split_factor);
+  }
+}
+
+void MulScheduleCPU(poly::StageMap stages,
+                    const ir::Tensor &output,
+                    const ir::Tensor &reduce_first,
+                    const common::Target &target) {
+  int split_factor                     = GetBasicFactor(output->type(), target);
+  auto out_reduce_axis                 = output->reduce_axis;
+  std::vector<Expr> reduce_first_shape = reduce_first->shape;
+  std::vector<Expr> output_shape       = output->shape;
+  CHECK_EQ(reduce_first_shape.size(), 3U);
+  CHECK_EQ(output_shape.size(), 2U);
+
+  // reduce_first init
+  auto reduce_first_init    = reduce_first->GetInitTensor(stages, target);
+  int reduce_first_init_dim = stages[reduce_first_init]->axis_names().size();
+  stages[reduce_first_init]->ComputeAt2(stages[reduce_first], reduce_first_init_dim - 2);
+  // output init
+  auto out_init    = output->GetInitTensor(stages, target);
+  int out_init_dim = stages[out_init]->axis_names().size();
+  stages[out_init]->ComputeAt2(stages[output], out_init_dim - 1);
+  // reduce_first
+  int reduce_first_dim = stages[reduce_first]->axis_names().size();
+  stages[reduce_first]->Reorder({reduce_first_dim - 1, reduce_first_dim - 2});
+  int reduce_first_last_shape = reduce_first_shape.back().as_int32();
+  // output
+  int out_dims = stages[output]->n_out_dims();
+  if (reduce_first_last_shape > 1) {
+    stages[output]->Unroll(out_dims - 1);
+  }
+}
+
+int GetThreadBindAxis(const std::vector<ir::Expr> &shape) {
+  int thread_axis = shape.size() - 1;
+  for (int idx = thread_axis; idx >= 0; --idx) {
+    if (shape[idx].as_int32() > 1) {
+      thread_axis = idx;
+      break;
+    }
+  }
+  return thread_axis;
+}
+
+int GetBlockBindAxis(const std::vector<ir::Expr> &shape, const int thread_axis) {
+  int block_axis = 0, max_dim_size = shape[0].as_int32();
+  for (int idx = 0; idx <= thread_axis; ++idx) {
+    if (max_dim_size < shape[idx].as_int32()) {
+      if (idx < thread_axis) {
+        max_dim_size = shape[idx].as_int32();
+        block_axis   = idx;
+      } else {
+        if (max_dim_size == 1) {
+          block_axis = thread_axis;
+        }
+      }
+    }
+  }
+  return block_axis;
+}
+
+void CudaReduceSchedule(poly::StageMap stages,
+                        ir::Tensor output,
+                        int last_dimension_num,
+                        const common::Target &target) {
+  int parallel_thread_num = 1;
+  for (int idx = output->shape.size() - 1; idx >= static_cast<int>(output->shape.size()) - last_dimension_num; --idx) {
+    parallel_thread_num *= output->shape[idx].as_int32();
+  }
+
+  int index = output->shape.size() - last_dimension_num;
+  for (int idx = output->shape.size() - last_dimension_num; idx < static_cast<int>(output->shape.size()) - 1; ++idx) {
+    stages[output]->Fuse(index, index + 1);
+  }
+
+  int max_block_size = 1024;
+  if (parallel_thread_num > max_block_size) {
+    stages[output]->Split(index, max_block_size);
+    stages[output]->Bind(index + 1, "threadIdx.x");
+  } else {
+    stages[output]->Bind(index, "threadIdx.x");
+  }
+
+  for (int idx = 0; idx < index - 1; ++idx) {
+    stages[output]->Fuse(0, 1);
+  }
+
+  if (index > 0) {
+    stages[output]->Bind(0, "blockIdx.x");
+  }
+}
+
+void CudaWarpReduceSchedule(poly::StageMap stages, ir::Tensor tmp_out, ir::Tensor out, const common::Target &target) {
+  int sum_out_dim = 1;
+  for (int idx = 0; idx < static_cast<int>(tmp_out->shape.size()) - 2; ++idx) {
+    stages[out]->Fuse(0, 1);
+    stages[tmp_out]->Fuse(0, 1);
+    sum_out_dim *= out->shape[idx].as_int32();
+  }
+  sum_out_dim *= out->shape.back().as_int32();
+
+  // out_shape = {1} tmp_shape = {32}
+  if (tmp_out->shape.size() == 1) {
+    stages[out]->Split(0, 1);
+    stages[tmp_out]->Split(0, tmp_out->shape[0].as_int32());
+  }
+
+  if (sum_out_dim <= 16) {
+    stages[out]->Bind(0, "threadIdx.y");
+
+    stages[tmp_out]->Bind(0, "threadIdx.y");
+    stages[tmp_out]->Bind(1, "threadIdx.x");
+    stages[tmp_out]->SimpleComputeAt(stages[out], 0);
+    stages[tmp_out]->SetBuffer("local");
+  } else {
+    stages[out]->Split(0, 8);
+    stages[out]->Bind(0, "blockIdx.x");
+    stages[out]->Bind(1, "threadIdx.y");
+
+    stages[tmp_out]->Split(0, 8);
+    stages[tmp_out]->Bind(2, "threadIdx.x");
+    stages[tmp_out]->SimpleComputeAt(stages[out], 1);
+    stages[tmp_out]->SetBuffer("local");
+  }
+}
+
+void CudaBlockReduceInternalSchedule(poly::StageMap stages,
+                                     ir::Tensor tmp_out,
+                                     ir::Tensor out,
+                                     const common::Target &target) {
+  for (int idx = 0; idx < static_cast<int>(tmp_out->shape.size()) - 2; ++idx) {
+    stages[tmp_out]->Fuse(0, 1);
+    stages[out]->Fuse(0, 1);
+  }
+
+  if (tmp_out->shape.size() == 1) {
+    stages[tmp_out]->Split(0, stages[tmp_out]->GetDimRange(0));
+    stages[out]->Split(0, stages[out]->GetDimRange(0));
+  }
+
+  stages[tmp_out]->Bind(0, "blockIdx.x");
+  stages[tmp_out]->Bind(1, "threadIdx.x");
+  stages[tmp_out]->SetBuffer("local");
+  stages[tmp_out]->SimpleComputeAt(stages[out], 0);
+
+  stages[out]->Bind(0, "blockIdx.x");
+}
+
+void CudaBlockReduceSchedule(poly::StageMap stages,
+                             ir::Tensor reduce_tmp_out,
+                             ir::Tensor tmp_out,
+                             ir::Tensor out,
+                             const common::Target &target) {
+  int output_shape_size_without_reduce = tmp_out->shape.size() - 1;
+  // fuse last parallel dimension
+  for (int idx = 0; idx < reduce_tmp_out->shape.size() - tmp_out->shape.size(); ++idx) {
+    stages[reduce_tmp_out]->Fuse(output_shape_size_without_reduce, output_shape_size_without_reduce + 1);
+  }
+
+  // fuse parallel dimension
+  for (int idx = 0; idx < output_shape_size_without_reduce - 1; ++idx) {
+    stages[out]->Fuse(0, 1);
+    stages[tmp_out]->Fuse(0, 1);
+    stages[reduce_tmp_out]->Fuse(0, 1);
+  }
+
+  if (tmp_out->shape.size() == 1) {
+    stages[reduce_tmp_out]->Split(0, stages[reduce_tmp_out]->GetDimRange(0));
+    stages[tmp_out]->Split(0, stages[tmp_out]->GetDimRange(0));
+    stages[out]->Split(0, stages[out]->GetDimRange(0));
+  }
+
+  stages[reduce_tmp_out]->Bind(0, "blockIdx.x");
+  stages[reduce_tmp_out]->Bind(1, "threadIdx.x");
+  stages[reduce_tmp_out]->SetBuffer("local");
+  stages[reduce_tmp_out]->SimpleComputeAt(stages[tmp_out], 0);
+
+  stages[tmp_out]->Bind(0, "blockIdx.x");
+  stages[tmp_out]->Bind(1, "threadIdx.x");
+  stages[tmp_out]->SetBuffer("local");
+  stages[tmp_out]->SimpleComputeAt(stages[out], 0);
+
+  stages[out]->Bind(0, "blockIdx.x");
+}
+
+void CudaBlockShuffleReduceSchedule(
+    poly::StageMap stages, ir::Tensor reshape, ir::Tensor internal, ir::Tensor out, const common::Target &target) {
+  int fuse_times = internal->shape.size() - 2;
+  for (int idx = 0; idx < fuse_times; ++idx) {
+    stages[internal]->Fuse(0, 1);
+    stages[out]->Fuse(0, 1);
+  }
+
+  fuse_times = out->shape.size() - internal->shape.size();
+  for (int idx = 0; idx < fuse_times; ++idx) {
+    if (internal->shape.size() == 1) {
+      stages[out]->Fuse(0, 1);
+    } else {
+      stages[out]->Fuse(1, 2);
+    }
+  }
+
+  if (stages[out]->n_out_dims() == 1) {
+    stages[internal]->Split(0, stages[internal]->GetDimRange(0));
+    stages[out]->Split(0, stages[out]->GetDimRange(0));
+  }
+
+  stages[reshape]->ComputeInline();
+  stages[internal]->SetBuffer("shared");
+
+  stages[internal]->Bind(0, "blockIdx.x");
+  stages[internal]->Bind(1, "threadIdx.x");
+
+  stages[out]->Bind(0, "blockIdx.x");
+  stages[out]->Bind(1, "threadIdx.x");
+  stages[internal]->SimpleComputeAt(stages[out], 0);
+
+  stages[out]->SyncThreads(0, {internal}, stages);
+}
+
+void CudaTwoStepReduceSchedule(poly::StageMap stages,
+                               ir::Tensor reshape,
+                               ir::Tensor internal,
+                               ir::Tensor tmp_out,
+                               ir::Tensor out,
+                               const common::Target &target) {
+  // fuse axis
+  for (int idx = 0; idx < static_cast<int>(internal->shape.size()) - 2; ++idx) {
+    stages[internal]->Fuse(0, 1);
+    stages[tmp_out]->Fuse(0, 1);
+    stages[out]->Fuse(0, 1);
+  }
+
+  if (stages[tmp_out]->n_out_dims() == 1) {
+    stages[internal]->Split(0, stages[internal]->GetDimRange(0));
+    stages[tmp_out]->Split(0, stages[tmp_out]->GetDimRange(0));
+    stages[out]->Split(0, stages[out]->GetDimRange(0));
+  }
+
+  stages[reshape]->ComputeInline();
+
+  stages[internal]->Bind(0, "blockIdx.x");
+  stages[internal]->Bind(1, "threadIdx.x");
+  stages[internal]->SetBuffer("local");
+  stages[internal]->SimpleComputeAt(stages[tmp_out], 0);
+
+  stages[tmp_out]->Bind(0, "blockIdx.x");
+  stages[tmp_out]->Bind(1, "threadIdx.x");
+  stages[tmp_out]->SetBuffer("local");
+  stages[tmp_out]->SimpleComputeAt(stages[out], 0);
+
+  stages[out]->Bind(0, "blockIdx.x");
+}
+
+void SoftmaxScheduleCPU(poly::StageMap stage, const ir::Tensor &output, const ir::Tensor &temp, int axis) {
+  if (axis == -1) {
+    axis += output->shape.size();
+  }
+  poly::Iterator fused = stage[output]->axis(0);
+  stage[output]->Parallel(fused);
+  for (int i = 1; i < axis; i++) {
+    fused = stage[output]->Fuse(0, 1);
+  }
+  CHECK_GT(stage[output]->n_out_dims(), 1);
+  stage[temp]->ComputeAt(stage[output], 0);
+}
+
+void GlobalPoolScheduleGPU(poly::StageMap stages, const std::vector<ir::Tensor> &output, const common::Target &target) {
+  auto &out    = output[0];
+  auto &reduce = output[1];
+  stages[out]->Fuse(0, 1);
+  stages[out]->Split(0, 32);
+  stages[out]->Bind(0, "blockIdx.x");
+  stages[out]->Bind(1, "threadIdx.y");
+  stages[reduce]->ComputeAt2(stages[out], 1);
+  stages[reduce]->SetBuffer("local");
+  stages[reduce]->Bind(2, "threadIdx.x");
+}
+void PoolScheduleCPU(poly::StageMap stages, const ir::Tensor &output, const common::Target &target) {
+  CHECK_GE(stages[output]->n_out_dims(), 2);
+  stages[output]->Fuse({0, 1});
+  stages[output]->Parallel(0);
+}
+
+void PoolScheduleGPU(poly::StageMap stages, ir::Tensor &output, const common::Target &target) {
+  CHECK_GE(stages[output]->axis_names().size(), 4);
+  stages[output]->Fuse({0, 1, 2, 3});
+  stages[output]->Split(0, 1024);
+  stages[output]->Bind(0, "blockIdx.x");
+  stages[output]->Bind(1, "threadIdx.x");
+}
+
+void GetConv2dFactors(absl::flat_hash_map<std::string, int> *factors,
+                      int oc,
+                      int ic,
+                      int fc,
+                      int oh,
+                      int ow,
+                      const Type &type,
+                      const common::Target &target,
+                      const std::string &key,
+                      bool import_params) {
+  if (import_params) {
+    auto &params = ScheduleParam::get_x86_instance().GetParam();
+    if (params.count(key)) {
+      VLOG(3) << "find saved param, key is: " << key;
+      CHECK(!params[key]["oc_bn"].empty());
+      CHECK(!params[key]["ic_bn"].empty());
+      CHECK(!params[key]["ow_bn"].empty());
+      (*factors)["oc_bn"] = params[key]["oc_bn"].back();
+      (*factors)["ic_bn"] = params[key]["ic_bn"].back();
+      (*factors)["ow_bn"] = params[key]["ow_bn"].back();
+      if (!params[key]["oh_bn"].empty()) {
+        (*factors)["oh_bn"] = params[key]["oh_bn"].back();
+      }
+      if (!params[key]["unroll_kw"].empty()) {
+        (*factors)["unroll_kw"] = params[key]["unroll_kw"].back();
+      }
+      if (ic == fc) {
+        (*factors)["fc_bn"] = (*factors)["ic_bn"];
+      } else {
+        int fc_bn = 1;
+        for (int i = (*factors)["oc_bn"]; i > 1; i--) {
+          if (fc < 1) break;
+          if (fc % i == 0) {
+            fc_bn = i;
+            break;
+          }
+        }
+        (*factors)["fc_bn"] = fc_bn;
+      }
+      return;
+    } else {
+      VLOG(3) << "Can not find saved param, key is: " << key;
+    }
+  }
+  int bn_base = GetBasicFactor(type, target);
+  int oc_bn   = 1;
+  for (int i = bn_base; i > 1; i--) {
+    if (oc < 1) break;
+    if (oc % i == 0) {
+      oc_bn = i;
+      break;
+    }
+  }
+  int ic_bn = 1;
+  for (int i = oc_bn; i > 1; i--) {
+    if (ic < 1) break;
+    if (ic % i == 0) {
+      ic_bn = i;
+      break;
+    }
+  }
+  int fc_bn = 1;
+  for (int i = oc_bn; i > 1; i--) {
+    if (fc < 1) break;
+    if (fc % i == 0) {
+      fc_bn = i;
+      break;
+    }
+  }
+  (*factors)["oc_bn"] = oc_bn;
+  (*factors)["ic_bn"] = ic_bn;
+  (*factors)["fc_bn"] = fc_bn;
+  int ow_bn           = 1;
+
+  if (oh < 1) {
+    for (int i = bn_base; i > 1; i--) {
+      if (ow < 1) break;
+      if (ow % i == 0) {
+        ow_bn = i;
+        break;
+      }
+    }
+    (*factors)["ow_bn"] = ow_bn;
+  } else {
+    int oh_bn = 1;
+    int begin = std::min(ow, bn_base);
+    for (int i = begin; i >= 1; i--) {
+      if (ow < 1) break;
+      if (ow % i == 0) {
+        ow_bn = i;
+        for (int j = oh; j >= 1; j--) {
+          if (oh % j == 0 && j * ow_bn <= 16) {
+            oh_bn               = j;
+            (*factors)["oh_bn"] = oh_bn;
+            (*factors)["ow_bn"] = ow_bn;
+            return;
+          }
+        }
+      }
+    }
+  }
+}
+
+void GetConv2d1x1Factors(absl::flat_hash_map<std::string, int> *factors,
+                         int oc,
+                         int ic,
+                         int oh,
+                         int ow,
+                         const Type &type,
+                         const common::Target &target) {
+  int bn_base = GetBasicFactor(type, target);
+  int oc_bn   = 1;
+  for (int i = bn_base; i > 1; i--) {
+    if (oc < 1) break;
+    if (oc % i == 0) {
+      oc_bn = i;
+      break;
+    }
+  }
+  int ic_bn = 1;
+  for (int i = oc_bn; i > 1; i--) {
+    if (ic < 1) break;
+    if (ic % i == 0) {
+      ic_bn = i;
+      break;
+    }
+  }
+  (*factors)["oc_bn"] = oc_bn;
+  (*factors)["ic_bn"] = ic_bn;
+  int ow_bn           = 1;
+  int oh_bn           = 1;
+  int begin           = std::min(ow, bn_base);
+  for (int i = begin; i >= 1; i--) {
+    if (ow < 1) break;
+    if (ow % i == 0) {
+      ow_bn = i;
+      for (int j = oh; j >= 1; j--) {
+        if (oh % j == 0 && j * ow_bn <= 16) {
+          oh_bn               = j;
+          (*factors)["oh_bn"] = oh_bn;
+          (*factors)["ow_bn"] = ow_bn;
+          return;
+        }
+      }
+    }
+  }
+}
+
+std::string GenerateX86ConvKey(const std::vector<Expr> &input_shape,
+                               const std::vector<Expr> &weight_shape,
+                               const std::vector<int> &strides,
+                               const std::vector<int> &paddings,
+                               const std::vector<int> &dilations,
+                               const int &index,
+                               const std::string &model_name) {
+  // format: (model_name + index +)schedule_name + input_shape + weight_shape + strides + paddings + dilations
+  // e.g. resnet18 0 X86ScheduleConv input 1 3 224 224 weight 64 3 7 7 stride 2 2 padding 3 3 dilation 1 1
+  std::string key;
+  if (model_name != "") {
+    key = model_name + " index " + std::to_string(index) + " ";
+  }
+  key += "X86ScheduleConv input";
+  for (auto &shape : input_shape) {
+    key += " " + std::to_string(shape.as_int32());
+  }
+  key += " weight";
+  for (auto &shape : weight_shape) {
+    key += " " + std::to_string(shape.as_int32());
+  }
+  key += " stride";
+  for (auto &stride : strides) {
+    key += " " + std::to_string(stride);
+  }
+  key += " padding";
+  for (auto &padding : paddings) {
+    key += " " + std::to_string(padding);
+  }
+  key += " dilation";
+  for (auto &dilation : dilations) {
+    key += " " + std::to_string(dilation);
+  }
+  VLOG(3) << "key: " << key;
+  return key;
+}
+
+std::string GenerateX86ConvKey(const std::vector<int> &input_shape,
+                               const std::vector<int> &weight_shape,
+                               const std::vector<int> &strides,
+                               const std::vector<int> &paddings,
+                               const std::vector<int> &dilations,
+                               const int &index,
+                               const std::string &model_name) {
+  // format: (model_name + index +)schedule_name + input_shape + weight_shape + strides + paddings + dilations
+  std::string key;
+  if (model_name != "") {
+    key = model_name + " index " + std::to_string(index) + " ";
+  }
+  key += "X86ScheduleConv input";
+  for (auto &shape : input_shape) {
+    key += " " + std::to_string(shape);
+  }
+  key += " weight";
+  for (auto &shape : weight_shape) {
+    key += " " + std::to_string(shape);
+  }
+  key += " stride";
+  for (auto &stride : strides) {
+    key += " " + std::to_string(stride);
+  }
+  key += " padding";
+  for (auto &padding : paddings) {
+    key += " " + std::to_string(padding);
+  }
+  key += " dilation";
+  for (auto &dilation : dilations) {
+    key += " " + std::to_string(dilation);
+  }
+  VLOG(3) << "key: " << key;
+  return key;
+}
+
+void CreateX86SerialData(const std::string &file_name) {
+  /** The format of serial data is:
+   * hash_key: schedule_name + shape of input + shape of weights + stride + padding + dilation
+   * value: vector of params
+   */
+  SaveSerialData(CreateX86Params(), file_name);
+}
+
+void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
+                                   const ir::Tensor &res,
+                                   ir::Tensor &packed_out,
+                                   const ir::Tensor &input_pad,
+                                   const ir::Tensor &weights_dilation,
+                                   const ir::Tensor &data,
+                                   const common::Target &target,
+                                   const std::string &key,
+                                   bool do_padding) {
+  CHECK(target.arch == Target::Arch::X86) << "Conv2d_NCHWc_1X1_Schedule_CPU schedule only used in x86";
+  CHECK(packed_out.defined());
+  CHECK(input_pad.defined());
+  auto type = packed_out->type();
+  absl::flat_hash_map<std::string, int> conv2d_factors;
+  CHECK_EQ(packed_out->shape.size(), 5U) << "packed_out's shape size should be 5";
+  Expr h_out             = common::AutoSimplify(packed_out->shape[2]);
+  Expr w_out             = common::AutoSimplify(packed_out->shape[3]);
+  int oh                 = h_out.as_int32();
+  int ow                 = w_out.as_int32();
+  int basic_split_factor = GetBasicFactor(type, target);
+  GetConv2dFactors(&conv2d_factors, -1, -1, -1, oh, ow, type, target, key);
+  int oh_bn_size = conv2d_factors["oh_bn"];
+  int ow_bn_size = conv2d_factors["ow_bn"];
+
+  auto input_shape = input_pad->shape;
+  CHECK_EQ(input_shape.size(), 5U) << "input shape size should be 5";
+  Expr oc_bn     = common::AutoSimplify(packed_out->shape.back());
+  Expr ic_bn     = common::AutoSimplify(input_shape.back());
+  int oc_bn_size = oc_bn.as_int32();
+  int ic_bn_size = ic_bn.as_int32();
+  VLOG(3) << "oh_bn_size " << oh_bn_size;
+  VLOG(3) << "ow_bn_size " << ow_bn_size;
+  VLOG(3) << "oc_bn_size " << oc_bn_size;
+  VLOG(3) << "ic_bn_size " << ic_bn_size;
+
+  // data
+  if (data.defined()) {
+    CHECK_GE(stages[data]->n_out_dims(), 3U) << "data's out_dims should be more than 3";
+    stages[data]->Fuse({0, 1, 2});
+    stages[data]->ComputeInline();
+  }
+  // input_pad
+  if (do_padding) {
+    CHECK_GE(stages[input_pad]->n_out_dims(), 3U) << "input_pad's out_dims should be more than 3";
+    stages[input_pad]->Fuse({0, 1, 2});
+    stages[input_pad]->Vectorize(stages[input_pad]->n_out_dims() - 1, input_pad->shape.back().as_int32());
+  } else {
+    stages[input_pad]->ComputeInline();
+  }
+
+  // weights
+  if (weights_dilation.defined()) {
+    CHECK_GE(stages[weights_dilation]->n_out_dims(), 3U) << "weights_dilation's out_dims should be more than 3";
+    // oc_outer, ic_outer, oh, ow, ic_inner, oc_inner -> oc_outer, oh, ic_outer, ow, ic_inner, oc_inner
+    stages[weights_dilation]->Reorder({2, 1});
+    stages[weights_dilation]->Fuse({0, 1});
+  }
+
+  // packed_out
+  auto CC = stages[packed_out]->CacheWrite("global", stages, packed_out);
+  // packed_out: [batch, oc_outer, oh, ow, oc_inner]
+  // split oh, ow
+  stages[packed_out]->Split(2, oh_bn_size);
+  stages[packed_out]->Split(4, ow_bn_size);
+  // [batch, oc_outer, oh_outer, oh_inner, ow_outer, ow_inner, oc_inner] ->
+  // [batch_oc_outer_oh_outer_fused, oh_inner, ow_outer, ow_inner, oc_inner]
+  stages[packed_out]->Fuse({0, 1, 2});
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+
+  // CC: [batch, oh, ow, oc, ic, kh, kw] -> [batch_oc_outer_oh_outer_fused, oh_inner, ow, oc_inner, ic, kh, kw]
+  stages[CC]->ComputeAt2(stages[packed_out], 0);
+  VLOG(3) << "cache write shape: " << utils::Join(CC->shape, ", ");
+  // tempory solution because reorder may be wrong before ComputeAt
+  // reorder: [batch_oc_outer_oh_outer_fused, oh_inner, ow_outer, ow_inner, oc_inner] ->
+  // [batch_oc_outer_oh_outer_fused, ow_outer, oh_inner, ow_inner, oc_inner]
+  stages[packed_out]->Reorder({2, 1});
+  stages[packed_out]->Vectorize(stages[packed_out]->n_out_dims() - 1, packed_out->shape.back().as_int32());
+  VLOG(3) << "stages[packed_out]->transformed_domain()" << stages[packed_out]->transformed_domain();
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+
+  // CC: [batch_oc_outer_oh_outer_fused, oh_inner, ow, oc_inner, ic, kh, kw]
+  // split ow
+  stages[CC]->Split(2, ow_bn_size);
+  // reorder: [batch_oc_outer_oh_outer_fused, oh_inner, ow_outer, ow_inner, oc_inner, ic, kh, kw] ->
+  // [batch_oc_outer_oh_outer_fused, oh_inner, ow_outer, ow_inner, oc_inner, ic, kh, kw]
+  stages[CC]->Reorder({2, 1});
+
+  // split ic
+  // CC: [batch_oc_outer_oh_outer_fused, ow_outer, oh_inner, ow_inner, oc_inner, ic, kh, kw]
+  stages[CC]->Split(5, ic_bn_size);
+  // reorder: [batch_oc_outer_oh_outer_fused, ow_outer, oh_inner, ow_inner, oc_inner, ic_outer, ic_inner, kh, kw] ->
+  // [batch_oc_outer_oh_outer_fused, ow_outer, ic_outer, ic_inner, oh_inner, ow_inner, oc_inner, kh, kw]
+  auto oh_inner = stages[CC]->axis(2);
+  auto ow_inner = stages[CC]->axis(3);
+  auto oc_inner = stages[CC]->axis(4);
+  auto ic_outer = stages[CC]->axis(5);
+  auto ic_inner = stages[CC]->axis(6);
+  stages[CC]->Reorder({ic_outer, ic_inner, oh_inner, ow_inner, oc_inner});
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  stages[CC]->Vectorize(stages[CC]->n_out_dims() - 3, CC->shape.back().as_int32());
+  // unroll ow_inner, oh_inner
+  VLOG(3) << stages[CC]->transformed_domain();
+  // CC_init
+  auto CC_init = CC->GetInitTensor(stages, target);
+  stages[CC_init]->Vectorize(stages[CC_init]->n_out_dims() - 1, CC_init->shape.back().as_int32());
+  stages[CC]->Unroll(stages[CC]->n_out_dims() - 4);
+  stages[CC]->Unroll(stages[CC]->n_out_dims() - 5);
+  stages[CC_init]->Unroll(stages[CC_init]->n_out_dims() - 2);
+
+  // res
+  // n, oc, oh, ow
+  if (res.defined()) {
+    stages[res]->Split(1, oc_bn_size);
+    stages[res]->Split(3, oh_bn_size);
+    stages[res]->Split(5, ow_bn_size);
+    // reorder: [n, oc_outer, oc_inner, oh_outer, oh_inner, ow_outer, ow_inner] ->
+    // [n, oc_outer, oh_outer, ow_outer, oh_inner, ow_inner, oc_inner]
+    auto oc_inner1 = stages[res]->axis(2);
+    auto oh_outer1 = stages[res]->axis(3);
+    auto oh_inner1 = stages[res]->axis(4);
+    auto ow_outer1 = stages[res]->axis(5);
+    auto ow_inner1 = stages[res]->axis(6);
+    stages[res]->Reorder({oh_outer1, ow_outer1, oh_inner1, ow_inner1, oc_inner1});
+    // stages[res]->Fuse({0, 1, 2});
+    // Todo: computeAt according to forloops' range
+    // stages[packed_out]->ComputeAt2(stages[res], 2);
+    VLOG(3) << "stages[res]->transformed_domain()" << stages[res]->transformed_domain();
+  }
+}
+
+void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
+                                          const ir::Tensor &res,
+                                          ir::Tensor &packed_out,
+                                          const ir::Tensor &input_pad,
+                                          const ir::Tensor &weights_dilation,
+                                          const ir::Tensor &data,
+                                          const common::Target &target) {
+  CHECK(target.arch == Target::Arch::X86) << "Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse schedule only used in x86";
+  CHECK(packed_out.defined());
+  CHECK(input_pad.defined());
+  auto type = packed_out->type();
+  absl::flat_hash_map<std::string, int> conv2d_factors;
+  CHECK_EQ(packed_out->shape.size(), 5U) << "packed_out's shape size should be 5";
+  Expr h_out             = common::AutoSimplify(packed_out->shape[2]);
+  Expr w_out             = common::AutoSimplify(packed_out->shape[3]);
+  int oh                 = h_out.as_int32();
+  int ow                 = w_out.as_int32();
+  int basic_split_factor = GetBasicFactor(type, target);
+  GetConv2d1x1Factors(&conv2d_factors, -1, -1, oh, ow, type, target);
+  int oh_bn_size = conv2d_factors["oh_bn"];
+  int ow_bn_size = conv2d_factors["ow_bn"];
+
+  auto input_shape = input_pad->shape;
+  int shape_size   = input_shape.size();
+  CHECK_EQ(shape_size, 5U) << "input shape size should be 5";
+  Expr oc_bn     = common::AutoSimplify(packed_out->shape.back());
+  Expr ic_bn     = common::AutoSimplify(input_shape.back());
+  int oc_bn_size = oc_bn.as_int32();
+  int ic_bn_size = ic_bn.as_int32();
+  VLOG(3) << "ow_bn_size" << ow_bn_size;
+  VLOG(3) << "oc_bn_size" << oc_bn_size;
+  VLOG(3) << "ic_bn_size" << ic_bn_size;
+
+  // data
+  if (data.defined()) {
+    stages[data]->ComputeInline();
+  }
+  // weights
+  if (weights_dilation.defined()) {
+    CHECK_GE(stages[weights_dilation]->n_out_dims(), 3U) << "weights_dilation's out_dims should be more than 3";
+    // Reorder: [oc_outer, ic_outer, oh, ow, ic_inner, oc_inner] ->
+    // [oc_outer, oh, ic_outer, ow, ic_inner, oc_inner]
+    stages[weights_dilation]->Reorder({2, 1});
+  }
+
+  // packed_out
+  auto CC = stages[packed_out]->CacheWrite("global", stages, packed_out);
+  VLOG(3) << "stages[packed_out]->transformed_domain()" << stages[packed_out]->transformed_domain();
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // packed_out: [batch, oc_outer, oh, ow, oc_inner]
+  // split oh, ow
+  stages[packed_out]->Split(2, oh_bn_size);
+  stages[packed_out]->Split(4, ow_bn_size);
+
+  // CC: [batch, oc_outer, oh, ow, oc_inner]
+  // packed_out: [batch, oc_outer, oh_outer, oh_inner, ow_outer, ow_inner, oc_inner]
+  stages[CC]->ComputeAt2(stages[packed_out], 2);
+  VLOG(3) << "stages[packed_out]->transformed_domain()" << stages[packed_out]->transformed_domain();
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // tempory solution because reordering before computeAt may be wrong
+  // reorder: [batch, oc_outer, oh_outer, oh_inner, ow_outer, ow_inner, oc_inner] ->
+  // [batch, oc_outer, oh_outer, ow_outer, oh_inner, ow_inner, oc_inner]
+  stages[packed_out]->Reorder({4, 3});
+  stages[packed_out]->Vectorize(stages[packed_out]->n_out_dims() - 1, packed_out->shape.back().as_int32());
+
+  // split oh, ow
+  // CC: [batch, oc_outer, oh_outer, oh_inner, ow, oc_inner, ic, kh, kw]
+  stages[CC]->Split(4, ow_bn_size);
+  // CC: [batch, oc_outer, oh_outer, oh_inner, ow_outer, ow_inner, oc_inner, ic, kh, kw]
+  // split ic
+  stages[CC]->Split(7, ic_bn_size);
+
+  // reorder: [batch, oc_outer, oh_outer, oh_inner, ow_outer, ow_inner, oc_inner, ic_outer, ic_inner, kh, kw] ->
+  // [batch, oc_outer, oh_outer, ow_outer, ic_outer, ic_inner, oh_inner, ow_inner, oc_inner, kh, kw]
+  auto oh_inner = stages[CC]->axis(3);
+  auto ow_outer = stages[CC]->axis(4);
+  auto ow_inner = stages[CC]->axis(5);
+  auto oc_inner = stages[CC]->axis(6);
+  auto ic_outer = stages[CC]->axis(7);
+  auto ic_inner = stages[CC]->axis(8);
+  stages[CC]->Reorder({ow_outer, ic_outer, ic_inner, oh_inner, ow_inner, oc_inner});
+  stages[CC]->Vectorize(stages[CC]->n_out_dims() - 3, CC->shape.back().as_int32());
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // CC_init
+  auto CC_init = CC->GetInitTensor(stages, target);
+  stages[CC_init]->Vectorize(stages[CC_init]->n_out_dims() - 1, CC_init->shape.back().as_int32());
+
+  // res
+  // n, oc, oh, ow
+  if (res.defined()) {
+    stages[res]->Split(1, oc_bn_size);
+    stages[res]->Split(3, oh_bn_size);
+    stages[res]->Split(5, ow_bn_size);
+    // reorder: [n, oc_outer, oc_inner, oh_outer, oh_inner, ow_outer, ow_inner] ->
+    // [n, oc_outer, oh_outer, ow_outer, oh_inner, ow_inner, oc_inner]
+    auto oc_inner1 = stages[res]->axis(2);
+    auto oh_outer1 = stages[res]->axis(3);
+    auto oh_inner1 = stages[res]->axis(4);
+    auto ow_outer1 = stages[res]->axis(5);
+    auto ow_inner1 = stages[res]->axis(6);
+    stages[res]->Reorder({oh_outer1, ow_outer1, oh_inner1, ow_inner1, oc_inner1});
+    VLOG(3) << "stages[res]->transformed_domain()" << stages[res]->transformed_domain();
+  }
+}
+
+void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
+                                      const ir::Tensor &res,
+                                      ir::Tensor &packed_out,
+                                      const ir::Tensor &input_pad,
+                                      const ir::Tensor &weights_dilation,
+                                      const ir::Tensor &data,
+                                      const common::Target &target) {
+  CHECK(target.arch == Target::Arch::X86) << "Conv2d_NCHWc_Schedule_CPU_Nofuse schedule only used in x86";
+  CHECK(packed_out.defined());
+  CHECK(input_pad.defined());
+  auto type = packed_out->type();
+  absl::flat_hash_map<std::string, int> conv2d_factors;
+  CHECK_EQ(packed_out->shape.size(), 5U) << "packed_out's shape size should be 5";
+  Expr w_out             = common::AutoSimplify(packed_out->shape[3]);
+  int ow                 = w_out.as_int32();
+  int basic_split_factor = GetBasicFactor(type, target);
+  GetConv2dFactors(&conv2d_factors, -1, -1, -1, -1, ow, type, target);
+  int ow_bn_size = conv2d_factors["ow_bn"];
+
+  auto input_shape = input_pad->shape;
+  int shape_size   = input_shape.size();
+  CHECK_EQ(shape_size, 5U) << "input shape size should be 5";
+  Expr oc_bn     = common::AutoSimplify(packed_out->shape.back());
+  Expr ic_bn     = common::AutoSimplify(input_shape.back());
+  int oc_bn_size = oc_bn.as_int32();
+  int ic_bn_size = ic_bn.as_int32();
+  VLOG(3) << "ow_bn_size " << ow_bn_size;
+  VLOG(3) << "oc_bn_size " << oc_bn_size;
+  VLOG(3) << "ic_bn_size " << ic_bn_size;
+
+  // data
+  if (data.defined()) {
+    stages[data]->ComputeInline();
+  }
+  // weights
+  if (weights_dilation.defined()) {
+    CHECK_GE(stages[weights_dilation]->n_out_dims(), 3U) << "weights_dilation's out_dims should be more than 3";
+    // Reorder: [oc_outer, ic_outer, oh, ow, ic_inner, oc_inner] ->
+    // [oc_outer, oh, ic_outer, ow, ic_inner, oc_inner]
+    stages[weights_dilation]->Reorder({2, 1});
+  }
+  // packed_out
+  auto CC = stages[packed_out]->CacheWrite("global", stages, packed_out);
+  VLOG(3) << "stages[packed_out]->transformed_domain()" << stages[packed_out]->transformed_domain();
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // packed_out: [batch, oc_outer, oh, ow, oc_inner]
+  // split ow
+  stages[packed_out]->Split(3, ow_bn_size);
+  stages[packed_out]->Vectorize(stages[packed_out]->n_out_dims() - 1, packed_out->shape.back().as_int32());
+
+  // CC: [batch, oc_outer, oh, ow, oc_inner]
+  // packed_out: [batch, oc_outer, oh, ow_outer, ow_inner, oc_inner]
+  // not computeAt ow_outer but oh
+  stages[CC]->ComputeAt2(stages[packed_out], 2);
+  VLOG(3) << "stages[packed_out]->transformed_domain()" << stages[packed_out]->transformed_domain();
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // split ow
+  stages[CC]->Split(3, ow_bn_size);
+  // CC: [batch, oc_outer, oh, ow_outer, ow_inner, oc_inner, ic, kh, kw]
+  // split ic
+  stages[CC]->Split(6, ic_bn_size);
+  // reorder: [batch, oc_outer, oh, ow_outer, ow_inner, oc_inner, ic_outer, ic_inner, kh, kw] ->
+  // [batch, oc_outer, oh, ow_outer, ic_outer, kh, kw, ic_inner, ow_inner, oc_inner]
+  auto ow_inner = stages[CC]->axis(4);
+  auto oc_inner = stages[CC]->axis(5);
+  auto ic_outer = stages[CC]->axis(6);
+  auto ic_inner = stages[CC]->axis(7);
+  auto kh       = stages[CC]->axis(8);
+  auto kw       = stages[CC]->axis(9);
+  stages[CC]->Reorder({ic_outer, kh, kw, ic_inner, ow_inner, oc_inner});
+  stages[CC]->Vectorize(stages[CC]->n_out_dims() - 1, CC->shape.back().as_int32());
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // CC_init
+  auto CC_init = CC->GetInitTensor(stages, target);
+  stages[CC_init]->Vectorize(stages[CC_init]->n_out_dims() - 1, CC_init->shape.back().as_int32());
+
+  // res
+  // n, oc, oh, ow
+  if (res.defined()) {
+    stages[res]->Split(1, oc_bn_size);
+    stages[res]->Split(4, ow_bn_size);
+    // Reorder: [n, oc_outer, oc_inner, oh, ow_outer, ow_inner] ->
+    // [n, oc_outer, oh, ow_outer, ow_inner, oc_inner]
+    auto oc_inner1 = stages[res]->axis(2);
+    auto oh1       = stages[res]->axis(3);
+    auto ow_outer1 = stages[res]->axis(4);
+    auto ow_inner1 = stages[res]->axis(5);
+    stages[res]->Reorder({oh1, ow_outer1, ow_inner1, oc_inner1});
+    VLOG(3) << "stages[res]->transformed_domain()" << stages[res]->transformed_domain();
+  }
+}
+
+void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
+                               const ir::Tensor &res,
+                               ir::Tensor &packed_out,
+                               const ir::Tensor &input_pad,
+                               const ir::Tensor &weights_dilation,
+                               const ir::Tensor &data,
+                               const common::Target &target,
+                               const std::string &key,
+                               bool do_padding) {
+  CHECK(target.arch == Target::Arch::X86) << "Conv2d_NCHWc_Schedule_CPU schedule only used in x86";
+  CHECK(packed_out.defined());
+  CHECK(input_pad.defined());
+  auto type = packed_out->type();
+  CHECK_EQ(packed_out->shape.size(), 5U) << "packed_out's shape size should be 5";
+  Expr w_out       = common::AutoSimplify(packed_out->shape[3]);
+  int ow           = w_out.as_int32();
+  auto input_shape = input_pad->shape;
+  int shape_size   = input_shape.size();
+  CHECK_EQ(shape_size, 5U) << "input shape size should be 5";
+  Expr oc_bn     = common::AutoSimplify(packed_out->shape.back());
+  Expr ic_bn     = common::AutoSimplify(input_shape.back());
+  int oc_bn_size = oc_bn.as_int32();
+  int ic_bn_size = ic_bn.as_int32();
+
+  absl::flat_hash_map<std::string, int> conv2d_factors;
+  GetConv2dFactors(&conv2d_factors, -1, -1, -1, -1, ow, type, target, key);
+  int ow_bn_size = conv2d_factors["ow_bn"];
+  VLOG(3) << "ow_bn_size " << ow_bn_size;
+  VLOG(3) << "oc_bn_size " << oc_bn_size;
+  VLOG(3) << "ic_bn_size " << ic_bn_size;
+  int unroll_kw = 0;
+  if (conv2d_factors.count("unroll_kw")) {
+    unroll_kw = conv2d_factors["unroll_kw"];
+  }
+  VLOG(3) << "unroll_kw " << unroll_kw;
+  // data
+  if (data.defined()) {
+    CHECK_GE(stages[data]->n_out_dims(), 3U) << "data's out_dims should be more than 3";
+    stages[data]->Fuse({0, 1, 2});
+    stages[data]->ComputeInline();
+  }
+  // input_pad
+  if (do_padding) {
+    CHECK_GE(stages[input_pad]->n_out_dims(), 3U) << "input_pad's out_dims should be more than 3";
+    stages[input_pad]->Fuse({0, 1, 2});
+    stages[input_pad]->Vectorize(stages[input_pad]->n_out_dims() - 1, input_pad->shape.back().as_int32());
+  } else {
+    stages[input_pad]->ComputeInline();
+  }
+  // weights
+  if (weights_dilation.defined()) {
+    CHECK_GE(stages[weights_dilation]->n_out_dims(), 3U) << "weights_dilation's out_dims should be more than 3";
+    // oc_outer, ic_outer, oh, ow, ic_inner, oc_inner -> oc_outer, oh, ic_outer, ow, ic_inner, oc_inner
+    stages[weights_dilation]->Reorder({2, 1});
+    stages[weights_dilation]->Fuse({0, 1});
+  }
+  // packed_out
+  auto CC = stages[packed_out]->CacheWrite("global", stages, packed_out);
+  VLOG(3) << "stages[packed_out]->transformed_domain()" << stages[packed_out]->transformed_domain();
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // packed_out: [batch, oc_outer, oh, ow, oc_inner]
+  // split ow
+  stages[packed_out]->Split(3, ow_bn_size);
+  stages[packed_out]->Fuse({0, 1, 2});
+  stages[packed_out]->Vectorize(stages[packed_out]->n_out_dims() - 1, packed_out->shape.back().as_int32());
+
+  // CC
+  stages[CC]->ComputeAt2(stages[packed_out], 1);
+  VLOG(3) << "cache write shape: " << utils::Join(CC->shape, ", ");
+  VLOG(3) << "stages[packed_out]->transformed_domain()" << stages[packed_out]->transformed_domain();
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // CC: [batch_oc_outer_oh_fused, ow_outer, ow_inner, oc_inner, ic, kh, kw]
+  // for fused_axes' copy transform, not split ow again
+  // split ic
+  stages[CC]->Split(4, ic_bn_size);
+  // reorder: [batch_oc_outer_oh_fused, ow_outer, ow_inner, oc_inner, ic_outer, ic_inner, kh, kw] ->
+  // [batch_oc_outer_oh_fused, ow_outer, ic_outer, kh, kw, ic_inner, ow_inner, oc_inner]
+  auto ow_inner = stages[CC]->axis(2);
+  auto oc_inner = stages[CC]->axis(3);
+  auto ic_outer = stages[CC]->axis(4);
+  auto ic_inner = stages[CC]->axis(5);
+  auto kh       = stages[CC]->axis(6);
+  auto kw       = stages[CC]->axis(7);
+  if (unroll_kw) {
+    stages[CC]->Reorder({ic_outer, kh, ic_inner, kw, ow_inner, oc_inner});
+    stages[CC]->Unroll(kw);
+  } else {
+    stages[CC]->Reorder({ic_outer, kh, kw, ic_inner, ow_inner, oc_inner});
+  }
+  stages[CC]->Vectorize(stages[CC]->n_out_dims() - 1, CC->shape.back().as_int32());
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // CC_init
+  auto CC_init = CC->GetInitTensor(stages, target);
+  stages[CC_init]->Vectorize(stages[CC_init]->n_out_dims() - 1, CC_init->shape.back().as_int32());
+  // unroll ow_inner
+  stages[CC]->Unroll(stages[CC]->n_out_dims() - 2);
+  stages[CC_init]->Unroll(stages[CC_init]->n_out_dims() - 2);
+
+  // res
+  // n, oc, oh, ow
+  if (res.defined()) {
+    stages[res]->Split(1, oc_bn_size);
+    stages[res]->Split(4, ow_bn_size);
+    // Reorder: [n, oc_outer, oc_inner, oh, ow_outer, ow_inner] ->
+    // [n, oc_outer, oh, ow_outer, ow_inner, oc_inner]
+    auto oc_inner1 = stages[res]->axis(2);
+    auto oh1       = stages[res]->axis(3);
+    auto ow_outer1 = stages[res]->axis(4);
+    auto ow_inner1 = stages[res]->axis(5);
+    stages[res]->Reorder({oh1, ow_outer1, ow_inner1, oc_inner1});
+    // stages[res]->Fuse({0, 1, 2});
+    // Todo: computeAt according to forloops' range
+    // stages[packed_out]->ComputeAt2(stages[res], 2);
+  }
+}
+
+void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
+                                                const ir::Tensor &res,
+                                                ir::Tensor &packed_out,
+                                                const ir::Tensor &input_pad,
+                                                const ir::Tensor &weights_dilation,
+                                                const ir::Tensor &data,
+                                                const common::Target &target,
+                                                bool do_padding) {
+  CHECK(target.arch == Target::Arch::X86) << "Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse schedule only used in x86";
+  CHECK(packed_out.defined());
+  CHECK(input_pad.defined());
+  auto type = packed_out->type();
+  absl::flat_hash_map<std::string, int> conv2d_factors;
+  CHECK_EQ(packed_out->shape.size(), 5U) << "packed_out's shape size should be 5";
+  Expr w_out             = common::AutoSimplify(packed_out->shape[3]);
+  int ow                 = w_out.as_int32();
+  int basic_split_factor = GetBasicFactor(type, target);
+  GetConv2dFactors(&conv2d_factors, -1, -1, -1, -1, ow, type, target);
+  int ow_bn_size = conv2d_factors["ow_bn"];
+
+  auto input_shape = input_pad->shape;
+  int shape_size   = input_shape.size();
+  CHECK_EQ(shape_size, 5U) << "input shape size should be 5";
+  Expr oc_bn     = common::AutoSimplify(packed_out->shape.back());
+  Expr ic_bn     = common::AutoSimplify(input_shape.back());
+  int oc_bn_size = oc_bn.as_int32();
+  int ic_bn_size = ic_bn.as_int32();
+  VLOG(3) << "ow_bn_size " << ow_bn_size;
+  VLOG(3) << "oc_bn_size " << oc_bn_size;
+  VLOG(3) << "ic_bn_size " << ic_bn_size;
+
+  // data
+  if (data.defined()) {
+    stages[data]->ComputeInline();
+  }
+  // input_pad
+  if (!do_padding) {
+    stages[input_pad]->ComputeInline();
+  }
+  // weights
+  if (weights_dilation.defined()) {
+    CHECK_GE(stages[weights_dilation]->n_out_dims(), 3U) << "weights_dilation's out_dims should be more than 3";
+    // Reorder: [oc_outer, ic_outer, oh, ow, ic_inner, oc_inner] ->
+    // [oc_outer, oh, ic_outer, ow, ic_inner, oc_inner]
+    stages[weights_dilation]->Reorder({2, 1});
+  }
+
+  // packed_out
+  auto CC = stages[packed_out]->CacheWrite("global", stages, packed_out);
+  VLOG(3) << "stages[packed_out]->transformed_domain()" << stages[packed_out]->transformed_domain();
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // packed_out: [batch, oc_outer, oh, ow, oc_inner]
+  // split ow
+  stages[packed_out]->Split(3, ow_bn_size);
+  stages[packed_out]->Vectorize(stages[packed_out]->n_out_dims() - 1, packed_out->shape.back().as_int32());
+
+  // CC: [batch, oc_outer, oh, ow, oc_inner]
+  // packed_out: [batch, oc_outer, oh, ow_outer, ow_inner, oc_inner]
+  stages[CC]->ComputeAt2(stages[packed_out], 3);
+  VLOG(3) << "stages[packed_out]->transformed_domain()" << stages[packed_out]->transformed_domain();
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+
+  // CC: [batch, oc_outer, oh, ow_outer, ow_inner, oc_inner, fc, kh, kw]
+  // batch, oc_outer, oh, ow_outer, kh, kw, ow_inner, oc_inner
+  auto CC_ow_inner = stages[CC]->axis(4);
+  auto CC_oc_inner = stages[CC]->axis(5);
+  auto CC_fc       = stages[CC]->axis(6);
+  auto CC_kh       = stages[CC]->axis(7);
+  auto CC_kw       = stages[CC]->axis(8);
+  stages[CC]->Reorder({CC_fc, CC_kh, CC_kw, CC_ow_inner, CC_oc_inner});
+  stages[CC]->Vectorize(stages[CC]->n_out_dims() - 1, CC->shape.back().as_int32());
+  VLOG(3) << "stages[CC]->transformed_domain()" << stages[CC]->transformed_domain();
+  // CC_init
+  auto CC_init = CC->GetInitTensor(stages, target);
+  stages[CC_init]->Vectorize(stages[CC_init]->n_out_dims() - 1, CC_init->shape.back().as_int32());
+
+  // res
+  // n, oc, oh, ow
+  if (res.defined()) {
+    stages[res]->Split(1, oc_bn_size);
+    stages[res]->Split(4, ow_bn_size);
+    // Reorder: [n, oc_outer, oc_inner, oh, ow_outer, ow_inner] ->
+    // [n, oc_outer, oh, ow_outer, ow_inner, oc_inner]
+    auto oc_inner1 = stages[res]->axis(2);
+    auto oh1       = stages[res]->axis(3);
+    auto ow_outer1 = stages[res]->axis(4);
+    auto ow_inner1 = stages[res]->axis(5);
+    stages[res]->Reorder({oh1, ow_outer1, ow_inner1, oc_inner1});
+    VLOG(3) << "stages[res]->transformed_domain()" << stages[res]->transformed_domain();
+  }
+}
+
+void CudaScheduleMul(poly::StageMap stages,
+                     ir::Tensor output,
+                     const std::vector<int> &output_shape,
+                     const common::Target &target) {
+  stages[output]->Split(1, 2);
+  stages[output]->Bind(0, "blockIdx.x");
+  stages[output]->Bind(1, "threadIdx.x");
+}
+
+inline void InputDirectConvCudaParam(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> &model_data,
+    const std::string &key,
+    const std::vector<std::vector<int>> &int_data) {
+  CHECK_EQ(int_data.size(), 6UL);
+  absl::flat_hash_map<std::string, std::vector<int>> schedule_data;
+  schedule_data["rc"] = int_data[0];
+  schedule_data["ry"] = int_data[1];
+  schedule_data["rx"] = int_data[2];
+  schedule_data["f"]  = int_data[3];
+  schedule_data["y"]  = int_data[4];
+  schedule_data["x"]  = int_data[5];
+  CHECK(model_data.count(key) == 0) << "Key " << key << "in conv cuda param already exists.";
+  model_data[key] = schedule_data;
+}
+
+inline void InputWinogradConvCudaParam(
+    absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> &model_data,
+    const std::string &key,
+    const std::vector<std::vector<int>> &int_data) {
+  CHECK_EQ(int_data.size(), 4UL);
+  absl::flat_hash_map<std::string, std::vector<int>> schedule_data;
+  schedule_data["rc"] = int_data[0];
+  schedule_data["x"]  = int_data[1];
+  schedule_data["y"]  = int_data[2];
+  schedule_data["b"]  = int_data[3];
+  model_data[key]     = schedule_data;
+}
+
+absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> CreateCudaParams() {
+  absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> model_data;
+  // The format of serial data is:
+  // hash_key: string = name of schedule + shape of input_pad + shape of weights + shape of output
+  // value: vector of params
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 3 230 230 64 3 7 7 1 64 112 112",
+                           {{3, 1}, {7, 1}, {1, 7}, {1, 4, 8, 2}, {112, 1, 1, 1}, {1, 7, 16, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 56 56 64 64 1 1 1 64 56 56",
+                           {{4, 16}, {1, 1}, {1, 1}, {1, 8, 8, 1}, {56, 1, 1, 1}, {1, 2, 28, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 58 58 128 64 3 3 1 128 28 28",
+                           {{32, 2}, {1, 3}, {1, 3}, {4, 2, 16, 1}, {28, 1, 1, 1}, {1, 2, 14, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 56 56 128 64 1 1 1 128 28 28",
+                           {{4, 16}, {1, 1}, {1, 1}, {2, 2, 32, 1}, {28, 1, 1, 1}, {1, 2, 14, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 128 30 30 256 128 3 3 1 256 14 14",
+                           {{32, 4}, {1, 3}, {1, 3}, {8, 1, 16, 2}, {7, 1, 2, 1}, {1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 128 28 28 256 128 1 1 1 256 14 14",
+                           {{16, 8}, {1, 1}, {1, 1}, {8, 1, 16, 2}, {14, 1, 1, 1}, {1, 1, 14, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 256 16 16 512 256 3 3 1 512 7 7",
+                           {{64, 4}, {1, 3}, {1, 3}, {32, 1, 16, 1}, {7, 1, 1, 1}, {1, 1, 7, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 256 14 14 512 256 1 1 1 512 7 7",
+                           {{16, 16}, {1, 1}, {1, 1}, {16, 1, 32, 1}, {7, 1, 1, 1}, {1, 1, 7, 1}});
+
+  // winograd
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 58 58 64 64 3 3 1 64 56 56",
+                           {{32, 2}, {1, 3}, {1, 3}, {4, 1, 8, 2}, {28, 1, 2, 1}, {1, 2, 7, 4}});
+  // winograd
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 512 9 9 512 512 3 3 1 512 7 7",
+                           {{64, 8}, {1, 3}, {1, 3}, {32, 1, 16, 1}, {7, 1, 1, 1}, {1, 1, 7, 1}});
+  // winograd
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 256 16 16 256 256 3 3 1 256 14 14",
+                           {{64, 4}, {1, 3}, {1, 3}, {16, 1, 16, 1}, {14, 1, 1, 1}, {1, 1, 14, 1}});
+  // winograd
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 128 30 30 128 128 3 3 1 128 28 28",
+                           {{32, 4}, {1, 3}, {1, 3}, {8, 1, 16, 1}, {14, 1, 2, 1}, {1, 1, 7, 4}});
+
+  // MobileNetV2 schedule params
+  /*   InputDirectConvCudaParam(model_data,
+                             "CudaDirectConvSchedule 1 3 226 226 32 3 3 3 1 32 112 112",
+                             {{3, 1}, {1, 3}, {1, 3}, {-1, 2, 8, 2}, {-1, 1, 1, 7}, {-1, 1, 16, 1}}); */
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 32 112 112 16 32 1 1 1 16 112 112",
+                           {{-1, 4}, {-1, 1}, {-1, 1}, {-1, 2, 2, 4}, {-1, 1, 2, 1}, {-1, 1, 56, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 32 112 112 32 32 1 1 1 32 112 112",
+                           {{-1, 4}, {-1, 1}, {-1, 1}, {-1, 1, 4, 8}, {-1, 1, 2, 1}, {-1, 7, 16, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 16 112 112 96 16 1 1 1 96 112 112",
+                           {{-1, 4}, {-1, 1}, {-1, 1}, {-1, 4, 4, 2}, {-1, 2, 2, 1}, {-1, 1, 16, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 96 56 56 24 96 1 1 1 24 56 56",
+                           {{-1, 4}, {-1, 1}, {-1, 1}, {-1, 3, 4, 2}, {-1, 1, 1, 1}, {-1, 1, 28, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 24 56 56 144 24 1 1 1 144 56 56",
+                           {{-1, 6}, {-1, 1}, {-1, 1}, {-1, 9, 4, 2}, {-1, 2, 1, 1}, {-1, 1, 56, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 144 56 56 24 144 1 1 1 24 56 56",
+                           {{-1, 12}, {-1, 1}, {-1, 1}, {-1, 1, 8, 3}, {-1, 1, 1, 1}, {-1, 2, 14, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 144 28 28 32 144 1 1 1 32 28 28",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 4, 8, 1}, {-1, 1, 1, 1}, {-1, 1, 14, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 32 28 28 192 32 1 1 1 192 28 28",
+                           {{-1, 4}, {-1, 1}, {-1, 1}, {-1, 6, 4, 1}, {-1, 2, 1, 2}, {-1, 1, 28, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 192 28 28 32 192 1 1 1 32 28 28",
+                           {{-1, 48}, {-1, 1}, {-1, 1}, {-1, 4, 8, 1}, {-1, 1, 1, 1}, {-1, 1, 28, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 192 14 14 64 192 1 1 1 64 14 14",
+                           {{-1, 12}, {-1, 1}, {-1, 1}, {-1, 1, 8, 2}, {-1, 2, 1, 1}, {-1, 1, 14, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 14 14 384 64 1 1 1 384 14 14",
+                           {{-1, 4}, {-1, 1}, {-1, 1}, {-1, 2, 4, 3}, {-1, 1, 7, 1}, {-1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 384 14 14 64 384 1 1 1 64 14 14",
+                           {{-1, 48}, {-1, 1}, {-1, 1}, {-1, 2, 16, 1}, {-1, 1, 2, 1}, {-1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 384 14 14 96 384 1 1 1 96 14 14",
+                           {{-1, 12}, {-1, 1}, {-1, 1}, {-1, 2, 6, 1}, {-1, 1, 2, 1}, {-1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 96 14 14 576 96 1 1 1 576 14 14",
+                           {{-1, 6}, {-1, 1}, {-1, 1}, {-1, 1, 6, 6}, {-1, 1, 7, 1}, {-1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 576 14 14 96 576 1 1 1 96 14 14",
+                           {{-1, 24}, {-1, 1}, {-1, 1}, {-1, 1, 8, 3}, {-1, 1, 2, 1}, {-1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 576 7 7 160 576 1 1 1 160 7 7",
+                           {{-1, 36}, {-1, 1}, {-1, 1}, {-1, 2, 2, 2}, {-1, 1, 7, 1}, {-1, 1, 7, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 160 7 7 960 160 1 1 1 960 7 7",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 6, 4, 1}, {-1, 1, 7, 1}, {-1, 1, 7, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 960 7 7 160 960 1 1 1 160 7 7",
+                           {{-1, 60}, {-1, 1}, {-1, 1}, {-1, 2, 4, 1}, {-1, 1, 7, 1}, {-1, 1, 7, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 960 7 7 320 960 1 1 1 320 7 7",
+                           {{-1, 20}, {-1, 1}, {-1, 1}, {-1, 2, 2, 2}, {-1, 1, 7, 1}, {-1, 1, 7, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 320 7 7 1280 320 1 1 1 1280 7 7",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 2, 16, 1}, {-1, 7, 1, 1}, {-1, 1, 7, 1}});
+
+  // EfficientNet schedule params
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 3 228 228 32 3 3 3 1 32 113 113",
+                           {{-1, 1}, {-1, 1}, {-1, 3}, {-1, 32, 1, 1}, {-1, 1, 1, 1}, {-1, 1, 113, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 32 1 1 8 32 1 1 1 8 1 1",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 1, 4, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 8 1 1 32 8 1 1 1 32 1 1",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 1, 8, 4}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 96 1 1 4 96 1 1 1 4 1 1",
+                           {{-1, 48}, {-1, 1}, {-1, 1}, {-1, 1, 4, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 4 1 1 96 4 1 1 1 96 1 1",
+                           {{-1, 2}, {-1, 1}, {-1, 1}, {-1, 12, 1, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 144 1 1 6 144 1 1 1 6 1 1",
+                           {{-1, 48}, {-1, 1}, {-1, 1}, {-1, 1, 6, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 6 1 1 144 6 1 1 1 144 1 1",
+                           {{-1, 2}, {-1, 1}, {-1, 1}, {-1, 2, 8, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 144 28 28 40 144 1 1 1 40 28 28",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 5, 8, 1}, {-1, 1, 1, 1}, {-1, 1, 28, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 40 28 28 240 40 1 1 1 240 28 28",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 2, 8, 3}, {-1, 4, 1, 1}, {-1, 1, 28, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 240 1 1 10 240 1 1 1 10 1 1",
+                           {{-1, 60}, {-1, 1}, {-1, 1}, {-1, 1, 5, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 10 1 1 240 10 1 1 1 240 1 1",
+                           {{-1, 10}, {-1, 1}, {-1, 1}, {-1, 1, 40, 3}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 240 28 28 40 240 1 1 1 40 28 28",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 1, 8, 5}, {-1, 1, 1, 1}, {-1, 1, 28, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 240 14 14 80 240 1 1 1 80 14 14",
+                           {{-1, 20}, {-1, 1}, {-1, 1}, {-1, 2, 8, 1}, {-1, 1, 2, 1}, {-1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 80 14 14 480 80 1 1 1 480 14 14",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 2, 8, 3}, {-1, 1, 7, 1}, {-1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 480 1 1 20 480 1 1 1 20 1 1",
+                           {{-1, 60}, {-1, 1}, {-1, 1}, {-1, 1, 4, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 20 1 1 480 20 1 1 1 480 1 1",
+                           {{-1, 5}, {-1, 1}, {-1, 1}, {-1, 1, 32, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 480 14 14 80 480 1 1 1 80 14 14",
+                           {{-1, 40}, {-1, 1}, {-1, 1}, {-1, 2, 8, 1}, {-1, 1, 2, 1}, {-1, 1, 14, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 480 14 14 112 480 1 1 1 112 14 14",
+                           {{-1, 20}, {-1, 1}, {-1, 1}, {-1, 1, 8, 2}, {-1, 1, 2, 1}, {-1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 112 14 14 672 112 1 1 1 672 14 14",
+                           {{-1, 14}, {-1, 1}, {-1, 1}, {-1, 1, 7, 6}, {-1, 1, 7, 1}, {-1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 672 1 1 28 672 1 1 1 28 1 1",
+                           {{-1, 28}, {-1, 1}, {-1, 1}, {-1, 1, 7, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 28 1 1 672 28 1 1 1 672 1 1",
+                           {{-1, 28}, {-1, 1}, {-1, 1}, {-1, 1, 16, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 672 14 14 112 672 1 1 1 112 14 14",
+                           {{-1, 14}, {-1, 1}, {-1, 1}, {-1, 2, 4, 2}, {-1, 1, 2, 1}, {-1, 1, 7, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 672 7 7 192 672 1 1 1 192 7 7",
+                           {{-1, 28}, {-1, 1}, {-1, 1}, {-1, 1, 2, 3}, {-1, 1, 7, 1}, {-1, 1, 7, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 192 7 7 1152 192 1 1 1 1152 7 7",
+                           {{-1, 24}, {-1, 1}, {-1, 1}, {-1, 1, 12, 3}, {-1, 7, 1, 1}, {-1, 1, 7, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 1152 1 1 48 1152 1 1 1 48 1 1",
+                           {{-1, 576}, {-1, 1}, {-1, 1}, {-1, 1, 3, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 48 1 1 1152 48 1 1 1 1152 1 1",
+                           {{-1, 12}, {-1, 1}, {-1, 1}, {-1, 1, 32, 1}, {-1, 1, 1, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 1152 7 7 192 1152 1 1 1 192 7 7",
+                           {{-1, 36}, {-1, 1}, {-1, 1}, {-1, 1, 2, 6}, {-1, 1, 7, 1}, {-1, 1, 7, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 1152 7 7 320 1152 1 1 1 320 7 7",
+                           {{-1, 12}, {-1, 1}, {-1, 1}, {-1, 1, 2, 4}, {-1, 1, 7, 1}, {-1, 1, 7, 1}});
+
+  // FaceDet schedule params
+  /*   InputDirectConvCudaParam(model_data,
+                                 "CudaDirectConvSchedule 1 3 242 322 16 3 3 3 1 16 120 160",
+                                 {{-1, 1}, {-1, 3}, {-1, 3}, {-1, 2, 4, 2}, {-1, 1, 1, 5}, {-1, 1, 32, 1}}); */
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 16 120 160 32 16 1 1 1 32 120 160",
+                           {{-1, 4}, {-1, 1}, {-1, 1}, {-1, 8, 4, 1}, {-1, 1, 1, 1}, {-1, 5, 32, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 32 60 80 32 32 1 1 1 32 60 80",
+                           {{-1, 4}, {-1, 1}, {-1, 1}, {-1, 8, 4, 1}, {-1, 3, 1, 1}, {-1, 1, 40, 1}});
+  /*   InputDirectConvCudaParam(model_data,
+                                 "CudaDirectConvSchedule 1 32 30 40 64 32 1 1 1 64 30 40",
+                                 {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 2, 8, 2}, {-1, 1, 1, 3}, {-1, 1, 20, 1}}); */
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 30 40 64 64 1 1 1 64 30 40",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 2, 8, 2}, {-1, 1, 2, 1}, {-1, 5, 8, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 30 40 8 64 1 1 1 8 30 40",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 2, 4, 1}, {-1, 1, 2, 1}, {-1, 1, 8, 1}});
+  /*   InputDirectConvCudaParam(model_data,
+                                 "CudaDirectConvSchedule 1 8 32 42 12 8 3 3 1 12 30 40",
+                                 {{-1, 4}, {-1, 3}, {-1, 3}, {-1, 1, 12, 1}, {-1, 1, 1, 3}, {-1, 1, 10, 1}});
+    InputDirectConvCudaParam(model_data,
+                                 "CudaDirectConvSchedule 1 8 32 42 16 8 3 3 1 16 30 40",
+                                 {{-1, 8}, {-1, 3}, {-1, 3}, {-1, 1, 16, 1}, {-1, 3, 1, 2}, {-1, 1, 4, 2}}); */
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 16 36 46 16 16 3 3 1 16 30 40",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 2, 8, 1}, {-1, 1, 2, 1}, {-1, 1, 8, 1}});
+  /*   InputDirectConvCudaParam(model_data,
+                             "CudaDirectConvSchedule 1 16 34 44 16 16 3 3 1 16 30 40",
+                             {{-1, 4}, {-1, 3}, {-1, 3}, {-1, 1, 4, 2}, {-1, 3, 2, 1}, {-1, 1, 20, 1}}); */
+  /*   InputDirectConvCudaParam(model_data,
+                                 "CudaDirectConvSchedule 1 12 32 42 16 12 3 3 1 16 30 40",
+                                 {{-1, 4}, {-1, 3}, {-1, 3}, {-1, 1, 16, 1}, {-1, 1, 2, 3}, {-1, 1, 2, 2}}); */
+  /*   InputDirectConvCudaParam(model_data,
+                             "CudaDirectConvSchedule 1 16 40 50 16 16 3 3 1 16 30 40",
+                             {{-1, 4}, {-1, 1}, {-1, 3}, {-1, 1, 1, 8}, {-1, 1, 3, 1}, {-1, 1, 40, 1}}); */
+  /*   InputDirectConvCudaParam(model_data,
+                               "CudaDirectConvSchedule 1 48 30 40 64 48 1 1 1 64 30 40",
+                               {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 2, 8, 2}, {-1, 1, 1, 3}, {-1, 1, 20, 1}}); */
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 30 40 12 64 1 1 1 12 30 40",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 1, 4, 3}, {-1, 1, 3, 1}, {-1, 1, 10, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 30 40 6 64 1 1 1 6 30 40",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 3, 2, 1}, {-1, 1, 3, 1}, {-1, 1, 10, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 15 20 128 64 1 1 1 128 15 20",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 2, 8, 2}, {-1, 1, 3, 1}, {-1, 1, 10, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 128 15 20 128 128 1 1 1 128 15 20",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 4, 8, 1}, {-1, 1, 3, 1}, {-1, 1, 10, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 128 15 20 8 128 1 1 1 8 15 20",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 1, 8, 1}, {-1, 1, 1, 1}, {-1, 1, 10, 2}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 128 15 20 4 128 1 1 1 4 15 20",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 1, 4, 1}, {-1, 1, 1, 1}, {-1, 1, 20, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 128 8 10 256 128 1 1 1 256 8 10",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 1, 16, 2}, {-1, 1, 8, 1}, {-1, 1, 2, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 256 8 10 256 256 1 1 1 256 8 10",
+                           {{-1, 8}, {-1, 1}, {-1, 1}, {-1, 4, 8, 1}, {-1, 1, 8, 1}, {-1, 1, 2, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 256 8 10 64 256 1 1 1 64 8 10",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 1, 16, 1}, {-1, 1, 8, 1}, {-1, 2, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 256 8 10 8 256 1 1 1 8 8 10",
+                           {{-1, 32}, {-1, 1}, {-1, 1}, {-1, 1, 8, 1}, {-1, 1, 2, 1}, {-1, 1, 2, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 256 8 10 4 256 1 1 1 4 8 10",
+                           {{-1, 32}, {-1, 1}, {-1, 1}, {-1, 1, 4, 1}, {-1, 1, 4, 1}, {-1, 1, 2, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 64 4 5 256 64 1 1 1 256 4 5",
+                           {{-1, 16}, {-1, 1}, {-1, 1}, {-1, 1, 8, 1}, {-1, 1, 4, 1}, {-1, 1, 5, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 256 6 7 12 256 3 3 1 12 4 5",
+                           {{-1, 32}, {-1, 3}, {-1, 3}, {-1, 1, 4, 1}, {-1, 1, 4, 1}, {-1, 1, 1, 1}});
+  InputDirectConvCudaParam(model_data,
+                           "CudaDirectConvSchedule 1 256 6 7 6 256 3 3 1 6 4 5",
+                           {{-1, 32}, {-1, 3}, {-1, 3}, {-1, 1, 2, 1}, {-1, 1, 4, 1}, {-1, 1, 1, 1}});
+
+#ifndef CINN_WITH_CUDNN
+  InputWinogradConvCudaParam(model_data,
+                             "CudaWinogradConvSchedule 1 512 9 9 512 512 3 3 1 512 7 7",
+                             {{32, 16}, {1, 1, 8, 2}, {8, 1, 16, 4}, {16, 1, 1, 1}});
+  InputWinogradConvCudaParam(model_data,
+                             "CudaWinogradConvSchedule 1 256 6 7 12 256 3 3 1 12 4 5",
+                             {{-1, 256}, {-1, 1, 6, 1}, {-1, 1, 6, 1}, {-1, 1, 1, 1}});
+  InputWinogradConvCudaParam(model_data,
+                             "CudaWinogradConvSchedule 1 256 6 7 6 256 3 3 1 12 4 5",
+                             {{-1, 256}, {-1, 1, 6, 1}, {-1, 1, 6, 1}, {-1, 1, 1, 1}});
+  InputWinogradConvCudaParam(model_data,
+                             "CudaWinogradConvSchedule 1 12 32 42 16 12 3 3 1 16 30 40",
+                             {{-1, 12}, {-1, 2, 30, 1}, {-1, 4, 2, 2}, {-1, 1, 1, 1}});
+  InputWinogradConvCudaParam(model_data,
+                             "CudaWinogradConvSchedule 1 8 32 42 12 8 3 3 1 12 30 40",
+                             {{-1, 8}, {-1, 2, 30, 1}, {-1, 1, 2, 6}, {-1, 1, 1, 1}});
+  InputWinogradConvCudaParam(model_data,
+                             "CudaWinogradConvSchedule 1 8 32 42 16 8 3 3 1 16 30 40",
+                             {{-1, 4}, {-1, 2, 30, 1}, {-1, 1, 4, 4}, {-1, 1, 1, 1}});
+#endif
+  return model_data;
+}
+
+void CreateCudaSerialData(const std::string &file_name) { SaveSerialData(CreateCudaParams(), file_name); }
+
+int GetMaxSplitter(int a, int b) {
+  while (a % b > 0) {
+    b--;
+  }
+  return b;
+}
+
+void LoadSerialData(absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *params,
+                    const std::string &file_name) {
+  proto::ModelData read_model_data;
+  std::fstream input(file_name, std::ios::in | std::ios::binary);
+  if (!read_model_data.ParseFromIstream(&input)) {
+    std::cerr << "Failed to parse address book." << std::endl;
+    exit(-1);
+  }
+  input.close();
+  std::string test_write3;
+  read_model_data.SerializeToString(&test_write3);
+  auto read_model_map = read_model_data.data();
+  for (auto &i : read_model_map) {
+    auto read_schedule_map = i.second.data();
+    absl::flat_hash_map<std::string, std::vector<int>> param_data;
+    for (auto &j : read_schedule_map) {
+      std::vector<int> temp_data;
+      for (int k = 0; k < j.second.data_size(); k++) {
+        temp_data.push_back(std::stoi(j.second.data(k)));
+      }
+      param_data[j.first] = temp_data;
+    }
+    (*params)[i.first] = param_data;
+  }
+}
+
+void SaveSerialData(
+    const absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> &model_data,
+    const std::string &file_name) {
+  proto::ModelData write_model_data;
+  for (auto &i : model_data) {
+    proto::ScheduleData write_schedule_data;
+    for (auto &j : i.second) {
+      proto::StringData write_vector_data;
+      for (auto &k : j.second) {
+        write_vector_data.add_data(std::to_string(k));
+      }
+      auto data_map        = write_schedule_data.mutable_data();
+      (*data_map)[j.first] = write_vector_data;
+    }
+    auto model_map        = write_model_data.mutable_data();
+    (*model_map)[i.first] = write_schedule_data;
+    std::string test_write1;
+    write_schedule_data.SerializeToString(&test_write1);
+  }
+  std::fstream output(file_name, std::ios::out | std::ios::trunc | std::ios::binary);
+  std::string test_write;
+  write_model_data.SerializeToString(&test_write);
+  if (!write_model_data.SerializeToOstream(&output)) {
+    std::cerr << "Failed to write test_serial.log" << std::endl;
+    exit(-1);
+  }
+  output.close();
+}
+
+void CudaScheduleDepthwiseConv(poly::StageMap stages, ir::Tensor &output, const common::Target &target) {
+  auto OL = stages[output]->CacheWrite("local", stages, output);
+  stages[output]->Bind(0, "blockIdx.x");
+  stages[output]->Bind(1, "blockIdx.y");
+  stages[output]->Bind(2, "blockIdx.z");
+  stages[output]->Bind(3, "threadIdx.x");
+  stages[OL]->ComputeAt(stages[output], 3);
+  stages[OL]->Unroll(4);
+  stages[OL]->Unroll(5);
+}
+
+void CudaScheduleConv(poly::StageMap stages,
+                      ir::Tensor &input_pad,
+                      ir::Tensor &weights,
+                      ir::Tensor &output,
+                      const common::Target &target) {
+  auto &res = ScheduleParam::get_cuda_instance().GetParam();
+  int n     = output->shape[0].as_int32();
+  int c     = output->shape[1].as_int32();
+  optim::Simplify(&(output->shape[2]));
+  int h = output->shape[2].as_int32();
+  optim::Simplify(&(output->shape[3]));
+  int w  = output->shape[3].as_int32();
+  int rc = input_pad->shape[1].as_int32();
+
+  std::string key =
+      "CudaDirectConvSchedule " + std::to_string(input_pad->shape[0].as_int32()) + " " +
+      std::to_string(input_pad->shape[1].as_int32()) + " " + std::to_string(input_pad->shape[2].as_int32()) + " " +
+      std::to_string(input_pad->shape[3].as_int32()) + " " + std::to_string(weights->shape[0].as_int32()) + " " +
+      std::to_string(weights->shape[1].as_int32()) + " " + std::to_string(weights->shape[2].as_int32()) + " " +
+      std::to_string(weights->shape[3].as_int32()) + " " + std::to_string(output->shape[0].as_int32()) + " " +
+      std::to_string(output->shape[1].as_int32()) + " " + std::to_string(output->shape[2].as_int32()) + " " +
+      std::to_string(output->shape[3].as_int32());
+  if (res.count(key) == 0) {
+    VLOG(3) << "Didn't find saved param, key is: " << key;
+  } else {
+    VLOG(3) << "Find saved param! key is: " << key;
+    CudaScheduleConv2(stages, input_pad, weights, output, target, key);
+    return;
+  }
+  stages[input_pad]->ComputeInline();
+  if (stages[weights]->has_expression()) {
+    stages[weights]->ComputeInline();
+  }
+  int f_inner  = GetInnerSplitter(c, h);
+  int block_z  = SplitEven(c / f_inner);
+  int thread_z = c / f_inner / block_z;
+
+  int rc_factor = SplitEven(rc);
+  while (w * thread_z > 1024 && thread_z % 2 == 0) {
+    thread_z = thread_z / 2;
+    f_inner  = f_inner * 2;
+  }
+  CHECK_LE(w * thread_z, 1024) << "Wrong Param of Conv2d!";
+  auto OL = stages[output]->CacheWrite("local", stages, output);
+
+  auto tx     = stages[output]->axis(3);
+  auto by     = stages[output]->axis(2);
+  auto tem_fi = stages[output]->Split(1, f_inner);
+  auto &tem   = std::get<0>(tem_fi);
+  auto &fi    = std::get<1>(tem_fi);
+
+  auto bz_tz = stages[output]->Split(1, thread_z);
+  auto &bz   = std::get<0>(bz_tz);
+  auto &tz   = std::get<1>(bz_tz);
+
+  stages[output]->Reorder({bz, by, tz, tx, fi});
+  stages[output]->Bind(1, "blockIdx.z");
+  stages[output]->Bind(2, "blockIdx.y");
+  stages[output]->Bind(3, "threadIdx.z");
+  stages[output]->Bind(4, "threadIdx.x");
+  stages[OL]->ComputeAt(stages[output], 4);
+  stages[OL]->Split(6, rc_factor);
+}
+
+void CudaScheduleConv2(poly::StageMap stages,
+                       ir::Tensor &input_pad,
+                       ir::Tensor &weights,
+                       ir::Tensor &output,
+                       const common::Target &target,
+                       const std::string &key) {
+  auto &res = ScheduleParam::get_cuda_instance().GetParam();
+  stages[input_pad]->ComputeInline();
+  optim::Simplify(&(output->shape[2]));
+  optim::Simplify(&(output->shape[3]));
+
+  std::vector<ir::Tensor> readers{output};
+  auto PR = stages[input_pad]->CacheRead("shared", readers, stages);
+  auto KR = stages[weights]->CacheRead("shared", readers, stages);
+  auto OL = stages[output]->CacheWrite("local", stages, output);
+
+  auto &x_param  = res[key]["x"];
+  auto &y_param  = res[key]["y"];
+  auto &f_param  = res[key]["f"];
+  auto &rx_param = res[key]["rx"];
+  auto &ry_param = res[key]["ry"];
+  auto &rc_param = res[key]["rc"];
+
+  // x param is :  [1, 7, 16, 1]
+  stages[output]->Split(3, x_param[3]);
+  stages[output]->Split(3, x_param[2]);
+  stages[output]->Split(3, x_param[1]);
+
+  // y param is :  [112, 1, 1, 1]
+  stages[output]->Split(2, y_param[3]);
+  stages[output]->Split(2, y_param[2]);
+  stages[output]->Split(2, y_param[1]);
+
+  // f param is :  [1, 4, 8, 2]
+  stages[output]->Split(1, f_param[3]);
+  stages[output]->Split(1, f_param[2]);
+  stages[output]->Split(1, f_param[1]);
+
+  stages[output]->Reorder({0, 1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12});
+  stages[output]->Bind(1, "blockIdx.z");
+  stages[output]->Bind(2, "blockIdx.y");
+  stages[output]->Bind(3, "blockIdx.x");
+  stages[output]->Bind(7, "threadIdx.z");
+  stages[output]->Bind(8, "threadIdx.y");
+  stages[output]->Bind(9, "threadIdx.x");
+  stages[output]->Unroll(10);
+  stages[output]->Unroll(11);
+  stages[output]->Unroll(12);
+
+  stages[OL]->ComputeAt(stages[output], 9);
+
+  // rx param is :  [1, 7]
+  stages[OL]->Split(15, rx_param[1]);
+  // ry param is :  [7, 1]
+  stages[OL]->Split(14, ry_param[1]);
+  // rc param is :  [3, 1]
+  stages[OL]->Split(13, rc_param[1]);
+
+  stages[OL]->Reorder({13, 15, 17, 14, 16, 18, 10, 11, 12});
+
+  auto OL_init = OL->GetInitTensor(stages, target);
+  stages[PR]->ComputeAt(stages[OL], 12);
+  stages[KR]->ComputeAt(stages[OL], 12);
+
+  stages[PR]->SyncThreads(12, {OL_init}, stages);
+  stages[KR]->CtrlDepend(PR);
+  stages[KR]->SyncThreads(stages);
+
+  if (stages[PR]->n_out_dims() == 18) {
+    stages[PR]->Fuse({13, 14, 15, 16, 17});
+  } else if (stages[PR]->n_out_dims() == 19) {
+    stages[PR]->Fuse({13, 14, 15, 16, 17, 18});
+  } else {
+    LOG(FATAL) << "PR number of output dims is wrong: " << stages[PR]->n_out_dims();
+  }
+
+  if (stages[KR]->n_out_dims() == 18) {
+    stages[KR]->Fuse({13, 14, 15, 16, 17});
+  } else if (stages[KR]->n_out_dims() == 19) {
+    stages[KR]->Fuse({13, 14, 15, 16, 17, 18});
+  } else {
+    LOG(FATAL) << "KR number of output dims is wrong: " << stages[KR]->n_out_dims();
+  }
+  int thread_z = f_param[2];
+  int thread_x = x_param[2];
+  if (stages[PR]->GetDimRange(13) <= thread_z) {
+    stages[PR]->Bind(13, "threadIdx.z");
+  } else {
+    stages[PR]->Split(13, GetMaxSplitter(stages[PR]->GetDimRange(13), thread_z));
+    stages[PR]->Bind(14, "threadIdx.z");
+    stages[PR]->Unroll(13);
+  }
+  if (stages[KR]->GetDimRange(13) <= thread_x) {
+    stages[KR]->Bind(13, "threadIdx.x");
+  } else {
+    stages[KR]->Split(13, GetMaxSplitter(stages[KR]->GetDimRange(13), thread_x));
+    stages[KR]->Bind(14, "threadIdx.x");
+    stages[KR]->Unroll(13);
+  }
+  stages[output]->Unroll(4);
+  stages[output]->Unroll(5);
+  stages[output]->Unroll(6);
+
+  stages[OL]->Unroll(4);
+  stages[OL]->Unroll(5);
+  stages[OL]->Unroll(6);
+  stages[OL]->Unroll(10);
+  stages[OL]->Unroll(11);
+  stages[OL]->Unroll(12);
+  stages[OL]->Unroll(13);
+  stages[OL]->Unroll(14);
+  stages[OL]->Unroll(15);
+  stages[OL]->Unroll(16);
+  stages[OL]->Unroll(17);
+
+  stages[PR]->Unroll(4);
+  stages[PR]->Unroll(5);
+  stages[PR]->Unroll(6);
+  stages[PR]->Unroll(10);
+  stages[PR]->Unroll(11);
+  stages[PR]->Unroll(12);
+
+  stages[KR]->Unroll(4);
+  stages[KR]->Unroll(5);
+  stages[KR]->Unroll(6);
+  stages[KR]->Unroll(10);
+  stages[KR]->Unroll(11);
+  stages[KR]->Unroll(12);
+}
+
+void CudaScheduleWinogradConv(poly::StageMap wino_stages,
+                              std::vector<ir::Tensor> &all_tensors,
+                              const common::Target &target) {
+  auto &res                   = ScheduleParam::get_cuda_instance().GetParam();
+  auto &wino_weights_dilation = all_tensors[0];
+  auto &wino_input_pad        = all_tensors[1];
+  auto &wino_A                = all_tensors[2];
+  auto &wino_B                = all_tensors[3];
+  auto &wino_G                = all_tensors[4];
+  auto &kernel_pack           = all_tensors[5];
+  auto &input_tile            = all_tensors[6];
+  auto &data_pack             = all_tensors[7];
+  auto &bgemm                 = all_tensors[8];
+  auto &inverse               = all_tensors[9];
+  auto &wino_conv             = all_tensors[10];
+  std::string key =
+      "CudaWinogradConvSchedule " + std::to_string(wino_input_pad->shape[0].as_int32()) + " " +
+      std::to_string(wino_input_pad->shape[1].as_int32()) + " " + std::to_string(wino_input_pad->shape[2].as_int32()) +
+      " " + std::to_string(wino_input_pad->shape[3].as_int32()) + " " +
+      std::to_string(wino_weights_dilation->shape[0].as_int32()) + " " +
+      std::to_string(wino_weights_dilation->shape[1].as_int32()) + " " +
+      std::to_string(wino_weights_dilation->shape[2].as_int32()) + " " +
+      std::to_string(wino_weights_dilation->shape[3].as_int32()) + " " +
+      std::to_string(wino_conv->shape[0].as_int32()) + " " + std::to_string(wino_conv->shape[1].as_int32()) + " " +
+      std::to_string(wino_conv->shape[2].as_int32()) + " " + std::to_string(wino_conv->shape[3].as_int32());
+  VLOG(1) << "Key in CudaScheduleWinogradConv is : " << key;
+  CHECK_GT(res.count(key), 0);
+  auto &rc_param = res[key]["rc"];
+  auto &x_param  = res[key]["x"];
+  auto &y_param  = res[key]["y"];
+  auto &b_param  = res[key]["b"];
+
+  wino_stages[wino_B]->ComputeInline();
+
+  auto data_l = wino_stages[data_pack]->CacheWrite("local", wino_stages, data_pack);
+
+  wino_stages[data_pack]->Split(3, 1);
+  wino_stages[data_pack]->Fuse({2, 3});
+  wino_stages[data_pack]->Split(2, 128);
+  wino_stages[data_pack]->Reorder({2, 3, 4, 0, 1});
+  wino_stages[data_pack]->Bind(0, "blockIdx.x");
+  wino_stages[data_pack]->Bind(1, "threadIdx.x");
+  wino_stages[input_tile]->SetBuffer("local");
+  wino_stages[data_l]->ComputeAt(wino_stages[data_pack], 2);
+  wino_stages[input_tile]->ComputeAt(wino_stages[data_l], 2);
+  wino_stages[wino_input_pad]->ComputeInline();
+
+  wino_stages[wino_G]->ComputeInline();
+
+  wino_stages[kernel_pack]->Fuse({2, 3});
+  wino_stages[kernel_pack]->Split(2, 128);
+  wino_stages[kernel_pack]->Reorder({2, 3, 0, 1});
+
+  wino_stages[kernel_pack]->Bind(0, "blockIdx.x");
+  wino_stages[kernel_pack]->Bind(1, "threadIdx.x");
+
+  wino_stages[wino_weights_dilation]->ComputeInline();
+
+  auto wino_OL = wino_stages[bgemm]->CacheWrite("local", wino_stages, bgemm);
+  std::vector<ir::Tensor> readers{wino_OL};
+  auto AA = wino_stages[kernel_pack]->CacheRead("shared", readers, wino_stages);
+  auto BB = wino_stages[data_pack]->CacheRead("shared", readers, wino_stages);
+
+  wino_stages[bgemm]->Fuse({0, 1});
+
+  wino_stages[bgemm]->SplitOuter(0, 1);
+
+  // x param is :  [1, 1, 8, 2]
+  wino_stages[bgemm]->Split(3, x_param[3]);
+  wino_stages[bgemm]->Split(3, x_param[2]);
+  wino_stages[bgemm]->Split(3, x_param[1]);
+  // y param is :  [8, 1, 16, 4]
+  wino_stages[bgemm]->Split(2, y_param[3]);
+  wino_stages[bgemm]->Split(2, y_param[2]);
+  wino_stages[bgemm]->Split(2, y_param[1]);
+  // b param is :  [16, 1, 1, 1]
+  wino_stages[bgemm]->Split(1, b_param[3]);
+  wino_stages[bgemm]->Split(1, b_param[2]);
+  wino_stages[bgemm]->Split(1, b_param[1]);
+
+  wino_stages[bgemm]->Reorder({0, 1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12});
+
+  wino_stages[bgemm]->Bind(1, "blockIdx.z");
+  wino_stages[bgemm]->Bind(2, "blockIdx.y");
+  wino_stages[bgemm]->Bind(3, "blockIdx.x");
+  wino_stages[bgemm]->Bind(7, "threadIdx.z");
+  wino_stages[bgemm]->Bind(8, "threadIdx.y");
+  wino_stages[bgemm]->Bind(9, "threadIdx.x");
+
+  wino_stages[wino_OL]->ComputeAt(wino_stages[bgemm], 9);
+
+  // rc param is :  [32, 16]
+  wino_stages[wino_OL]->Split(12, rc_param[1]);
+
+  wino_stages[wino_OL]->Reorder({12, 13, 10, 11});
+
+  auto bgemm_init = wino_OL->GetInitTensor(wino_stages, target);
+  wino_stages[AA]->ComputeAt(wino_stages[wino_OL], 10);
+  wino_stages[BB]->ComputeAt(wino_stages[wino_OL], 10);
+
+  wino_stages[AA]->Fuse(11, 12);
+  wino_stages[AA]->SplitOuter(11, x_param[2]);
+  wino_stages[AA]->Bind(11, "threadIdx.x");
+
+  wino_stages[BB]->Fuse(11, 12);
+  wino_stages[BB]->SplitOuter(11, y_param[2]);
+  wino_stages[BB]->Bind(11, "threadIdx.y");
+
+  wino_stages[AA]->SyncThreads(10, {bgemm_init}, wino_stages);
+  wino_stages[BB]->SyncThreads(wino_stages);
+
+  int m = 2;
+
+  wino_stages[wino_conv]->Tile(2, 3, m, m);
+  wino_stages[wino_conv]->Reorder({2, 4, 3, 5});
+  wino_stages[wino_conv]->FuseDirect({0, 1, 2, 3});
+  wino_stages[wino_conv]->Split(0, 128);
+  wino_stages[wino_conv]->Bind(0, "blockIdx.x");
+  wino_stages[wino_conv]->Bind(1, "threadIdx.x");
+  wino_stages[wino_A]->ComputeInline();
+
+  wino_stages[inverse]->SetBuffer("local");
+  wino_stages[inverse]->Fuse(0, 1);
+  wino_stages[inverse]->Split(0, 128);
+  wino_stages[inverse]->Bind(0, "blockIdx.x");
+  wino_stages[inverse]->Bind(1, "threadIdx.x");
+  wino_stages[inverse]->SimpleComputeAt(wino_stages[wino_conv], 1);
+}
+
+int gcd(int a, int b) {
+  int r;
+  while (b > 0) {
+    r = a % b;
+    a = b;
+    b = r;
+  }
+  return a;
+}
+
+int MaxFactorLessThan(int a, int b) {
+  CHECK_GT(a, b);
+  int res = 1;
+  for (int i = 2; i <= (int)sqrt((double)a); i++) {
+    if (a % i == 0) {
+      if (i <= b) res = std::max(res, i);
+      if (a / i <= b) res = std::max(res, a / i);
+    }
+  }
+  return res;
+}
+
+void CudaScheduleInjectiveWithVectorize(poly::Stage *stage,
+                                        const std::vector<int> &output_shape,
+                                        const common::Target &target) {
+  int dims       = stage->n_out_dims();
+  int prod_size  = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+  int num_thread = target.max_num_threads();
+  int last_shape = stage->GetDimRange(stage->n_out_dims() - 1);
+  // determine the factor of vectorize
+  int vector_width = 1;
+  if (last_shape % 4 == 0) {
+    vector_width = 4;
+  } else if (last_shape % 2 == 0) {
+    vector_width = 2;
+  }
+
+  // print range of stage for debug
+  auto range_str_fn = [stage]() {
+    std::vector<int> dim_ranges;
+    for (int i = 0; i < stage->n_out_dims(); ++i) {
+      dim_ranges.push_back(stage->GetDimRange(i));
+    }
+    std::string res = "[" + utils::Join(dim_ranges, ",") + "]";
+    return res;
+  };
+
+  // the first bind position from tail
+  int bind_idx = stage->n_out_dims() - 1;
+  // it will add a new dim by split before vectorize, but the new dim will
+  // be eleminated when vectorizng, so the bind_idx does't need to increase
+  if (vector_width > 1) {
+    stage->Split(bind_idx, vector_width);
+  }
+  VLOG(5) << "vectorize result:" << range_str_fn();
+
+  // revise dim for binding threadIdx.x, here only use the x of threadIdx
+  if (stage->GetDimRange(bind_idx) > num_thread) {
+    stage->Split(bind_idx, gcd(stage->GetDimRange(bind_idx), num_thread));
+    ++bind_idx;
+  }
+  while (bind_idx > 0 && stage->GetDimRange(bind_idx - 1) * stage->GetDimRange(bind_idx) < num_thread) {
+    stage->Fuse(bind_idx - 1, bind_idx);
+    --bind_idx;
+  }
+  // call vectorize on the last dim
+  if (vector_width > 1) {
+    stage->Vectorize(stage->n_out_dims() - 1, vector_width);
+  }
+  stage->Bind(bind_idx, "threadIdx.x");
+  --bind_idx;
+  VLOG(5) << "bind threadIdx.x result:" << range_str_fn();
+
+  // revise dim for binding blockIdx, at most 3 indexes can be used
+  while (bind_idx > 2) {
+    stage->Fuse(bind_idx - 1, bind_idx);
+    --bind_idx;
+  }
+  std::string block_idx = "blockIdx.x";
+  for (int j = 0; bind_idx >= 0; ++j) {
+    block_idx.back() = 'x' + j;
+    stage->Bind(bind_idx, block_idx);
+    --bind_idx;
+  }
+  VLOG(5) << "CudaScheduleInjectiveWithVectorize tensor:" << stage->tensor()->name << ", vector_width:" << vector_width
+          << ", prod_size:" << prod_size << ", shape:[" << utils::Join(output_shape, ",") << "]"
+          << ", range:" << range_str_fn();
+}
+
+void CudaScheduleInjective(poly::Stage *stage, const std::vector<int> &output_shape, const common::Target &target) {
+  CHECK_EQ(stage->n_out_dims(), stage->n_in_dims()) << "The dims of op are not equal";
+  if (FLAGS_cinn_use_cuda_vectorize) {
+    CudaScheduleInjectiveWithVectorize(stage, output_shape, target);
+    return;
+  }
+  int dims = stage->n_out_dims();
+  for (int i = 1; i < dims; i++) {
+    stage->Fuse(0, 1);
+  }
+
+  int num_thread = target.max_num_threads();
+  int prod_size  = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+  if (prod_size <= num_thread) {
+    stage->Bind(0, "threadIdx.x");
+    return;
+  }
+  int new_num_thread = gcd(prod_size, num_thread);
+  if (new_num_thread % 32 != 0) {
+    new_num_thread = MaxFactorLessThan(prod_size, num_thread);
+  }
+  if (new_num_thread == 1) LOG(FATAL) << "prod_size out of range: " << prod_size;
+
+  CHECK_GT(prod_size, new_num_thread);
+  stage->Split(0, new_num_thread);
+  stage->Bind(0, "blockIdx.x");
+  stage->Bind(1, "threadIdx.x");
+}
+
+void CudaSplitSchedule(common::CINNValuePack *arg_pack,
+                       const std::vector<std::vector<int>> &output_shapes,
+                       int axis,
+                       const common::Target &target) {
+  poly::StageMap stages = arg_pack->back();
+  std::vector<ir::Tensor> out_tensors;
+  int dims = output_shapes[0].size();
+  for (int i = 0; i < arg_pack->size() - 1; ++i) {
+    Expr Out = (*arg_pack)[i];
+    CHECK(Out.as_tensor());
+    out_tensors.push_back(Out.as_tensor_ref());
+  }
+  std::vector<int> reorders;
+  for (int i = 0; i < dims; i++) {
+    reorders.push_back(i);
+  }
+  reorders.erase(reorders.begin() + axis);
+  reorders.push_back(axis);
+  for (auto &out : out_tensors) {
+    stages[out]->Reorder(reorders);
+  }
+  auto last_output = out_tensors.back();
+
+  std::vector<int> fuse_index;
+  for (int i = 0; i < dims - 1; i++) fuse_index.push_back(i);
+  for (auto &out : out_tensors) stages[out]->Fuse(fuse_index);
+  int fused_shape = 1;
+  for (int i = 0; i < dims; i++) {
+    if (i != axis) fused_shape = fused_shape * output_shapes[0][i];
+  }
+  int compute_at_level = 0;
+  if (target.arch == Target::Arch::NVGPU) {
+    if (fused_shape > target.max_num_threads()) {
+      stages[last_output]->Split(0, target.max_num_threads());
+      stages[last_output]->Bind(0, "blockIdx.x");
+      stages[last_output]->Bind(1, "threadIdx.x");
+      compute_at_level++;
+    } else
+      stages[last_output]->Bind(0, "threadIdx.x");
+  }
+
+  for (int i = 0; i < out_tensors.size() - 1; i++) {
+    stages[out_tensors[i]]->ComputeAt2(stages[last_output], compute_at_level);
+  }
+}
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/schedule.h b/paddle/cinn/hlir/pe/schedule.h
new file mode 100644
index 0000000000000..99a9bcd037419
--- /dev/null
+++ b/paddle/cinn/hlir/pe/schedule.h
@@ -0,0 +1,248 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/pe/schedule_param.pb.h"
+#include "cinn/ir/ir.h"
+#include "cinn/lang/compute.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+class ScheduleParam {
+ public:
+  ~ScheduleParam();
+  ScheduleParam(const ScheduleParam &) = delete;
+  ScheduleParam &operator=(const ScheduleParam &) = delete;
+  static ScheduleParam &get_cuda_instance() {
+    static ScheduleParam instance{common::Target::Arch::NVGPU};
+    return instance;
+  }
+  static ScheduleParam &get_x86_instance() {
+    static ScheduleParam instance{common::Target::Arch::X86};
+    return instance;
+  }
+  absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> &GetParam() {
+    return param_data;
+  }
+  absl::flat_hash_map<std::string, std::vector<int>> &operator[](const std::string &key) { return param_data[key]; }
+  int Count(const std::string &key) { return param_data.count(key); }
+
+ private:
+  ScheduleParam(common::Target::Arch arch);
+  absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> param_data;
+};
+
+int GetInnerSplitter(int origin, int other_axis);
+
+int GetVectorizeFactor(int shape, int split_factor);
+
+int SplitEven(int origin);
+
+int GetBasicFactor(const Type &type, const common::Target &target);
+
+int GetBetterSplitFactor(int shape, int split_factor);
+
+int GetArrayPackingFactor(int shape, const Type &type, const common::Target &target);
+
+void ScheduleInjectiveCPU(poly::Stage *stage,
+                          const std::vector<int> &output_shape,
+                          const common::Target &target,
+                          bool vectorizable = true);
+// to deprecate
+void ScheduleInjectiveCPU1(poly::Stage *stage,
+                           const std::vector<int> &output_shape,
+                           const common::Target &target,
+                           bool vectorizable = true);
+
+void MatmulScheduleCUDA(poly::StageMap stages, const ir::Tensor &output, const common::Target &target);
+
+void MatmulScheduleCPU(poly::StageMap stage,
+                       const ir::Tensor &output,
+                       const ir::Tensor &packedB,
+                       const common::Target &target);
+
+void MulScheduleCPU(poly::StageMap stage,
+                    const ir::Tensor &output,
+                    const ir::Tensor &input_tensor,
+                    const common::Target &target);
+
+void SoftmaxScheduleCPU(poly::StageMap stage, const ir::Tensor &output, const ir::Tensor &temp, int axis = -1);
+
+void GetConv2dFactors(absl::flat_hash_map<std::string, int> *factors,
+                      int oc,
+                      int ic,
+                      int fc,
+                      int oh,
+                      int ow,
+                      const Type &type,
+                      const common::Target &target,
+                      const std::string &key = "",
+                      bool import_params     = true);
+
+void GetConv2d1x1Factors(absl::flat_hash_map<std::string, int> *factors,
+                         int oc,
+                         int ic,
+                         int oh,
+                         int ow,
+                         const Type &type,
+                         const common::Target &target);
+
+void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
+                               const ir::Tensor &res,
+                               ir::Tensor &packed_out,
+                               const ir::Tensor &input_pad,
+                               const ir::Tensor &weights_dilation,
+                               const ir::Tensor &data,
+                               const common::Target &target,
+                               const std::string &key,
+                               bool do_padding);
+void GlobalPoolScheduleGPU(poly::StageMap stages, const std::vector<ir::Tensor> &output, const common::Target &target);
+void PoolScheduleCPU(poly::StageMap stages, const ir::Tensor &output, const common::Target &target);
+void PoolScheduleGPU(poly::StageMap stages, ir::Tensor &output, const common::Target &target);
+
+void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
+                                      const ir::Tensor &res,
+                                      ir::Tensor &packed_out,
+                                      const ir::Tensor &input_pad,
+                                      const ir::Tensor &weights_dilation,
+                                      const ir::Tensor &data,
+                                      const common::Target &target);
+
+void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
+                                   const ir::Tensor &res,
+                                   ir::Tensor &packed_out,
+                                   const ir::Tensor &input_pad,
+                                   const ir::Tensor &weights_dilation,
+                                   const ir::Tensor &data,
+                                   const common::Target &target,
+                                   const std::string &key,
+                                   bool do_padding);
+
+void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
+                                          const ir::Tensor &res,
+                                          ir::Tensor &packed_out,
+                                          const ir::Tensor &input_pad,
+                                          const ir::Tensor &weights_dilation,
+                                          const ir::Tensor &data,
+                                          const common::Target &target);
+
+void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
+                                                const ir::Tensor &res,
+                                                ir::Tensor &packed_out,
+                                                const ir::Tensor &input_pad,
+                                                const ir::Tensor &weights_dilation,
+                                                const ir::Tensor &data,
+                                                const common::Target &target,
+                                                bool do_padding);
+
+void CudaScheduleMul(poly::StageMap stages,
+                     ir::Tensor output,
+                     const std::vector<int> &output_shape,
+                     const common::Target &target);
+
+// reduce shedules.
+void CudaReduceSchedule(poly::StageMap stages, ir::Tensor output, int last_dimension_num, const common::Target &target);
+
+void CudaWarpReduceSchedule(poly::StageMap stages, ir::Tensor tmp_out, ir::Tensor out, const common::Target &target);
+
+void CudaBlockReduceInternalSchedule(poly::StageMap stages,
+                                     ir::Tensor tmp_out,
+                                     ir::Tensor out,
+                                     const common::Target &target);
+
+void CudaBlockReduceSchedule(
+    poly::StageMap stages, ir::Tensor reduce_tmp_out, ir::Tensor tmp_out, ir::Tensor out, const common::Target &target);
+
+void CudaBlockShuffleReduceSchedule(poly::StageMap stages,
+                                    ir::Tensor reduce_reshape,
+                                    ir::Tensor reduce_internal,
+                                    ir::Tensor reduce_out,
+                                    const common::Target &target);
+
+void CudaTwoStepReduceSchedule(poly::StageMap stages,
+                               ir::Tensor reshape,
+                               ir::Tensor internal,
+                               ir::Tensor tmp_out,
+                               ir::Tensor out,
+                               const common::Target &target);
+
+void CudaScheduleDepthwiseConv(poly::StageMap stages, ir::Tensor &output, const common::Target &target);
+
+void CudaScheduleConv(poly::StageMap stages,
+                      ir::Tensor &input_pad,
+                      ir::Tensor &weights,
+                      ir::Tensor &output,
+                      const common::Target &target);
+
+void CudaScheduleWinogradConv(poly::StageMap wino_stages,
+                              std::vector<ir::Tensor> &all_tensors,
+                              const common::Target &target);
+
+void CudaScheduleConv2(poly::StageMap stages,
+                       ir::Tensor &input_pad,
+                       ir::Tensor &weights,
+                       ir::Tensor &output,
+                       const common::Target &target,
+                       const std::string &key);
+
+void CudaScheduleInjective(poly::Stage *stage, const std::vector<int> &output_shape, const common::Target &target);
+
+void CudaSplitSchedule(common::CINNValuePack *arg_pack,
+                       const std::vector<std::vector<int>> &output_shapes,
+                       int axis,
+                       const common::Target &target);
+
+void CreateCudaSerialData(const std::string &file_name = "default_serial.log");
+
+std::string GenerateX86ConvKey(const std::vector<Expr> &input_shape,
+                               const std::vector<Expr> &weight_shape,
+                               const std::vector<int> &strides,
+                               const std::vector<int> &paddings,
+                               const std::vector<int> &dilations,
+                               const int &index              = 0,
+                               const std::string &model_name = "");
+
+std::string GenerateX86ConvKey(const std::vector<int> &input_shape,
+                               const std::vector<int> &weight_shape,
+                               const std::vector<int> &strides,
+                               const std::vector<int> &paddings,
+                               const std::vector<int> &dilations,
+                               const int &index              = 0,
+                               const std::string &model_name = "");
+void CreateX86SerialData(const std::string &file_name = "default_serial.log");
+
+void LoadSerialData(absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> *params,
+                    const std::string &file_name = "default_serial.log");
+
+void SaveSerialData(
+    const absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> &model_data,
+    const std::string &file_name = "default_serial.log");
+
+int GetMaxSplitter(int a, int b);
+
+absl::flat_hash_map<std::string, absl::flat_hash_map<std::string, std::vector<int>>> CreateCudaParams();
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/schedule_param.proto b/paddle/cinn/hlir/pe/schedule_param.proto
new file mode 100644
index 0000000000000..1d869a570706d
--- /dev/null
+++ b/paddle/cinn/hlir/pe/schedule_param.proto
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax ="proto3";
+
+package cinn.hlir.proto;
+
+message StringData {
+  repeated string data = 1;
+}
+
+message ScheduleData {
+  map<string, StringData> data = 1;
+}
+
+message ModelData {
+  map<string, ScheduleData> data = 1;
+}
diff --git a/paddle/cinn/hlir/pe/transform.cc b/paddle/cinn/hlir/pe/transform.cc
new file mode 100644
index 0000000000000..0cda4ebe1df7c
--- /dev/null
+++ b/paddle/cinn/hlir/pe/transform.cc
@@ -0,0 +1,1182 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/transform.h"
+
+#include <algorithm>
+#include <utility>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/context.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+using cinn::lang::Compute;
+using ir::Tensor;
+
+namespace utils {
+std::vector<std::vector<int>> GetMatmulNewShapes(const std::vector<std::vector<int>>& inputs_shape,
+                                                 bool trans_x,
+                                                 bool trans_y) {
+  CHECK_EQ(inputs_shape.size(), 2UL) << "The matmul should only have two inputs.";
+  const auto &x_shape = inputs_shape[0], &y_shape = inputs_shape[1];
+  CHECK(!x_shape.empty()) << "The shape of matmul input 'x' should not empty.";
+  CHECK(!y_shape.empty()) << "The shape of matmul input 'y' should not empty.";
+
+  auto matmul_info = [&]() {
+    std::stringstream ss;
+    ss << std::boolalpha << "matmul(X:"
+       << "[" << cinn::utils::Join(x_shape, ", ") << "], Y:"
+       << "[" << cinn::utils::Join(y_shape, ", ") << "]"
+       << ", trans_x=" << trans_x << ", trans_y=" << trans_y << ")";
+    return ss.str();
+  };
+  VLOG(4) << "Try infer " << matmul_info() << "'s correct shape";
+
+  std::vector<std::vector<int>> new_shape(3);
+  auto& new_x_shape = new_shape[0];
+  auto& new_y_shape = new_shape[1];
+  auto& out_shape   = new_shape[2];
+
+  int x_dim = x_shape.size(), y_dim = y_shape.size();
+  int max_dim = std::max(x_shape.size(), y_shape.size());
+  int out_dim = max_dim >= 3 ? 3 : (max_dim <= 2 ? 2 : max_dim);
+
+  auto get_input_shape = [out_dim](const std::vector<int>& old_shape) {
+    CHECK_GE(old_shape.size(), 2UL) << "The shape of matmul input should greater equal 2";
+    std::vector<int> res;
+    res.resize(out_dim, 1);
+    // [a, b, m, d] -> [a*b, m, d]
+    for (int i = 0; i < old_shape.size() - 2; ++i) {
+      res[0] *= old_shape[i];
+    }
+    res[out_dim - 2] = old_shape[old_shape.size() - 2];
+    res[out_dim - 1] = old_shape[old_shape.size() - 1];
+    return res;
+  };
+
+  if (max_dim == 1) {
+    // vector * vector
+    CHECK(x_shape[0] == y_shape[0])
+        << "The matmul input X's numbers must be equal to Y's numbers,when X/Y's dims =1. But here " << matmul_info();
+
+    new_x_shape = trans_x ? std::vector<int>{x_shape[0], 1} : std::vector<int>{1, x_shape[0]};
+    new_y_shape = trans_y ? std::vector<int>{1, y_shape[0]} : std::vector<int>{y_shape[0], 1};
+    out_shape   = {1};
+  } else if (x_dim == 1) {
+    // vector * matrix
+    int y_K = trans_y ? y_shape[max_dim - 1] : y_shape[max_dim - 2];
+    CHECK_EQ(y_K, x_shape[0]) << "The K dimension of Y:" << y_K << " should equal to X.shape[0]:" << x_shape[0]
+                              << ". But here " << matmul_info();
+
+    // set x shape for broadcast
+    new_x_shape.resize(out_dim, 1);
+    if (trans_x) {
+      // [m] * [a, b, m, d] -> [1, m, 1] * [a*b, m, d]
+      new_x_shape[out_dim - 2] = x_shape[0];
+    } else {
+      // [m] * [a, b, m, d] -> [1, 1, m] * [a*b, m, d]
+      new_x_shape[out_dim - 1] = x_shape[0];
+    }
+
+    new_y_shape = get_input_shape(y_shape);
+
+    // set output shape after broadcast
+    out_shape = y_shape;
+    if (trans_y) {
+      // [m] * [a, b, d, m] -> [a, b, d]
+      out_shape.erase(out_shape.end() - 1);
+    } else {
+      // [m] * [a, b, m, d] -> [a, b, d]
+      out_shape.erase(out_shape.end() - 2);
+    }
+
+  } else if (y_dim == 1) {
+    // matrix * vector
+    int x_K = trans_x ? x_shape[max_dim - 2] : x_shape[max_dim - 1];
+    CHECK_EQ(x_K, y_shape[0]) << "The K dimension of X:" << x_K << " should equal to Y.shape[0]:" << y_shape[0]
+                              << ". But here " << matmul_info();
+
+    // set y shape for broadcast
+    // [a, b, c, m] * [m] -> [a*b, c, m] * [1, m, 1]
+    new_x_shape = get_input_shape(x_shape);
+
+    new_y_shape.resize(out_dim, 1);
+    if (trans_y) {
+      // [a, b, c, m] * [m] -> [a*b, c, m] * [1, 1, m]
+      new_y_shape[out_dim - 1] = y_shape[0];
+    } else {
+      // [a, b, c, m] * [m] -> [a*b, c, m] * [1, m, 1]
+      new_y_shape[out_dim - 2] = y_shape[0];
+    }
+
+    out_shape = x_shape;
+    if (trans_x) {
+      // [a, b, m, c] * [m] -> [a, b, c]
+      out_shape.erase(out_shape.end() - 2);
+    } else {
+      // [a, b, c, m] * [m] -> [a, b, c]
+      out_shape.erase(out_shape.end() - 1);
+    }
+  } else {
+    // matrix * matrix
+    int x_K = trans_x ? x_shape[x_dim - 2] : x_shape[x_dim - 1];
+    int y_K = trans_y ? y_shape[y_dim - 1] : y_shape[y_dim - 2];
+    CHECK_EQ(x_K, y_K) << "The K dimension of matmul not equal. Where " << matmul_info();
+
+    // [c, m] * [a, b, m, d] -> [1, c, m] * [a*b, m, d]
+    new_x_shape = get_input_shape(x_shape);
+    // [a, b, c, m] * [m, d] -> [a*b, c, m] * [1, m, d]
+    new_y_shape = get_input_shape(y_shape);
+
+    // get output shape
+    // [a, b, c, m] * [a, b, m, d] -> [a, b, c, d]
+    int M = trans_x ? x_shape[x_dim - 1] : x_shape[x_dim - 2];
+    int N = trans_y ? y_shape[y_dim - 2] : y_shape[y_dim - 1];
+
+    out_shape.resize(max_dim, 1);
+    out_shape[max_dim - 2] = M;
+    out_shape[max_dim - 1] = N;
+
+    // get the batch dimension after broadcast
+    int x_pos = x_dim - 3, y_pos = y_dim - 3, out_pos = max_dim - 3;
+    while (x_pos >= 0 && y_pos >= 0) {
+      CHECK(x_shape[x_pos] == y_shape[y_pos] || x_shape[x_pos] == 1 || y_shape[y_pos] == 1)
+          << "Input X and Y's batch dimension should be same or 1. But here " << matmul_info();
+
+      out_shape[out_pos] = (x_shape[x_pos] == 1) ? y_shape[y_pos] : x_shape[x_pos];
+
+      out_pos--;
+      x_pos--;
+      y_pos--;
+    }
+
+    while (x_pos >= 0) {
+      out_shape[out_pos--] = x_shape[x_pos--];
+    }
+    while (y_pos >= 0) {
+      out_shape[out_pos--] = x_shape[y_pos--];
+    }
+  }
+
+  return new_shape;
+}
+
+std::vector<std::vector<int>> GetMulNewShapes(const std::vector<std::vector<int>>& inputs_shape,
+                                              int x_num_col_dims,
+                                              int y_num_col_dims,
+                                              bool is_infer) {
+  CHECK_EQ(inputs_shape.size(), 2UL) << "The mul should only have two inputs.";
+  const auto &x_shape = inputs_shape[0], &y_shape = inputs_shape[1];
+  CHECK(!x_shape.empty()) << "The shape of mul input 'x' should not empty.";
+  CHECK(!y_shape.empty()) << "The shape of mul input 'y' should not empty.";
+
+  auto mul_info = [&]() {
+    std::stringstream ss;
+    ss << std::boolalpha << "mul(X:"
+       << "[" << cinn::utils::Join(x_shape, ", ") << "], Y:"
+       << "[" << cinn::utils::Join(y_shape, ", ") << "]"
+       << ", x_num_col_dims=" << x_num_col_dims << ", y_num_col_dims=" << y_num_col_dims << ")";
+    return ss.str();
+  };
+  VLOG(4) << "Try infer " << mul_info() << "'s correct shape";
+
+  std::vector<std::vector<int>> new_shape(3);
+  auto& new_x_shape = new_shape[0];
+  auto& new_y_shape = new_shape[1];
+  auto& out_shape   = new_shape[2];
+
+  auto flatten_shape = [&](const std::vector<int>& shape, int num_col_dims) {
+    if (shape.size() <= 2) {
+      return shape;
+    }
+
+    if (num_col_dims < 0) {
+      num_col_dims += shape.size();
+    }
+
+    CHECK_GT(num_col_dims, 0) << "The [num_col_dims] should not be 0 in " << mul_info() << "! Please check.";
+    CHECK_LT(num_col_dims, shape.size()) << "The [num_col_dims] > rank(input) in " << mul_info() << "! Please check.";
+
+    std::vector<int> res(2, 1);
+    for (int i = 0; i < num_col_dims; ++i) {
+      res[0] *= shape[i];
+    }
+    for (int i = num_col_dims; i < shape.size(); ++i) {
+      res[1] *= shape[i];
+    }
+    return res;
+  };
+
+  new_x_shape = flatten_shape(x_shape, x_num_col_dims);
+  new_y_shape = flatten_shape(y_shape, y_num_col_dims);
+
+  for (int i = 0; i < x_num_col_dims; ++i) {
+    out_shape.emplace_back(x_shape[i]);
+  }
+  if (is_infer) {
+    for (int i = 0; i < y_num_col_dims; ++i) {
+      out_shape.emplace_back(y_shape[i]);
+    }
+  } else {
+    for (int i = y_num_col_dims; i < y_shape.size(); ++i) {
+      out_shape.emplace_back(y_shape[i]);
+    }
+  }
+
+  return new_shape;
+}
+}  // namespace utils
+
+std::vector<Tensor> Matmul(
+    const Tensor& A, const Tensor& B, bool trans_a, bool trans_b, float alpha, const std::string& name) {
+  std::vector<Expr> shape_A = A->shape;
+  std::vector<Expr> shape_B = B->shape;
+  int a_dim                 = shape_A.size();
+  int b_dim                 = shape_B.size();
+  CHECK(a_dim == 3U || a_dim == 2U) << "tensor_A's dim should be 2 or 3 while current dim is " << a_dim;
+  CHECK(b_dim == 3U || b_dim == 2U) << "tensor_B's dim should be 2 or 3 while current dim is " << b_dim;
+  CHECK_EQ(a_dim, b_dim) << "tensor_A's dim should be same with tensor_B";
+
+  Expr x_width  = trans_a ? shape_A[a_dim - 2] : shape_A.back();
+  Expr y_height = trans_b ? shape_B.back() : shape_B[b_dim - 2];
+  Expr M        = trans_a ? shape_A.back() : shape_A[a_dim - 2];
+  Expr N        = trans_b ? shape_B[b_dim - 2] : shape_B.back();
+  CHECK(is_zero(x_width - y_height)) << "matrix multiplication requires x_width to be same with y_height";
+  std::vector<Expr> output_shape;
+  std::vector<ir::Tensor> out;
+  if (a_dim == 3) {
+    int max_batch = std::max(shape_A[0].as_int32(), shape_B[0].as_int32());
+    output_shape  = {Expr(max_batch), M, N};
+  } else {
+    output_shape = {M, N};
+  }
+  Var reduce_k(x_width, UniqName("reduce_k"));
+  auto temp = Compute(
+      output_shape,
+      [=](const std::vector<Expr>& indice) {
+        int out_dim = indice.size();
+        std::vector<Expr> A_indice;
+        std::vector<Expr> B_indice;
+        CHECK(out_dim == 3U || out_dim == 2U) << "indice size should be 2 or 3 while current dim is " << out_dim;
+        if (out_dim == 3U) {
+          // batch
+          A_indice.push_back(indice[0]);
+          B_indice.push_back(indice[0]);
+        }
+        A_indice.push_back(indice[out_dim - 2]);
+        A_indice.push_back(reduce_k);
+        B_indice.push_back(reduce_k);
+        B_indice.push_back(indice[out_dim - 1]);
+        if (trans_a) {
+          std::swap(A_indice[out_dim - 2], A_indice[out_dim - 1]);
+        }
+        if (trans_b) {
+          std::swap(B_indice[out_dim - 2], B_indice[out_dim - 1]);
+        }
+        return lang::ReduceSum(A(A_indice) * B(B_indice), {reduce_k});
+      },
+      UniqName("temp_matmul_out"));
+  if (alpha != 1) {
+    auto res = Compute(
+        output_shape,
+        [=](const std::vector<Expr>& indice) { return temp(indice) * ir::Cast::Make(temp->type(), Expr(alpha)); },
+        name);
+    return {res, temp};
+  } else {
+    return {temp};
+  }
+}
+
+ir::Tensor Reshape(const ir::Tensor& A,
+                   const std::vector<int>& new_shape,
+                   poly::StageMap stages,
+                   const std::string& name) {
+  std::vector<Expr> new_expr_shape;
+  std::vector<Expr> A_expr_shape = A->shape;
+  int input_total_size           = 1;
+  int output_total_size          = 1;
+  for (auto& i : A_expr_shape) {
+    CHECK(i.is_constant()) << "Input tensor's shape should be constant value.";
+    input_total_size *= static_cast<int>(i.get_constant());
+  }
+  for (auto& i : new_shape) {
+    output_total_size *= i;
+    new_expr_shape.push_back(Expr(i));
+  }
+  CHECK_EQ(input_total_size, output_total_size)
+      << "In op reshape, the input tensor and output tensor's total size should be equal, please check!";
+  auto out = Identity(A->Reshape(new_expr_shape, stages), name).front();
+  return out;
+}
+
+std::vector<ir::Tensor> Split(const ir::Tensor& A,
+                              int axis,
+                              const std::vector<std::vector<int>>& output_shapes,
+                              const std::vector<std::string>& names) {
+  if (axis < 0) axis += A->shape.size();
+  auto output_size = output_shapes.size();
+
+  // compute select index list
+  // if   index = [2, 3, 4, 5]
+  // then start = [0, 2, 5, 9]
+  std::vector<int> start(output_size, 0);
+  for (int i = 1; i < output_size; ++i) {
+    start[i] = start[i - 1] + output_shapes[i - 1][axis];
+  }
+
+  std::vector<std::vector<Expr>> out_shape(output_size, std::vector<Expr>{});
+  for (int i = 0; i < output_size; ++i) {
+    for (int val : output_shapes[i]) {
+      out_shape[i].emplace_back(Expr(val));
+    }
+  }
+
+  std::vector<ir::Tensor> res(output_size);
+  CHECK_EQ(output_size, names.size());
+  for (int i = 0; i < output_size; ++i) {
+    res[i] = Compute(
+        out_shape[i],
+        [=](const std::vector<Expr>& indice) {
+          auto temp  = indice;
+          temp[axis] = common::AutoSimplify(temp[axis] + Expr(start[i]));
+          return A(temp);
+        },
+        names[i]);
+  }
+  return res;
+}
+
+ir::Tensor Concat(const ir::Tensor& A, const ir::Tensor& B, int axis, const std::string& name) {
+  if (axis < 0) axis += A->shape.size();
+  CHECK_EQ(A->shape.size(), B->shape.size()) << "Dimensions of inputs A and B in Concat should be equal! Please check.";
+  std::vector<Expr> output_shape = A->shape;
+  Expr pivot                     = A->shape[axis];
+  output_shape[axis]             = common::AutoSimplify(output_shape[axis] + B->shape[axis]);
+  auto res                       = Compute(
+      output_shape,
+      [=](const std::vector<Expr>& indice) {
+        auto indice_B  = indice;
+        indice_B[axis] = indice_B[axis] - pivot;
+        return ir::Select::Make(indice[axis] < pivot, A(indice), B(indice_B));
+      },
+      name);
+  return res;
+}
+
+ir::Tensor Concat(const std::vector<ir::Tensor>& input_tensors, int axis, const std::string& name) {
+  int input_size = input_tensors.size();
+  CHECK_GE(input_size, 2U) << "Concat should have at least 2 input tensors";
+  std::vector<Expr> output_shape = input_tensors[0]->shape;
+  int input_dim                  = output_shape.size();
+  CHECK(axis >= -input_dim && axis < input_dim) << "Concat's axis should be in [-R, R)"
+                                                << ", but get axis: " << axis << ", R: " << input_dim;
+  if (axis < 0) axis += output_shape.size();
+
+  for (int i = 1; i < input_size; i++) {
+    CHECK_EQ(input_tensors[i]->shape.size(), input_dim)
+        << "Dimensions of inputs tensors in Concat should be equal! Please check.";
+    output_shape[axis] = common::AutoSimplify(output_shape[axis] + input_tensors[i]->shape[axis]);
+  }
+
+  auto res = Compute(
+      output_shape,
+      [=](const std::vector<Expr>& indice) {
+        auto ret              = input_tensors[0](indice);
+        Expr accumulate_shape = Expr(0);
+        for (int i = 0; i < input_size - 1; i++) {
+          accumulate_shape             = common::AutoSimplify(accumulate_shape + input_tensors[i]->shape[axis]);
+          std::vector<Expr> new_indice = indice;
+          new_indice[axis]             = indice[axis] - accumulate_shape;
+          ret = ir::Select::Make(indice[axis] < accumulate_shape, ret, input_tensors[i + 1](new_indice));
+        }
+        return ret;
+      },
+      name);
+  return res;
+}
+
+std::vector<Tensor> MatmulV2(const Tensor& A,
+                             const Tensor& B,
+                             bool trans_a,
+                             bool trans_b,
+                             float alpha,
+                             const std::string& name,
+                             const common::Target& target) {
+  std::vector<Expr> shape_A = A->shape;
+  std::vector<Expr> shape_B = B->shape;
+  int a_dim                 = shape_A.size();
+  int b_dim                 = shape_B.size();
+  CHECK(a_dim == 3U || a_dim == 2U) << "tensor_A's dim should be 2 or 3 while current dim is " << a_dim;
+  CHECK(b_dim == 3U || b_dim == 2U) << "tensor_B's dim should be 2 or 3 while current dim is " << b_dim;
+  CHECK_EQ(a_dim, b_dim) << "tensor_A's dim should be same with tensor_B";
+
+  Expr x_width  = trans_a ? shape_A[a_dim - 2] : shape_A.back();
+  Expr y_height = trans_b ? shape_B.back() : shape_B[b_dim - 2];
+  Expr M        = trans_a ? shape_A.back() : shape_A[a_dim - 2];
+  Expr N        = trans_b ? shape_B[b_dim - 2] : shape_B.back();
+  CHECK(is_zero(x_width - y_height)) << "matrix multiplication requires x_width to be same with y_height";
+  Var reduce_k(x_width, UniqName("reduce_k"));
+  std::vector<Expr> output_shape;
+  std::vector<ir::Tensor> out;
+
+  if (a_dim == 3) {
+    int max_batch = std::max(shape_A[0].as_int32(), shape_B[0].as_int32());
+    output_shape  = {Expr(max_batch), M, N};
+  } else {
+    output_shape = {M, N};
+  }
+  // array packing
+  int shape_B_N = N.as_int32();
+  int bn        = GetArrayPackingFactor(shape_B_N, B->type(), target);
+  // {N / bn, K, bn}
+  std::vector<Expr> packedB_shape = {Expr(shape_B_N / bn), y_height, Expr(bn)};
+  if (b_dim == 3) {
+    packedB_shape.insert(packedB_shape.begin(), output_shape[0]);
+  }
+  auto packedB = Compute(
+      packedB_shape,
+      [=](const std::vector<Expr>& indice) {
+        std::vector<Expr> indice_b;
+        int indice_dim = indice.size();
+        CHECK_GE(indice_dim, 3) << "packedB's dim should be at least 3 while current dim is " << indice_dim;
+        if (indice_dim == 4) {
+          // batch
+          indice_b.push_back(indice[0]);
+        }
+        // k
+        indice_b.push_back(indice[indice_dim - 2]);
+        indice_b.push_back(Expr(bn) * indice[indice_dim - 3] + indice.back());
+        if (trans_b) {
+          std::swap(indice_b.back(), indice_b[indice_b.size() - 2]);
+        }
+        return B(indice_b);
+      },
+      UniqName("packedB"));
+
+  auto res = Compute(
+      output_shape,
+      [=](const std::vector<Expr>& indice) {
+        std::vector<Expr> indice_a;
+        std::vector<Expr> indice_b;
+        int out_dim = indice.size();
+        CHECK(out_dim == 3U || out_dim == 2U) << "indice size should be 2 or 3 while current dim is " << out_dim;
+        if (out_dim == 3) {
+          // batch
+          indice_a.push_back(indice[0]);
+          indice_b.push_back(indice[0]);
+        }
+        indice_a.push_back(indice[out_dim - 2]);
+        indice_a.push_back(reduce_k);
+        indice_b.push_back(indice[out_dim - 1] / Expr(bn));
+        indice_b.push_back(reduce_k);
+        indice_b.push_back(indice[out_dim - 1] % Expr(bn));
+        if (trans_a) {
+          std::swap(indice_a.back(), indice_a[indice_a.size() - 2]);
+        }
+        if (alpha == 1) {
+          return lang::ReduceSum(A(indice_a) * packedB(indice_b), {reduce_k});
+        } else {
+          return lang::ReduceSum(A(indice_a) * packedB(indice_b) * ir::Cast::Make(A->type(), Expr(alpha)), {reduce_k});
+        }
+      },
+      UniqName("matmulV2_out"));
+  return {res, packedB};
+}
+
+std::vector<Tensor> MatmulMKL(const Tensor& A,
+                              const Tensor& B,
+                              bool trans_a,
+                              bool trans_b,
+                              float alpha,
+                              const std::string& name,
+                              const common::Target& target) {
+  CHECK(target.arch == Target::Arch::X86) << "mkl should be used in the cpu environment";
+  std::vector<Expr> shape_A = A->shape;
+  std::vector<Expr> shape_B = B->shape;
+  int a_dim                 = shape_A.size();
+  int b_dim                 = shape_B.size();
+  CHECK(a_dim == 3U || a_dim == 2U) << "tensor_A's dim should be 2 or 3 while current dim is " << a_dim;
+  CHECK(b_dim == 3U || b_dim == 2U) << "tensor_B's dim should be 2 or 3 while current dim is " << b_dim;
+  CHECK_EQ(a_dim, b_dim) << "tensor_A's dim should be same with tensor_B";
+  if (a_dim == 3U) {
+    CHECK_EQ(shape_A.front(), shape_B.front())
+        << "tensor A and B's batch size should be same but current batch sizes are " << shape_A.front() << " and "
+        << shape_B.front();
+  }
+
+  Expr x_width  = trans_a ? shape_A[a_dim - 2] : shape_A.back();
+  Expr y_height = trans_b ? shape_B.back() : shape_B[b_dim - 2];
+  Expr M        = trans_a ? shape_A.back() : shape_A[a_dim - 2];
+  Expr N        = trans_b ? shape_B[b_dim - 2] : shape_B.back();
+  CHECK(is_zero(x_width - y_height)) << "matrix multiplication requires x_width to be same with y_height";
+
+  ir::Tensor call;
+  if (a_dim == 2U) {
+    call = Compute(
+        {Expr(1)},
+        [=]() -> Expr {
+          return lang::CallExtern("cinn_cpu_mkl_gemm_fp32",
+                                  {
+                                      Expr(alpha),                 // alpha
+                                      M,                           // M
+                                      N,                           // N
+                                      x_width,                     // K
+                                      common::make_bool(trans_a),  // ta
+                                      common::make_bool(trans_b),  // tb
+                                      shape_A.back(),              // lda
+                                      shape_B.back(),              // ldb
+                                      N,                           // ldc
+                                      common::make_zero<float>(),  // beta
+                                      A,                           // A
+                                      B,                           // B
+                                  });
+        },
+        UniqName("matmul_mkl_out"));
+  } else {
+    // batch matmul
+    call = Compute(
+        {Expr(1)},
+        [=]() -> Expr {
+          return lang::CallExtern("cinn_cpu_mkl_gemm_batch_fp32",
+                                  {
+                                      Expr(alpha),                 // alpha
+                                      shape_A.front(),             // batch
+                                      M,                           // M
+                                      N,                           // N
+                                      x_width,                     // K
+                                      common::make_bool(trans_a),  // ta
+                                      common::make_bool(trans_b),  // tb
+                                      shape_A.back(),              // lda
+                                      shape_B.back(),              // ldb
+                                      N,                           // ldc
+                                      M * x_width,                 // a_stride
+                                      N * x_width,                 // b_stride
+                                      M * N,                       // c_stride
+                                      common::make_zero<float>(),  // beta
+                                      A,                           // A
+                                      B,                           // B
+                                  });
+        },
+        UniqName("batch_matmul_mkl_out"));
+  }
+  auto out = call->TupleGet(0);
+  out->WithBuffer(A->type());
+  return {out, call};
+}
+
+int GetMulFactor(int shape, const Type& type, const common::Target& target) {
+  int split_base   = GetBasicFactor(type, target);
+  int split_factor = 1;
+  for (size_t i = split_base; i >= 1; --i) {
+    if (shape % i == 0) {
+      split_factor = i;
+      break;
+    }
+  }
+  return split_factor;
+}
+
+std::vector<Tensor> MulBase(const Tensor& A, const Tensor& B, const std::string& name, const common::Target& target) {
+  std::vector<Expr> output_shape;
+  CHECK_EQ(A->shape.size(), 2U) << "tensor_A's shape size should be two while current shape size is "
+                                << A->shape.size();
+  CHECK_EQ(B->shape.size(), 2U) << "tensor_B's shape size should be two while current shape size is "
+                                << B->shape.size();
+  CHECK_EQ(A->shape[1], B->shape[1]) << "tensor_A's last shape should be same with tensor_B";
+  output_shape.push_back(A->shape[0]);
+  output_shape.push_back(B->shape[0]);
+
+  if (target.arch == Target::Arch::X86) {
+    int reduce_dim   = A->shape[1].as_int32();
+    int split_factor = GetMulFactor(reduce_dim, A->type(), target);
+    Var reduce_k_first(ir::Cast::Make(A->shape[1]->type(), Expr(reduce_dim / split_factor)),
+                       UniqName("reduce_k_first"));
+    auto mul_reduce_first = Compute(
+        {A->shape[0], B->shape[0], Expr(split_factor)},
+        [=](const std::vector<Expr>& indice) {
+          CHECK_EQ(indice.size(), 3U) << "indice size should be three while current size is " << indice.size();
+          return lang::ReduceSum(A({indice[0], reduce_k_first * Expr(split_factor) + indice[2]}) *
+                                     B({indice[1], reduce_k_first * Expr(split_factor) + indice[2]}),
+                                 {reduce_k_first});
+        },
+        UniqName("mul_reduce_k_first"));
+    Var reduce_k_second(ir::Cast::Make(A->shape[1]->type(), Expr(split_factor)), UniqName("reduce_k_second"));
+    return {Compute(
+                output_shape,
+                [=](const std::vector<Expr>& indice) {
+                  std::vector<Expr> new_indice = indice;
+                  new_indice.push_back(reduce_k_second);
+                  return lang::ReduceSum(mul_reduce_first(new_indice), {reduce_k_second});
+                },
+                name),
+            mul_reduce_first};
+  } else {
+    Var reduce_k(A->shape[1], UniqName("reduce_k"));
+    return {Compute(
+        output_shape,
+        [=](const std::vector<Expr>& indice) {
+          std::vector<Expr> A_indice;
+          std::vector<Expr> B_indice;
+          CHECK_EQ(indice.size(), 2U) << "indice size should be two while current size is " << indice.size();
+          A_indice.push_back(indice[0]);
+          B_indice.push_back(indice[1]);
+          A_indice.push_back(reduce_k);
+          B_indice.push_back(reduce_k);
+          return lang::ReduceSum(A(A_indice) * B(B_indice), {reduce_k});
+        },
+        name)};
+  }
+}
+
+std::vector<Tensor> Mul(const Tensor& A,
+                        const Tensor& B,
+                        int x_num_col_dims,
+                        const std::vector<Expr>& output_shape,
+                        const Var& axis_k,
+                        const std::string& name) {
+  return {Compute(
+      output_shape,
+      [=](const std::vector<Expr>& indice) {
+        std::vector<Expr> A_indice;
+        std::vector<Expr> B_indice;
+        A_indice.insert(A_indice.begin(), indice.begin(), indice.begin() + x_num_col_dims);
+        B_indice.insert(B_indice.begin(), indice.begin() + x_num_col_dims, indice.end());
+        A_indice.push_back(axis_k);
+        B_indice.push_back(axis_k);
+        return lang::ReduceSum(A(A_indice) * B(B_indice), {axis_k});
+      },
+      name)};
+}
+
+std::vector<Tensor> MulMKL(const Tensor& A, const Tensor& B, const std::string& name, const common::Target& target) {
+  CHECK(target.arch == Target::Arch::X86) << "mkl should be used in the cpu environment";
+  std::vector<Expr> shape_A = A->shape;
+  std::vector<Expr> shape_B = B->shape;
+  int a_dim                 = shape_A.size();
+  int b_dim                 = shape_B.size();
+  CHECK_EQ(a_dim, 2U) << "tensor_A's shape size should be two while current shape size is " << A->shape.size();
+  CHECK_EQ(b_dim, 2U) << "tensor_B's shape size should be two while current shape size is " << B->shape.size();
+  // A: [M, K], B: [N, K]
+  Expr x_width  = shape_A[1];
+  Expr y_height = shape_B[1];
+  Expr M        = shape_A[0];
+  Expr N        = shape_B[0];
+  CHECK(is_zero(x_width - y_height)) << "matrix multiplication requires x_width to be same with y_height";
+  CHECK_EQ(A->shape[1], B->shape[1]) << "tensor_A's last shape should be same with tensor_B";
+
+  auto call = Compute(
+      {Expr(1)},
+      [=]() -> Expr {
+        return lang::CallExtern("cinn_cpu_mkl_gemm_fp32",
+                                {
+                                    Expr(1.0f),                  // alpha
+                                    M,                           // M
+                                    N,                           // N
+                                    x_width,                     // K
+                                    common::make_bool(false),    // ta
+                                    common::make_bool(true),     // tb
+                                    shape_A.back(),              // lda
+                                    shape_B.back(),              // ldb
+                                    N,                           // ldc
+                                    common::make_zero<float>(),  // beta
+                                    A,                           // A
+                                    B,                           // B
+                                });
+      },
+      UniqName("mul_mkl_out"));
+  auto out = call->TupleGet(0);
+  out->WithBuffer(A->type());
+  return {out, call};
+}
+
+void GetLayoutTransformInfo(const ir::Layout& src_layout,
+                            const ir::Layout& dst_layout,
+                            absl::flat_hash_map<int, std::vector<int>>* split_index_map) {
+  CHECK_GT(dst_layout.ndims(), src_layout.ndims());
+  int offset = 'A' - 'a';
+  CHECK_EQ(dst_layout.axis_names().size(), dst_layout.ndims());
+  for (int i = dst_layout.ndims() - 1; i >= 0; i--) {
+    char axis_name      = dst_layout.axis_names(i);
+    char prim_axis_name = axis_name;
+    if (axis_name >= 'a' && axis_name <= 'z') {
+      prim_axis_name += offset;
+      int factor = dst_layout[i]->upper_bound.as_int32();
+
+      CHECK_GT(factor, 0) << "sub-axis factor should be larger than 0";
+      int src_primal_index = src_layout.axis_names().find(prim_axis_name);
+      int dst_primal_index = dst_layout.axis_names().find(prim_axis_name);
+      CHECK(src_primal_index != src_layout.axis_names().npos);
+      CHECK(dst_primal_index != dst_layout.axis_names().npos);
+      (*split_index_map)[src_primal_index] = {dst_primal_index, i, factor};
+    } else {
+      int src_primal_index = src_layout.axis_names().find(prim_axis_name);
+      if (split_index_map->find(src_primal_index) != split_index_map->end()) continue;
+      CHECK(src_primal_index != src_layout.axis_names().npos);
+      (*split_index_map)[src_primal_index] = {i};
+    }
+  }
+}
+
+std::vector<Expr> InferShapeLayoutTransform(const std::vector<Expr>& input_shapes,
+                                            const ir::Layout& old_layout,
+                                            const ir::Layout& new_layout,
+                                            absl::flat_hash_map<int, std::vector<int>>* split_index_map) {
+  int src_dim = old_layout.ndims();
+  int dst_dim = new_layout.ndims();
+  std::vector<Expr> output_shape(dst_dim);
+  CHECK_EQ(input_shapes.size(), src_dim);
+
+  if (src_dim == dst_dim) {
+    CHECK_EQ(old_layout.name(), new_layout.name());
+    return input_shapes;
+  } else if (src_dim < dst_dim) {
+    GetLayoutTransformInfo(old_layout, new_layout, split_index_map);
+    for (int i = 0; i < src_dim; i++) {
+      CHECK(split_index_map->find(i) != split_index_map->end());
+      if ((*split_index_map)[i].size() == 3) {
+        int dst_prim_index           = (*split_index_map)[i][0];
+        int dst_sub_index            = (*split_index_map)[i][1];
+        int factor                   = (*split_index_map)[i][2];
+        Expr chunk_shape             = common::AutoSimplify(input_shapes[i] / factor);
+        Expr block_shape             = Expr(factor);
+        output_shape[dst_prim_index] = chunk_shape;
+        output_shape[dst_sub_index]  = block_shape;
+      } else if ((*split_index_map)[i].size() == 1) {
+        int dst_prim_index           = (*split_index_map)[i][0];
+        output_shape[dst_prim_index] = input_shapes[i];
+      }
+    }
+  } else {
+    GetLayoutTransformInfo(new_layout, old_layout, split_index_map);
+    for (int i = 0; i < dst_dim; i++) {
+      CHECK(split_index_map->find(i) != split_index_map->end());
+      if ((*split_index_map)[i].size() == 3) {
+        int src_prim_index = (*split_index_map)[i][0];
+        int src_sub_index  = (*split_index_map)[i][1];
+        int factor         = (*split_index_map)[i][2];
+        CHECK_GE(input_shapes.size(), src_sub_index);
+        CHECK_EQ(input_shapes[src_sub_index].as_int32(), factor);
+        output_shape[i] = common::AutoSimplify(input_shapes[src_prim_index] * factor);
+      } else if ((*split_index_map)[i].size() == 1) {
+        int src_prim_index = (*split_index_map)[i][0];
+        output_shape[i]    = input_shapes[src_prim_index];
+      }
+    }
+  }
+  VLOG(4) << "output_shape: " << output_shape;
+  return output_shape;
+}
+
+ir::Tensor LayoutTransform(const Tensor& input,
+                           const std::string& src_layout,
+                           const std::string& dst_layout,
+                           const std::string& name) {
+  CHECK(src_layout != dst_layout) << "dst_layout is same with src_layout, should not do layout transform";
+  // NCHW -> NCHWxc
+  // NCHWxc -> NCHW
+  // OIHW -> OIHWxixo
+  // OIHWxixo -> OIHW
+  CHECK_GE(src_layout.size(), 4U);
+  CHECK_GE(dst_layout.size(), 4U);
+  absl::flat_hash_map<int, std::vector<int>> split_index_map;
+  // transform shape
+  int offset = 'A' - 'a';
+  ir::Layout old_layout(src_layout);
+  ir::Layout new_layout(dst_layout);
+  int src_dim                    = old_layout.ndims();
+  int dst_dim                    = new_layout.ndims();
+  std::vector<Expr> output_shape = InferShapeLayoutTransform(input->shape, old_layout, new_layout, &split_index_map);
+  CHECK_EQ(output_shape.size(), dst_dim);
+
+  auto res = Compute(
+      output_shape,
+      [=](const std::vector<Expr>& indice) {
+        // transform indice
+        std::vector<Expr> new_indice(src_dim);
+        int min_dim = std::min(src_dim, dst_dim);
+        for (int i = 0; i < min_dim; i++) {
+          CHECK(split_index_map.find(i) != split_index_map.end());
+          std::vector<int> split_infos = split_index_map.at(i);
+          if (split_infos.size() == 3) {
+            int prim_index = split_infos[0];
+            int sub_index  = split_infos[1];
+            int factor     = split_infos[2];
+            if (dst_dim > src_dim) {
+              new_indice[i] = common::AutoSimplify(indice[prim_index] * factor + indice[sub_index]);
+            } else {
+              new_indice[prim_index] = common::AutoSimplify(indice[i] / factor);
+              new_indice[sub_index]  = common::AutoSimplify(indice[i] % factor);
+            }
+
+          } else if (split_infos.size() == 1) {
+            int prim_index = split_infos[0];
+            if (dst_dim > src_dim) {
+              new_indice[i] = indice[prim_index];
+            } else {
+              new_indice[prim_index] = indice[i];
+            }
+          }
+        }
+        VLOG(4) << "new_indice: " << new_indice;
+
+        return input(new_indice);
+      },
+      name);
+  return {res};
+}
+
+ir::Tensor Reverse(const ir::Tensor& input, const std::vector<int>& axis, const std::string& output_name) {
+  for (auto& val : axis) {
+    CHECK(val >= 0 && val < static_cast<int>(input->shape.size())) << "axis should be [0,n_dim)";
+  }
+  std::vector<Expr> shape = input->shape;
+  return lang::Compute(
+      input->shape,
+      [=](const std::vector<Expr>& indice) {
+        std::vector<Expr> indexs(indice.begin(), indice.end());
+        for (auto idx : axis) {
+          indexs[idx] = shape[idx] - Expr(1) - indexs[idx];
+        }
+        return input(indexs);
+      },
+      output_name);
+}
+
+ir::Tensor Transpose(const ir::Tensor& input, const std::vector<int>& axis, const std::string& output_name) {
+  CHECK_EQ(input->shape.size(), axis.size()) << "input shape size and axis size is not equal!";
+  for (int idx = 0; idx < axis.size(); ++idx) {
+    CHECK(axis[idx] >= 0 && axis[idx] < axis.size()) << "axis value should be among [0,axis.size())";
+    for (int idy = idx + 1; idy < axis.size(); ++idy) {
+      CHECK_NE(axis[idx], axis[idy]) << "axis value can't repeat!";
+    }
+  }
+  // compute output shape
+  std::vector<Expr> shape = input->shape;
+  std::vector<Expr> output_shape;
+  for (auto idx = 0; idx < axis.size(); ++idx) {
+    output_shape.push_back(shape[axis[idx]]);
+  }
+
+  // tranpose axis to map output to input
+  // new_axis = axis(T)
+  std::vector<int> new_axis;
+  for (int idx = 0; idx < axis.size(); ++idx) {
+    for (int idy = 0; idy < axis.size(); ++idy) {
+      if (idx == axis[idy]) {
+        new_axis.push_back(idy);
+      }
+    }
+  }
+
+  return lang::Compute(
+      output_shape,
+      [=](const std::vector<Expr>& indice) {
+        std::vector<Expr> indexs;
+        for (auto idx : new_axis) {
+          indexs.push_back(indice[idx]);
+        }
+        return input(indexs);
+      },
+      output_name);
+}
+
+ir::Tensor Slice(const ir::Tensor& A,
+                 const std::vector<int>& starts,
+                 const std::vector<int>& axes,
+                 const std::vector<int>& strides,
+                 const std::vector<int>& decrease_axis,
+                 const std::vector<Expr>& output_shape,
+                 const std::string& output_name) {
+  std::vector<int> input_shape;
+  for (const auto& shape : A->shape) {
+    input_shape.emplace_back(shape.as_int32());
+  }
+  std::vector<int> new_starts(starts);
+  for (int i = 0; i < axes.size(); i++) {
+    if (new_starts[i] < -input_shape[axes[i]]) {
+      new_starts[i] = 0;
+    } else if (new_starts[i] < 0) {
+      new_starts[i] = input_shape[axes[i]] + new_starts[i];
+    } else if (new_starts[i] > input_shape[axes[i]]) {
+      new_starts[i] = input_shape[axes[i]] - 1;
+    }
+  }
+
+  // output = input[starts:ends:strides]
+  // Note that when strides < 0, the output reverse:
+  // data=[[1,2,3,4],[5,6,7,8],]
+  // axes=[0,1]
+  // starts=[1,3]
+  // ends=[2,0]
+  // strides=[1,-1]
+  // ==> result=[[8,7,6],]
+  return Compute(
+      output_shape,
+      [=](const std::vector<Expr>& indice) {
+        std::vector<Expr> temp;
+        int indice_i = 0;
+        for (int i = 0; i < input_shape.size(); ++i) {
+          if (std::find(decrease_axis.cbegin(), decrease_axis.cend(), i) != decrease_axis.cend()) {
+            temp.emplace_back(0);
+          } else {
+            temp.emplace_back(indice[indice_i]);
+            indice_i++;
+          }
+        }
+        for (int i = 0; i < axes.size(); i++) {
+          temp[axes[i]] = temp[axes[i]] * Expr(strides[i]) + Expr(new_starts[i]);
+        }
+        return A(temp);
+      },
+      output_name);
+}
+
+ir::Tensor SliceAssign(const ir::Tensor& input,
+                       const ir::Tensor& assign,
+                       const std::vector<int>& axes,
+                       const std::vector<int>& starts,
+                       const std::vector<int>& ends,
+                       const std::vector<int>& strides,
+                       const std::string& output_name) {
+  CHECK_EQ(axes.size(), starts.size()) << "axes's size is not equal to starts's size!";
+  CHECK_EQ(axes.size(), ends.size()) << "axes's size is not equal to starts's size!";
+  CHECK_EQ(axes.size(), strides.size()) << "axes's size is not equal to strides's size!";
+
+  std::vector<int> input_shape;
+  for (const auto& shape : input->shape) {
+    input_shape.emplace_back(shape.as_int32());
+  }
+  std::vector<int> new_starts(starts);
+  std::vector<int> new_ends(ends);
+  std::vector<int> new_strides(strides);
+  for (int i = 0; i < axes.size(); i++) {
+    CHECK_LT(axes[i], input->shape.size()) << "axes should less than input's shape size";
+
+    if (new_starts[i] < 0) {
+      new_starts[i] = input_shape[axes[i]] + new_starts[i];
+      CHECK_GE(new_starts[i], 0) << "The value of [starts] should not less than " << -input_shape[axes[i]];
+    }
+    if (new_starts[i] > input_shape[axes[i]]) {
+      new_starts[i] = input_shape[axes[i]];
+    }
+    if (new_ends[i] < 0) {
+      new_ends[i] = input_shape[axes[i]] + new_ends[i];
+      CHECK_GE(new_ends[i], 0) << "The value of [ends] should not less than " << -input_shape[axes[i]];
+    }
+    if (new_ends[i] > input_shape[axes[i]]) {
+      new_ends[i] = input_shape[axes[i]];
+    }
+
+    // if strides < 0, starts > ends, we need swap them
+    CHECK_NE(strides[i], 0) << "[strides] should not be 0 ! Please Check.";
+    if (strides[i] < 0) {
+      CHECK_GT(new_starts[i], new_ends[i]) << "[starts] should greater than [ends] when [strides] < 0";
+      // if strides > 0, the range is [starts, ends)
+      // but if strides < 0, the range is (ends, starts]
+      auto tmp      = new_starts[i];
+      new_starts[i] = new_ends[i] + 1;  // the new starts should not contain ends[i]
+      new_ends[i]   = tmp + 1;          // the new ends should contain starts[i]
+
+      new_strides[i] = -new_strides[i];
+    } else {
+      CHECK_LT(new_starts[i], new_ends[i]) << "[ends] shoould greater than [starts] when [strides] > 0";
+    }
+  }
+
+  // input[starts:ends:strides] = assign
+  auto output_tensor = Compute(
+      input->shape,
+      [=](const std::vector<Expr>& indice) {
+        ir::Expr is_assigned             = ir::Expr(true);
+        std::vector<ir::Expr> tmp_indice = indice;
+        for (int idx = 0; idx < axes.size(); ++idx) {
+          // get input axis to be assigned
+          auto tmp_axis = indice[axes[idx]];
+          // get assign axis
+          Expr out_axis;
+          if (strides[idx] > 0) {
+            out_axis = tmp_axis - ir::Expr(new_starts[idx]);
+          } else {
+            // when strides < 0, reverse input to output.
+            // the value of ends is not contained in slice, so `ends - 1`
+            out_axis = ir::Expr(new_ends[idx] - 1) - tmp_axis;
+          }
+          // axis >= start
+          auto ge = ir::GE::Make(tmp_axis, ir::Expr(new_starts[idx]));
+          // axis < ends
+          auto lt = ir::LT::Make(tmp_axis, ir::Expr(new_ends[idx]));
+          // check start <= axis < ends
+          auto inside = ir::And::Make(ge, lt);
+          // check (axis - starts) % strides == 0
+          auto mod = ir::EQ::Make(ir::Mod::Make(out_axis, Expr(new_strides[idx])), Expr(0));
+          // check start <= axis < ends and (axis - starts) % strides == 0
+          is_assigned = ir::And::Make(is_assigned, ir::And::Make(inside, mod));
+          // update axis for assign tensor
+          tmp_indice[axes[idx]] = out_axis / Expr(new_strides[idx]);
+        }
+        return ir::Select::Make(is_assigned, assign(tmp_indice), input(indice));
+      },
+      output_name);
+  return output_tensor;
+}
+
+ir::Tensor Gather(const ir::Tensor& x,
+                  const ir::Tensor& index,
+                  const std::vector<Expr>& output_shape,
+                  int axis,
+                  const std::string& name) {
+  CHECK_EQ(x->shape.size(), index->shape.size()) << "The rank of x and index must be same.";
+  // The implementation details are explained below.
+  // If output_shape = [2, 4, 3] and axis = 0, `Compute` can be translated as the following code:
+  // {
+  //   for (i, 0, 2)
+  //   {
+  //     for (j, 0, 4)
+  //     {
+  //       for (k, 0, 3)
+  //       {
+  //         index_select_output[i, j, k] = X[int32(Index[i, j, k]), j, k]
+  //       }
+  //     }
+  //   }
+  // }
+  auto output_tensor = Compute(
+      output_shape,
+      [x, index, axis](const std::vector<Expr>& indice) {
+        // 1) indice is got from `output_shape`
+        // 2) transformed_indice is used in the input `x`
+        std::vector<Expr> transformed_indice = indice;
+        // The element type of index maybe int64, but the index type is limited to int32 in CINN.
+        // See the below link for more details:
+        // https://github.com/PaddlePaddle/CINN/blob/85ab4981a38926dc5c1dbf672762cec335d2b857/cinn/ir/ir.cc#L477
+        transformed_indice[axis] = ir::Cast::Make(common::Int(32), index(indice));
+        return x(transformed_indice);
+      },
+      name);
+  return output_tensor;
+}
+
+ir::Tensor ScatterAssign(const ir::Tensor& input,
+                         const ir::Tensor& updates,
+                         const ir::Tensor& index,
+                         const common::Target& target,
+                         const int axis,
+                         const std::string& output_name) {
+  CHECK_EQ(index->type(), common::Int(32)) << "Param [Index] of ScatterAssign only support int32 ! Please Check.\n";
+  std::string extern_fun_name;
+  if (target.arch == common::Target::Arch::NVGPU) {
+    extern_fun_name.assign("cinn_cuda_find_int");
+  } else if (target.arch == common::Target::Arch::X86) {
+    extern_fun_name.assign("cinn_host_find_int");
+  } else {
+    LOG(FATAL) << "ScatterAssign only support X86 and NVGPU ! Please Check.\n";
+  }
+
+  auto pos_axis = axis;
+  if (pos_axis < 0) pos_axis += input->shape.size();
+
+  auto res = Compute(
+      input->shape,
+      [=](const std::vector<Expr>& indice) {
+        // find whether indice[axis] in Index,
+        // then return id if found Index[id] == indice[axis]
+        // else return -1
+        auto id = lang::CallExtern(extern_fun_name, {index, index->shape[0], indice[pos_axis]});
+
+        std::vector<Expr> indice_updates = indice;
+        indice_updates[pos_axis]         = id;
+
+        // check wheter Index[id] == cur_index and return by check result
+        return ir::Select::Make(ir::EQ::Make(id, Expr(-1)), input(indice), updates(indice_updates));
+      },
+      UniqName(output_name));
+  return res;
+}
+
+ir::Tensor ScatterAdd(const ir::Tensor& input,
+                      const ir::Tensor& updates,
+                      const ir::Tensor& index,
+                      const common::Target& target,
+                      const int axis,
+                      const std::string& output_name) {
+  CHECK_EQ(target.arch, common::Target::Arch::NVGPU) << "Op IndexAdd only support NVGPU now ! Please Check.\n";
+
+  CHECK_EQ(index->type(), common::Int(32)) << "Param [index] of IndexAdd only support int32 ! Please Check.\n";
+  CHECK_EQ(index->shape.size(), 1) << "The dimension of param [index] of IndexAdd should be 1 ! Please Check.\n";
+  CHECK_EQ(input->type(), updates->type())
+      << "Please ensure that the data types for input and updates are identical.\n";
+
+  auto pos_axis = axis;
+  if (pos_axis < 0) pos_axis += input->shape.size();
+  CHECK(pos_axis >= 0 && pos_axis < input->shape.size())
+      << "Param [axis] of IndexAdd should satisfy 0 <= axis < input.shape ! Please Check.\n";
+
+  // compute each dimension's stride, it is used for indice2offset.
+  // for shape=[1,2,3,4], strides=[2*3*4,3*4,4*1,1]=[24, 12, 4, 1]
+  std::vector<int> strides(updates->shape.size(), 1);
+  for (int i = updates->shape.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * updates->shape[i + 1].as_int32();
+  }
+
+  // compute multi-dimension index(without axis's) to scalar offset,
+  // offset = offset + indice[i] * strides[i];
+  auto indice2offset = [&](const std::vector<Expr>& indice) -> Expr {
+    Expr offset(0);
+    for (int i = 0; i < pos_axis; ++i) {
+      offset = offset + indice[i] * Expr(strides[i]);
+    }
+    for (int i = pos_axis + 1; i < updates->shape.size(); ++i) {
+      offset = offset + indice[i] * Expr(strides[i]);
+    }
+    return offset;
+  };
+
+  const std::string& extern_func_name = GetExternFuncName(target, input->type(), "index_add");
+
+  // assume shape=[1,2,3], axis=1, `cinn_cuda_index_add` extern function do following compute:
+  // out[i][j][k] = input[i][j][k]
+  // for l in range(index.size()):
+  //   if index[l] == j:
+  //      out[i][j][k] += update[i][l][k]
+  auto output = Compute(
+      input->shape,
+      [=](const std::vector<Expr>& indice) {
+        return lang::CallExtern(extern_func_name,
+                                {input(indice),
+                                 indice[pos_axis],
+                                 updates,
+                                 indice2offset(indice),
+                                 Expr(strides[pos_axis]),
+                                 index,
+                                 index->shape[0]});
+      },
+      UniqName(output_name));
+
+  return output;
+}
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/transform.h b/paddle/cinn/hlir/pe/transform.h
new file mode 100644
index 0000000000000..a3fff23cb56f0
--- /dev/null
+++ b/paddle/cinn/hlir/pe/transform.h
@@ -0,0 +1,232 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/layout.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {
+
+namespace utils {
+std::vector<std::vector<int>> GetMatmulNewShapes(const std::vector<std::vector<int>>& inputs_shape,
+                                                 bool trans_x,
+                                                 bool trans_y);
+
+std::vector<std::vector<int>> GetMulNewShapes(const std::vector<std::vector<int>>& inputs_shape,
+                                              int x_num_col_dims,
+                                              int y_num_col_dims,
+                                              bool is_infer = false);
+}  // namespace utils
+
+/**
+ * @brief basic PE that calculates a matrix multiplication
+ *
+ * @param A The first input tensor, [batch, M, K] or [M, K]
+ * @param B The second input tensor, [batch, K, N] or [K, N]
+ * @param trans_a whether A is transposed, default: false
+ * @param trans_b whether B is transposed, default: false
+ * @param alpha  The scale of output, default: 1.0.
+ * @param name The name of the operation
+ * @param target
+ *
+ * @return the output tensors
+ */
+std::vector<ir::Tensor> Matmul(const ir::Tensor& A,
+                               const ir::Tensor& B,
+                               bool trans_a            = false,
+                               bool trans_b            = false,
+                               float alpha             = 1,
+                               const std::string& name = UniqName("T_Transform_Matmul_out"));
+
+// realized by sharing buffer
+ir::Tensor Reshape(const ir::Tensor& A,
+                   const std::vector<int>& new_shape,
+                   poly::StageMap stages,
+                   const std::string& name);
+
+ir::Tensor Concat(const ir::Tensor& A,
+                  const ir::Tensor& B,
+                  int axis                = 0,
+                  const std::string& name = UniqName("T_Transform_Concat_out"));
+
+ir::Tensor Concat(const std::vector<ir::Tensor>& input_tensors,
+                  int axis                = 0,
+                  const std::string& name = UniqName("T_Transform_Concat_out"));
+
+std::vector<ir::Tensor> MatmulV2(const ir::Tensor& A,
+                                 const ir::Tensor& B,
+                                 bool trans_a                 = false,
+                                 bool trans_b                 = false,
+                                 float alpha                  = 1,
+                                 const std::string& name      = UniqName("T_Transform_MatmulV2_out"),
+                                 const common::Target& target = common::DefaultHostTarget());
+
+std::vector<ir::Tensor> MatmulMKL(const ir::Tensor& A,
+                                  const ir::Tensor& B,
+                                  bool trans_a                 = false,
+                                  bool trans_b                 = false,
+                                  float alpha                  = 1,
+                                  const std::string& name      = UniqName("T_Transform_MatmulMKL_out"),
+                                  const common::Target& target = common::DefaultHostTarget());
+
+int GetMulFactor(int shape, const Type& type, const common::Target& target);
+
+/**
+ * @brief basic PE that calculates a matrix multiplication
+ *
+ * @param A The first input tensor, [M, K]
+ * @param B The second input tensor, [N, K]
+ * @param name The name of the operation
+ * @param target if target is x86, we will split the reduce axis
+ *
+ * @return the output tensors
+Notes: this mul only support two-dims-tensor after flattening [M, K] * [N, K], K is the reduce axis
+ */
+std::vector<ir::Tensor> MulBase(const ir::Tensor& A,
+                                const ir::Tensor& B,
+                                const std::string& name      = UniqName("T_Transform_MulBase_out"),
+                                const common::Target& target = common::DefaultHostTarget());
+
+std::vector<ir::Tensor> Mul(const ir::Tensor& A,
+                            const ir::Tensor& B,
+                            int x_num_col_dims,
+                            const std::vector<ir::Expr>& output_shape,
+                            const ir::Var& axis_k,
+                            const std::string& name);
+
+std::vector<ir::Tensor> MulMKL(const ir::Tensor& A,
+                               const ir::Tensor& B,
+                               const std::string& name      = UniqName("T_Transform_MulMKL_out"),
+                               const common::Target& target = common::DefaultHostTarget());
+
+ir::Tensor LayoutTransform(const ir::Tensor& input,
+                           const std::string& src_layout,
+                           const std::string& dst_layout,
+                           const std::string& name = UniqName("T_LayoutTransform_out"));
+
+std::vector<ir::Expr> InferShapeLayoutTransform(const std::vector<Expr>& input_shapes,
+                                                const ir::Layout& old_layout,
+                                                const ir::Layout& new_layout,
+                                                absl::flat_hash_map<int, std::vector<int>>* split_index_map);
+
+/**
+ * @brief Perform meta op Reverse
+ * @param input The input tensor
+ * @param axis reverse axis
+ * @param output_name the name of the output tensor
+ */
+ir::Tensor Reverse(const ir::Tensor& input,
+                   const std::vector<int>& axis,
+                   const std::string& output_name = UniqName("T_Reverse_out"));
+
+/**
+ * @brief Perform meta op Transpose
+ * @param input The input tensor
+ * @param axis tranpsoe axis
+ * @param output_name the name of the output tensor
+ */
+ir::Tensor Transpose(const ir::Tensor& input,
+                     const std::vector<int>& axis,
+                     const std::string& output_name = UniqName("T_Transpose_out"));
+
+/**
+ * @brief Perform meta op Split
+ * @param x The input tensor
+ * @param index The index tensor
+ * @param output_shape The output tensor shape
+ * @param axis select axis
+ * @param output_name the name of the output tensor
+ */
+std::vector<ir::Tensor> Split(const ir::Tensor& A,
+                              int axis,
+                              const std::vector<std::vector<int>>& output_shapes,
+                              const std::vector<std::string>& names);
+
+ir::Tensor Slice(const ir::Tensor& A,
+                 const std::vector<int>& starts,
+                 const std::vector<int>& axes,
+                 const std::vector<int>& strides,
+                 const std::vector<int>& decrease_axis,
+                 const std::vector<Expr>& output_shape,
+                 const std::string& output_name);
+
+/**
+ * @brief Perform meta op SliceAssign
+ * @param input The input tensor
+ * @param assign The assign tensor
+ * @param axis select axis
+ * @param starts select reigon starts
+ * @param strides select reigon strides
+ * @param output_name the name of the output tensor
+ */
+ir::Tensor SliceAssign(const ir::Tensor& input,
+                       const ir::Tensor& assign,
+                       const std::vector<int>& axes,
+                       const std::vector<int>& starts,
+                       const std::vector<int>& ends,
+                       const std::vector<int>& strides,
+                       const std::string& output_name = UniqName("T_Transform_SliceAssign_out"));
+/**
+ * @brief Perform meta op Split
+ * @param A The input tensor
+ * @param axis split axis
+ * @param output_shapes The output sub-tensors shape
+ * @param output_name the name of the output tensor
+ */
+ir::Tensor Gather(const ir::Tensor& x,
+                  const ir::Tensor& index,
+                  const std::vector<Expr>& output_shape,
+                  int axis                = 0,
+                  const std::string& name = UniqName("T_Transform_Gather_out"));
+
+/**
+ * @brief Perform meta op ScatterAssign
+ * @param input The input tensor
+ * @param assign The assign tensor
+ * @param indexs The indexs tensor
+ * @param output_name the name of the output tensor
+ */
+ir::Tensor ScatterAssign(const ir::Tensor& input,
+                         const ir::Tensor& updates,
+                         const ir::Tensor& index,
+                         const common::Target& target,
+                         const int axis                 = 0,
+                         const std::string& output_name = UniqName("T_Transform_ScatterAssign_out"));
+
+/**
+ * @brief Perform meta op ScatterAdd
+ * @param input The input tensor
+ * @param updates The updates tensor
+ * @param indexs The indexs tensor
+ * @param output_name the name of the output tensor
+ */
+ir::Tensor ScatterAdd(const ir::Tensor& input,
+                      const ir::Tensor& updates,
+                      const ir::Tensor& index,
+                      const common::Target& target,
+                      const int axis,
+                      const std::string& output_name);
+
+}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/vision.cc b/paddle/cinn/hlir/pe/vision.cc
new file mode 100644
index 0000000000000..54c65f165458a
--- /dev/null
+++ b/paddle/cinn/hlir/pe/vision.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/pe/vision.h"
+
+namespace cinn {
+namespace hlir {
+namespace pe {}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/vision.h b/paddle/cinn/hlir/pe/vision.h
new file mode 100644
index 0000000000000..799e1e114e09e
--- /dev/null
+++ b/paddle/cinn/hlir/pe/vision.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace cinn {
+namespace hlir {
+namespace pe {}  // namespace pe
+}  // namespace hlir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/CMakeLists.txt b/paddle/cinn/ir/CMakeLists.txt
new file mode 100755
index 0000000000000..b4618cbd0f545
--- /dev/null
+++ b/paddle/cinn/ir/CMakeLists.txt
@@ -0,0 +1,44 @@
+proto_library(schedule_desc_proto SRCS schedule_desc.proto)
+
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    ir.cc
+    ir_base.cc
+    ir_schedule.cc
+    ir_schedule_util.cc
+    ir_visitor.cc
+    ir_printer.cc
+    ir_mutator.cc
+    function_definition.cc
+    lowered_func.cc
+    ir_operators.cc
+    ir_verify.cc
+    buffer.cc
+    function_base.cc
+    operation.cc
+    collect_ir_nodes.cc
+    registry.cc
+    tensor.cc
+    module.cc
+    intrinsic_ops.cc
+    layout.cc
+    schedule_desc.cc
+    ir_compare.cc
+    )
+
+# cc_test(test_ir SRCS ir_test.cc DEPS core)
+# cc_test(test_ir_printer SRCS ir_printer_test.cc DEPS core)
+# cc_test(test_ir_operators SRCS ir_operators_test.cc DEPS core)
+cc_test(test_collect_ir_nodes SRCS collect_ir_nodes_test.cc DEPS cinncore)
+# cc_test(test_tensor SRCS tensor_test.cc DEPS core)
+cc_test(test_buffer SRCS buffer_test.cc DEPS cinncore ARGS ${global_test_args})
+cc_test(test_tensor SRCS tensor_test.cc DEPS cinncore)
+cc_test(test_intrinsic_ops SRCS intrinsic_ops_test.cc DEPS cinncore)
+cc_test(test_ir_verify SRCS ir_verify_test.cc DEPS cinncore)
+cc_test(test_schedule_desc SRCS schedule_desc_test.cc DEPS cinncore)
+cc_test(test_ir_compare SRCS ir_compare_test.cc DEPS cinncore)
+
+foreach(header ${schedule_desc_proto_HDRS})
+  set(core_proto_includes "${core_proto_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/cinn/ir/buffer.cc b/paddle/cinn/ir/buffer.cc
new file mode 100755
index 0000000000000..76ed3e37571e0
--- /dev/null
+++ b/paddle/cinn/ir/buffer.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/buffer.h"
+
+#include "cinn/common/common.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/runtime/intrinsic.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace ir {
+
+std::string TensorGetBufferName(const _Tensor_ *tensor) {
+  CHECK(!tensor->name.empty());
+  CHECK(!utils::Startswith(tensor->name, "_"))
+      << "the name with prefix _ is not allowed for tensor. Current tensor's name is: " << tensor->name;
+  return "_" + tensor->name;
+}
+std::string BufferGetTensorName(const _Buffer_ *buffer) {
+  CHECK(!buffer->name.empty());
+  CHECK(utils::Startswith(buffer->name, "_")) << "buffer's name should start with _";
+  return buffer->name.substr(1);
+}
+
+const _Buffer_ *Buffer::operator->() const { return IrNodeRef::As<_Buffer_>(); }
+_Buffer_ *Buffer::operator->() { return IrNodeRef::As<_Buffer_>(); }
+
+Buffer _Buffer_::Make(Var data,
+                      Type dtype,
+                      const std::vector<Expr> &shape,
+                      const std::vector<Expr> &strides,
+                      Expr elem_offset,
+                      const std::string &name,
+                      const std::string &scope,
+                      int data_alignment,
+                      int offset_factor,
+                      Target target) {
+  CHECK(dtype.valid());
+  CHECK(!dtype.is_unk());
+  CHECK(!dtype.is_void());
+  auto *node           = common::make_shared<_Buffer_>();
+  node->shape          = shape;
+  node->strides        = strides;
+  node->elem_offset    = elem_offset;
+  node->name           = name;
+  node->scope          = scope;
+  node->data_alignment = data_alignment;
+  node->offset_factor  = offset_factor;
+  node->target         = target;
+  node->dtype          = dtype;
+  return Buffer(node);
+}
+
+Buffer _Buffer_::Make(const std::string &name, const std::vector<Expr> &shape) {
+  auto *node  = common::make_shared<_Buffer_>();
+  node->name  = name;
+  node->shape = shape;
+  node->dtype = Void();
+  return Buffer(node);
+}
+
+Buffer _Buffer_::Make() {
+  auto *node  = common::make_shared<_Buffer_>();
+  node->dtype = Void();
+  return Buffer(node);
+}
+
+IrNodeTy _Buffer_::node_type() const { return _node_type_; }
+
+void _Buffer_::BindTo(const Tensor &tensor) { BindTo(tensor.As<_Tensor_>()); }
+void _Buffer_::BindTo(const _Tensor_ *tensor) {
+  if (name.empty()) name = TensorGetBufferName(tensor);
+  if (type().is_unk()) set_type(tensor->type());
+  CHECK(!tensor->shape.empty()) << "Tensor should have shape to bind to a Buffer";
+  shape = tensor->shape;
+  binded_tensors_names_.insert(tensor->name);
+}
+void _Buffer_::Unbind(const _Tensor_ *tensor) { binded_tensors_names_.erase(tensor->name); }
+
+Var _Buffer_::buffer_addr() const {
+  auto thetype = type().ElementOf();
+  thetype.set_cpp_handle();
+  return _Var_::Make(name, thetype);
+}
+
+int _Buffer_::numel() const {
+  int res = 1;
+  for (auto &i : shape) {
+    CHECK(i.is_constant());
+    res *= i.as_int32();
+  }
+  return res;
+}
+
+void _Buffer_::Verify() const {
+  CHECK(!shape.empty());
+  CHECK(!name.empty());
+  CHECK(dtype.valid());
+}
+
+Expr Buffer::DestroyExpr() const {
+  auto *node = operator->();
+  return runtime::IntrinsicCall(
+      Void(), runtime::intrinsic::buffer_destroy, {ir::_Var_::Make(node->name, node->type())});
+}
+
+Expr _BufferRange_::Make(const Expr &buffer, const std::vector<Var> &ranges) {
+  auto node    = make_shared<_BufferRange_>();
+  node->buffer = buffer;
+  node->ranges = ranges;
+  return Expr(node);
+}
+void _BufferRange_::Verify() const {
+  auto *buffer_ptr = buffer.As<_Buffer_>();
+  CHECK(buffer_ptr);
+}
+Expr _BufferRange_::Copy() const {
+  auto node    = make_shared<_BufferRange_>();
+  node->buffer = buffer;
+  node->ranges = ranges;
+  node->set_type(type());
+  return Expr(node);
+}
+
+bool BufferRange::operator==(const BufferRange &x) const {
+  auto this_buffer  = operator->()->buffer.As<_Buffer_>();
+  auto other_buffer = x->buffer.As<_Buffer_>();
+  CHECK(this_buffer);
+  CHECK(other_buffer);
+  if (this_buffer != other_buffer) return false;
+  if (x->ranges.size() != operator->()->ranges.size()) return false;
+  for (int i = 0; i < x->ranges.size(); i++) {
+    Var this_range  = operator->()->ranges[i];
+    Var other_range = x->ranges[i];
+    if (!is_zero(this_range->lower_bound - other_range->lower_bound)) return false;
+    if (!is_zero(this_range->upper_bound - other_range->upper_bound)) return false;
+  }
+  return true;
+}
+bool BufferRange::operator!=(const BufferRange &x) const { return !(*this == x); }
+BufferRange &BufferRange::operator=(_BufferRange_ *x) {
+  *this = BufferRange(x);
+  return *this;
+}
+BufferRange &BufferRange::operator=(const _BufferRange_ *x) {
+  auto node    = make_shared<_BufferRange_>();
+  node->buffer = x->buffer;
+  node->ranges = x->ranges;
+  node->set_type(x->type());
+  *this = BufferRange(node);
+  return *this;
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/buffer.h b/paddle/cinn/ir/buffer.h
new file mode 100755
index 0000000000000..9fd9f96517a4d
--- /dev/null
+++ b/paddle/cinn/ir/buffer.h
@@ -0,0 +1,192 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace ir {
+
+class _Buffer_;
+class Tensor;
+class _Tensor_;
+
+//! The memory access mode.
+enum class AccessMask : int {
+  kRead = 1,
+  kWrite,
+};
+
+//! Get its buffer's name given a tensor.
+std::string TensorGetBufferName(const _Tensor_* tensor);
+//! Get its tensor's name given a buffer.
+std::string BufferGetTensorName(const _Buffer_* buffer);
+
+/**
+ * Buffer is a symbolic multi-dimensional data structure, it is a node in IR.
+ * It is a composition of primitive symbolic types, used to specify the memory layout of the Tensor used in the program
+ * input. User can create a buffer and bind to multiple Tensors to specify that the tensors are not inlined and persist
+ * data to this buffer.
+ */
+class Buffer : public IrNodeRef {
+ public:
+  Buffer() = default;
+  explicit Buffer(IrNode* n) : IrNodeRef(n) {}
+  operator Expr() const { return Expr(get()); }
+
+  //! Some expressions on operating the buffer.
+  //! All the IR-wise operations are collected below.
+  // TODO(Superjom) Abandon them.
+  // @{
+  //! Expression to destroy the buffer.
+  Expr DestroyExpr() const;
+  // @}
+
+  const _Buffer_* operator->() const;
+  _Buffer_* operator->();
+};
+
+class _Buffer_ : public ExprNode<_Buffer_> {
+ public:
+  //! The shape of the buffer.
+  std::vector<Expr> shape;
+  //! The strides of each dimension.
+  // This can be empty, indicating that the array is contiguous.
+  std::vector<Expr> strides;
+  //! The name of the buffer.
+  std::string name;
+  //! The storage scope of the buffer, empty if global.
+  std::string scope;
+  //! The offset in terms of number of dtype elements (including lanes).
+  Expr elem_offset;
+  //! Factor of elem_offset field.
+  // elem_offset is guaranteed to be multiple of offset_factor.
+  int offset_factor{0};
+  //! The place the buffer locates.
+  Target target{UnkTarget()};
+  //! Aignment requirement of data pointer in bytes.
+  mutable int data_alignment{0};
+  //! The memory type of the buffer.
+  MemoryType memory_type{MemoryType::Heap};
+
+  //! The data type of the elements.
+  //! This is different from `type`, a buffer's type should always be `cinn_buffer_t*`.
+  Type dtype;
+
+  _Buffer_() : elem_offset(Expr(0)) { set_type(type_of<cinn_buffer_t*>()); }
+
+  static Buffer Make(Var data,
+                     Type dtype,
+                     const std::vector<Expr>& shape,
+                     const std::vector<Expr>& strides,
+                     Expr elem_offset,
+                     const std::string& name,
+                     const std::string& scope,
+                     int data_alignment,
+                     int offset_factor,
+                     Target target = UnkTarget());
+
+  static Buffer Make(const std::string& name, const std::vector<Expr>& shape = {});
+
+  static Buffer Make(const std::string& name, Type type) {
+    CHECK(!type.is_void());
+    CHECK(!type.is_unk());
+    auto n   = make_shared<_Buffer_>();
+    n->name  = name;
+    n->dtype = type;
+    return Buffer(n);
+  }
+
+  //! Make an empty buffer.
+  static Buffer Make();
+
+  bool is_on_gpu() const { return memory_type == MemoryType::GPULocal || memory_type == MemoryType::GPUShared; }
+  bool is_on_host() const { return !is_on_gpu(); }
+
+  void BindTo(const Tensor& tensor);
+  void BindTo(const _Tensor_* tensor);
+  void Unbind(const _Tensor_* tensor);
+
+  const std::set<std::string>& binded_tensor_names() const { return binded_tensors_names_; }
+
+  Var buffer_addr() const;
+
+  IrNodeTy node_type() const override;
+
+  void Verify() const override;
+
+  int numel() const;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::_Buffer_;
+
+  // Copy the meta infos to other.
+  void CopyMeta(_Buffer_* other) const { other->binded_tensors_names_ = binded_tensors_names_; }
+
+ private:
+  std::set<std::string> binded_tensors_names_;
+};
+
+static bool operator<(const ir::Buffer& a, const ir::Buffer& b) { return a->name < b->name; }
+
+// represents the multi-dimension ranges of the buffer
+struct _BufferRange_ : public ExprNode<_BufferRange_> {
+  Expr buffer;
+  // For every range, it starts from var's lower_bound and ends at var's upper_bound.
+  std::vector<Var> ranges;
+
+  _BufferRange_() = default;
+  _BufferRange_(const Expr& buffer, const std::vector<Var>& ranges)
+      : ExprNode<_BufferRange_>(), buffer(buffer), ranges(ranges) {}
+
+  static Expr Make(const Expr& buffer, const std::vector<Var>& ranges);
+
+  void Verify() const override;
+
+  Expr Copy() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::_BufferRange_;
+};
+
+struct BufferRange : public IrNodeRef {
+  BufferRange() = default;
+  explicit BufferRange(IrNode* n) : IrNodeRef(n) {}
+  BufferRange(const Expr& buffer, const std::vector<Var>& ranges)
+      : BufferRange(_BufferRange_::Make(buffer, ranges).ptr()) {}
+
+  operator Expr() { return Expr(get()); }
+  operator Expr() const {
+    BufferRange v = *this;
+    return Expr(v);
+  }
+
+  bool operator==(const BufferRange& o) const;
+  bool operator!=(const BufferRange& o) const;
+
+  BufferRange& operator=(_BufferRange_* x);
+  BufferRange& operator=(const _BufferRange_* x);
+
+  const _BufferRange_* operator->() const { return get(); }
+  _BufferRange_* operator->() { return get(); }
+  const _BufferRange_* get() const { return static_cast<const _BufferRange_*>(ptr()); }
+  _BufferRange_* get() { return static_cast<_BufferRange_*>(ptr()); }
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/buffer_test.cc b/paddle/cinn/ir/buffer_test.cc
new file mode 100644
index 0000000000000..13e84831c24e2
--- /dev/null
+++ b/paddle/cinn/ir/buffer_test.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/buffer.h"
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/cinn.h"
+#include "cinn/common/common.h"
+#include "cinn/ir/module.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/buffer.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+
+namespace cinn {
+namespace ir {
+
+TEST(Buffer, basic) {
+  Var ptr("buff", Float(32));
+  std::vector<Expr> shape({Expr(100), Expr(20)});
+  Var i("i"), j("j");
+  std::vector<Expr> strides({Expr(0), Expr(0)});
+  auto buffer = _Buffer_::Make(ptr, ptr->type(), shape, strides, Expr(0), "buf", "", 0, 0);
+
+  // Check shared
+  ASSERT_EQ(ref_count(buffer.get()).val(), 1);
+
+  ASSERT_EQ(buffer->type(), type_of<cinn_buffer_t*>());
+  ASSERT_EQ(buffer->dtype, ptr->type());
+
+  {
+    auto buffer1 = buffer;
+    ASSERT_EQ(ref_count(buffer.get()).val(), 2);
+    ASSERT_EQ(ref_count(buffer1.get()).val(), 2);
+  }
+
+  ASSERT_EQ(ref_count(buffer.get()).val(), 1);
+}
+
+TEST(Buffer, bind_to_multiple_tensors) {
+  Expr M(100);
+  Expr N(20);
+  Tensor A = lang::Compute(
+      {M, N}, [=](Var i, Var j) { return Expr(0.f); }, "A");
+  Tensor B = lang::Compute(
+      {M, N}, [=](Var i, Var j) { return Expr(1.f); }, "B");
+
+  auto stages = CreateStages({A, B});
+
+  stages[B]->ShareBufferWith(stages[A]);
+
+  auto funcs = lang::Lower("func1", stages, {A, B});
+
+  Target target;
+  target.arch = Target::Arch ::X86;
+  target.bits = Target::Bit ::k32;
+  target.os   = Target::OS ::Linux;
+
+  ir::Module::Builder builder("module1", target);
+  builder.AddFunction(funcs);
+  builder.AddBuffer(A->buffer);
+
+  backends::CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  std::cout << "codegen C:" << std::endl << out << std::endl;
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/collect_ir_nodes.cc b/paddle/cinn/ir/collect_ir_nodes.cc
new file mode 100644
index 0000000000000..5bb20cd3363dd
--- /dev/null
+++ b/paddle/cinn/ir/collect_ir_nodes.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/collect_ir_nodes.h"
+
+#include <glog/logging.h>
+
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn {
+namespace ir {
+
+namespace {
+
+struct IrNodesCollector : public IRVisitor {
+  using teller_t  = std::function<bool(const Expr*)>;
+  using handler_t = std::function<void(const Expr*)>;
+
+  teller_t teller;
+  handler_t handler;
+  bool uniq_target_;
+  bool find_target_{false};
+
+  IrNodesCollector(teller_t&& teller, handler_t&& handler, bool uniq_target)
+      : teller(teller), handler(handler), uniq_target_(uniq_target) {}
+
+  void Visit(const Expr* expr) override {
+    if (!expr->defined() || find_target_) return;
+    if (visited_.count(expr->get())) return;
+
+    if (teller(expr)) {
+      handler(expr);
+      if (uniq_target_) {
+        find_target_ = true;
+        return;
+      }
+    }
+    visited_.insert(expr->get());
+
+    switch (expr->node_type()) {
+#define __(op__)                 \
+  case ir::IrNodeTy::op__:       \
+    Visit(expr->As<ir::op__>()); \
+    break;
+
+      NODETY_FORALL(__)
+
+      default:
+        LOG(FATAL) << "not supported NodeTy";
+#undef __
+    }
+  }
+
+#define __m(t__)                       \
+  void Visit(const t__* x) override {  \
+    for (auto* n : x->expr_fields()) { \
+      if (n->defined()) {              \
+        Visit(n);                      \
+      }                                \
+    }                                  \
+  }
+
+  NODETY_FORALL(__m)
+#undef __m
+  std::set<void*> visited_;
+};
+
+struct IrNodesWithoutTensorCollector : public IrNodesCollector {
+  using teller_t  = std::function<bool(const Expr*)>;
+  using handler_t = std::function<void(const Expr*)>;
+  IrNodesWithoutTensorCollector(teller_t teller, handler_t handler, bool uniq_target)
+      : IrNodesCollector(std::move(teller), std::move(handler), uniq_target) {}
+
+  void Visit(const _Tensor_* expr) override {
+    for (auto& e : expr->shape) {
+      IrNodesCollector::Visit(&e);
+    }
+  }
+  void Visit(const Expr* expr) override { IrNodesCollector::Visit(expr); }
+};
+
+}  // namespace
+
+std::set<Expr> CollectIRNodes(Expr expr, std::function<bool(const Expr*)>&& teller, bool uniq_target) {
+  std::set<Expr> exprs;
+  IrNodesCollector::handler_t handler = [&](const Expr* x) { exprs.insert(*x); };
+  IrNodesCollector collector(std::move(teller), std::move(handler), uniq_target);
+  collector.Visit(&expr);
+  return exprs;
+}
+
+std::vector<Expr> CollectIRNodesInOrder(Expr expr, std::function<bool(const Expr*)>&& teller) {
+  std::vector<Expr> exprs;
+  IrNodesWithoutTensorCollector::handler_t handler = [&](const Expr* x) { exprs.push_back(*x); };
+  IrNodesWithoutTensorCollector collector(std::move(teller), std::move(handler), false);
+  collector.Visit(&expr);
+  return exprs;
+}
+
+std::set<Expr> CollectIRNodesWithoutTensor(Expr expr, std::function<bool(const Expr*)>&& teller, bool uniq_target) {
+  std::set<Expr> exprs;
+  IrNodesWithoutTensorCollector::handler_t handler = [&](const Expr* x) { exprs.insert(*x); };
+  IrNodesWithoutTensorCollector collector(std::move(teller), std::move(handler), uniq_target);
+  collector.Visit(&expr);
+  return exprs;
+}
+
+std::map<std::string, Expr> CollectTensorMap(Expr x, std::function<bool(const Expr*)>&& extra_teller) {
+  std::map<std::string, Expr> tensor_map;
+
+  auto tensors = CollectIRNodes(x, [&](const Expr* x) { return x->as_tensor() && extra_teller(x); });
+  for (auto& e : tensors) {
+    auto* t             = e.as_tensor();
+    tensor_map[t->name] = e;
+  }
+  return tensor_map;
+}
+
+std::set<Expr> CollectLoadTensors(Expr x, std::function<bool(const Expr*)>&& teller) {
+  if (!x.defined()) return std::set<Expr>();
+  struct Mutator : public ir::IRMutator<const Expr*> {
+    std::function<bool(const Expr*)> teller;
+    std::set<Expr> exprs;
+    Mutator(std::function<bool(const Expr*)>&& teller) : teller(std::move(teller)) {}
+
+    void operator()(const Expr* expr) { ir::IRMutator<const Expr*>::Visit(expr, expr); }
+
+    void Visit(const Load* op, const Expr* expr) override {
+      if (teller(&op->tensor)) {
+        exprs.insert(op->tensor);
+      }
+    }
+  };
+
+  Mutator mutator(std::move(teller));
+  mutator(&x);
+  return mutator.exprs;
+}
+
+std::set<Expr> CollectStoreTensors(Expr x, std::function<bool(const Expr*)>&& teller) {
+  struct Mutator : public ir::IRMutator<const Expr*> {
+    std::function<bool(const Expr*)> teller;
+    std::set<Expr> exprs;
+    Mutator(std::function<bool(const Expr*)>&& teller) : teller(std::move(teller)) {}
+
+    void operator()(const Expr* expr) { ir::IRMutator<const Expr*>::Visit(expr, expr); }
+
+    void Visit(const Store* op, const Expr* expr) override {
+      if (teller(&op->tensor)) {
+        exprs.insert(op->tensor);
+      }
+    }
+  };
+
+  Mutator mutator(std::move(teller));
+  mutator(&x);
+  return mutator.exprs;
+}
+
+std::set<Expr> CollectReferencedTensors(Expr x, const std::function<bool(const Expr*)>& teller) {
+  auto handle0 = teller;
+  auto handle1 = teller;
+
+  auto ts0 = CollectLoadTensors(x, std::move(handle0));
+  auto ts1 = CollectLoadTensors(x, std::move(handle1));
+
+  for (auto& item : ts1) {
+    ts0.insert(item);
+  }
+  return ts0;
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/collect_ir_nodes.h b/paddle/cinn/ir/collect_ir_nodes.h
new file mode 100755
index 0000000000000..b63ed193bee30
--- /dev/null
+++ b/paddle/cinn/ir/collect_ir_nodes.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace ir {
+
+/**
+ * Collect the IR Nodes(without duplication) in the expression.
+ */
+std::set<Expr> CollectIRNodes(Expr x, std::function<bool(const Expr*)>&& teller, bool uniq_target = false);
+
+/**
+ * Collect the IR Nodes(without duplication and tensor's compute body) in the expression.
+ */
+std::set<Expr> CollectIRNodesWithoutTensor(Expr x, std::function<bool(const Expr*)>&& teller, bool uniq_target = false);
+
+/**
+ * Collect the IR Nodes from Block.
+ */
+std::vector<Expr> CollectIRNodesInOrder(Expr block, std::function<bool(const Expr*)>&& teller);
+
+/**
+ * Collect the tensors in Load nodes.
+ */
+std::set<Expr> CollectLoadTensors(Expr x, std::function<bool(const Expr*)>&& teller);
+
+/**
+ * Collect the tensors in Store nodes.
+ */
+std::set<Expr> CollectStoreTensors(Expr x, std::function<bool(const Expr*)>&& teller);
+
+/**
+ * Collect both the Store and Load nodes.
+ */
+std::set<Expr> CollectReferencedTensors(Expr x, const std::function<bool(const Expr*)>& teller);
+
+std::map<std::string, Expr> CollectTensorMap(
+    Expr x, std::function<bool(const Expr*)>&& extra_teller = [](const Expr* x) { return true; });
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/collect_ir_nodes_test.cc b/paddle/cinn/ir/collect_ir_nodes_test.cc
new file mode 100644
index 0000000000000..2d988660340bf
--- /dev/null
+++ b/paddle/cinn/ir/collect_ir_nodes_test.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace ir {
+
+TEST(CollectIRNodes, basic0) {
+  Expr C = Expr(1) + 2;
+
+  auto exprs = CollectIRNodes(C, [](const Expr* x) { return x->As<ir::Add>(); });
+  ASSERT_EQ(exprs.size(), 1UL);
+
+  auto ints = CollectIRNodes(C, [](const Expr* x) { return x->As<ir::IntImm>(); });
+  ASSERT_EQ(ints.size(), 2UL);
+}
+
+TEST(CollectIRNodes, basic) {
+  Expr M(100);
+  Expr N(200);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) + B(i, j); }, "C");
+
+  auto stages = CreateStages({C});
+
+  auto fn = Lower("fn", stages, {A, B, C});
+
+  LOG(INFO) << "fn:\n" << fn;
+
+  auto tensors = CollectIRNodes(fn, [](const Expr* x) { return x->as_tensor(); });
+  ASSERT_EQ(tensors.size(), 5UL);
+
+  auto fn_body = fn.As<ir::_LoweredFunc_>()->body;
+  LOG(INFO) << "fn.body:\n" << fn_body;
+  auto tensors2 = CollectIRNodes(fn_body, [](const Expr* x) { return x->as_tensor(); });
+  auto exprs    = CollectIRNodes(fn_body, [](const Expr* x) { return x; });
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/function_base.cc b/paddle/cinn/ir/function_base.cc
new file mode 100644
index 0000000000000..924c011152637
--- /dev/null
+++ b/paddle/cinn/ir/function_base.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/function_base.h"
+
+namespace cinn {
+namespace ir {}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/function_base.h b/paddle/cinn/ir/function_base.h
new file mode 100644
index 0000000000000..47683d66cd056
--- /dev/null
+++ b/paddle/cinn/ir/function_base.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/ir/ir_base.h"
+
+namespace cinn {
+namespace ir {
+
+class FunctionBase : public IrNode {
+ public:
+  virtual const std::string& func_name() const = 0;
+};
+
+class FunctionRef : public IrNodeRef {
+ public:
+  FunctionRef() = default;
+  FunctionRef(IrNode* n) : IrNodeRef(n) {}
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/function_definition.cc b/paddle/cinn/ir/function_definition.cc
new file mode 100644
index 0000000000000..b5d9368431e98
--- /dev/null
+++ b/paddle/cinn/ir/function_definition.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/function_definition.h"
+
+namespace cinn {
+namespace ir {}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/function_definition.h b/paddle/cinn/ir/function_definition.h
new file mode 100644
index 0000000000000..848dd6a6ca2c2
--- /dev/null
+++ b/paddle/cinn/ir/function_definition.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace ir {
+
+struct Specialization {
+  Expr condition;
+};
+
+struct DefinitionContents;
+struct FunctionContents;
+
+/**
+ * A Function definition which can either represent a init or an update definition.
+ */
+class Definition {
+ public:
+  explicit Definition(const std::shared_ptr<DefinitionContents>& contents) : contents_(contents) {}
+
+ private:
+  std::shared_ptr<DefinitionContents> contents_;
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/intrinsic_ops.cc b/paddle/cinn/ir/intrinsic_ops.cc
new file mode 100644
index 0000000000000..4e33f8666845f
--- /dev/null
+++ b/paddle/cinn/ir/intrinsic_ops.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/intrinsic_ops.h"
+
+namespace cinn::ir {
+
+const char* IntrinsicOp::type_info() const { return IrNode::type_info(); }
+
+const Type& IntrinsicOp::GetInputType(int offset) const {
+  CHECK_LT(offset, input_types_.size());
+  return input_types_[offset];
+}
+const Type& IntrinsicOp::GetOutputType(int offset) const {
+  CHECK_LT(offset, output_types_.size());
+  return output_types_[offset];
+}
+
+void IntrinsicOp::Verify(llvm::ArrayRef<Type> input_types, llvm::ArrayRef<Type> output_types) const {
+  CHECK_EQ(input_types.size(), input_types_.size());
+  CHECK_EQ(output_types.size(), output_types_.size());
+
+  for (int i = 0; i < input_types.size(); i++) {
+    CHECK_EQ(input_types[i], input_types_[i]);
+  }
+
+  for (int i = 0; i < output_types.size(); i++) {
+    CHECK_EQ(output_types[i], output_types_[i]);
+  }
+}
+
+void IntrinsicOp::Verify(llvm::ArrayRef<Expr> inputs) const {
+  CHECK_EQ(inputs.size(), input_types_.size());
+  for (int i = 0; i < inputs.size(); i++) {
+    CHECK_EQ(inputs[i].type().IgnoreConst(), input_types_[i].IgnoreConst());
+  }
+}
+
+void IntrinsicOp::Verify(llvm::ArrayRef<Expr> inputs, llvm::ArrayRef<Expr> outputs) const {
+  llvm::SmallVector<Type, 4> input_types, output_types;
+  for (auto& e : inputs) input_types.push_back(e.type());
+  for (auto& e : outputs) output_types.push_back(e.type());
+  Verify(input_types, output_types);
+}
+
+Expr intrinsics::BufferGetDataHandle::Make(Expr buffer) {
+  auto* n = new BufferGetDataHandle;
+  n->Verify({buffer});
+  n->buffer = buffer;
+  n->set_type(n->GetOutputType(0));
+  return Expr(n);
+}
+
+Expr intrinsics::BufferGetDataConstHandle::Make(Expr buffer) {
+  auto* n = new BufferGetDataConstHandle;
+  n->Verify({buffer});
+  n->buffer = buffer;
+  n->set_type(n->GetOutputType(0));
+  return Expr(n);
+}
+
+Expr intrinsics::PodValueToX::Make(Expr pod_value_ptr, const Type& type) {
+  auto* n = new PodValueToX;
+  n->AddOutputType(type);
+  n->Verify({pod_value_ptr});
+  n->pod_value_ptr = pod_value_ptr;
+  n->set_type(n->GetOutputType(0));
+  return Expr(n);
+}
+
+Expr intrinsics::BufferCreate::Make(Expr buffer) {
+  auto* n = new BufferCreate;
+  n->set_type(Void());
+  n->buffer = buffer;
+  n->Verify({n->buffer});
+  return Expr(n);
+}
+
+Expr intrinsics::GetAddr::Make(Expr data) {
+  auto* n = new GetAddr;
+  n->set_type(data.type().PointerOf());
+  n->data          = data;
+  n->input_types_  = {data.type()};
+  n->output_types_ = {data.type().PointerOf()};
+  return Expr(n);
+}
+
+Expr intrinsics::ArgsConstruct::Make(Var var, llvm::ArrayRef<Expr> args) {
+  auto* n = new ArgsConstruct;
+  CHECK_EQ(var->type().ElementOf(), type_of<cinn_pod_value_t>());
+  CHECK_GE(var->type().lanes(), 1);
+  for (auto& arg : args) {
+    CHECK_EQ(arg.type(), type_of<cinn_pod_value_t*>());
+    n->AddInputType(var->type());
+    n->AddInputType(arg.type());
+  }
+  n->var = var;
+  n->AddOutputType(type_of<cinn_pod_value_t*>());
+  n->args.assign(args.begin(), args.end());
+  return Expr(n);
+}
+
+Expr intrinsics::BuiltinIntrin::Make(
+    const std::string& name, llvm::ArrayRef<Expr> args, llvm::Intrinsic::ID id, int64_t arg_nums, const Type& type) {
+  auto* n = new BuiltinIntrin;
+  n->name = name;
+  n->args.assign(args.begin(), args.end());
+  n->id       = id;
+  n->arg_nums = arg_nums;
+  CHECK(!type.is_unk());
+  n->type_ = type;
+
+  return Expr(n);
+}
+
+}  // namespace cinn::ir
diff --git a/paddle/cinn/ir/intrinsic_ops.h b/paddle/cinn/ir/intrinsic_ops.h
new file mode 100644
index 0000000000000..d5219e896d9a3
--- /dev/null
+++ b/paddle/cinn/ir/intrinsic_ops.h
@@ -0,0 +1,200 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/Support/Casting.h>
+
+#include <string>
+
+#include "cinn/common/type.h"
+#include "cinn/ir/ir.h"
+
+//! This file defines some intrinsic IR nodes, this is similar to the MLIR operations, we try to expose some underlying
+//! opaque operations to IR system to helpe more intuitive codegen.
+
+namespace cinn::ir {
+
+// clang-format off
+#define INTRINSIC_KIND_FOR_EACH(macro__)                 \
+  macro__(BufferGetDataHandle)                           \
+  macro__(BufferGetDataConstHandle)                      \
+  macro__(PodValueToX)                                   \
+  macro__(BufferCreate)                                  \
+  macro__(GetAddr)                                       \
+  macro__(ArgsConstruct)                                 \
+  macro__(BuiltinIntrin)
+// clang-format on
+
+enum class IntrinsicKind {
+// All the intrinsics should registered here.
+#define __(x__) k##x__,
+  INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+};
+
+class IntrinsicOp : public IrNode {
+ public:
+  IntrinsicOp(IntrinsicKind kind, llvm::ArrayRef<Type> input_types, llvm::ArrayRef<Type> output_types)
+      : kind_(kind),
+        input_types_(input_types.begin(), input_types.end()),
+        output_types_(output_types.begin(), output_types.end()) {}
+
+  const Type& GetInputType(int offset) const;
+  const Type& GetOutputType(int offset) const;
+
+  void AddInputType(const Type& type) { input_types_.push_back(type); }
+  void AddOutputType(const Type& type) { output_types_.push_back(type); }
+
+  const llvm::SmallVectorImpl<Type>& input_types() const { return input_types_; }
+  const llvm::SmallVectorImpl<Type>& output_types() const { return input_types_; }
+
+  //! Verify the \p input_types and \p output_types matches the signature of this operation.
+  void Verify(llvm::ArrayRef<Type> input_types, llvm::ArrayRef<Type> output_types) const;
+  void Verify(llvm::ArrayRef<Expr> inputs, llvm::ArrayRef<Expr> outputs) const;
+  void Verify(llvm::ArrayRef<Expr> inputs) const;
+
+  void Verify() const override {}
+
+  const char* type_info() const override;
+
+  IntrinsicKind getKind() const { return kind_; }
+
+  IrNodeTy node_type() const override { return _node_type_; }
+
+  static constexpr IrNodeTy _node_type_{IrNodeTy::IntrinsicOp};
+
+ protected:
+  llvm::SmallVector<Type, 4> input_types_;
+  llvm::SmallVector<Type, 4> output_types_;
+  const IntrinsicKind kind_;
+};
+
+namespace intrinsics {
+
+/**
+ * The operation to get the memory address from cinn_buffer_t.
+ */
+struct BufferGetDataHandle : public IntrinsicOp {
+  // signature: (cinn_buffer_t*) -> (void*)
+  BufferGetDataHandle()
+      : IntrinsicOp(IntrinsicKind::kBufferGetDataHandle, {type_of<cinn_buffer_t*>()}, {type_of<void*>()}) {}
+
+  static Expr Make(Expr buffer);
+
+  static bool classof(const IntrinsicOp* s) { return s->getKind() == IntrinsicKind::kBufferGetDataHandle; }
+
+  Expr buffer;
+};
+
+/**
+ * The operation to get the memory address from cinn_buffer_t.
+ */
+struct BufferGetDataConstHandle : public IntrinsicOp {
+  // signature: (cinn_buffer_t*) -> (const void*)
+  BufferGetDataConstHandle()
+      : IntrinsicOp(IntrinsicKind::kBufferGetDataConstHandle, {type_of<const cinn_buffer_t*>()}, {type_of<void*>()}) {}
+
+  static Expr Make(Expr buffer);
+
+  static bool classof(const IntrinsicOp* s) { return s->getKind() == IntrinsicKind::kBufferGetDataConstHandle; }
+
+  Expr buffer;
+};
+
+/**
+ * The operation to represent the helper methods:
+ * - cinn_pod_value_to_float
+ * - cinn_pod_value_to_duoble
+ * - cinn_pod_value_to_int64
+ * - cinn_pod_value_to_int32
+ * - cinn_pod_value_to_void_p
+ * - cinn_pod_value_to_buffer_p
+ */
+struct PodValueToX : public IntrinsicOp {
+  // signature: (cinn_pod_value_t*) -> (X), X is some pod type.
+  PodValueToX() : IntrinsicOp(IntrinsicKind::kPodValueToX, {type_of<cinn_pod_value_t*>()}, {}) {}
+
+  static Expr Make(Expr pod_value_ptr, const Type& type);
+
+  static bool classof(const IntrinsicOp* s) { return s->getKind() == IntrinsicKind::kPodValueToX; }
+
+  Expr pod_value_ptr;
+};
+
+/**
+ * The operation to create a buffer.
+ */
+struct BufferCreate : public IntrinsicOp {
+  // signature: (cinn_buffer_t*) -> void
+  BufferCreate() : IntrinsicOp(IntrinsicKind::kBufferCreate, {type_of<cinn_buffer_t*>()}, {}) {}
+
+  static Expr Make(Expr buffer);
+
+  static bool classof(const IntrinsicOp* s) { return s->getKind() == IntrinsicKind::kBufferCreate; }
+
+  Expr buffer;
+};
+
+/**
+ * The operation to get the address of a data.
+ */
+struct GetAddr : public IntrinsicOp {
+  // signature: (X) -> (X*)
+  GetAddr() : IntrinsicOp(IntrinsicKind::kGetAddr, {}, {}) {}
+
+  static Expr Make(Expr data);
+
+  static bool classof(const IntrinsicOp* s) { return s->getKind() == IntrinsicKind::kGetAddr; }
+
+  Expr data;
+};
+
+/**
+ * The operation to construct a cinn_pod_value_t*
+ */
+struct ArgsConstruct : public IntrinsicOp {
+  ArgsConstruct() : IntrinsicOp(IntrinsicKind::kArgsConstruct, {}, {}) {}
+
+  static Expr Make(Var var, llvm::ArrayRef<Expr> args);
+
+  static bool classof(const IntrinsicOp* s) { return s->getKind() == IntrinsicKind::kArgsConstruct; }
+
+  Var var;
+  llvm::SmallVector<Expr, 4> args;
+};
+
+/**
+ * The llvm intrinsic op
+ */
+struct BuiltinIntrin : public IntrinsicOp {
+  BuiltinIntrin() : IntrinsicOp(IntrinsicKind::kBuiltinIntrin, {}, {}) {}
+
+  static Expr Make(
+      const std::string& name, llvm::ArrayRef<Expr> args, llvm::Intrinsic::ID id, int64_t arg_nums, const Type& type);
+
+  static bool classof(const IntrinsicOp* s) { return s->getKind() == IntrinsicKind::kBuiltinIntrin; }
+
+  std::string name;
+  llvm::SmallVector<Expr, 4> args;
+  llvm::Intrinsic::ID id;
+  int64_t arg_nums;
+};
+
+}  // namespace intrinsics
+
+}  // namespace cinn::ir
diff --git a/paddle/cinn/ir/intrinsic_ops_test.cc b/paddle/cinn/ir/intrinsic_ops_test.cc
new file mode 100644
index 0000000000000..e6af51abf223e
--- /dev/null
+++ b/paddle/cinn/ir/intrinsic_ops_test.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/intrinsic_ops.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn::ir {
+
+TEST(IntrinsicOp, basic) {
+  Expr buffer(1);
+  buffer->set_type(type_of<cinn_buffer_t*>());
+  auto op   = intrinsics::BufferGetDataHandle::Make(buffer);
+  auto* ptr = op.As<IntrinsicOp>();
+  ASSERT_TRUE(ptr);
+  auto* obj = llvm::dyn_cast<intrinsics::BufferGetDataHandle>(ptr);
+  ASSERT_TRUE(obj);
+}
+
+}  // namespace cinn::ir
diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc
new file mode 100755
index 0000000000000..6a2b58445bfad
--- /dev/null
+++ b/paddle/cinn/ir/ir.cc
@@ -0,0 +1,819 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "cinn/common/cinn_value.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/ir/module.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/optim/ir_simplify.h"
+
+namespace cinn {
+namespace ir {
+
+using common::make_shared;
+
+Expr Cast::Make(Type t, Expr v) {
+  CHECK(!t.is_unk());
+  CHECK(!(t.is_void() && !t.is_cpp_handle())) << "Void is not allowed to cast";
+  CHECK(v.defined());
+
+  auto node = make_shared<Cast>();
+  node->v() = v;
+  node->set_type(t);
+  return Expr(node);
+}
+
+void Cast::Verify() const {
+  if (v().type() == type())
+    LOG(WARNING) << "Found a Cast Node casting a value to the same type, this is not reasonable";
+}
+
+Expr Add::Make(Expr a, Expr b) {
+  auto node = make_shared<Add>(a, b);
+  return Expr(node);
+}
+
+Add::Add(Expr a, Expr b) : BinaryOpNode<Add>(a.type(), a, b) {}
+
+void BinaryNodeVerify(const Expr &a, const Expr &b, absl::string_view ir_name) {
+  CHECK(a.defined());
+  CHECK(b.defined());
+  CHECK_EQ(a.type(), b.type()) << "The operands' types of the node [" << ir_name << "] don't match";
+}
+
+void Add::Verify() const { BinaryNodeVerify(a(), b(), "Add"); }
+
+Expr Sub::Make(Expr a, Expr b) {
+  auto node = make_shared<Sub>(a, b);
+  return Expr(node);
+}
+
+void Sub::Verify() const { BinaryNodeVerify(a(), b(), "Sub"); }
+
+Expr Mul::Make(Expr a, Expr b) {
+  CHECK(a.defined());
+  CHECK(b.defined());
+  CHECK_EQ(a.type(), b.type()) << "a=" << a << ", b=" << b;
+  auto node = make_shared<Mul>(a, b);
+  return Expr(node);
+}
+
+void Max::Verify() const { BinaryNodeVerify(a(), b(), "Max"); }
+
+Expr Div::Make(Expr a, Expr b) {
+  auto node = make_shared<Div>(a, b);
+  return Expr(node);
+}
+
+void Div::Verify() const { BinaryNodeVerify(a(), b(), "Div"); }
+
+Expr Mod::Make(Expr a, Expr b) {
+  auto node = make_shared<Mod>(a, b);
+  return Expr(node);
+}
+
+void Mod::Verify() const { BinaryNodeVerify(a(), b(), "Mod"); }
+
+Expr Min::Make(Expr a, Expr b) {
+  auto node = make_shared<Min>(a, b);
+  return Expr(node);
+}
+
+void Min::Verify() const { BinaryNodeVerify(a(), b(), "Min"); }
+
+Expr Max::Make(Expr a, Expr b) {
+  auto node = make_shared<Max>(a, b);
+  return Expr(node);
+}
+
+Expr Minus::Make(Expr a) {
+  auto node = make_shared<Minus>(a);
+  return Expr(node);
+}
+
+void Minus::Verify() const { CHECK(v().defined()); }
+
+Expr EQ::Make(Expr a, Expr b) {
+  auto node = make_shared<EQ>(a, b);
+  return Expr(node);
+}
+
+void EQ::Verify() const { BinaryNodeVerify(a(), b(), "EQ"); }
+
+Expr NE::Make(Expr a, Expr b) {
+  auto node = make_shared<NE>(a, b);
+  return Expr(node);
+}
+
+void NE::Verify() const { BinaryNodeVerify(a(), b(), "NE"); }
+
+Expr LT::Make(Expr a, Expr b) {
+  auto node = make_shared<LT>(a, b);
+  return Expr(node);
+}
+
+void LT::Verify() const { BinaryNodeVerify(a(), b(), "LT"); }
+
+Expr LE::Make(Expr a, Expr b) {
+  auto node = make_shared<LE>(a, b);
+  return Expr(node);
+}
+
+void LE::Verify() const { BinaryNodeVerify(a(), b(), "LE"); }
+
+Expr GT::Make(Expr a, Expr b) {
+  auto node = make_shared<GT>(a, b);
+  return Expr(node);
+}
+
+void GT::Verify() const { BinaryNodeVerify(a(), b(), "GT"); }
+
+Expr GE::Make(Expr a, Expr b) {
+  auto node = make_shared<GE>(a, b);
+  return Expr(node);
+}
+
+void GE::Verify() const { BinaryNodeVerify(a(), b(), "GE"); }
+
+Expr And::Make(Expr a, Expr b) {
+  auto node = make_shared<And>(a, b);
+  return Expr(node);
+}
+
+void And::Verify() const {
+  BinaryNodeVerify(a(), b(), "And");
+  CHECK_EQ(a().type(), type_of<bool>());
+}
+
+Expr Or::Make(Expr a, Expr b) {
+  auto node = make_shared<Or>(a, b);
+  return Expr(node);
+}
+
+void Or::Verify() const {
+  BinaryNodeVerify(a(), b(), "Or");
+  CHECK_EQ(a().type(), type_of<bool>());
+}
+
+Type Or::type() const { return type_; }
+
+Expr Not::Make(Expr v) {
+  auto node = make_shared<Not>(v);
+  return Expr(node);
+}
+
+void Not::Verify() const { CHECK_EQ(v().type(), type_of<bool>()); }
+
+Type Not::type() const { return type_; }
+
+Expr Let::Make(Expr symbol, Expr body) {
+  auto *n = make_shared<Let>();
+  CHECK(symbol.type().valid());
+  if (body.defined()) {
+    CHECK(body.type().valid());
+  }
+  n->symbol = symbol;
+  n->body   = body;
+  n->set_type(n->symbol->type());
+  return Expr(n);
+}
+
+void Let::Verify() const {
+  CHECK(symbol.defined());
+  // The default value(contained in body) is not required.
+  if (body.defined()) {
+    CHECK_EQ(symbol.type(), body.type());
+  }
+}
+
+Type Let::type() const { return symbol.type(); }
+
+Expr _Var_::Make(const std::string &name, const Type &type) {
+  auto node = new _Var_(name, type);
+  return Expr(node);
+}
+
+Expr _Var_::Make(Expr lower_bound, Expr upper_bound, const std::string &name, bool is_reduce_axis) {
+  auto *n           = make_shared<_Var_>();
+  n->lower_bound    = lower_bound;
+  n->upper_bound    = upper_bound;
+  n->is_reduce_axis = is_reduce_axis;
+  n->name           = name;
+  n->set_type(lower_bound.type());
+  return Expr(n);
+}
+
+Expr _Var_::Copy() const {
+  auto *n           = make_shared<_Var_>();
+  n->name           = name;
+  n->is_reduce_axis = is_reduce_axis;
+  n->lower_bound    = lower_bound;
+  n->upper_bound    = upper_bound;
+  n->set_type(type());
+  return Expr(n);
+}
+
+void _Var_::Verify() const { CHECK(!name.empty()) << "Var should have a name"; }
+
+void Mul::Verify() const { BinaryNodeVerify(a(), b(), "Mul"); }
+
+Expr For::Make(Var loop_var,
+               Expr min,
+               Expr extent,
+               ForType for_type,
+               DeviceAPI device_api,
+               Expr body,
+               VectorizeInfo vector_info,
+               BindInfo bind_info) {
+  auto node = make_shared<For>();
+  CHECK(loop_var.defined());
+  CHECK(min.defined());
+  CHECK(extent.defined());
+  node->loop_var   = loop_var;
+  node->min        = min;
+  node->extent     = extent;
+  node->device_api = device_api;
+  node->body       = body;
+  node->set_for_type(for_type);
+  node->set_vectorize_info(vector_info);
+  node->set_bind_info(bind_info);
+
+  if (node->is_vectorized()) CHECK(node->vectorize_info().valid());
+  if (node->is_binded() && bind_info.offset >= 0) CHECK(node->bind_info().valid());
+
+  return Expr(node);
+}
+
+std::vector<Expr *> For::expr_fields() { return {&min, &extent, &body}; }
+std::vector<const Expr *> For::expr_fields() const { return {&min, &extent, &body}; }
+
+Expr Block::Make(const std::vector<Expr> &stmts) {
+  auto node   = make_shared<Block>();
+  node->stmts = stmts;
+  return Expr(node);
+}
+std::vector<Expr *> Block::expr_fields() {
+  std::vector<Expr *> res;
+  for (auto &x : stmts) res.push_back(&x);
+  return res;
+}
+std::vector<const Expr *> Block::expr_fields() const {
+  std::vector<const Expr *> res;
+  for (auto &x : stmts) res.push_back(&x);
+  return res;
+}
+
+Expr ScheduleBlock::Make(const std::vector<Var> &iter_vars,
+                         const std::vector<Expr> &read_buffers,
+                         const std::vector<Expr> &write_buffers,
+                         const std::string &name,
+                         Expr body) {
+  auto node           = make_shared<ScheduleBlock>();
+  node->iter_vars     = iter_vars;
+  node->read_buffers  = read_buffers;
+  node->write_buffers = write_buffers;
+  node->name          = name;
+  node->body          = body;
+  return Expr(node);
+}
+void ScheduleBlock::Verify() const {
+  CHECK(!name.empty());
+  CHECK(body.defined());
+}
+std::vector<Expr *> ScheduleBlock::expr_fields() {
+  std::vector<Expr *> res;
+  res.push_back(&body);
+  return res;
+}
+std::vector<const Expr *> ScheduleBlock::expr_fields() const {
+  std::vector<const Expr *> res;
+  res.push_back(&body);
+  return res;
+}
+
+Expr ScheduleBlockRealize::Make(const std::vector<Expr> &iter_values, const Expr &schedule_block) {
+  auto node            = make_shared<ScheduleBlockRealize>();
+  node->iter_values    = iter_values;
+  node->schedule_block = schedule_block;
+  return Expr(node);
+}
+void ScheduleBlockRealize::Verify() const {
+  auto *schedule_block_ptr = schedule_block.As<ScheduleBlock>();
+  CHECK(schedule_block_ptr);
+  CHECK_EQ(schedule_block_ptr->iter_vars.size(), iter_values.size());
+}
+std::vector<Expr *> ScheduleBlockRealize::expr_fields() {
+  std::vector<Expr *> res;
+  auto *schedule_block_ptr = schedule_block.As<ScheduleBlock>();
+  CHECK(schedule_block_ptr);
+  res.push_back(&schedule_block_ptr->body);
+  return res;
+}
+std::vector<const Expr *> ScheduleBlockRealize::expr_fields() const {
+  std::vector<const Expr *> res;
+  auto *schedule_block_ptr = schedule_block.As<ScheduleBlock>();
+  CHECK(schedule_block_ptr);
+  res.push_back(&schedule_block_ptr->body);
+  return res;
+}
+
+Expr IfThenElse::Make(Expr condition, Expr true_case, Expr false_case) {
+  auto node = make_shared<IfThenElse>(condition, true_case, false_case);
+  return Expr(node);
+}
+
+IfThenElse::IfThenElse(Expr condition, Expr true_case, Expr false_case)
+    : ExprNode(Type()), condition(condition), true_case(true_case), false_case(false_case) {
+  CHECK(condition.defined());
+  CHECK(true_case.defined());
+}
+std::vector<Expr *> IfThenElse::expr_fields() { return {&condition, &true_case, &false_case}; }
+std::vector<const Expr *> IfThenElse::expr_fields() const { return {&condition, &true_case, &false_case}; }
+
+Expr Store::Make(Expr tensor, Expr value, const std::vector<Expr> &indices) {
+  CHECK(tensor.As<_Tensor_>()) << "tensor should be _Tensor_ type";
+  auto node     = make_shared<Store>();
+  node->tensor  = tensor;
+  node->value   = value;
+  node->indices = indices;
+
+  if (tensor->type() != Void()) {
+    node->set_type(tensor->type().ElementOf().with_lanes(node->index().type().lanes()));
+  }
+  return Expr(node);
+}
+
+Expr Store::index() const {
+  auto *tensor_n = tensor.As<ir::_Tensor_>();
+  CHECK(tensor_n);
+  if (indices.size() == 1) {
+    return indices[0];
+  }
+  Expr res = common::IndiceToAbsOffset(tensor_n->shape, indices);
+  optim::Simplify(&res);
+  return res;
+}
+
+const std::string &Store::name() const {
+  auto *t = tensor.As<ir::_Tensor_>();
+  CHECK(t);
+  return t->name;
+}
+
+Type Store::type() const { return value.type(); }
+std::vector<Expr *> Store::expr_fields() {
+  std::vector<Expr *> exprs({&tensor, &value});
+  for (auto &idx : indices) exprs.push_back(&idx);
+  return exprs;
+}
+
+std::vector<const Expr *> Store::expr_fields() const {
+  std::vector<const Expr *> exprs({&tensor, &value});
+  for (auto &idx : indices) exprs.push_back(&idx);
+  return exprs;
+}
+
+void Store::Verify() const { CHECK(tensor.defined()); }
+
+Expr Alloc::Make(Expr dest, Type type, const std::vector<Expr> &extents, Expr condition, Expr body) {
+  auto node = make_shared<Alloc>();
+  CHECK(dest.As<_Buffer_>()) << "Alloc destination only supports Buffer";
+  node->destination = dest;
+  node->extents     = extents;
+  node->condition   = condition;
+  node->body        = body;
+  node->set_type(type);
+  return Expr(node);
+}
+
+int32_t Alloc::ConstantAllocationSize() const { return ConstantAllocationSize(extents); }
+
+int32_t Alloc::ConstantAllocationSize(const std::vector<Expr> &extents) {
+  int32_t res{1};
+  for (auto &e : extents) {
+    auto *p = e.As<IntImm>();
+    CHECK(p) << "extent should be IntImm";
+    res *= p->value;
+  }
+  return res;
+}
+std::vector<Expr *> Alloc::expr_fields() {
+  std::vector<Expr *> res;
+  for (auto &x : extents) res.push_back(&x);
+  res.push_back(&condition);
+  res.push_back(&body);
+  return res;
+}
+std::vector<const Expr *> Alloc::expr_fields() const {
+  std::vector<const Expr *> res;
+  for (auto &x : extents) res.push_back(&x);
+  res.push_back(&condition);
+  res.push_back(&body);
+  return res;
+}
+
+Expr Free::Make(Expr dest) {
+  auto node = make_shared<Free>();
+  CHECK(dest.As<_Buffer_>()) << "Free destination only supports Buffer";
+  node->destination = dest;
+  return Expr(node);
+}
+
+Expr Call::Make(Type type,
+                const std::string &name,
+                const std::vector<Expr> &read_args,
+                const std::vector<Expr> &write_args,
+                CallType call_type,
+                FunctionRef func,
+                int value_index,
+                const std::map<std::string, attr_t> &attrs) {
+  for (size_t i = 0; i < read_args.size(); ++i) {
+    CHECK(read_args[i].defined());
+  }
+
+  auto node         = common::make_shared<Call>(type);
+  node->name        = name;
+  node->read_args   = read_args;
+  node->write_args  = write_args;
+  node->call_type   = call_type;
+  node->func        = func;
+  node->value_index = value_index;
+  node->set_type(type);
+  node->attrs = attrs;
+  return Expr(node);
+}
+std::vector<Expr *> Call::expr_fields() {
+  std::vector<Expr *> res;
+  for (auto &x : read_args) res.push_back(&x);
+  for (auto &x : write_args) res.push_back(&x);
+  return res;
+}
+std::vector<const Expr *> Call::expr_fields() const {
+  std::vector<const Expr *> res;
+  for (auto &x : read_args) res.push_back(&x);
+  for (auto &x : write_args) res.push_back(&x);
+  return res;
+}
+void Call::Verify() const {}
+
+Expr PolyFor::Make(Var iterator,
+                   Expr init_val,
+                   Expr condition,
+                   Expr inc,
+                   ForType for_type,
+                   DeviceAPI device_api,
+                   Expr body,
+                   VectorizeInfo vectorize_info,
+                   BindInfo bind_info) {
+  auto n        = make_shared<PolyFor>();
+  n->iterator   = iterator;
+  n->init       = init_val;
+  n->condition  = condition;
+  n->inc        = inc;
+  n->device_api = device_api;
+  n->body       = body;
+  n->set_for_type(for_type);
+  n->set_vectorize_info(vectorize_info);
+  n->set_bind_info(bind_info);
+
+  if (n->is_vectorized()) CHECK(n->vectorize_info().valid());
+  if (n->is_binded() && bind_info.offset >= 0) CHECK(n->bind_info().valid());
+
+  return Expr(n);
+}
+std::vector<Expr *> PolyFor::expr_fields() { return {&init, &condition, &inc, &body}; }
+std::vector<const Expr *> PolyFor::expr_fields() const { return {&init, &condition, &inc, &body}; }
+
+Expr PolyFor::ExtractExtent() const {
+  auto nodes = CollectIRNodes(condition, [&](const Expr *e) {
+    return e->As<NE>() ||   //
+           e->As<EQ>() ||   //
+           e->As<Min>() ||  //
+           e->As<Max>();
+  });
+
+  if (!nodes.empty()) {
+    return Expr();
+  }
+
+  auto *le_n = condition.As<LE>();
+  auto *lt_n = condition.As<LT>();
+  if (!(le_n || lt_n)) return Expr();
+
+  if (le_n) {
+    if (le_n->a() != Expr(iterator)) return Expr();
+    auto *le_b_int = le_n->b().As<IntImm>();
+    if (le_b_int) return Expr(make_shared<IntImm>(Int(32), le_b_int->value + 1));
+    return Add::Make(le_n->b(), Expr(1));
+  }
+
+  if (lt_n) {
+    if (lt_n->a() != Expr(iterator)) return Expr();
+    return lt_n->b();
+  }
+  return Expr();
+}
+
+bool Var::operator==(const Var &o) const { return o->name == operator->()->name; }
+bool Var::operator!=(const Var &o) const { return !(*this == o); }
+
+Var &Var::operator=(_Var_ *x) {
+  *this = Var(x);
+  return *this;
+}
+
+Var &Var::operator=(const _Var_ *x) {
+  *this = x->Copy();
+  return *this;
+}
+
+Expr Load::Make(Expr tensor, const std::vector<Expr> &indices) {
+  CHECK(tensor->type().valid());
+  CHECK(!indices.empty());
+  for (auto &idx : indices) CHECK_EQ(idx.type().ElementOf(), Int(32));
+  auto node     = make_shared<Load>();
+  node->tensor  = tensor;
+  node->indices = indices;
+  node->set_type(node->type());
+  return Expr(node);
+}
+Type Load::type() const {
+  CHECK(tensor.defined());
+  CHECK(tensor.type().valid());
+
+  int lanes = 0;
+  for (auto &idx : indices) lanes = std::max(lanes, idx.type().lanes());
+  auto type = tensor.type().ElementOf().with_lanes(lanes);
+  if (type.is_cpp_handle()) return type.set_cpp_handle(false);
+  if (type.is_cpp_handle2()) return type.set_cpp_handle(true);
+  return type;
+}
+
+std::vector<Expr *> Load::expr_fields() {
+  std::vector<Expr *> exprs({&tensor});
+  for (auto &idx : indices) exprs.push_back(&idx);
+  return exprs;
+}
+
+std::vector<const Expr *> Load::expr_fields() const {
+  std::vector<const Expr *> exprs({&tensor});
+  for (auto &idx : indices) exprs.push_back(&idx);
+  return exprs;
+}
+
+Expr Load::index() const {
+  if (is_addr_tensor()) {
+    auto *tensor_n = tensor.As<_Tensor_>();
+    CHECK(tensor_n);
+    VLOG(3) << "Begin Load::index IndiceToAbsOffset of tensor: " << this->name();
+    if (indices.size() == 1) {
+      return indices[0];
+    }
+    Expr res = common::IndiceToAbsOffset(tensor_n->shape, indices);
+    VLOG(3) << "Begin Load::index Simplify";
+    optim::Simplify(&res);
+    return res;
+  } else {
+    CHECK_EQ(indices.size(), 1UL);
+    return indices[0];
+  }
+}
+
+const std::string &Load::name() const {
+  auto *t = tensor.As<ir::_Tensor_>();
+  CHECK(t);
+  return t->name;
+}
+
+void Load::Verify() const {
+  CHECK(tensor.defined());
+  CHECK(!indices.empty()) << "At least one indice is needed";
+  for (auto &indice : indices) {
+    CHECK(indice.defined());
+    CHECK(indice.type().ElementOf() == type_of<int32_t>() || indice.type().ElementOf() == type_of<int64_t>())
+        << "get type " << indice.type() << " vs (int64 or int32)";
+  }
+}
+
+bool LoadStoreAddrMnger::is_addr_tensor() const { return tensor.As<_Tensor_>(); }
+bool LoadStoreAddrMnger::is_addr_scalar() const { return !is_addr_tensor(); }
+
+Expr Ramp::Make(Expr base, Expr stride, int lanes) {
+  CHECK(base.defined());
+  CHECK(stride.defined());
+  CHECK(base.type().valid());
+  CHECK(stride.type().valid());
+  CHECK_EQ(stride.type(), Int(32));
+  CHECK_GT(lanes, 0);
+
+  auto *n   = make_shared<Ramp>();
+  n->base   = base;
+  n->stride = stride;
+  n->lanes  = lanes;
+  Type type(base.type().type(), base.type().bits(), lanes);
+  n->set_type(type);
+  return Expr(n);
+}
+
+Expr Broadcast::Make(Expr value, int lanes) {
+  CHECK(value.defined());
+  CHECK(value.type().valid());
+
+  auto *n  = make_shared<Broadcast>();
+  n->value = value;
+  n->lanes = lanes;
+
+  Type type(value.type().type(), value.type().bits(), lanes);
+  n->set_type(type);
+
+  return Expr(n);
+}
+
+Type Broadcast::type() const { return value.type().ElementOf().with_lanes(lanes); }
+
+Expr Sum::Make(const std::vector<Expr> &vs) {
+  CHECK(!vs.empty());
+  if (vs.size() == 1) return vs.front();
+
+  auto *n   = make_shared<Sum>();
+  auto type = vs.front().type();
+  for (auto &v : vs) CHECK_EQ(v.type(), type) << vs.front() << " " << v;
+
+  n->operands() = vs;
+
+  n->set_type(vs.front()->type());
+
+  return Expr(n);
+}
+
+Expr Product::Make(const std::vector<Expr> &vs) {
+  CHECK_GE(vs.size(), 1);
+
+  auto *n   = make_shared<Product>();
+  auto type = vs.front().type();
+  for (auto &v : vs) CHECK_EQ(v.type(), type);
+
+  n->operands() = vs;
+
+  n->set_type(vs.front()->type());
+
+  return Expr(n);
+}
+
+Expr FracOp::Make(Expr n, Expr d) {
+  auto *node = make_shared<FracOp>();
+  node->a()  = n;
+  node->b()  = d;
+  return Expr(node);
+}
+
+ir::Module _Module_::Make(const std::string &name, Target target) {
+  auto n    = make_shared<_Module_>();
+  n->name   = name;
+  n->target = target;
+  return ir::Module(n);
+}
+
+Expr PrimitiveNode::Make(const std::string &name, const std::map<std::string, attr_t> &attrs) {
+  auto *n  = make_shared<PrimitiveNode>();
+  n->name  = name;
+  n->attrs = attrs;
+  return Expr(n);
+}
+
+Expr Reduce::Make(Reduce::ReduceType reduce_type, Expr init, Expr body, const std::vector<Var> &reduce_aixs) {
+  CHECK(body.defined());
+  CHECK(init.defined());
+  auto n         = common::make_shared<Reduce>();
+  n->init        = init;
+  n->body        = body;
+  n->reduce_type = reduce_type;
+  n->reduce_axis.append(reduce_aixs.begin(), reduce_aixs.end());
+  CHECK(body.type().valid());
+  if (init.defined()) {
+    CHECK(init.type().valid());
+    CHECK_EQ(init.type(), body.type());
+  }
+  n->set_type(body.type());
+  return Expr(n);
+}
+std::vector<Expr *> Reduce::expr_fields() {
+  std::vector<Expr *> res;
+  if (init.defined()) {
+    res.push_back(&init);
+  }
+  CHECK(body.defined());
+  res.push_back(&body);
+  return res;
+}
+std::vector<const Expr *> Reduce::expr_fields() const {
+  std::vector<const Expr *> res;
+  if (init.defined()) {
+    res.push_back(&init);
+  }
+  CHECK(body.defined());
+  res.push_back(&body);
+  return res;
+}
+
+void Reduce::Verify() const {
+  CHECK(init.defined());
+  CHECK(body.defined());
+  CHECK(!reduce_axis.empty()) << "At least one reduce axis is needed";
+  CHECK_EQ(init.type(), body.type());
+}
+
+void Select::Verify() const {
+  CHECK(condition.defined());
+  CHECK(true_value.defined());
+  CHECK(false_value.defined());
+  CHECK(condition.type().is_bool()) << "Select Node's condition should be a boolean";
+  CHECK_EQ(true_value.type(), false_value.type())
+      << "Select Node's true_value and false_value should have the same type";
+}
+
+void Free::Verify() const { CHECK(destination.defined()); }
+
+void Alloc::Verify() const { CHECK(destination.defined()); }
+
+void For::Verify() const {
+  CHECK(loop_var.defined());
+  CHECK(min.defined());
+  CHECK(extent.defined());
+  CHECK(body.defined());
+
+  CHECK_EQ(loop_var->type(), type_of<int32_t>());
+  CHECK_EQ(min->type(), type_of<int32_t>());
+  CHECK_EQ(extent->type(), type_of<int32_t>());
+}
+
+void PolyFor::Verify() const {
+  CHECK(iterator.defined());
+  CHECK(init.defined());
+  CHECK(condition.defined());
+  CHECK(inc.defined());
+  CHECK(body.defined());
+
+  CHECK_EQ(iterator->type(), type_of<int32_t>());
+  CHECK_EQ(init.type(), type_of<int32_t>());
+  CHECK_EQ(condition.type(), type_of<bool>());
+  CHECK_EQ(inc.type(), type_of<int32_t>());
+}
+
+void Ramp::Verify() const {
+  CHECK(base.defined());
+  CHECK(stride.defined());
+}
+
+void FracOp::Verify() const {
+  CHECK(a().defined());
+  CHECK(b().defined());
+  CHECK_EQ(a().type(), b().type());
+}
+
+void Broadcast::Verify() const { CHECK(value.defined()); }
+
+void MultiOperandVerify(llvm::ArrayRef<Expr> operands) {
+  Type operand_type = operands.front().type();
+  CHECK(operand_type.valid());
+  for (int i = 1; i < operands.size(); i++) {
+    CHECK(operands[i].defined());
+    CHECK_EQ(operands[i].type(), operand_type);
+  }
+}
+
+void Product::Verify() const {
+  CHECK_GT(operands().size(), 1UL) << "Product node should have more than 1 operands";
+  MultiOperandVerify(operands());
+}
+
+void Sum::Verify() const {
+  CHECK_GT(operands().size(), 1UL) << "Sum node should have more than 1 operands";
+  MultiOperandVerify(operands());
+}
+
+void Block::Verify() const {}
+
+void PrimitiveNode::Verify() const {}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h
new file mode 100644
index 0000000000000..894ab40e4ce5b
--- /dev/null
+++ b/paddle/cinn/ir/ir.h
@@ -0,0 +1,999 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This file contains all the internal representations used in CINN project.
+ */
+#pragma once
+
+#include <absl/types/variant.h>
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "cinn/common/shared.h"
+#include "cinn/common/type.h"
+#include "cinn/ir/function_base.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/utils/small_vector.h"
+
+namespace cinn {
+
+namespace poly {
+class Stage;
+}  // namespace poly
+
+namespace ir {
+class Buffer;
+class BufferRange;
+struct LoweredFunc;
+class Module;
+
+using common::Object;
+using common::Shared;
+// NOTE attr_t only support POD, can not contain Expr or other IR nodes, or the IRVisitor or IRCopy on PrimitiveNode
+// will result in undefined behavior.
+using attr_t = absl::variant<int, float, bool, std::string>;
+
+/**
+ * Cast a node to another type, can't change the width.
+ */
+struct Cast : public ExprNode<Cast> {
+  Cast() : ExprNode(1) {}
+
+  static Expr Make(Type t, Expr v);
+
+  template <typename T>
+  static Expr Make(Type t, T v) {
+    return Make(t, Expr(v));
+  }
+
+  Expr& v() { return operand(0); }
+  const Expr& v() const { return operand(0); }
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Cast;
+
+  std::vector<Expr*> expr_fields() override { return {&operand(0)}; }
+  std::vector<const Expr*> expr_fields() const override { return {&operand(0)}; }
+};
+
+/**
+ * The sum of two expressions.
+ */
+struct Add : public BinaryOpNode<Add> {
+  Add(Expr a, Expr b);
+
+  static Expr Make(Expr a, Expr b);
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Add;
+};
+
+/**
+ * The difference of two expressions.
+ */
+struct Sub : public BinaryOpNode<Sub> {
+  Sub(Expr a, Expr b) : BinaryOpNode<Sub>(a.type(), a, b) {}
+
+  static Expr Make(Expr a, Expr b);
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Sub;
+};
+
+/**
+ * The product of two expressions.
+ */
+struct Mul : public BinaryOpNode<Mul> {
+  Mul(Expr a, Expr b) : BinaryOpNode<Mul>(a.type(), a, b) {}
+
+  static Expr Make(Expr a, Expr b);
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Mul;
+};
+
+/**
+ * The ratio of two expressions.
+ */
+struct Div : public BinaryOpNode<Div> {
+  Div(Expr a, Expr b) : BinaryOpNode<Div>(a.type(), a, b) {}
+
+  static Expr Make(Expr a, Expr b);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::Div;
+};
+
+/**
+ * The mod of two expressions.
+ */
+struct Mod : public BinaryOpNode<Mod> {
+  Mod(Expr a, Expr b) : BinaryOpNode<Mod>(a.type(), a, b) {}
+
+  static Expr Make(Expr a, Expr b);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::Mod;
+};
+
+/**
+ * The lesser of two expressions.
+ */
+struct Min : public BinaryOpNode<Min> {
+  Min(Expr a, Expr b) : BinaryOpNode<Min>(a.type(), a, b) {}
+
+  static Expr Make(Expr a, Expr b);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::Min;
+};
+
+/**
+ * The larger of two expressions.
+ */
+struct Max : public BinaryOpNode<Max> {
+  Max(Expr a, Expr b) : BinaryOpNode<Max>(a.type(), a, b) {}
+
+  static Expr Make(Expr a, Expr b);
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Max;
+};
+
+/**
+ * Tell whether the first expression equals to the second expression.
+ */
+struct EQ : public BinaryOpNode<EQ> {
+  EQ(Expr a, Expr b) : BinaryOpNode<EQ>(a.type(), a, b) {}
+
+  Type type() const { return Bool(a()->type().lanes()); }
+
+  static Expr Make(Expr a, Expr b);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::EQ;
+};
+
+/**
+ * Tell whether the first expression not equals to the second expression.
+ */
+struct NE : public BinaryOpNode<NE> {
+  NE(Expr a, Expr b) : BinaryOpNode<NE>(a.type(), a, b) {}
+
+  Type type() const { return Bool(a()->type().lanes()); }
+
+  static Expr Make(Expr a, Expr b);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::NE;
+};
+
+/**
+ * Tell whether the first expression is lower than the second expression.
+ */
+struct LT : public BinaryOpNode<LT> {
+  LT(Expr a, Expr b) : BinaryOpNode<LT>(a.type(), a, b) {}
+
+  Type type() const { return Bool(a()->type().lanes()); }
+
+  static Expr Make(Expr a, Expr b);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::LT;
+};
+
+/**
+ * Tell whether the first expression is no larger than the second expression.
+ */
+struct LE : public BinaryOpNode<LE> {
+  LE(Expr a, Expr b) : BinaryOpNode<LE>(a.type(), a, b) {}
+
+  Type type() const { return Bool(a()->type().lanes()); }
+
+  static Expr Make(Expr a, Expr b);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::LE;
+};
+
+/**
+ * Tell whether the first expression is larger than the second expression.
+ */
+struct GT : public BinaryOpNode<GT> {
+  GT(Expr a, Expr b) : BinaryOpNode<GT>(a.type(), a, b) {}
+
+  Type type() const { return Bool(a()->type().lanes()); }
+
+  static Expr Make(Expr a, Expr b);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::GT;
+};
+
+/**
+ * Tell whether the first expression is not less than the second expression.
+ */
+struct GE : public BinaryOpNode<GE> {
+  GE(Expr a, Expr b) : BinaryOpNode<GE>(a.type(), a, b) {}
+
+  Type type() const { return Bool(a()->type().lanes()); }
+
+  static Expr Make(Expr a, Expr b);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::GE;
+};
+
+/**
+ * Logical and.
+ */
+struct And : public BinaryOpNode<And> {
+  And(Expr a, Expr b) : BinaryOpNode<And>(a.type(), a, b) {
+    CHECK(a->type().is_bool());
+    CHECK(b->type().is_bool());
+  }
+
+  Type type() const { return Bool(a()->type().lanes()); }
+
+  static Expr Make(Expr a, Expr b);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::And;
+};
+
+/**
+ * -x
+ */
+struct Minus : public UnaryOpNode<Minus> {
+  explicit Minus(Expr x) : UnaryOpNode<Minus>(x.type(), x) {}
+
+  static Expr Make(Expr a);
+  void Verify() const override;
+  static const IrNodeTy _node_type_ = IrNodeTy::Minus;
+};
+
+/**
+ * Logical or.
+ */
+struct Or : public BinaryOpNode<Or> {
+  Or(Expr a, Expr b) : BinaryOpNode<Or>(Bool(), a, b) {
+    CHECK(a->type().is_bool());
+    CHECK(b->type().is_bool());
+  }
+
+  static Expr Make(Expr a, Expr b);
+
+  Type type() const override;
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Or;
+};
+
+/**
+ * Logical not.
+ */
+struct Not : public UnaryOpNode<Not> {
+  explicit Not(Expr v) : UnaryOpNode<Not>(Bool(), v) {}
+
+  static Expr Make(Expr v);
+
+  Type type() const override;
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Not;
+};
+
+struct Let : public ExprNode<Let> {
+  Expr symbol;
+  Expr body;
+
+  static Expr Make(Expr symbol, Expr body);
+
+  Type type() const override;
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Let;
+
+  std::vector<Expr*> expr_fields() override {
+    if (!body.defined()) return {&symbol};
+    return {&symbol, &body};
+  }
+  std::vector<const Expr*> expr_fields() const override {
+    if (!body.defined()) return {&symbol};
+    return {&symbol, &body};
+  }
+};
+
+enum CallType : int {
+  //! Extern "C" function.
+  Extern = 0,
+  //! CINN-style call, call a CINN function.
+  CINN,
+  //! Intrinsic functions.
+  Intrinsic,
+  //! Generated from ISL Ast.
+  ISL,
+};
+struct Call : public ExprNode<Call> {
+  explicit Call(Type t) : ExprNode<Call>(t) {}
+
+  //! The name of the function/intrinsic.
+  std::string name;
+  //! The arguments.
+  std::vector<Expr> read_args;
+  std::vector<Expr> write_args;
+  //! the attribute of this CallNode.
+  std::map<std::string, attr_t> attrs;
+  //! Type of calls.
+  CallType call_type;
+  //! The function to be called.
+  FunctionRef func;
+  //! The output value index if func's value is a tuple.
+  int value_index{-1};
+
+  static Expr Make(Type type,
+                   const std::string& name,
+                   const std::vector<Expr>& read_args,
+                   const std::vector<Expr>& write_args,
+                   CallType call_type,
+                   FunctionRef func                           = FunctionRef(),
+                   int value_index                            = 0,
+                   const std::map<std::string, attr_t>& attrs = {});
+
+  void Verify() const override;
+
+  inline size_t total_args_count() const { return read_args.size() + write_args.size(); }
+
+  inline bool is_extern_call() const { return call_type == CallType::Extern; }
+  inline bool is_cinn_call() const { return call_type == CallType::CINN; }
+  inline bool is_intrinsic_call() const { return call_type == CallType::Intrinsic; }
+  inline bool is_isl_call() const { return call_type == CallType::ISL; }
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Call;
+};
+
+/**
+ * Variable used as iterator value or bound definition.
+ */
+struct _Var_ : public ExprNode<_Var_> {
+  std::string name;
+
+  bool is_reduce_axis{false};
+  //! Lower bound and upper bound of a axis.
+  // @{
+  Expr lower_bound;
+  Expr upper_bound;
+  // @}
+
+  // ! Extra tag of this variable/axis.
+  std::string tag;
+
+  _Var_() = default;
+  _Var_(const std::string& name, Type type) : ExprNode<_Var_>(type), name(name) {}
+
+  static Expr Make(const std::string& name, const Type& type);
+  //! Make a reduce axis.
+  static Expr Make(Expr lower_bound, Expr upper_bound, const std::string& name, bool is_reduce);
+
+  void Verify() const override;
+
+  Expr Copy() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::_Var_;
+};
+
+//! A named variable.
+struct Var : public IrNodeRef {
+  Var() = default;
+  explicit Var(IrNode* n) : IrNodeRef(n) {}
+  explicit Var(const std::string& name_hint, Type t = type_of<int>()) : Var(_Var_::Make(name_hint, t).ptr()) {}
+  Var(Expr lower_bound, Expr upper_bound, const std::string& name, bool is_reduce = false)
+      : Var(_Var_::Make(lower_bound, upper_bound, name, is_reduce)) {}
+  Var(int upper_bound, const std::string& name) : Var(_Var_::Make(Expr(0), Expr(upper_bound), name, false)) {}
+  Var(Expr upper_bound, const std::string& name) : Var(_Var_::Make(Expr(0), upper_bound, name, false)) {}
+
+  operator Expr() { return Expr(get()); }
+  operator Expr() const {
+    Var v = *this;
+    return Expr(v);
+  }
+
+  bool operator==(const Var& o) const;
+  bool operator!=(const Var& o) const;
+
+  Var& operator=(_Var_* x);
+  Var& operator=(const _Var_* x);
+
+  const _Var_* operator->() const { return get(); }
+  _Var_* operator->() { return get(); }
+  const _Var_* get() const { return static_cast<const _Var_*>(ptr()); }
+  _Var_* get() { return static_cast<_Var_*>(ptr()); }
+};
+
+struct Reduce : public ExprNode<Reduce> {
+  enum ReduceType {
+    kSum = 0,
+    kSub,
+    kMul,
+    kDiv,
+    kMax,
+    kMin,
+    kAll,
+    kAny,
+  };
+
+  //! The initial value.
+  Expr init;
+
+  // ! The body.
+  Expr body;
+
+  utils::SmallVector<Var, 4> reduce_axis;
+
+  //! The type of the reduce operation.
+  ReduceType reduce_type;
+
+  static Expr Make(ReduceType reduce_type, Expr init, Expr body, const std::vector<Var>& reduce_aixs);
+
+  Type type() const override { return body.type().ElementOf(); }
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Reduce;
+};
+
+/**
+ * Evaluates `true_value` and `false_value` then selects between them based on `condition`.
+ */
+struct Select : public ExprNode<Select> {
+  Expr condition;
+  Expr true_value;
+  Expr false_value;
+
+  Select(Expr condition, Expr true_value, Expr false_value)
+      : ExprNode<Select>(true_value.type()), condition(condition), true_value(true_value), false_value(false_value) {
+    CHECK_EQ(true_value.type(), false_value.type());
+    CHECK(condition.type().is_bool());
+  }
+
+  static Expr Make(Expr condition, Expr true_value, Expr false_value) {
+    auto node = make_shared<Select>(condition, true_value, false_value);
+    return Expr(node);
+  }
+
+  Type type() const override {
+    CHECK_EQ(true_value.type(), false_value.type());
+    return true_value.type();
+  }
+
+  void Verify() const override;
+
+  std::vector<Expr*> expr_fields() override { return {&condition, &true_value, &false_value}; }
+  std::vector<const Expr*> expr_fields() const override { return {&condition, &true_value, &false_value}; }
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Select;
+};
+
+struct LoadStoreAddrMnger {
+  Expr tensor;  // Should be a tensor or a scalar.
+  //! Tell whether the address is a tensor.
+  bool is_addr_tensor() const;
+  //! Tell whether the address is a scalar.
+  bool is_addr_scalar() const;
+};
+
+/**
+ * Load the value from a buffer (as an array).
+ */
+struct Load : public ExprNode<Load>, public LoadStoreAddrMnger {
+  std::vector<Expr> indices;
+  //! The abstract offset.
+  Expr index() const;
+
+  static Expr Make(Expr tensor, const std::vector<Expr>& indices);
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  void Verify() const override;
+
+  const std::string& name() const;
+
+  Type type() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Load;
+};
+
+/**
+ * Store a `value` to the buffer at a given `index`.
+ */
+struct Store : public ExprNode<Store>, public LoadStoreAddrMnger {
+  Expr value;
+  std::vector<Expr> indices;
+
+  static Expr Make(Expr tensor, Expr value, const std::vector<Expr>& indices);
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  void Verify() const override;
+
+  const std::string& name() const;
+
+  Type type() const override;
+  Expr index() const;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Store;
+};
+
+/**
+ * Allocate a buffer with the given type and size. The buffer lives for at most the duration of the body statement,
+ * within which it is freed.
+ */
+struct Alloc : public ExprNode<Alloc> {
+  //! The destination of the allocation, this might be a buffer or a variable.
+  Expr destination;
+  //! Dimensions of this buffer (as a multi-dimensional array).
+  std::vector<Expr> extents;
+  // NOTE the condition might be undefined, that means always true.
+  Expr condition;
+  // NOTE the body might be undefined, that means no specific logic other than default.
+  Expr body;
+
+  Alloc() : ExprNode(Type()) {}
+
+  static Expr Make(Expr dest, Type type, const std::vector<Expr>& extents, Expr condition, Expr body);
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  void Verify() const override;
+
+  int32_t ConstantAllocationSize() const;
+  static int32_t ConstantAllocationSize(const std::vector<Expr>& extents);
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Alloc;
+};
+
+/**
+ * Free the resources associated with the given buffer.
+ */
+struct Free : public ExprNode<Free> {
+  Expr destination;
+
+  Free() : ExprNode(Type()) {}
+
+  static Expr Make(Expr dest);
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Free;
+};
+
+struct IfThenElse : public ExprNode<IfThenElse> {
+  Expr condition;
+  Expr true_case;
+  Expr false_case;
+
+  IfThenElse(Expr condition, Expr true_case, Expr false_case);
+
+  static Expr Make(Expr condition, Expr true_case, Expr false_case = Expr());
+
+  void Verify() const override {
+    CHECK(condition.defined());
+    CHECK(true_case.defined());
+    CHECK_EQ(condition.type(), type_of<bool>());
+  }
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::IfThenElse;
+};
+
+enum class ForType : int {
+  Serial     = 0,       //! Serial execution.
+  Parallel   = 1,       //! Parallel execution.
+  Vectorized = 1 << 1,  //! Vector SIMD loop annotation.
+  Unrolled   = 1 << 2,  //! Unroll annotation.
+  GPUThread  = 1 << 3,  //! GPU Thread.
+  GPUBlock   = 1 << 4,  //! GPU Block.
+  GPULane    = 1 << 5,  //! GPU Lane.
+  Default    = 1 << 6,
+};
+
+struct VectorizeInfo {
+  VectorizeInfo() = default;
+  VectorizeInfo(int level, int factor) : level(level), factor(factor) {}
+
+  int level{-1};
+  int factor{-1};
+
+  inline void set(int level, int factor) {
+    this->level  = level;
+    this->factor = factor;
+  }
+  inline bool valid() const { return level >= 0 && factor > 0; }
+};
+
+struct BindInfo {
+  BindInfo() = default;
+  BindInfo(const ForType& for_type, const int& offset, const DeviceAPI& device)
+      : for_type(for_type), offset(offset), device(device) {}
+
+  ForType for_type{ForType::Default};
+  int offset{-1};
+  DeviceAPI device{DeviceAPI::UNK};
+
+  inline void set(const ForType& for_type, const int& offset, const DeviceAPI& device) {
+    this->for_type = for_type;
+    this->offset   = offset;
+    this->device   = device;
+  }
+  // offset should be 0-2, should correspond to the thread of x, y, z
+  inline bool valid() const {
+    return offset >= 0 && offset < 3 && (for_type == ForType::GPUThread || for_type == ForType::GPUBlock);
+  }
+};
+
+struct ForBase {
+  ForType for_type() const { return for_type_; }
+  void set_for_type(ForType x) { for_type_ = x; }
+
+  void set_vectorize_info(const VectorizeInfo& x) {
+    if (x.valid()) set_vectorized();
+    vectorize_info_ = x;
+  }
+  void set_bind_info(const BindInfo& x) {
+    if (x.valid()) set_binded(x.for_type);
+    bind_info_ = x;
+  }
+  const VectorizeInfo& vectorize_info() const { return vectorize_info_; }
+  const BindInfo& bind_info() const { return bind_info_; }
+
+  void reset_vectorize_info() {
+    set_vectorized(false);
+    vectorize_info_.factor = -1;
+    vectorize_info_.level  = -1;
+  }
+  void reset_bind_info() {
+    set_binded(bind_info_.for_type, false);
+    bind_info_.offset = -1;
+    bind_info_.device = DeviceAPI::UNK;
+  }
+
+  void set_serial() { for_type_ = ForType::Serial; }
+
+  void set_unrolled(bool x = true) {
+    if (x)
+      set_for_type_flag(ForType::Unrolled);
+    else
+      unset_for_type_flag(ForType::Unrolled);
+  }
+  void set_vectorized(bool x = true) {
+    if (x)
+      set_for_type_flag(ForType::Vectorized);
+    else
+      unset_for_type_flag(ForType::Vectorized);
+  }
+  void set_parallel(bool x = true) {
+    if (x)
+      set_for_type_flag(ForType::Parallel);
+    else
+      unset_for_type_flag(ForType::Parallel);
+  }
+  void set_binded(ForType for_type, bool x = true) {
+    if (x)
+      set_for_type_flag(for_type);
+    else
+      unset_for_type_flag(for_type);
+  }
+
+  inline bool is_serial() const { return for_type_ == ForType::Serial; }
+  inline bool is_default() const { return for_type_ == ForType::Default; }
+  inline bool is_unrolled() const { return tell_for_type_flag(ForType::Unrolled); }
+  inline bool is_vectorized() const { return tell_for_type_flag(ForType::Vectorized); }
+  inline bool is_parallel() const { return tell_for_type_flag(ForType::Parallel); }
+  inline bool is_binded() const {
+    return tell_for_type_flag(ForType::GPUBlock) || tell_for_type_flag(ForType::GPUThread);
+  }
+  inline bool is_gpu_block_binded() const { return tell_for_type_flag(ForType::GPUBlock); }
+  inline bool is_gpu_thread_binded() const { return tell_for_type_flag(ForType::GPUThread); }
+
+ private:
+  inline void set_for_type_flag(ForType type) { *reinterpret_cast<int*>(&for_type_) |= static_cast<int>(type); }
+  inline void unset_for_type_flag(ForType type) { *reinterpret_cast<int*>(&for_type_) &= ~static_cast<int>(type); }
+  inline bool tell_for_type_flag(ForType type) const { return static_cast<int>(for_type_) & static_cast<int>(type); }
+
+  ForType for_type_{ForType::Serial};
+  VectorizeInfo vectorize_info_;
+  BindInfo bind_info_;
+};
+
+/// LLVM loop unroll metadata infomation
+struct LLVMForLoopMeta {
+  enum UnrollMode { DefaultUnroll, FullyUnroll, NoUnroll };
+
+  UnrollMode unroll_mode{DefaultUnroll};
+  bool vectorization{true};
+};
+
+struct For : public ExprNode<For>, public ForBase {
+  //! The loop variable.
+  Var loop_var;
+  //! The minimum value of the iteration.
+  Expr min;
+  //! The extent of the iteration.
+  Expr extent;
+
+  Expr body;
+
+  DeviceAPI device_api;
+
+  LLVMForLoopMeta metadata;
+
+  static Expr Make(Var loop_var,
+                   Expr min,
+                   Expr extent,
+                   ForType for_type,
+                   DeviceAPI device_api,
+                   Expr body,
+                   VectorizeInfo vector_info = VectorizeInfo(),
+                   BindInfo bind_info        = BindInfo());
+
+  void Verify() const override;
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::For;
+};
+
+//! Polyhedral forloop, which condition is more complex than the normal `For`.
+struct PolyFor : public ExprNode<PolyFor>, public ForBase {
+  //! The iterator variable.
+  Var iterator;
+  // Initial value of the iterator.
+  Expr init;
+  //! The condition to continue the loop.
+  Expr condition;
+  //! Increase the iterator.
+  Expr inc;
+  //! The forloop body.
+  Expr body;
+
+  DeviceAPI device_api;
+
+  PolyFor() : ExprNode(Type()) {}
+
+  Expr ExtractExtent() const;
+
+  static Expr Make(Var iterator,
+                   Expr init_val,
+                   Expr condition,
+                   Expr inc,
+                   ForType for_type,
+                   DeviceAPI device_api,
+                   Expr body,
+                   VectorizeInfo vector_info = VectorizeInfo(),
+                   BindInfo bind_info        = BindInfo());
+
+  void Verify() const override;
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::PolyFor;
+};
+
+//! A linear ramp node.
+struct Ramp : public ExprNode<Ramp> {
+  Expr base, stride;
+  int lanes;
+
+  static Expr Make(Expr base, Expr stride, int lanes);
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Ramp;
+};
+
+//! A vector with `lanes` elements and all of them are `value`.
+struct Broadcast : public ExprNode<Broadcast> {
+  Expr value;
+  int lanes;
+
+  static Expr Make(Expr value, int lanes);
+
+  Type type() const override;
+
+  void Verify() const override;
+
+  std::vector<Expr*> expr_fields() override { return {&value}; }
+  std::vector<const Expr*> expr_fields() const override { return {&value}; }
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Broadcast;
+};
+
+struct FracOp : public BinaryOpNode<FracOp> {
+  FracOp() { operands().resize(2); }
+
+  static Expr Make(Expr n, Expr d);
+
+  bool is_constant() const { return a().is_constant() && b().is_constant(); }
+
+  double get_constant() const {
+    CHECK(is_constant());
+    CHECK_NE(b().get_constant(), 0.f);
+    return a().get_constant() / b().get_constant();
+  }
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::FracOp;
+
+  using ExprNode<FracOp>::operands;
+};
+
+struct Product : public ExprNode<Product> {
+  static Expr Make(const std::vector<Expr>& vs);
+
+  using ExprNode<Product>::operand;
+
+  Type type() const override { return operands().front().type(); }
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Product;
+};
+
+struct Sum : public ExprNode<Sum> {
+  static Expr Make(const std::vector<Expr>& vs);
+
+  using ExprNode<Sum>::operand;
+
+  Type type() const override { return operands().front().type(); }
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Sum;
+};
+
+struct Block : public ExprNode<Block> {
+  std::vector<Expr> stmts;
+
+  Block() : ExprNode(Type()) {}
+
+  static Expr Make(const std::vector<Expr>& stmts);
+
+  void Verify() const override;
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::Block;
+};
+
+// ScheduleBlock is the unit of schedule IR which represents tensor's computation
+struct ScheduleBlock : public ExprNode<ScheduleBlock> {
+  std::vector<Var> iter_vars;
+  // BufferRange(s) which is read in this schedule block, it is used to
+  // analyze, not a real computation expression. Must be AST DFS order.
+  std::vector<Expr> read_buffers;
+  // BufferRange(s) which is written in this schedule block, it is used to
+  // analyze, not a real computation expression. Must be AST DFS order.
+  std::vector<Expr> write_buffers;
+  // Additional attributes about this schedulable block,
+  // which take some auxiliary hints for future transformations.
+  std::map<std::string, attr_t> attrs;
+  std::string name;
+  Expr body;
+
+  static Expr Make(const std::vector<Var>& iter_vars,
+                   const std::vector<Expr>& read_buffers,
+                   const std::vector<Expr>& write_buffers,
+                   const std::string& name,
+                   Expr body);
+
+  void Verify() const override;
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::ScheduleBlock;
+};
+
+// ScheduleBlockRealize is used to execute ScheduleBlock with the binding iter_values
+struct ScheduleBlockRealize : public ExprNode<ScheduleBlockRealize> {
+  // values of the iter_vars
+  std::vector<Expr> iter_values;
+  Expr schedule_block;
+
+  static Expr Make(const std::vector<Expr>& iter_values, const Expr& schedule_block);
+
+  void Verify() const override;
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::ScheduleBlockRealize;
+};
+
+/**
+ * Content of a module.
+ */
+struct _Module_ : public ExprNode<_Module_> {
+  std::string name;
+  Target target;
+  std::vector<Expr> buffers;
+  std::vector<Expr> functions;
+  std::vector<Expr> submodules;
+
+  static ir::Module Make(const std::string& name, Target target);
+
+  void Verify() const override {}
+
+  static const IrNodeTy _node_type_ = IrNodeTy::_Module_;
+};
+
+/**
+ * \brief PrimitiveNode holds the contept of Primitive in CINN.
+ * A Primitive is a basic Call to some Expr function, it is introduced to create several level of coarsed-grained IR
+ * nodes for better IR optimization and hardware adaption.
+ */
+struct PrimitiveNode : public ExprNode<PrimitiveNode> {
+  std::string name;
+  //! the inputs of the PrimitiveNode, the vector<vector<Expr>> can hold variadic arguments.
+  std::vector<std::vector<Expr>> arguments;
+  //! the attribute of this PrimitiveNode.
+  std::map<std::string, attr_t> attrs;
+
+  static Expr Make(const std::string& name, const std::map<std::string, attr_t>& attrs);
+
+  void Verify() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::PrimitiveNode;
+};
+
+// possiable keys of attributes in ir nodes with are listed in the following namespace
+namespace attr {
+
+// max permitted steps for auto_unroll, used in unroll_loop pass
+constexpr const char* auto_unroll_max_step = "auto_unroll_max_step";
+// record the extra loop built during ComputeAt, used for calculate the size of temp buffer in post-processing
+constexpr const char* compute_at_extra_var = "compute_at_extra_var";
+// record the extra loop built during ReverseComputeAt, used for calculate the size of temp buffer in post-processing
+constexpr const char* reverse_compute_at_extra_var = "reverse_compute_at_extra_var";
+// record the cooperative process info, used in post schedule rule(CooperativeProcess)
+constexpr const char* cooperative_process = "cooperative_process";
+
+}  // namespace attr
+
+}  // namespace ir
+
+// Expose the following to cinn namespace for easier usage.
+// @{
+using ir::Expr;
+using ir::Var;
+// @}
+
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_base.cc b/paddle/cinn/ir/ir_base.cc
new file mode 100644
index 0000000000000..19c8004fd2bf4
--- /dev/null
+++ b/paddle/cinn/ir/ir_base.cc
@@ -0,0 +1,231 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_base.h"
+
+#include "cinn/common/cinn_value.h"
+#include "cinn/common/common.h"
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/ir/module.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace ir {
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+//! Implementations for Ir Expr Nodes.
+// @{
+#define __m(t__)                                             \
+  template <>                                                \
+  void ExprNode<t__>::Accept(cinn::ir::IRVisitor *v) const { \
+    v->Visit(const_self());                                  \
+  }
+#undef __m
+// @}
+
+std::ostream &operator<<(std::ostream &os, IrNodeTy type) {
+  switch (type) {
+#define __m(t__)                    \
+  case IrNodeTy::t__:               \
+    os << "<node: " << #t__ << ">"; \
+    break;
+
+    NODETY_FORALL(__m)
+#undef __m
+
+    default:
+      LOG(FATAL) << "unknown IrNodeTy found";
+  }
+
+  return os;
+}
+
+Expr Zero(const Type &type) {
+  if (type.is_bfloat16()) return Expr(bfloat16(0.f));
+  if (type.is_float16()) return Expr(float16(0.f));
+  if (type.is_float(32)) return Expr(0.f);
+  if (type.is_float(64)) return Expr(double(0.));  // NOLINT
+
+  if (type.is_bool()) return Expr(false);
+
+  if (type.is_int(8)) return Expr(int8_t(0));
+  if (type.is_int(16)) return Expr(int16_t(0));
+  if (type.is_int(32)) return Expr(int32_t(0));
+  if (type.is_int(64)) return Expr(int64_t(0));
+
+  if (type.is_uint(8)) return Expr(uint8_t(0));
+  if (type.is_uint(16)) return Expr(uint16_t(0));
+  if (type.is_uint(32)) return Expr(uint32_t(0));
+  if (type.is_uint(64)) return Expr(uint64_t(0));
+  CINN_NOT_IMPLEMENTED
+  return Expr();
+}
+
+Expr One(const Type &type) {
+  if (type.is_bfloat16()) return Expr(bfloat16(1.f));
+  if (type.is_float16()) return Expr(float16(1.f));
+  if (type.is_float(32)) return Expr(1.f);
+  if (type.is_float(64)) return Expr(double(1.));  // NOLINT
+
+  if (type.is_bool()) return Expr(true);
+
+  if (type.is_int(8)) return Expr(int8_t(1));
+  if (type.is_int(16)) return Expr(int16_t(1));
+  if (type.is_int(32)) return Expr(int32_t(1));
+  if (type.is_int(64)) return Expr(int64_t(1));
+
+  if (type.is_uint(8)) return Expr(uint8_t(1));
+  if (type.is_uint(16)) return Expr(uint16_t(1));
+  if (type.is_uint(32)) return Expr(uint32_t(1));
+  if (type.is_uint(64)) return Expr(uint64_t(1));
+  CINN_NOT_IMPLEMENTED
+  return Expr();
+}
+
+Expr::Expr(const Var &var) { *static_cast<IrNodeRef *>(this) = *static_cast<const IrNodeRef *>(&var); }
+bool Expr::as_bool() const {
+  CHECK(type().is_uint(1));
+  return As<UIntImm>()->value;
+}
+
+int8_t Expr::as_int8() const {
+  CHECK(type().is_int(8));
+  return As<IntImm>()->value;
+}
+int16_t Expr::as_int16() const {
+  CHECK(type().is_int(16));
+  return As<IntImm>()->value;
+}
+int32_t Expr::as_int32() const {
+  CHECK(type().is_int(32));
+  return As<IntImm>()->value;
+}
+int64_t Expr::as_int64() const {
+  CHECK(type().is_int(64));
+  return As<IntImm>()->value;
+}
+
+uint8_t Expr::as_uint8() const {
+  CHECK(type().is_uint(8));
+  return As<UIntImm>()->value;
+}
+uint16_t Expr::as_uint16() const {
+  CHECK(type().is_uint(16));
+  return As<UIntImm>()->value;
+}
+uint32_t Expr::as_uint32() const {
+  CHECK(type().is_uint(32));
+  return As<UIntImm>()->value;
+}
+uint64_t Expr::as_uint64() const {
+  CHECK(type().is_uint(64));
+  return As<UIntImm>()->value;
+}
+
+bfloat16 Expr::as_bfloat16() const {
+  CHECK(type().is_bfloat16());
+  return bfloat16(As<FloatImm>()->value);
+}
+float16 Expr::as_float16() const {
+  CHECK(type().is_float16());
+  return float16(As<FloatImm>()->value);
+}
+float Expr::as_float() const {
+  CHECK(type().is_float(32));
+  return As<FloatImm>()->value;
+}
+double Expr::as_double() const {
+  CHECK(type().is_float(64));
+  return As<FloatImm>()->value;
+}
+
+Expr &Expr::operator=(const Expr &other) {
+  *static_cast<IrNodeRef *>(this) = *static_cast<const IrNodeRef *>(&other);
+  return *this;
+}
+
+Expr::operator Var() {
+  auto *x = As<ir::_Var_>();
+  CHECK(x);
+  return ir::Var(x);
+}
+
+bool Expr::is_constant() const { return As<IntImm>() || As<UIntImm>() || As<FloatImm>(); }
+
+double Expr::get_constant() const {
+  CHECK(is_constant()) << *this << " is not constant! Please check.";
+  auto *vi = As<IntImm>();
+  auto *vf = As<FloatImm>();
+  if (vi) return vi->value;
+  return vf->value;
+}
+
+bool Expr::is_var() const { return As<_Var_>(); }
+
+_Buffer_ *Expr::as_buffer() { return As<_Buffer_>(); }
+const _Buffer_ *Expr::as_buffer() const { return As<_Buffer_>(); }
+Buffer Expr::as_buffer_ref() const { return Buffer(&Reference(as_buffer())); }
+
+_LoweredFunc_ *Expr::as_lowered_func() { return As<_LoweredFunc_>(); }
+const _LoweredFunc_ *Expr::as_lowered_func() const { return As<_LoweredFunc_>(); }
+
+_Module_ *Expr::as_module() { return As<_Module_>(); }
+const _Module_ *Expr::as_module() const { return As<_Module_>(); }
+ir::Module Expr::as_module_ref() const {
+  auto *module = as_module();
+  CHECK(module);  // Need check here?
+  // TODO(Superjomn) remove the Reference here.
+  return ir::Module(&Reference(module));
+}
+
+LoweredFunc Expr::as_lowered_func_ref() const {
+  auto *function = as_lowered_func();
+  CHECK(function);
+  return LoweredFunc(&Reference(function));
+}
+
+_Tensor_ *Expr::as_tensor() { return As<_Tensor_>(); }
+const _Tensor_ *Expr::as_tensor() const { return As<_Tensor_>(); }
+ir::Tensor Expr::as_tensor_ref() const { return ir::Tensor(&Reference(as_tensor())); }
+
+_Var_ *Expr::as_var() { return As<_Var_>(); }
+const _Var_ *Expr::as_var() const { return As<_Var_>(); }
+Var Expr::as_var_ref() const { return Var(&Reference(as_var())); }
+
+bool Expr::is_cmp() const {
+  switch (node_type()) {
+    case ir::IrNodeTy::LE:
+    case ir::IrNodeTy::LT:
+    case ir::IrNodeTy::EQ:
+    case ir::IrNodeTy::NE:
+    case ir::IrNodeTy::GT:
+    case ir::IrNodeTy::GE:
+      return true;
+    default:
+      return false;
+  }
+}
+
+const Expr &IrNode::operand(int i) {
+  CHECK_LT(i, operands.size());
+  return operands[i];
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h
new file mode 100644
index 0000000000000..b1baf1d59fdea
--- /dev/null
+++ b/paddle/cinn/ir/ir_base.h
@@ -0,0 +1,500 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/common/object.h"
+#include "cinn/common/shared.h"
+#include "cinn/common/type.h"
+
+namespace cinn {
+
+namespace ir {
+using common::BFloat16;
+using common::Float;
+using common::Float16;
+using common::Int;
+using common::Type;
+using common::type_of;
+
+class Module;
+class IRVisitor;
+class _Buffer_;
+class Buffer;
+class _Module_;
+class _LoweredFunc_;
+class LoweredFunc;
+class _Tensor_;
+class Tensor;
+class _Var_;
+class Var;
+class _BufferRange_;
+class BufferRange;
+class ScheduleBlock;
+class ScheduleBlockRealize;
+
+// clang-format off
+#define NODETY_PRIMITIVE_TYPE_FOR_EACH(macro__) \
+  macro__(IntImm)                               \
+  macro__(UIntImm)                              \
+  macro__(FloatImm)                             \
+  macro__(StringImm)                            \
+
+#define NODETY_BINARY_OP_FOR_EACH(macro__) \
+  macro__(Add)                      \
+  macro__(Sub)                      \
+  macro__(Mul)                      \
+  macro__(Div)                      \
+  macro__(Mod)                      \
+  macro__(EQ)                       \
+  macro__(NE)                       \
+  macro__(LT)                       \
+  macro__(LE)                       \
+  macro__(GT)                       \
+  macro__(GE)                       \
+  macro__(And)                      \
+  macro__(Or)                       \
+  macro__(Min)                      \
+  macro__(Max)                      \
+
+#define NODETY_UNARY_OP_FOR_EACH(macro__) \
+  macro__(Minus)                          \
+  macro__(Not)                            \
+
+#define NODETY_OP_FOR_EACH(macro__) NODETY_BINARY_OP_FOR_EACH(macro__) NODETY_UNARY_OP_FOR_EACH(macro__)
+
+#define NODETY_CONTROL_OP_FOR_EACH(macro__) \
+  macro__(Cast)                             \
+  macro__(For)                              \
+  macro__(PolyFor)                          \
+  macro__(Select)                           \
+  macro__(IfThenElse)                       \
+  macro__(Block)                            \
+  macro__(Call)                             \
+  macro__(_Var_)                            \
+  macro__(Load)                             \
+  macro__(Store)                            \
+  macro__(Alloc)                            \
+  macro__(Free)                             \
+  macro__(_Buffer_)                         \
+  macro__(_Tensor_)                         \
+  macro__(_LoweredFunc_)                    \
+  macro__(_Module_)                         \
+  macro__(Let)                              \
+  macro__(Reduce)                           \
+  macro__(Ramp)                             \
+  macro__(Broadcast)                        \
+  macro__(FracOp)                           \
+  macro__(Product)                          \
+  macro__(Sum)                              \
+  macro__(PrimitiveNode)                    \
+  macro__(IntrinsicOp)                      \
+  macro__(_BufferRange_)                    \
+  macro__(ScheduleBlock)                    \
+  macro__(ScheduleBlockRealize)             \
+
+
+#define NODETY_FORALL(__m)              \
+  NODETY_PRIMITIVE_TYPE_FOR_EACH(__m)   \
+  NODETY_OP_FOR_EACH(__m)               \
+  NODETY_CONTROL_OP_FOR_EACH(__m)
+// clang-format on
+
+//! Define IrNodeTy
+// @{
+#define __m(x__) x__,
+enum class IrNodeTy { kUnk = -1, NODETY_FORALL(__m) };
+#undef __m
+// @}
+
+//! String representations for IrNodeTy.
+// @{
+#define __m(x__) #x__,
+const std::vector<std::string> kIrNodeTyReprs({NODETY_FORALL(__m) "None"});
+#undef __m
+// @}
+
+std::ostream& operator<<(std::ostream& os, IrNodeTy type);
+
+struct Expr;
+
+/**
+ * The base of all the nodes in the IR.
+ */
+class IrNode : public common::Object {
+ public:
+  //! The operands of this operator.
+  std::vector<Expr> operands;
+
+  IrNode() = default;
+  explicit IrNode(Type t) : type_(t) {}
+  virtual ~IrNode() = default;
+
+  virtual IrNodeTy node_type() const { return IrNodeTy::kUnk; }
+  virtual Type type() const { return type_; }
+  void set_type(Type type) { type_ = type; }
+
+  //! Get i-th operand
+  const Expr& operand(int i);
+
+  //! Gather all the expression fields in this node for easier visit and mutate.
+  virtual std::vector<Expr*> expr_fields() { return {}; }
+  virtual std::vector<const Expr*> expr_fields() const { return {}; }
+
+  const char* type_info() const override { return __type_info__; }
+
+  //! Verify the current IR node's correctness.
+  virtual void Verify() const { CINN_NOT_IMPLEMENTED }
+
+ protected:
+  static constexpr char* __type_info__ = "IRNode";
+  Type type_;
+};
+
+/**
+ * A handle to store any IRNode.
+ */
+class IrNodeRef : public common::Shared<IrNode> {
+ public:
+  IrNodeRef() = default;
+  IrNodeRef(const IrNodeRef& other) : Shared(other.p_) {}
+  explicit IrNodeRef(IrNode* x) : Shared(x) {}
+
+  virtual IrNodeTy node_type() const { return operator->()->node_type(); }
+
+  template <typename T>
+  const T* As() const {
+    static_assert(std::is_base_of<IrNode, T>());
+    CHECK(get()) << "IrNodeRef holds null";
+    if (node_type() == T::_node_type_) return static_cast<const T*>(get());
+    return nullptr;
+  }
+  template <typename T>
+  T* As() {
+    if (node_type() == T::_node_type_) return static_cast<T*>(get());
+    return nullptr;
+  }
+
+  void operator=(const IrNodeRef& other) {
+    *static_cast<Shared<IrNode>*>(this) = *static_cast<const Shared<IrNode>*>(&other);
+  }
+
+  IrNode* ptr() { return get(); }
+  IrNode* ptr() const { return get(); }
+};
+
+template <typename T>
+struct ExprNode : public IrNode {
+  ExprNode() : IrNode(Type()) {}
+  explicit ExprNode(Type t) : IrNode(t) { set_type(t); }
+  explicit ExprNode(int num_operands) { operands().resize(num_operands); }
+
+  T* self() { return static_cast<T*>(this); }
+  const T* const_self() const { return dynamic_cast<const T*>(this); }
+
+  const std::vector<Expr>& operands() const { return IrNode::operands; }
+  std::vector<Expr>& operands() { return IrNode::operands; }
+
+  Expr& operand(int i) {
+    CHECK_LT(i, operands().size());
+    return operands()[i];
+  }
+  const Expr& operand(int i) const {
+    CHECK_LT(i, operands().size());
+    return operands()[i];
+  }
+
+  virtual Expr Copy() const;
+
+  IrNodeTy node_type() const override { return T::_node_type_; }
+};
+
+struct IntImm : public ExprNode<IntImm> {
+  int64_t value;
+
+  IntImm(Type t, int64_t v) : ExprNode<IntImm>(t), value(v) { Verify(); }
+
+  void Verify() const override {
+    CHECK(type().is_int());
+    CHECK(type().is_scalar());
+    CHECK(type().bits() == 8 || type().bits() == 16 || type().bits() == 32 || type().bits() == 64);
+  }
+
+  static const IrNodeTy _node_type_ = IrNodeTy::IntImm;
+};
+
+struct UIntImm : public ExprNode<UIntImm> {
+  uint64_t value;
+
+  UIntImm(Type t, uint64_t v) : ExprNode<UIntImm>(t), value(v) { Verify(); }
+
+  void Verify() const override {
+    CHECK(type().is_uint());
+    CHECK(type().is_scalar());
+    CHECK(type().bits() == 1 /*bool*/ || type().bits() == 8 || type().bits() == 16 || type().bits() == 32 ||
+          type().bits() == 64);
+  }
+
+  static const IrNodeTy _node_type_ = IrNodeTy::UIntImm;
+};
+
+struct FloatImm : public ExprNode<FloatImm> {
+  double value;
+
+  FloatImm(Type t, double v) : ExprNode<FloatImm>(t), value(v) { Verify(); }
+
+  void Verify() const override {
+    CHECK(type().is_float());
+    CHECK(type().is_scalar());
+  }
+
+  static const IrNodeTy _node_type_ = IrNodeTy::FloatImm;
+};
+
+struct StringImm : public ExprNode<StringImm> {
+  std::string value;
+
+  explicit StringImm(const std::string& value) : value(value) { Verify(); }
+
+  void Verify() const override {}
+
+  static const IrNodeTy _node_type_ = IrNodeTy::StringImm;
+};
+
+class Var;
+/**
+ * An expression that represents some value or the result of some operations.
+ */
+struct Expr : public IrNodeRef {
+ public:
+  Expr() = default;
+  Expr(const Expr& other) : IrNodeRef(other.ptr()) {}
+  Expr(IrNode* p) : IrNodeRef(p) {}  // NOLINT
+  explicit Expr(const Var& var);
+
+  //! Helper function to construct numeric constants of various types.
+  // @{
+  explicit Expr(bool x) : IrNodeRef(new UIntImm(UInt(1), x)) {}
+
+  explicit Expr(int8_t x) : IrNodeRef(new IntImm(Int(8), x)) {}
+  explicit Expr(int16_t x) : IrNodeRef(new IntImm(Int(16), x)) {}
+  explicit Expr(int32_t x) : IrNodeRef(new IntImm(Int(32), x)) {}
+  explicit Expr(int64_t x) : IrNodeRef(new IntImm(Int(64), x)) {}
+
+  explicit Expr(uint8_t x) : IrNodeRef(new UIntImm(UInt(8), x)) {}
+  explicit Expr(uint16_t x) : IrNodeRef(new UIntImm(UInt(16), x)) {}
+  explicit Expr(uint32_t x) : IrNodeRef(new UIntImm(UInt(32), x)) {}
+  explicit Expr(uint64_t x) : IrNodeRef(new UIntImm(UInt(64), x)) {}
+
+  explicit Expr(cinn::common::bfloat16 x) : IrNodeRef(new FloatImm(BFloat16(), x)) {}
+  explicit Expr(cinn::common::float16 x) : IrNodeRef(new FloatImm(Float16(), x)) {}
+  explicit Expr(float x) : IrNodeRef(new FloatImm(Float(32), x)) {}
+  explicit Expr(double x) : IrNodeRef(new FloatImm(Float(64), x)) {}
+
+  explicit Expr(const std::string& x) : IrNodeRef(new StringImm(x)) {}
+  // @}
+
+  Expr& operator=(const Expr& other);
+
+  // primitive types
+  // @{
+  bool as_bool() const;
+
+  int8_t as_int8() const;
+  int16_t as_int16() const;
+  int32_t as_int32() const;
+  int64_t as_int64() const;
+
+  uint8_t as_uint8() const;
+  uint16_t as_uint16() const;
+  uint32_t as_uint32() const;
+  uint64_t as_uint64() const;
+
+  cinn::common::bfloat16 as_bfloat16() const;
+  cinn::common::float16 as_float16() const;
+  float as_float() const;
+  double as_double() const;
+  // @}
+
+  _Var_* as_var();
+  const _Var_* as_var() const;
+  Var as_var_ref() const;
+
+  // @{ Other nodes caster.
+  _Buffer_* as_buffer();
+  const _Buffer_* as_buffer() const;
+  Buffer as_buffer_ref() const;
+
+  _LoweredFunc_* as_lowered_func();
+  const _LoweredFunc_* as_lowered_func() const;
+  LoweredFunc as_lowered_func_ref() const;
+
+  _Module_* as_module();
+  const _Module_* as_module() const;
+  ir::Module as_module_ref() const;
+
+  _Tensor_* as_tensor();
+  const _Tensor_* as_tensor() const;
+  ir::Tensor as_tensor_ref() const;
+  // @}
+
+  bool is_constant() const;
+  double get_constant() const;
+
+  //! Tell if this is a compare op.
+  bool is_cmp() const;
+
+  bool is_var() const;
+
+  operator Var();
+
+  Type type() const { return p_->type(); }
+};
+
+template <typename T>
+struct UnaryOpNode : public ExprNode<T> {
+  UnaryOpNode() { operands().resize(1); }
+  UnaryOpNode(Type type, Expr v) : ExprNode<T>(type) {
+    CHECK(v.defined());
+    operands().resize(1);
+    this->v() = v;
+  }
+
+  Type type() const override {
+    CHECK(v().defined());
+    return v().type();
+  }
+
+  Expr& v() { return operands().front(); }
+  const Expr& v() const { return operands().front(); }
+
+  std::vector<Expr*> expr_fields() override { return {&v()}; }
+  std::vector<const Expr*> expr_fields() const override { return {&v()}; }
+
+  using ExprNode<T>::operands;
+};
+
+template <typename T>
+struct BinaryOpNode : public ExprNode<T> {
+  BinaryOpNode() { operands().resize(2); }
+  BinaryOpNode(Type type, Expr a, Expr b) : ExprNode<T>(type) {
+    CHECK(type.valid());
+    CHECK(a.defined());
+    CHECK(b.defined());
+    operands().resize(2);
+    this->a() = a;
+    this->b() = b;
+    // CHECK_EQ(a.type(), b.type()) << "the type of two argument not match";
+  }
+
+  Expr& a() { return ExprNode<T>::operand(0); }
+  Expr& b() { return ExprNode<T>::operand(1); }
+  const Expr& a() const { return ExprNode<T>::operand(0); }
+  const Expr& b() const { return ExprNode<T>::operand(1); }
+
+  Type type() const override { return a().type(); }
+
+  std::vector<Expr*> expr_fields() override { return {&a(), &b()}; }
+  std::vector<const Expr*> expr_fields() const override { return {&a(), &b()}; }
+
+  using ExprNode<T>::operands;
+};
+
+//! Zero in CINN type system.
+Expr Zero(const Type& type);
+Expr One(const Type& type);
+
+#define DEVICE_API_FOR_ALL(__) \
+  __(UNK)                      \
+  __(Host)                     \
+  __(GPU)                      \
+  __(CUDA)                     \
+  __(OpenCL)
+
+#define __decl__(x) x,
+enum class DeviceAPI { DEVICE_API_FOR_ALL(__decl__) };
+#undef __decl__
+
+static std::ostream& operator<<(std::ostream& os, DeviceAPI x) {
+  switch (x) {
+#define __decl__(x)  \
+  case DeviceAPI::x: \
+    os << #x;        \
+    break;
+
+    DEVICE_API_FOR_ALL(__decl__)
+#undef __decl__
+
+    default:
+      break;
+  }
+  return os;
+}
+
+#define MEMORY_TYPE_FOR_ALL(__)                                                \
+  __(Auto, "Auto")                                                             \
+  __(Heap, "Heap")                                                             \
+  __(Stack, "Stack")                                                           \
+  __(GPUShared, "GPUShared")                                                   \
+  __(GPULocal, "GPULocal")                                                     \
+/**                                                                            \
+ * An enum describing different address spaces to be used with Func::store_in. \
+ */
+enum class MemoryType {
+#define __(token__, token_repr__) token__,
+  MEMORY_TYPE_FOR_ALL(__)
+#undef __
+};
+
+static std::ostream& operator<<(std::ostream& os, MemoryType t) {
+  switch (t) {
+#define __(token__, token_repr__) \
+  case MemoryType::token__:       \
+    os << token_repr__;           \
+    break;
+
+    MEMORY_TYPE_FOR_ALL(__)
+
+    default:
+      LOG(FATAL) << "Not supported memory type";
+#undef __
+  }
+  return os;
+}
+
+template <typename T>
+Expr ExprNode<T>::Copy() const {
+  LOG(FATAL) << "Not Implemented";
+  return Expr();
+}
+
+}  // namespace ir
+}  // namespace cinn
+
+namespace std {
+
+template <>
+struct hash<cinn::ir::Expr> {
+  size_t operator()(const cinn::ir::Expr& x) { return reinterpret_cast<size_t>(x.get()); }
+};
+
+}  // namespace std
diff --git a/paddle/cinn/ir/ir_compare.cc b/paddle/cinn/ir/ir_compare.cc
new file mode 100644
index 0000000000000..16a0672d51fea
--- /dev/null
+++ b/paddle/cinn/ir/ir_compare.cc
@@ -0,0 +1,319 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_compare.h"
+
+#include <regex>
+
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn {
+namespace ir {
+
+bool IrEqualVisitor::Compare(const Expr& lhs, const Expr& rhs) {
+  if (lhs.get() == rhs.get()) {  // the same object, including both are null
+    return true;
+  }
+
+  if (!lhs.defined() || !rhs.defined()) {  // someone invalid
+    return false;
+    VLOG(5) << "Not equal on Expr, someone not defined";
+  }
+  bool equal = lhs->node_type() == rhs->node_type();
+  equal      = equal && IRVisitorBase<bool, const Expr*>::Visit(&lhs, &rhs);
+
+  if (!equal) {
+    VLOG(5) << "Not equal on Expr, lhs:[type:" << kIrNodeTyReprs[static_cast<int>(lhs->node_type())] << "]\n"
+            << lhs << ", \nrhs[type:" << kIrNodeTyReprs[static_cast<int>(rhs->node_type())] << "]\n"
+            << rhs;
+  }
+  return equal;
+}
+
+bool IrEqualVisitor::Compare(const std::string& lhs, const std::string& rhs, bool allow_name_suffix_diff) {
+  // if allow_name_suffix_diff=true then just compare the name prefix before the "_[0-9]+"
+  auto common_len = 0;
+  for (; common_len < lhs.size() && common_len < rhs.size(); ++common_len) {
+    if (lhs[common_len] != rhs[common_len]) break;
+  }
+
+  auto is_endswith_index = [&common_len](const std::string& name) {
+    const std::regex txt_regex("_\\d+");
+    return common_len == name.size() || std::regex_match(name.substr(common_len), txt_regex);
+  };
+
+  bool equal = false;
+  if (common_len == lhs.size() && common_len == rhs.size()) {
+    equal = true;
+  } else {
+    equal = false;
+    if (allow_name_suffix_diff) {
+      equal = is_endswith_index(lhs) && is_endswith_index(rhs);
+    }
+  }
+
+  if (!equal) {
+    VLOG(5) << "Not euqal on name, lhs=" << lhs << ", rhs=" << rhs;
+  }
+
+  return equal;
+}
+
+bool IrEqualVisitor::Compare(const std::map<std::string, attr_t>& lhs, const std::map<std::string, attr_t>& rhs) {
+  if (lhs.size() != rhs.size()) {
+    VLOG(6) << "Not equal on attrs, lhs size=" << lhs.size() << ", rhs size=" << rhs.size();
+    return false;
+  }
+  for (auto&& kv : lhs) {
+    auto opposite = rhs.find(kv.first);
+    if (opposite == rhs.end() || kv.second != opposite->second) {
+      VLOG(6) << "Not equal at attr key=" << kv.first;
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+bool IrEqualVisitor::Compare(const std::vector<T>& lhs, const std::vector<T>& rhs) {
+  if (lhs.size() != rhs.size()) {
+    VLOG(6) << "Not equal on repeated fields, lhs size=" << lhs.size() << ", rhs size=" << rhs.size();
+    return false;
+  }
+  for (auto i = 0; i < lhs.size(); ++i) {
+    if (!Compare(lhs.at(i), rhs.at(i))) {
+      VLOG(6) << "Not equal on repeated fields at index=" << i;
+      return false;
+    }
+  }
+  return true;
+}
+
+#define PRIMITIVE_TYPE_IMPL(op__)                                  \
+  bool IrEqualVisitor::Visit(const op__* lhs, const Expr* other) { \
+    auto* rhs = other->As<op__>();                                 \
+    return lhs->value == rhs->value;                               \
+  }
+
+#define UNARY_OP_IMPL(op__)                                        \
+  bool IrEqualVisitor::Visit(const op__* lhs, const Expr* other) { \
+    auto* rhs = other->As<op__>();                                 \
+    return Compare(lhs->v(), rhs->v());                            \
+  }
+
+#define BINARY_OP_IMPL(op__)                                           \
+  bool IrEqualVisitor::Visit(const op__* lhs, const Expr* other) {     \
+    auto* rhs = other->As<op__>();                                     \
+    return Compare(lhs->a(), rhs->a()) && Compare(lhs->b(), rhs->b()); \
+  }
+
+NODETY_PRIMITIVE_TYPE_FOR_EACH(PRIMITIVE_TYPE_IMPL)
+NODETY_UNARY_OP_FOR_EACH(UNARY_OP_IMPL)
+NODETY_BINARY_OP_FOR_EACH(BINARY_OP_IMPL)
+
+#undef PRIMITIVE_TYPE_IMPL
+#undef UNARY_OP_IMPL
+#undef BINARY_OP_IMPL
+
+bool IrEqualVisitor::Visit(const Cast* lhs, const Expr* other) {
+  auto* rhs = other->As<Cast>();
+  return lhs->type() == rhs->type() && Compare(lhs->v(), rhs->v());
+}
+
+bool IrEqualVisitor::Visit(const For* lhs, const Expr* other) {
+  auto* rhs = other->As<For>();
+  return lhs->for_type() == rhs->for_type() && Compare(lhs->loop_var, rhs->loop_var) && Compare(lhs->min, rhs->min) &&
+         Compare(lhs->extent, rhs->extent) && Compare(lhs->body, rhs->body);
+}
+
+bool IrEqualVisitor::Visit(const PolyFor* lhs, const Expr* other) {
+  auto* rhs = other->As<PolyFor>();
+  return lhs->for_type() == rhs->for_type() && Compare(lhs->iterator, rhs->iterator) && Compare(lhs->init, rhs->init) &&
+         Compare(lhs->condition, rhs->condition) && Compare(lhs->inc, rhs->inc) && Compare(lhs->body, rhs->body);
+}
+
+bool IrEqualVisitor::Visit(const Select* lhs, const Expr* other) {
+  auto* rhs = other->As<Select>();
+  return Compare(lhs->condition, rhs->condition) && Compare(lhs->true_value, rhs->true_value) &&
+         Compare(lhs->false_value, rhs->false_value);
+}
+
+bool IrEqualVisitor::Visit(const IfThenElse* lhs, const Expr* other) {
+  auto* rhs = other->As<IfThenElse>();
+  return Compare(lhs->condition, rhs->condition) && Compare(lhs->true_case, rhs->true_case) &&
+         Compare(lhs->false_case, rhs->false_case);
+}
+
+bool IrEqualVisitor::Visit(const Block* lhs, const Expr* other) {
+  auto* rhs = other->As<Block>();
+  return Compare(lhs->stmts, rhs->stmts);
+}
+
+bool IrEqualVisitor::Visit(const Call* lhs, const Expr* other) {
+  auto* rhs = other->As<Call>();
+  return lhs->name == rhs->name && Compare(lhs->read_args, rhs->read_args) &&
+         Compare(lhs->write_args, rhs->write_args) && Compare(lhs->attrs, rhs->attrs) &&
+         lhs->call_type == rhs->call_type;
+  // TODO(CtfGo): Compare `func` field
+}
+
+bool IrEqualVisitor::Visit(const _Var_* lhs, const Expr* other) {
+  auto* rhs = other->As<_Var_>();
+  return lhs->name == rhs->name && Compare(lhs->lower_bound, rhs->lower_bound) &&
+         Compare(lhs->upper_bound, rhs->upper_bound) && lhs->tag == rhs->tag;
+}
+
+bool IrEqualVisitor::Visit(const Load* lhs, const Expr* other) {
+  auto* rhs = other->As<Load>();
+  return Compare(lhs->tensor, rhs->tensor) && Compare(lhs->indices, rhs->indices);
+}
+
+bool IrEqualVisitor::Visit(const Store* lhs, const Expr* other) {
+  auto* rhs = other->As<Store>();
+  return Compare(lhs->tensor, rhs->tensor) && Compare(lhs->indices, rhs->indices);
+}
+
+bool IrEqualVisitor::Visit(const Alloc* lhs, const Expr* other) {
+  auto* rhs = other->As<Alloc>();
+  return Compare(lhs->destination, rhs->destination) && Compare(lhs->extents, rhs->extents) &&
+         Compare(lhs->condition, rhs->condition) && Compare(lhs->body, rhs->body);
+}
+
+bool IrEqualVisitor::Visit(const Free* lhs, const Expr* other) {
+  auto* rhs = other->As<Free>();
+  return Compare(lhs->destination, rhs->destination);
+}
+
+bool IrEqualVisitor::Visit(const _Buffer_* lhs, const Expr* other) {
+  auto* rhs = other->As<_Buffer_>();
+  return Compare(lhs->shape, rhs->shape) && Compare(lhs->strides, rhs->strides) && lhs->name == rhs->name &&
+         lhs->scope == rhs->scope && Compare(lhs->elem_offset, rhs->elem_offset) &&
+         lhs->offset_factor == rhs->offset_factor && lhs->target == rhs->target &&
+         lhs->data_alignment == rhs->data_alignment && lhs->memory_type == rhs->memory_type && lhs->dtype == rhs->dtype;
+}
+
+bool IrEqualVisitor::Visit(const _Tensor_* lhs, const Expr* other) {
+  auto* rhs = other->As<_Tensor_>();
+  return lhs->name == rhs->name && Compare(lhs->shape, rhs->shape);
+}
+
+bool IrEqualVisitor::Visit(const _LoweredFunc_* lhs, const Expr* other) {
+  auto* rhs = other->As<_LoweredFunc_>();
+  if (lhs->name != rhs->name) {
+    VLOG(6) << "Not equal, lhs name=" << lhs->name << ", rhs name=" << rhs->name;
+    return false;
+  }
+
+  auto compare_args_fn = [this](const std::vector<Argument>& largs, const std::vector<Argument>& rargs) -> bool {
+    if (largs.size() != rargs.size()) {
+      VLOG(6) << "Not equal, lhs args size=" << largs.size() << ", rhs args size=" << rargs.size();
+      return false;
+    }
+    for (auto i = 0; i < largs.size(); ++i) {
+      const Argument& a = largs.at(i);
+      const Argument& b = rargs.at(i);
+      bool equal        = a.io == b.io;
+      equal = equal && (!a.is_var() && !b.is_var() || a.is_var() && b.is_var() && Compare(a.var_arg(), b.var_arg()));
+      equal = equal && (!a.is_buffer() && !b.is_buffer() ||
+                        a.is_buffer() && b.is_buffer() && Compare(a.buffer_arg(), b.buffer_arg()));
+      if (!equal) {
+        VLOG(6) << "Not equal at Argument index=" << i;
+        return false;
+      }
+    }
+    return true;
+  };
+
+  return compare_args_fn(lhs->args, rhs->args) && Compare(lhs->temp_bufs, rhs->temp_bufs) &&
+         Compare(lhs->body, rhs->body) && lhs->device_api == rhs->device_api &&
+         Compare(lhs->alloc_output_buffer_exprs, rhs->alloc_output_buffer_exprs) &&
+         Compare(lhs->dealloc_output_buffer_exprs, rhs->dealloc_output_buffer_exprs) &&
+         Compare(lhs->buffer_data_cast_exprs, rhs->buffer_data_cast_exprs) &&
+         Compare(lhs->argument_prepare_exprs, rhs->argument_prepare_exprs);
+}
+
+bool IrEqualVisitor::Visit(const _Module_* lhs, const Expr* other) {
+  auto* rhs = other->As<_Module_>();
+  return lhs->name == rhs->name && lhs->target == rhs->target && Compare(lhs->buffers, rhs->buffers) &&
+         Compare(lhs->functions, rhs->functions) && Compare(lhs->submodules, rhs->submodules);
+}
+
+bool IrEqualVisitor::Visit(const Let* lhs, const Expr* other) {
+  auto* rhs = other->As<Let>();
+  return Compare(lhs->symbol, rhs->symbol) && Compare(lhs->body, rhs->body);
+}
+
+bool IrEqualVisitor::Visit(const Reduce* lhs, const Expr* other) {
+  auto* rhs = other->As<Reduce>();
+  return Compare(lhs->init, rhs->init) && Compare(lhs->body, rhs->body) && lhs->reduce_type == rhs->reduce_type;
+  // TODO(CtfGo): compare `reduce_axis` field
+}
+
+bool IrEqualVisitor::Visit(const Ramp* lhs, const Expr* other) {
+  auto* rhs = other->As<Ramp>();
+  return Compare(lhs->base, rhs->base) && Compare(lhs->stride, rhs->stride) && lhs->lanes == rhs->lanes;
+}
+
+bool IrEqualVisitor::Visit(const Broadcast* lhs, const Expr* other) {
+  auto* rhs = other->As<Broadcast>();
+  return Compare(lhs->value, rhs->value) && lhs->lanes == rhs->lanes;
+}
+
+bool IrEqualVisitor::Visit(const FracOp* lhs, const Expr* other) {
+  auto* rhs = other->As<FracOp>();
+  return Compare(lhs->a(), rhs->a()) && Compare(lhs->b(), rhs->b());
+}
+
+bool IrEqualVisitor::Visit(const Product* lhs, const Expr* other) {
+  auto* rhs = other->As<Product>();
+  return Compare(lhs->operands(), rhs->operands());
+}
+
+bool IrEqualVisitor::Visit(const Sum* lhs, const Expr* other) {
+  auto* rhs = other->As<Sum>();
+  return Compare(lhs->operands(), rhs->operands());
+}
+
+bool IrEqualVisitor::Visit(const PrimitiveNode* lhs, const Expr* other) {
+  auto* rhs = other->As<PrimitiveNode>();
+  return lhs->name == rhs->name && Compare(lhs->arguments, rhs->arguments) && Compare(lhs->attrs, rhs->attrs);
+}
+
+bool IrEqualVisitor::Visit(const IntrinsicOp* lhs, const Expr* other) {
+  auto* rhs = other->As<IntrinsicOp>();
+  return lhs->getKind() == rhs->getKind() && lhs->input_types() == rhs->input_types() &&
+         lhs->output_types() == rhs->output_types();
+  // TODO(CtfGo): Compare every derived class of IntrinsicOp separately
+}
+
+bool IrEqualVisitor::Visit(const _BufferRange_* lhs, const Expr* other) {
+  auto* rhs = other->As<_BufferRange_>();
+  return Compare(lhs->buffer, rhs->buffer) && Compare(lhs->ranges, rhs->ranges);
+}
+
+bool IrEqualVisitor::Visit(const ScheduleBlock* lhs, const Expr* other) {
+  auto* rhs = other->As<ScheduleBlock>();
+  return Compare(lhs->name, rhs->name, allow_name_suffix_diff_) && Compare(lhs->iter_vars, rhs->iter_vars) &&
+         Compare(lhs->read_buffers, rhs->read_buffers) && Compare(lhs->write_buffers, rhs->write_buffers) &&
+         Compare(lhs->attrs, rhs->attrs) && Compare(lhs->body, rhs->body);
+}
+
+bool IrEqualVisitor::Visit(const ScheduleBlockRealize* lhs, const Expr* other) {
+  auto* rhs = other->As<ScheduleBlockRealize>();
+  return Compare(lhs->iter_values, rhs->iter_values) && Compare(lhs->schedule_block, rhs->schedule_block);
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_compare.h b/paddle/cinn/ir/ir_compare.h
new file mode 100644
index 0000000000000..3bad12b7e7665
--- /dev/null
+++ b/paddle/cinn/ir/ir_compare.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_visitor.h"
+
+namespace cinn {
+namespace ir {
+
+// Determine whether two ir AST trees are euqal by comparing their struct and fields of each node through dfs visitor
+class IrEqualVisitor : public IRVisitorBase<bool, const Expr*> {
+ public:
+  explicit IrEqualVisitor(bool allow_name_suffix_diff = false) : allow_name_suffix_diff_(allow_name_suffix_diff) {}
+  // Return true if they are euqal, otherwise false;
+  bool Compare(const Expr& lhs, const Expr& rhs);
+
+ private:
+  bool Compare(const std::string& lhs, const std::string& rhs, bool allow_name_suffix_diff = false);
+  bool Compare(const std::map<std::string, attr_t>& lhs, const std::map<std::string, attr_t>& rhs);
+  template <typename T>
+  bool Compare(const std::vector<T>& lhs, const std::vector<T>& rhs);
+
+#define __(op__) bool Visit(const op__* lhs, const Expr* other) override;
+  NODETY_FORALL(__)
+#undef __
+
+  // whether allowing name suffix ends with "_[0-9]+" different
+  bool allow_name_suffix_diff_ = false;
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_compare_test.cc b/paddle/cinn/ir/ir_compare_test.cc
new file mode 100644
index 0000000000000..88a288dd918df
--- /dev/null
+++ b/paddle/cinn/ir/ir_compare_test.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_compare.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/common/context.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace ir {
+
+TEST(TestIrCompare, SingleFunction) {
+  Target target = common::DefaultHostTarget();
+
+  ir::Expr M(32);
+  ir::Expr N(32);
+
+  lang::Placeholder<float> A("A", {M, N});
+  ir::Tensor B = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) + ir::Expr(2.f); }, "B");
+  ir::Tensor C = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) + ir::Expr(2.f); }, "C");
+
+  cinn::common::Context::Global().ResetNameId();
+  auto funcs_1 = lang::LowerVec("add_const", poly::CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
+
+  cinn::common::Context::Global().ResetNameId();
+  auto funcs_2 = lang::LowerVec("add_const", poly::CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
+
+  cinn::common::Context::Global().ResetNameId();
+  auto funcs_3 = lang::LowerVec("add_const", poly::CreateStages({A, C}), {A, C}, {}, {}, nullptr, target, true);
+
+  ASSERT_EQ(funcs_1.size(), 1);
+  ASSERT_EQ(funcs_2.size(), 1);
+  ASSERT_EQ(funcs_3.size(), 1);
+
+  std::string func1_str = R"ROC(function add_const (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 32)
+      {
+        ScheduleBlock(B)
+        {
+          i0, i1 = axis.bind(i, j)
+          B[i0, i1] = (A[i0, i1] + 2.00000000f)
+        }
+      }
+    }
+  }
+})ROC";
+
+  std::string func2_str = R"ROC(function add_const (_A, _B)
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 32)
+      {
+        ScheduleBlock(B)
+        {
+          i0, i1 = axis.bind(i, j)
+          B[i0, i1] = (A[i0, i1] + 2.00000000f)
+        }
+      }
+    }
+  }
+})ROC";
+
+  std::string func3_str = R"ROC(function add_const (_A, _C)
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 32)
+      {
+        ScheduleBlock(C)
+        {
+          i0, i1 = axis.bind(i, j)
+          C[i0, i1] = (A[i0, i1] + 2.00000000f)
+        }
+      }
+    }
+  }
+})ROC";
+
+  ASSERT_EQ(func1_str, utils::GetStreamCnt(funcs_1.front()));
+  ASSERT_EQ(func2_str, utils::GetStreamCnt(funcs_2.front()));
+  ASSERT_EQ(func3_str, utils::GetStreamCnt(funcs_3.front()));
+
+  IrEqualVisitor compartor;
+  // they are different at the name of root ScheduleBlock
+  ASSERT_TRUE(compartor.Compare(funcs_1.front(), funcs_2.front()));
+  // compare with itself
+  ASSERT_TRUE(compartor.Compare(funcs_1.front(), funcs_1.front()));
+  IrEqualVisitor compartor_allow_suffix_diff(true);
+  // they are euqal if allowing suffix of name different
+  ASSERT_TRUE(compartor_allow_suffix_diff.Compare(funcs_1.front(), funcs_2.front()));
+
+  ASSERT_FALSE(compartor.Compare(funcs_1.front(), funcs_3.front()));
+  ASSERT_FALSE(compartor_allow_suffix_diff.Compare(funcs_1.front(), funcs_3.front()));
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_mutator.cc b/paddle/cinn/ir/ir_mutator.cc
new file mode 100644
index 0000000000000..eabeecfc1c67c
--- /dev/null
+++ b/paddle/cinn/ir/ir_mutator.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_mutator.h"
+
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace ir {}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_mutator.h b/paddle/cinn/ir/ir_mutator.h
new file mode 100755
index 0000000000000..90098e3b35514
--- /dev/null
+++ b/paddle/cinn/ir/ir_mutator.h
@@ -0,0 +1,334 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This file implements the IRMutator as the base interface to mutate the IR.
+ */
+#pragma once
+
+#include "cinn/ir/intrinsic_ops.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_visitor.h"
+
+namespace cinn {
+namespace ir {
+
+//! T might be Expr* or const Expr*
+template <typename T = Expr *>
+class IRMutator : public IRVisitorBase<void, T> {
+ public:
+  void Visit(const Expr *expr, T op) override;
+
+#define __(op__) void Visit(const op__ *op, T expr) override;
+  NODETY_FORALL(__)
+#undef __
+};
+
+template <typename T>
+void IRMutator<T>::Visit(const Expr *expr, T op) {
+  IRVisitorBase<void, T>::Visit(expr, op);
+}
+
+#define UNARY_OP_IMPL(op__)                                \
+  template <typename T>                                    \
+  void IRMutator<T>::Visit(const op__ *expr, T op) {       \
+    auto *node = op->template As<op__>();                  \
+    IRVisitorBase<void, T>::Visit(&node->v(), &node->v()); \
+  }
+
+#define BINARY_OP_IMPL(op__)                               \
+  template <typename T>                                    \
+  void IRMutator<T>::Visit(const op__ *expr, T op) {       \
+    auto *node = op->template As<op__>();                  \
+    IRVisitorBase<void, T>::Visit(&node->a(), &node->a()); \
+    IRVisitorBase<void, T>::Visit(&node->b(), &node->b()); \
+  }
+
+NODETY_UNARY_OP_FOR_EACH(UNARY_OP_IMPL)
+NODETY_BINARY_OP_FOR_EACH(BINARY_OP_IMPL)
+
+#undef UNARY_OP_IMPL
+#undef BINARY_OP_IMPL
+
+template <typename T>
+void IRMutator<T>::Visit(const IntImm *expr, T op) {}
+template <typename T>
+void IRMutator<T>::Visit(const UIntImm *expr, T op) {}
+template <typename T>
+void IRMutator<T>::Visit(const FloatImm *expr, T op) {}
+template <typename T>
+void IRMutator<T>::Visit(const StringImm *expr, T op) {}
+template <typename T>
+void IRMutator<T>::Visit(const Cast *expr, T op) {
+  auto *node = op->template As<Cast>();
+  Visit(&node->v(), &node->v());
+}
+template <typename T>
+void IRMutator<T>::Visit(const For *expr, T op) {
+  auto *node = op->template As<For>();
+  IRVisitorBase<void, T>::Visit(&node->min, &node->min);
+  IRVisitorBase<void, T>::Visit(&node->extent, &node->extent);
+  IRVisitorBase<void, T>::Visit(&node->body, &node->body);
+}
+template <typename T>
+void IRMutator<T>::Visit(const PolyFor *expr, T op) {
+  auto *node = op->template As<PolyFor>();
+  // IRVisitorBase<void,T>::Visit(&node->iterator, &node->iterator);
+  IRVisitorBase<void, T>::Visit(&node->init, &node->init);
+  IRVisitorBase<void, T>::Visit(&node->condition, &node->condition);
+  IRVisitorBase<void, T>::Visit(&node->inc, &node->inc);
+  IRVisitorBase<void, T>::Visit(&node->body, &node->body);
+}
+template <typename T>
+void IRMutator<T>::Visit(const Select *expr, T op) {
+  auto *node = op->template As<Select>();
+  IRVisitorBase<void, T>::Visit(&node->condition, &node->condition);
+  IRVisitorBase<void, T>::Visit(&node->true_value, &node->true_value);
+  IRVisitorBase<void, T>::Visit(&node->false_value, &node->false_value);
+}
+template <typename T>
+void IRMutator<T>::Visit(const IfThenElse *expr, T op) {
+  auto *node = op->template As<IfThenElse>();
+  IRVisitorBase<void, T>::Visit(&node->condition, &node->condition);
+  IRVisitorBase<void, T>::Visit(&node->true_case, &node->true_case);
+  if (node->false_case.defined()) IRVisitorBase<void, T>::Visit(&node->false_case, &node->false_case);
+}
+template <typename T>
+void IRMutator<T>::Visit(const Block *expr, T op) {
+  auto *node = op->template As<Block>();
+  for (auto &expr : node->stmts) {
+    IRVisitorBase<void, T>::Visit(&expr, &expr);
+  }
+}
+template <typename T>
+void IRMutator<T>::Visit(const Call *expr, T op) {
+  auto *node = op->template As<Call>();
+  for (auto &expr : node->read_args) {
+    IRVisitorBase<void, T>::Visit(&expr, &expr);
+  }
+  for (auto &expr : node->write_args) {
+    IRVisitorBase<void, T>::Visit(&expr, &expr);
+  }
+}
+template <typename T>
+void IRMutator<T>::Visit(const _Module_ *expr, T op) {
+  auto *node = op->template As<_Module_>();
+  for (auto &func : node->functions) {
+    IRVisitorBase<void, T>::Visit(&func, &func);
+  }
+  for (auto &func : node->buffers) {
+    IRVisitorBase<void, T>::Visit(&func, &func);
+  }
+  for (auto &expr : node->submodules) {
+    IRVisitorBase<void, T>::Visit(&expr, &expr);
+  }
+}
+template <typename T>
+void IRMutator<T>::Visit(const _Var_ *expr, T op) {
+  auto *node = op->template As<ir::_Var_>();
+  if (node->lower_bound.defined()) {
+    IRVisitorBase<void, T>::Visit(&node->lower_bound, &node->lower_bound);
+  }
+  if (node->upper_bound.defined()) {
+    IRVisitorBase<void, T>::Visit(&node->upper_bound, &node->upper_bound);
+  }
+}
+template <typename T>
+void IRMutator<T>::Visit(const Load *expr, T op) {
+  auto *node = op->template As<Load>();
+  for (auto &idx : node->indices) IRVisitorBase<void, T>::Visit(&idx, &idx);
+  IRVisitorBase<void, T>::Visit(&node->tensor, &node->tensor);
+}
+template <typename T>
+void IRMutator<T>::Visit(const Store *expr, T op) {
+  auto *node = op->template As<Store>();
+  IRVisitorBase<void, T>::Visit(&node->value, &node->value);
+  IRVisitorBase<void, T>::Visit(&node->tensor, &node->tensor);
+  for (auto &idx : node->indices) IRVisitorBase<void, T>::Visit(&idx, &idx);
+}
+template <typename T>
+void IRMutator<T>::Visit(const Alloc *expr, T op) {
+  auto *node = op->template As<Alloc>();
+  for (auto &e : node->extents) {
+    IRVisitorBase<void, T>::Visit(&e, &e);
+  }
+
+  if (node->condition.defined()) IRVisitorBase<void, T>::Visit(&node->condition, &node->condition);
+  if (node->body.defined()) {
+    Expr body(node->body);
+    IRVisitorBase<void, T>::Visit(&node->body, &body);
+  }
+}
+template <typename T>
+void IRMutator<T>::Visit(const Free *expr, T op) {
+  auto *node = op->template As<Free>();
+  IRVisitorBase<void, T>::Visit(&node->destination, &node->destination);
+}
+template <typename T>
+void IRMutator<T>::Visit(const _Buffer_ *expr, T op) {
+  auto *node = op->template As<_Buffer_>();
+
+  for (auto &e : node->shape) {
+    IRVisitorBase<void, T>::Visit(&e, &e);
+  }
+  for (auto &e : node->strides) {
+    IRVisitorBase<void, T>::Visit(&e, &e);
+  }
+  IRVisitorBase<void, T>::Visit(&node->elem_offset, &node->elem_offset);
+}
+template <typename T>
+void IRMutator<T>::Visit(const _Tensor_ *expr, T op) {
+  auto *node = op->template As<_Tensor_>();
+
+  for (auto &e : node->shape) {
+    IRVisitorBase<void, T>::Visit(&e, &e);
+  }
+}
+template <typename T>
+void IRMutator<T>::Visit(const _LoweredFunc_ *expr, T op) {
+  auto *node = op->template As<_LoweredFunc_>();
+  IRVisitorBase<void, T>::Visit(&node->body, &node->body);
+}
+template <typename T>
+void IRMutator<T>::Visit(const Let *expr, T op) {
+  auto *node = op->template As<Let>();
+  IRVisitorBase<void, T>::Visit(&node->symbol, &node->symbol);
+  if (node->body.defined()) IRVisitorBase<void, T>::Visit(&node->body, &node->body);
+}
+template <typename T>
+void IRMutator<T>::Visit(const Reduce *expr, T op) {
+  auto *node = op->template As<Reduce>();
+  if (node->init.defined()) IRVisitorBase<void, T>::Visit(&node->init, &node->init);
+  CHECK(node->body.defined());
+  IRVisitorBase<void, T>::Visit(&node->body, &node->body);
+}
+
+template <typename T>
+void IRMutator<T>::Visit(const Ramp *expr, T op) {
+  auto *node = op->template As<Ramp>();
+  IRVisitorBase<void, T>::Visit(&node->base, &node->base);
+  IRVisitorBase<void, T>::Visit(&node->stride, &node->stride);
+}
+
+template <typename T>
+void IRMutator<T>::Visit(const Broadcast *expr, T op) {
+  auto *node = op->template As<Broadcast>();
+  IRVisitorBase<void, T>::Visit(&node->value, &node->value);
+}
+
+template <typename T>
+void IRMutator<T>::Visit(const FracOp *expr, T op) {
+  auto *node = op->template As<FracOp>();
+  IRVisitorBase<void, T>::Visit(&node->a(), &node->a());
+  IRVisitorBase<void, T>::Visit(&node->b(), &node->b());
+}
+
+template <typename T>
+void IRMutator<T>::Visit(const Product *expr, T op) {
+  auto *node = op->template As<Product>();
+  for (auto &x : node->operands()) {
+    IRVisitorBase<void, T>::Visit(&x, &x);
+  }
+}
+
+template <typename T>
+void IRMutator<T>::Visit(const Sum *expr, T op) {
+  auto *node = op->template As<Sum>();
+  for (auto &x : node->operands()) {
+    IRVisitorBase<void, T>::Visit(&x, &x);
+  }
+}
+template <typename T>
+void IRMutator<T>::Visit(const PrimitiveNode *expr, T op) {
+  auto *node = op->template As<PrimitiveNode>();
+  for (auto &args : node->arguments) {
+    for (auto &arg : args) {
+      IRVisitorBase<void, T>::Visit(&arg, &arg);
+    }
+  }
+}
+
+template <typename T>
+void IRMutator<T>::Visit(const IntrinsicOp *expr, T op) {
+  auto *node = op->template As<IntrinsicOp>();
+  switch (node->getKind()) {
+    case ir::IntrinsicKind::kBufferGetDataHandle: {
+      auto *n = llvm::dyn_cast<intrinsics::BufferGetDataHandle>(node);
+      Visit(&n->buffer, &n->buffer);
+    } break;
+    case ir::IntrinsicKind::kBufferGetDataConstHandle: {
+      auto *n = llvm::dyn_cast<intrinsics::BufferGetDataConstHandle>(node);
+      Visit(&n->buffer, &n->buffer);
+    } break;
+    case ir::IntrinsicKind::kPodValueToX: {
+      auto *n = llvm::dyn_cast<intrinsics::PodValueToX>(node);
+      Visit(&n->pod_value_ptr, &n->pod_value_ptr);
+    } break;
+    case ir::IntrinsicKind::kBuiltinIntrin: {
+      auto *n = llvm::dyn_cast<intrinsics::BuiltinIntrin>(node);
+      for (auto &expr : n->args) {
+        Visit(&expr, &expr);
+      }
+    } break;
+  }
+}
+
+template <typename T>
+void IRMutator<T>::Visit(const _BufferRange_ *expr, T op) {
+  auto *node = op->template As<_BufferRange_>();
+  CHECK(node);
+  IRVisitorBase<void, T>::Visit(&node->buffer, &node->buffer);
+  for (auto &var : node->ranges) {
+    if (var->lower_bound.defined()) {
+      IRVisitorBase<void, T>::Visit(&var->lower_bound, &var->lower_bound);
+    }
+    if (var->upper_bound.defined()) {
+      IRVisitorBase<void, T>::Visit(&var->upper_bound, &var->upper_bound);
+    }
+  }
+}
+
+template <typename T>
+void IRMutator<T>::Visit(const ScheduleBlock *expr, T op) {
+  auto *node = op->template As<ScheduleBlock>();
+  CHECK(node);
+  for (auto &var : node->iter_vars) {
+    if (var->lower_bound.defined()) {
+      IRVisitorBase<void, T>::Visit(&var->lower_bound, &var->lower_bound);
+    }
+    if (var->upper_bound.defined()) {
+      IRVisitorBase<void, T>::Visit(&var->upper_bound, &var->upper_bound);
+    }
+  }
+  for (auto &buffer_region : node->read_buffers) {
+    IRVisitorBase<void, T>::Visit(&buffer_region, &buffer_region);
+  }
+  for (auto &buffer_region : node->write_buffers) {
+    IRVisitorBase<void, T>::Visit(&buffer_region, &buffer_region);
+  }
+  IRVisitorBase<void, T>::Visit(&(node->body), &(node->body));
+}
+
+template <typename T>
+void IRMutator<T>::Visit(const ScheduleBlockRealize *expr, T op) {
+  auto *node = op->template As<ScheduleBlockRealize>();
+  CHECK(node);
+  for (auto &value : node->iter_values) {
+    IRVisitorBase<void, T>::Visit(&value, &value);
+  }
+  IRVisitorBase<void, T>::Visit(&node->schedule_block, &node->schedule_block);
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_operators.cc b/paddle/cinn/ir/ir_operators.cc
new file mode 100644
index 0000000000000..cc586971c11b2
--- /dev/null
+++ b/paddle/cinn/ir/ir_operators.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_operators.h"
+
+#include <limits>
+#include <string>
+
+#include "cinn/common/target.h"
+#include "cinn/common/type.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/lang/compute.h"
+#include "cinn/runtime/flags.h"
+
+namespace cinn {
+namespace ir {
+using attr_t = absl::variant<int, float, bool, std::string>;
+
+Expr operator<<(Expr a, Expr b) {
+  CHECK(a.type().is_int() || a.type().is_uint());
+  CHECK(b.type().is_int() || b.type().is_uint());
+  auto int_a = a.As<IntImm>();
+  auto int_b = b.As<IntImm>();
+  Type t_a   = a.type();
+  Type t_b   = b.type();
+  if (t_a.is_index_type() && t_b.is_index_type()) {
+    if (int_b) {
+      CHECK(int_b->value >= 0 && int_b->value < t_a.bits())
+          << "Shift amount must be non-negative and less than " << t_a.bits() << " for type " << t_a << std::endl;
+      if (int_b->value == 0) return a;
+    }
+    if (int_a && int_b) {
+      return Expr(int_a->value << int_b->value);
+    }
+  }
+  return lang::CallExtern("left_shift", {a, b}, {{"vectorizable", false}});
+}
+
+Expr operator>>(Expr a, Expr b) {
+  CHECK(a.type().is_int() || a.type().is_uint());
+  CHECK(b.type().is_int() || b.type().is_uint());
+  auto int_a = a.As<IntImm>();
+  auto int_b = b.As<IntImm>();
+  Type t_a   = a.type();
+  Type t_b   = b.type();
+  if (t_a.is_index_type() && t_b.is_index_type()) {
+    if (int_b) {
+      CHECK(int_b->value >= 0 && int_b->value < t_a.bits())
+          << "Shift amount must be non-negative and less than " << t_a.bits() << " for type " << t_a << std::endl;
+      if (int_b->value == 0) return a;
+    }
+    if (int_a && int_b) {
+      return Expr(int_a->value >> int_b->value);
+    }
+  }
+  return lang::CallExtern("right_shift", {a, b}, {{"vectorizable", false}});
+}
+
+Expr operator|(Expr a, Expr b) {
+  CHECK(a.type().is_int() || a.type().is_uint());
+  CHECK(b.type().is_int() || b.type().is_uint());
+  auto int_a = a.As<IntImm>();
+  auto int_b = b.As<IntImm>();
+  Type t_a   = a.type();
+  Type t_b   = b.type();
+  if (t_a.is_index_type() && t_b.is_index_type()) {
+    if (int_a && int_b) {
+      return Expr(int_a->value | int_b->value);
+    }
+  }
+  auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
+  if (target.arch == common::Target::Arch::X86) {
+    return lang::CallExtern("bitwise_or", {a, b}, {{"vectorizable", false}});
+  } else if (target.arch == common::Target::Arch::NVGPU) {
+    auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_or");
+    return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
+  } else {
+    LOG(FATAL) << "Unsupport arch: " << target.arch_str() << " for bitwise_or.";
+  }
+}
+
+Expr operator&(Expr a, Expr b) {
+  CHECK(a.type().is_int() || a.type().is_uint());
+  CHECK(b.type().is_int() || b.type().is_uint());
+  auto int_a = a.As<IntImm>();
+  auto int_b = b.As<IntImm>();
+  Type t_a   = a.type();
+  Type t_b   = b.type();
+  if (t_a.is_index_type() && t_b.is_index_type()) {
+    if (int_a && int_b) {
+      return Expr(int_a->value & int_b->value);
+    }
+  }
+  auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
+  if (target.arch == common::Target::Arch::X86) {
+    return lang::CallExtern("bitwise_and", {a, b}, {{"vectorizable", false}});
+  } else if (target.arch == common::Target::Arch::NVGPU) {
+    auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_and");
+    return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
+  } else {
+    LOG(FATAL) << "Unsupport arch: " << target.arch_str() << " for bitwise_and.";
+  }
+}
+
+Expr operator^(Expr a, Expr b) {
+  CHECK(a.type().is_int() || a.type().is_uint());
+  CHECK(b.type().is_int() || b.type().is_uint());
+  auto int_a = a.As<IntImm>();
+  auto int_b = b.As<IntImm>();
+  Type t_a   = a.type();
+  Type t_b   = b.type();
+  if (t_a.is_index_type() && t_b.is_index_type()) {
+    if (int_a && int_b) {
+      return Expr(int_a->value ^ int_b->value);
+    }
+  }
+  auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
+  if (target.arch == common::Target::Arch::X86) {
+    return lang::CallExtern("bitwise_xor", {a, b}, {{"vectorizable", false}});
+  } else if (target.arch == common::Target::Arch::NVGPU) {
+    auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_xor");
+    return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
+  } else {
+    LOG(FATAL) << "Unsupport arch: " << target.arch_str() << " for bitwise_xor.";
+  }
+}
+
+Expr operator~(Expr a) {
+  CHECK(a.type().is_int() || a.type().is_uint());
+  auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
+  if (target.arch == common::Target::Arch::X86) {
+    return lang::CallExtern("bitwise_not", {a}, {{"vectorizable", false}});
+  } else if (target.arch == common::Target::Arch::NVGPU) {
+    auto func_name = hlir::GetExternFuncName(target, a->type(), "bitwise_not");
+    return lang::CallExtern(func_name, {a}, {{"vectorizable", false}});
+  } else {
+    LOG(FATAL) << "Unsupport arch: " << target.arch_str() << " for bitwise_not.";
+  }
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_operators.h b/paddle/cinn/ir/ir_operators.h
new file mode 100644
index 0000000000000..a2a7b711573aa
--- /dev/null
+++ b/paddle/cinn/ir/ir_operators.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace ir {
+
+//-- left hand --
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator+(Expr a, POD b) {
+  return Add::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator-(Expr a, POD b) {
+  return Sub::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator*(Expr a, POD b) {
+  return Mul::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator/(Expr a, POD b) {
+  return Div::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator%(Expr a, POD b) {
+  return Mod::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator<(Expr a, POD b) {
+  return LT::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator<=(Expr a, POD b) {
+  return LE::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator>(Expr a, POD b) {
+  return GT::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator>=(Expr a, POD b) {
+  return GE::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator==(Expr a, POD b) {
+  return EQ::Make(Expr(a), Expr(b));
+}
+
+//- right hand --
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator+(POD a, Expr b) {
+  return Add::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator-(POD a, Expr b) {
+  return Sub::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator*(POD a, Expr b) {
+  return Mul::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator/(POD a, Expr b) {
+  return Div::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator%(POD a, Expr b) {
+  return Mod::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator<(POD a, Expr b) {
+  return LT::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator<=(POD a, Expr b) {
+  return LE::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator>(POD a, Expr b) {
+  return GT::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator>=(POD a, Expr b) {
+  return GE::Make(Expr(a), Expr(b));
+}
+template <typename POD, typename = typename std::enable_if<std::is_pod<POD>::value>::type>
+Expr operator==(POD a, Expr b) {
+  return EQ::Make(Expr(a), Expr(b));
+}
+
+//--
+inline Expr operator+(Expr a, Expr b) { return Add::Make(a, b); }
+inline Expr operator-(Expr a, Expr b) { return Sub::Make(a, b); }
+inline Expr operator*(Expr a, Expr b) { return Mul::Make(a, b); }
+inline Expr operator/(Expr a, Expr b) { return Div::Make(a, b); }
+inline Expr operator%(Expr a, Expr b) { return Mod::Make(a, b); }
+
+inline Expr operator&&(Expr a, Expr b) { return And::Make(Expr(a), Expr(b)); }
+inline Expr operator||(Expr a, Expr b) { return Or::Make(Expr(a), Expr(b)); }
+inline Expr operator>=(Expr a, Expr b) { return GE::Make(Expr(a), Expr(b)); }
+inline Expr operator<=(Expr a, Expr b) { return LE::Make(Expr(a), Expr(b)); }
+inline Expr operator>(Expr a, Expr b) { return GT::Make(Expr(a), Expr(b)); }
+inline Expr operator<(Expr a, Expr b) { return LT::Make(Expr(a), Expr(b)); }
+
+inline Expr operator-(Expr a) { return Minus::Make(Expr(a)); }
+inline Expr operator!(Expr a) { return Not::Make(Expr(a)); }
+
+Expr operator<<(Expr a, Expr b);
+Expr operator>>(Expr a, Expr b);
+Expr operator^(Expr a, Expr b);
+Expr operator|(Expr a, Expr b);
+Expr operator&(Expr a, Expr b);
+Expr operator~(Expr a);
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_operators_test.cc b/paddle/cinn/ir/ir_operators_test.cc
new file mode 100644
index 0000000000000..b31614308e889
--- /dev/null
+++ b/paddle/cinn/ir/ir_operators_test.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_operators.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn {
+namespace ir {
+
+TEST(ir_operators, test) {
+  Expr a(1);
+  Expr b = a + 1;
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_printer.cc b/paddle/cinn/ir/ir_printer.cc
new file mode 100644
index 0000000000000..66604da970182
--- /dev/null
+++ b/paddle/cinn/ir/ir_printer.cc
@@ -0,0 +1,645 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_printer.h"
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <vector>
+
+#include "cinn/ir/lowered_func.h"
+#include "cinn/ir/module.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/runtime/intrinsic.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace ir {
+
+using common::bfloat16;
+using common::float16;
+
+void IrPrinter::Print(Expr e) { IRVisitor::Visit(&e); }
+void IrPrinter::Print(const std::vector<Expr> &exprs, const std::string &splitter) {
+  for (std::size_t i = 0; !exprs.empty() && i + 1 < exprs.size(); i++) {
+    Print(exprs[i]);
+    os_ << splitter;
+  }
+  if (!exprs.empty()) Print(exprs.back());
+}
+
+void IrPrinter::Visit(const IntImm *x) {
+  if (x->type().is_int(64)) {
+    os_ << x->value << "ll";
+  } else if (x->type().is_int(32)) {
+    os_ << x->value;
+  } else if (x->type().is_int(16)) {
+    os_ << "(int16_t)" << x->value;
+  } else if (x->type().is_int(8)) {
+    os_ << "(int8_t)" << x->value;
+  } else {
+    LOG(FATAL) << "Not support int type: " << x->type();
+  }
+}
+void IrPrinter::Visit(const UIntImm *x) {
+  if (x->type().is_uint(64)) {
+    os_ << x->value << "ull";
+  } else if (x->type().is_uint(32)) {
+    os_ << x->value;
+  } else if (x->type().is_uint(16)) {
+    os_ << "(uint16_t)" << x->value;
+  } else if (x->type().is_uint(8)) {
+    os_ << "(uint8_t)" << x->value;
+  } else if (x->type().is_uint(1)) {
+    if (x->value) {
+      os_ << "true";
+    } else {
+      os_ << "false";
+    }
+  } else {
+    LOG(FATAL) << "Not support uint type: " << x->type();
+  }
+}
+void IrPrinter::Visit(const FloatImm *x) {
+  if (x->type().is_float16()) {
+    if (std::isinf(x->value)) {
+      os_ << "cinn::common::raw_uint16_to_float16(0x7c00)";
+    } else if (std::isnan(x->value)) {
+      os_ << "cinn::common::raw_uint16_to_float16(0x7e00)";
+    } else {
+      os_ << "(float16)" << std::setprecision(std::numeric_limits<float16>::max_digits10)
+          << static_cast<float16>(x->value) << "f";
+    }
+  } else if (x->type().is_bfloat16()) {
+    if (std::isinf(x->value)) {
+      os_ << "cinn::common::raw_uint16_to_bfloat16(0x7F80)";
+    } else if (std::isnan(x->value)) {
+      os_ << "cinn::common::raw_uint16_to_bfloat16(0x7FC0)";
+    } else {
+      os_ << "(bfloat16)" << std::setprecision(std::numeric_limits<bfloat16>::max_digits10)
+          << static_cast<bfloat16>(x->value) << "f";
+    }
+  } else if (x->type().is_float(32)) {
+    os_ << std::setprecision(std::numeric_limits<float>::max_digits10) << std::showpoint << x->value;
+    if (std::isfinite(x->value)) {
+      os_ << "f";
+    }
+  } else if (x->type().is_float(64)) {
+    os_ << std::setprecision(std::numeric_limits<double>::max_digits10) << std::showpoint << x->value;
+  } else {
+    LOG(FATAL) << "Not support float type: " << x->type();
+  }
+}
+void IrPrinter::Visit(const StringImm *x) { os_ << "\"" << x->value << "\""; }
+void IrPrinter::Visit(const Add *x) { PrintBinaryOp("+", x); }
+void IrPrinter::Visit(const Sub *x) { PrintBinaryOp("-", x); }
+void IrPrinter::Visit(const Mul *x) { PrintBinaryOp("*", x); }
+void IrPrinter::Visit(const Div *x) { PrintBinaryOp("/", x); }
+void IrPrinter::Visit(const Mod *x) { PrintBinaryOp("%", x); }
+void IrPrinter::Visit(const EQ *x) { PrintBinaryOp("==", x); }
+void IrPrinter::Visit(const NE *x) { PrintBinaryOp("!=", x); }
+void IrPrinter::Visit(const LT *x) { PrintBinaryOp("<", x); }
+void IrPrinter::Visit(const LE *x) { PrintBinaryOp("<=", x); }
+void IrPrinter::Visit(const GT *x) { PrintBinaryOp(">", x); }
+void IrPrinter::Visit(const GE *x) { PrintBinaryOp(">=", x); }
+void IrPrinter::Visit(const And *x) { PrintBinaryOp("and", x); }
+void IrPrinter::Visit(const Or *x) { PrintBinaryOp("or", x); }
+void IrPrinter::Visit(const Not *x) {
+  os_ << "!";
+  Print(x->v());
+}
+void IrPrinter::Visit(const Min *x) {
+  os_ << "cinn_min(";
+  Print(x->a());
+  os_ << ", ";
+  Print(x->b());
+  os_ << ")";
+}
+void IrPrinter::Visit(const Max *x) {
+  os_ << "cinn_max(";
+  Print(x->a());
+  os_ << ", ";
+  Print(x->b());
+  os_ << ")";
+}
+void IrPrinter::Visit(const Minus *x) {
+  os_ << "-(";
+  Print(x->v());
+  os_ << ")";
+}
+void IrPrinter::Visit(const For *x) {
+  if (x->is_parallel()) {
+    os() << "parallel for (";
+  } else if (x->is_unrolled()) {
+    os() << "unroll for (";
+  } else if (x->is_vectorized()) {
+    int factor = x->vectorize_info().factor;
+    os() << "vectorize[" << factor << "] for (";
+  } else if (x->is_binded()) {
+    auto &bind_info = x->bind_info();
+    if (bind_info.valid()) {
+      char axis_name     = 'x' + bind_info.offset;
+      auto for_type      = bind_info.for_type;
+      std::string prefix = for_type == ForType::GPUBlock ? "blockIdx." : "threadIdx.";
+      os() << "thread_bind[" << prefix << axis_name << "] for (";
+    } else {
+      os() << "thread_bind[invalid info] for (";
+    }
+  } else if (x->is_serial()) {
+    os() << "serial for (";
+  } else if (x->is_default()) {
+    os() << "default for (";
+  } else {
+    os() << "for (";
+  }
+  Print(x->loop_var);
+  os_ << ", ";
+  Print(x->min);
+  os_ << ", ";
+  Print(x->extent);
+  os_ << ")\n";
+
+  DoIndent();
+  Print(x->body);
+}
+
+void IrPrinter::Visit(const PolyFor *x) {
+  if (x->is_parallel()) {
+    os() << "parallel poly_for (";
+  } else {
+    os() << "poly_for (";
+  }
+  Print(x->iterator);
+  os_ << ", ";
+  Print(x->init);
+  os_ << ", ";
+  Print(x->condition);
+  os_ << ", ";
+  Print(x->inc);
+  os_ << ")\n";
+
+  DoIndent();
+  Print(x->body);
+}
+void IrPrinter::Visit(const IfThenElse *x) {
+  os_ << "if (";
+  Print(x->condition);
+  os_ << ") {\n";
+  IncIndent();
+  DoIndent();
+  Print(x->true_case);
+  DecIndent();
+  os() << "\n";
+  DoIndent();
+  os() << "}";
+
+  if (x->false_case.defined()) {
+    os_ << " else {\n";
+    IncIndent();
+
+    DoIndent();
+    Print(x->false_case);
+    os() << "\n";
+
+    DecIndent();
+    DoIndent();
+    os_ << "}";
+  }
+}
+void IrPrinter::Visit(const Block *x) {
+  os_ << "{\n";
+
+  IncIndent();
+  for (std::size_t i = 0; !x->stmts.empty() && i + 1 < x->stmts.size(); i++) {
+    DoIndent();
+    Print(x->stmts[i]);
+    os_ << "\n";
+  }
+  if (!x->stmts.empty()) {
+    DoIndent();
+    Print(x->stmts.back());
+  }
+  DecIndent();
+  os_ << "\n";
+  DoIndent();
+  os_ << "}";
+}
+void IrPrinter::Visit(const Call *x) {
+  os_ << x->name << "(";
+  if (!x->read_args.empty()) {
+    for (std::size_t i = 0; i + 1 < x->read_args.size(); i++) {
+      Print(x->read_args[i]);
+      os_ << ", ";
+    }
+    Print(x->read_args.back());
+  }
+
+  if (!x->write_args.empty()) {
+    if (!x->read_args.empty()) os() << ", ";
+
+    for (std::size_t i = 0; i + 1 < x->write_args.size(); i++) {
+      Print(x->write_args[i]);
+      os_ << ", ";
+    }
+    Print(x->write_args.back());
+  }
+
+  os_ << ")";
+}
+void IrPrinter::Visit(const Cast *x) {
+  os() << x->type();
+  os() << "(";
+  os() << x->v();
+  os() << ")";
+}
+void IrPrinter::Visit(const _Module_ *x) {}
+void IrPrinter::Visit(const _Var_ *x) { os_ << x->name; }
+void IrPrinter::Visit(const Alloc *x) {
+  auto *buffer = x->destination.As<ir::_Buffer_>();
+  CHECK(buffer);
+  os_ << "alloc(" << buffer->name << ", ";
+  Print(x->extents);
+  os_ << ")";
+}
+void IrPrinter::Visit(const Select *x) {
+  os_ << "select(";
+  Print(x->condition);
+  os_ << ", ";
+  Print(x->true_value);
+  os_ << ", ";
+  Print(x->false_value);
+  os_ << ")";
+}
+void IrPrinter::Visit(const Load *x) {
+  if (x->is_addr_tensor()) {
+    auto *tensor = x->tensor.As<ir::_Tensor_>();
+    CHECK(tensor);
+    os_ << tensor->name;
+  } else if (x->is_addr_scalar()) {
+    Print(x->tensor);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  os_ << "[";
+  for (std::size_t i = 0; i + 1 < x->indices.size(); i++) {
+    Print(x->indices[i]);
+    os() << ", ";
+  }
+  if (!x->indices.empty()) Print(x->indices.back());
+  os_ << "]";
+}
+void IrPrinter::Visit(const Store *x) {
+  if (x->is_addr_tensor()) {
+    auto *tensor_node = x->tensor.As<ir::_Tensor_>();
+    CHECK(tensor_node);
+    os_ << tensor_node->name;
+  } else if (x->is_addr_scalar()) {
+    Print(x->tensor);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  os_ << "[";
+  for (std::size_t i = 0; i + 1 < x->indices.size(); i++) {
+    Print(x->indices[i]);
+    os() << ", ";
+  }
+  if (!x->indices.empty()) Print(x->indices.back());
+  os_ << "] = ";
+  Print(x->value);
+}
+void IrPrinter::Visit(const Free *x) {
+  auto *buffer = x->destination.As<ir::_Buffer_>();
+  CHECK(buffer);
+  os_ << "free(" << buffer->name << ")";
+}
+
+void IrPrinter::DoIndent() { os_ << std::string(indent_, ' '); }
+void IrPrinter::IncIndent() { indent_ += indent_unit; }
+void IrPrinter::DecIndent() { indent_ -= indent_unit; }
+
+void IrPrinter::Visit(const _Buffer_ *x) {
+  std::vector<std::string> dim_names;
+  std::transform(x->shape.begin(), x->shape.end(), std::back_inserter(dim_names), [&](const Expr &x) {
+    return utils::GetStreamCnt(x);
+  });
+
+  os_ << "_Buffer_<" << x->type() << ": " << utils::Join(dim_names, ",") << ">(" << x->name << ")";
+}
+void IrPrinter::Visit(const _Tensor_ *x) {
+  os_ << "Tensor(";
+  os() << x->name << ", ";
+  os() << "[";
+  if (!x->shape.empty()) {
+    for (std::size_t i = 0; i + 1 < x->shape.size(); i++) {
+      Print(x->shape[i]);
+      os() << ",";
+    }
+    Print(x->shape.back());
+  }
+  os_ << "])";
+}
+void IrPrinter::Visit(const _LoweredFunc_ *f) {
+  os_ << "function " << f->name << " ";
+
+  std::vector<std::string> arg_names;
+  for (auto &arg : f->args) {
+    arg_names.push_back(arg.name());
+  }
+  os_ << "(" << utils::Join(arg_names, ", ") << ")\n";
+
+  Print(f->body);
+}
+void IrPrinter::Visit(const Let *f) {
+  CHECK(f->type().valid());
+  os() << f->type() << " ";
+  Print(f->symbol);
+  if (f->body.defined()) {
+    os() << " = ";
+    Print(f->body);
+  }
+}
+
+void IrPrinter::Visit(const Reduce *f) {
+  os() << "Reduce(";
+  switch (f->reduce_type) {
+    case Reduce::ReduceType::kSum:
+      os() << "sum";
+      break;
+    case Reduce::ReduceType::kSub:
+      os() << "sub";
+      break;
+    case Reduce::ReduceType::kDiv:
+      os() << "Div";
+      break;
+    case Reduce::ReduceType::kMul:
+      os() << "Mul";
+      break;
+    case Reduce::ReduceType::kMax:
+      os() << "Max";
+      break;
+    case Reduce::ReduceType::kMin:
+      os() << "Min";
+      break;
+    case Reduce::ReduceType::kAll:
+      os() << "&&";
+      break;
+    case Reduce::ReduceType::kAny:
+      os() << "||";
+      break;
+  }
+  os() << ", ";
+  Print(f->body);
+  os() << ",";
+  Print(f->init);
+  os() << ")";
+}
+
+void IrPrinter::Visit(const Ramp *x) {
+  os() << "Ramp(";
+  Print(x->base);
+  os() << ",";
+  Print(x->stride);
+  os() << ",";
+  os() << x->lanes;
+  os() << ")";
+}
+
+void IrPrinter::Visit(const Broadcast *x) {
+  os() << "Broadcast(";
+  Print(x->value);
+  os() << ",";
+  os() << x->lanes;
+  os() << ")";
+}
+
+void IrPrinter::Visit(const FracOp *x) {
+  os() << "(";
+  Print(x->a());
+  os() << " / ";
+  Print(x->b());
+  os() << ")";
+}
+
+void IrPrinter::Visit(const Product *x) {
+  os() << "(";
+  for (std::size_t i = 0; i + 1 < x->operands().size(); i++) {
+    Print(x->operand(i));
+    os() << " * ";
+  }
+  if (!x->operands().empty()) Print(x->operands().back());
+  os() << ")";
+}
+
+void IrPrinter::Visit(const Sum *x) {
+  os() << "(";
+  for (std::size_t i = 0; i + 1 < x->operands().size(); i++) {
+    Print(x->operand(i));
+    os() << " + ";
+  }
+  if (!x->operands().empty()) Print(x->operands().back());
+  os() << ")";
+}
+
+void IrPrinter::Visit(const PrimitiveNode *x) {
+  os() << x->name << "(";
+  std::vector<std::string> args_repr;
+  for (auto &args : x->arguments) {
+    std::vector<std::string> arg_repr;
+    for (auto &arg : args) {
+      arg_repr.push_back(utils::GetStreamCnt(arg));
+    }
+    args_repr.push_back(utils::Join(arg_repr, ","));
+  }
+
+  os() << utils::Join(args_repr, ",");
+  os() << ")";
+}
+
+void IrPrinter::Visit(const _BufferRange_ *x) {
+  auto *buffer = x->buffer.As<ir::_Buffer_>();
+  CHECK(buffer);
+  os() << buffer->name << "[";
+  for (std::size_t i = 0; i < x->ranges.size(); i++) {
+    if (i) os() << ", ";
+    auto &range = x->ranges[i];
+    os() << range->name << "(";
+    if (range->lower_bound.defined()) {
+      os() << range->lower_bound << ":";
+    } else {
+      os() << "undefined:";
+    }
+
+    if (range->upper_bound.defined()) {
+      os() << range->upper_bound;
+    } else {
+      os() << "undefined";
+    }
+    os() << ")";
+  }
+  os() << "]";
+}
+
+void IrPrinter::Visit(const ScheduleBlock *x) {}
+
+void IrPrinter::Visit(const ScheduleBlockRealize *x) {
+  auto *schedule_block = x->schedule_block.As<ScheduleBlock>();
+  os() << "ScheduleBlock(" << schedule_block->name << ")\n";
+  DoIndent();
+  os() << "{\n";
+  // print block vars and bindings
+  auto iter_vars   = schedule_block->iter_vars;
+  auto iter_values = x->iter_values;
+  CHECK_EQ(iter_vars.size(), iter_values.size());
+  IncIndent();
+  if (!iter_vars.empty()) DoIndent();
+  for (std::size_t i = 0; i < iter_vars.size(); i++) {
+    if (i) os() << ", ";
+    os() << iter_vars[i]->name;
+  }
+  if (!iter_vars.empty()) os() << " = axis.bind(";
+  for (std::size_t i = 0; i < iter_values.size(); i++) {
+    if (i) os() << ", ";
+    os() << iter_values[i];
+  }
+  if (!iter_vars.empty()) os() << ")\n";
+  // print block body
+  if (!schedule_block->read_buffers.empty()) {
+    DoIndent();
+    os() << "read_buffers(";
+    auto &read_buffers = schedule_block->read_buffers;
+    for (std::size_t i = 0; i < read_buffers.size(); i++) {
+      if (i) os() << ", ";
+      Print(read_buffers[i]);
+    }
+    os() << ")\n";
+  }
+  if (!schedule_block->write_buffers.empty()) {
+    DoIndent();
+    os() << "write_buffers(";
+    auto &write_buffers = schedule_block->write_buffers;
+    for (std::size_t i = 0; i < write_buffers.size(); i++) {
+      if (i) os() << ", ";
+      Print(write_buffers[i]);
+    }
+    os() << ")\n";
+  }
+  if (!schedule_block->attrs.empty()) {
+    DoIndent();
+    os() << "attrs(";
+    bool comma = false;
+    for (auto &&kv : schedule_block->attrs) {
+      if (comma) os() << ", ";
+      os() << kv.first << ":";
+      absl::visit([this](auto &&arg) { this->os() << arg; }, kv.second);
+      comma = true;
+    }
+    os() << ")\n";
+  }
+  DoIndent();
+  Print(schedule_block->body);
+  os() << "\n";
+  DecIndent();
+  DoIndent();
+  os() << "}";
+}
+
+void IrPrinter::Visit(const IntrinsicOp *x) {
+  switch (x->getKind()) {
+#define __(op__)                                \
+  case IntrinsicKind::k##op__:                  \
+    Visit(llvm::dyn_cast<intrinsics::op__>(x)); \
+    break;
+
+    INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+  }
+}
+void IrPrinter::Visit(const intrinsics::BufferGetDataHandle *x) {
+  os() << runtime::intrinsic::buffer_get_data_handle;
+  Print(x->buffer);
+  os() << ")";
+}
+void IrPrinter::Visit(const intrinsics::BufferGetDataConstHandle *x) {
+  os() << runtime::intrinsic::buffer_get_data_const_handle;
+  Print(x->buffer);
+  os() << ")";
+}
+void IrPrinter::Visit(const intrinsics::PodValueToX *x) {
+  os() << "pod_value_to_";
+  os() << x->GetOutputType(0);
+  os() << "(";
+  Print(x->pod_value_ptr);
+  os() << ")";
+}
+void IrPrinter::Visit(const intrinsics::BufferCreate *x) {
+  os() << runtime::intrinsic::buffer_create;
+  os() << "()";
+}
+void IrPrinter::Visit(const intrinsics::GetAddr *x) {
+  os() << "get_addr(";
+  Print(x->data);
+  os() << ")";
+}
+void IrPrinter::Visit(const intrinsics::ArgsConstruct *x) {
+  os() << runtime::intrinsic::args_construct_repr;
+  os() << "(";
+  Print(std::vector<Expr>(x->args.begin(), x->args.end()));
+  os() << ")";
+}
+
+void IrPrinter::Visit(const intrinsics::BuiltinIntrin *x) {
+  os_ << runtime::intrinsic::builtin_intrin_repr << "_";
+  os_ << x->name << "(";
+  if (!x->args.empty()) {
+    for (std::size_t i = 0; i + 1 < x->args.size(); i++) {
+      Print(x->args[i]);
+      os_ << ", ";
+    }
+    Print(x->args.back());
+  }
+
+  os_ << ")";
+}
+
+std::ostream &operator<<(std::ostream &os, Expr a) {
+  std::stringstream ss;
+  IrPrinter printer(ss);
+  printer.Print(a);
+  os << ss.str();
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const std::vector<Expr> &a) {
+  std::stringstream ss;
+  IrPrinter printer(ss);
+  printer.Print(a);
+  os << ss.str();
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const ir::Module &m) {
+  os << "Module " << m->name << " {\n\n";
+  for (auto &fn : m->functions) {
+    os << fn << '\n';
+  }
+  os << "\n\n}";
+  return os;
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_printer.h b/paddle/cinn/ir/ir_printer.h
new file mode 100644
index 0000000000000..7eafbcf97172e
--- /dev/null
+++ b/paddle/cinn/ir/ir_printer.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_visitor.h"
+
+namespace cinn {
+
+namespace lang {
+class LoweredFunc;
+}  // namespace lang
+
+namespace ir {
+class Module;
+
+struct IrPrinter : public IRVisitor {
+  explicit IrPrinter(std::ostream &os) : os_(os) {}
+
+  //! Emit an expression on the output stream.
+  void Print(Expr e);
+  //! Emit a expression list with , splitted.
+  void Print(const std::vector<Expr> &exprs, const std::string &splitter = ", ");
+  //! Emit a binary operator
+  template <typename IRN>
+  void PrintBinaryOp(const std::string &op, const BinaryOpNode<IRN> *x);
+
+  //! Prefix the current line with `indent_` spaces.
+  void DoIndent();
+  //! Increase the indent size.
+  void IncIndent();
+  //! Decrease the indent size.
+  void DecIndent();
+
+  std::ostream &os() { return os_; }
+
+#define __(op__) void Visit(const op__ *x) override;
+  NODETY_FORALL(__)
+#undef __
+
+#define __(op__) virtual void Visit(const intrinsics::op__ *x);
+  INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+
+ private:
+  std::ostream &os_;
+  uint16_t indent_{};
+  const int indent_unit{2};
+};
+
+std::ostream &operator<<(std::ostream &os, Expr a);
+std::ostream &operator<<(std::ostream &os, const std::vector<Expr> &a);
+std::ostream &operator<<(std::ostream &os, const Module &m);
+
+template <typename IRN>
+void IrPrinter::PrintBinaryOp(const std::string &op, const BinaryOpNode<IRN> *x) {
+  os_ << "(";
+  Print(x->a());
+  os_ << " " + op + " ";
+  Print(x->b());
+  os_ << ")";
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_printer_test.cc b/paddle/cinn/ir/ir_printer_test.cc
new file mode 100644
index 0000000000000..1f9edca6ded05
--- /dev/null
+++ b/paddle/cinn/ir/ir_printer_test.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_printer.h"
+
+#include <gtest/gtest.h>
+
+#include <sstream>
+
+namespace cinn {
+namespace ir {}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_schedule.cc b/paddle/cinn/ir/ir_schedule.cc
new file mode 100644
index 0000000000000..eb2d934e0f646
--- /dev/null
+++ b/paddle/cinn/ir/ir_schedule.cc
@@ -0,0 +1,2310 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_schedule.h"
+
+#include <math.h>
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule_util.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/lang/compute.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/replace_var_with_expr.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace ir {
+
+/**
+ * A struct helps to implement Schedule primitives.
+ */
+class ScheduleImpl {
+ public:
+  ScheduleImpl() = default;
+  explicit ScheduleImpl(const ModuleExpr& module_expr, bool debug_flag = false)
+      : module_expr_(module_expr), debug_flag_(debug_flag) {}
+  explicit ScheduleImpl(ModuleExpr&& module_expr) : module_expr_(std::move(module_expr)) {}
+
+  //! Set the debug flag.
+  void SetDebugFlag(bool debug_flag) { debug_flag_ = debug_flag; }
+
+  //! Get the ModuleExpr stored in ScheduleImpl.
+  const ModuleExpr& GetModule() const { return module_expr_; }
+
+  void MergeExprs();
+
+  void SetExprs(const std::vector<Expr>& exprs) { module_expr_.SetExprs(exprs); }
+
+  bool HasBlock(const std::string& block_name) const;
+
+  std::vector<Expr> GetLoops(const Expr& block) const;
+  std::vector<Expr> GetLoops(const std::string& block_name) const;
+  std::vector<Expr> GetAllBlocks() const;
+  std::vector<Expr> GetChildBlocks(const Expr& expr) const;
+  Expr GetBlock(const std::string& block_name) const;
+  std::vector<Expr> Split(const Expr& loop, const std::vector<int>& factors);
+  std::vector<Expr> SamplePerfectTile(utils::LinearRandomEngine::StateType* rand_seed,
+                                      const Expr& loop,
+                                      int n,
+                                      int max_innermost_factor);
+  Expr Fuse(const std::vector<Expr>& loops);
+  Expr Fuse(const std::string& block_name, const std::vector<int>& loops_index);
+  Expr Fuse(const Expr& block, const std::vector<int>& loops_index);
+  void ComputeAt(const Expr& block, const Expr& loop, bool keep_unit_loops);
+  void SimpleComputeAt(const Expr& block, const Expr& loop);
+  void ReverseComputeAt(const Expr& block, const Expr& loop, bool keep_unit_loops);
+  Expr GetRootBlock(const Expr& expr) const;
+  Expr CacheRead(const Expr& block, int read_buffer_index, const std::string& memory_type);
+  Expr CacheWrite(const Expr& block, int write_buffer_index, const std::string& memory_type);
+  void SyncThreads(const Expr& ir_node, bool after_node = true);
+  void SetBuffer(Expr& block, const std::string& memory_type, bool fixed = false);
+  Expr Reorder(const std::vector<Expr>& loops);
+  Expr Reorder(const std::string& block_name, const std::vector<int>& loops_index);
+  Expr Reorder(const Expr& block, const std::vector<int>& loops_index);
+  DeviceAPI GetDeviceAPI() const;
+  void MutateForType(const Expr& loop, ForType for_type, int factor = -1);
+  void Parallel(const Expr& loop);
+  void Vectorize(const Expr& loop, int factor);
+  void Unroll(const Expr& loop);
+  void ComputeInline(const Expr& schedule_block);
+  void ReverseComputeInline(const Expr& schedule_block);
+  void Bind(const Expr& loop, const std::string& thread_axis);
+  Expr Rfactor(const Expr& rf_loop, int rf_axis);
+  Expr AddUnitLoop(const Expr& block) const;
+  void Annotate(const Expr& block, const std::string& key, const attr_t& value);
+  void Unannotate(Expr& block, const std::string& key);
+  void FlattenLoops(const std::vector<Expr>& loops, const bool force_flat = false);
+  void CopyTransformAndLoopInfo(const Expr& block, const Expr& block_target);
+  void CopyTransformAndLoopInfo(const std::string& block_name, const std::string& block_target_name);
+  Expr SampleCategorical(utils::LinearRandomEngine::StateType* rand_seed,
+                         const std::vector<int>& candidates,
+                         const std::vector<float>& probs);
+
+ private:
+  void Replace(const Expr& src_sref, const Expr& tgt_stmt);
+
+  ModuleExpr module_expr_;
+  bool debug_flag_{false};
+};
+
+std::vector<Expr> ScheduleImpl::Split(const Expr& loop, const std::vector<int>& factors) {
+  CHECK(loop.As<ir::For>()) << "Expr param of Split must be For node! Please check.";
+  auto* for_node = loop.As<ir::For>();
+  CHECK(common::is_zero(for_node->min)) << "The For node must start with 0! Please check.";
+  CHECK(for_node->extent.is_constant()) << "The For node's extent must be constant! Please check.";
+  int tot_extent = for_node->extent.get_constant();
+
+  VLOG(3) << "Try Split loop from (" << for_node->loop_var->name << ", 0, " << tot_extent << ") to ("
+          << cinn::utils::Join(factors, ", ") << ") at loop:\n"
+          << loop;
+
+  auto processed_factors = ValidateFactors(factors, tot_extent);
+  int prod_size = std::accumulate(processed_factors.begin(), processed_factors.end(), 1, std::multiplies<int>());
+  std::vector<Var> new_loop_vars;
+  Expr substitute_value(0);
+  for (int i = 0; i < processed_factors.size(); ++i) {
+    Var temp_var(common::UniqName(for_node->loop_var->name));
+    substitute_value = Expr(temp_var) + substitute_value * Expr(processed_factors[i]);
+    new_loop_vars.push_back(temp_var);
+  }
+  substitute_value = common::AutoSimplify(substitute_value);
+  Expr new_node    = optim::IRCopy(for_node->body);
+  ReplaceExpr(&new_node, {for_node->loop_var}, {substitute_value});
+  std::vector<Expr> splited_loops;
+  splited_loops.resize(processed_factors.size());
+  if (tot_extent < prod_size) {
+    new_node = IfThenElse::Make(LT::Make(substitute_value, for_node->extent), new_node);
+  }
+  for (int i = processed_factors.size() - 1; i >= 0; i--) {
+    if (!new_node.As<ir::Block>()) new_node = Block::Make({new_node});
+    new_node = For::Make(
+        new_loop_vars[i], Expr(0), Expr(processed_factors[i]), for_node->for_type(), for_node->device_api, new_node);
+    splited_loops[i] = new_node;
+  }
+
+  this->Replace(loop, new_node);
+  VLOG(3) << "After Split, ir is:\n" << splited_loops.at(0);
+  return splited_loops;
+}
+
+Expr ScheduleImpl::Fuse(const std::vector<Expr>& loops) {
+  VLOG(3) << "Tring to fuse:\n" << cinn::utils::Join(loops, "\n");
+  std::vector<const ir::For*> for_nodes;
+  std::vector<Var> loop_vars;
+  CHECK(!loops.empty()) << "The loops param of Fuse should not be empty! Please check.";
+
+  for (const Expr& it_loop : loops) {
+    CHECK(it_loop.As<ir::For>()) << "Expr param of Fuse must be For node! Please check.";
+    if (!for_nodes.empty()) {
+      CHECK(for_nodes.back()->body.As<ir::Block>()) << "The body of for node is not Block!";
+      CHECK_EQ(for_nodes.back()->body.As<ir::Block>()->stmts.size(), 1U) << "The Block'size of for node is not 1!";
+      CHECK_EQ(for_nodes.back()->body.As<ir::Block>()->stmts[0], it_loop)
+          << "The For nodes in loops param of Fuse must be adjacent! Please check.";
+    }
+    for_nodes.push_back(it_loop.As<ir::For>());
+    loop_vars.push_back(it_loop.As<ir::For>()->loop_var);
+  }
+  std::string suffix;
+  suffix           = for_nodes[0]->loop_var->name;
+  int loops_number = for_nodes.size();
+  for (int i = 1; i < loops_number; ++i) {
+    suffix += "_" + for_nodes[i]->loop_var->name;
+  }
+  suffix += "_fused";
+  Var fused_var(suffix);
+  std::vector<Expr> substitute_value;
+  substitute_value.resize(loops_number);
+  Expr fused_expr(fused_var);
+  for (int i = loops_number - 1; i > 0; i--) {
+    substitute_value[i] = Mod::Make(fused_expr, for_nodes[i]->extent);
+    fused_expr          = Div::Make(fused_expr, for_nodes[i]->extent);
+  }
+  substitute_value[0] = fused_expr;
+
+  Expr fused_body = optim::IRCopy(for_nodes.back()->body);
+  ReplaceExpr(&fused_body, loop_vars, substitute_value);
+  optim::Simplify(&fused_body);
+  Expr fused_extent(1);
+  for (int i = 0; i < loops_number; ++i) {
+    fused_extent = fused_extent * for_nodes[i]->extent;
+  }
+  fused_extent = common::AutoSimplify(fused_extent);
+
+  if (!fused_body.As<ir::Block>()) fused_body = Block::Make({fused_body});
+  Expr new_stmt =
+      For::Make(fused_var, Expr(0), fused_extent, for_nodes[0]->for_type(), for_nodes[0]->device_api, fused_body);
+  this->Replace(loops[0], new_stmt);
+
+  VLOG(3) << "After fuse, ir is:\n" << new_stmt;
+  return new_stmt;
+}
+
+Expr ScheduleImpl::Fuse(const std::string& block_name, const std::vector<int>& loops_index) {
+  std::vector<Expr> all_loops = this->GetLoops(block_name);
+  std::vector<Expr> loops_expr;
+  loops_expr.reserve(loops_index.size());
+  for (int i = 0; i < loops_index.size(); ++i) {
+    if (i > 0) CHECK_EQ(loops_index[i - 1] + 1, loops_index[i]) << "Loops index in Fuse shoule be continuous!";
+  }
+  for (int i : loops_index) {
+    CHECK_LT(i, (int)all_loops.size()) << "The loop index in Fuse should be less than total loop's number.";
+    CHECK_GE(i, 0) << "The loop index in Fuse should be >= 0.";
+    loops_expr.emplace_back(all_loops[i]);
+  }
+  return this->Fuse(loops_expr);
+}
+
+Expr ScheduleImpl::Fuse(const Expr& block, const std::vector<int>& loops_index) {
+  std::vector<Expr> all_loops = this->GetLoops(block);
+  std::vector<Expr> loops_expr;
+  loops_expr.reserve(loops_index.size());
+  for (int i = 0; i < loops_index.size(); ++i) {
+    if (i > 0) CHECK_EQ(loops_index[i - 1] + 1, loops_index[i]) << "Loops index in Fuse shoule be continuous!";
+  }
+  for (int i : loops_index) {
+    CHECK_LT(i, (int)all_loops.size()) << "The loop index in Fuse should be less than total loop's number.";
+    CHECK_GE(i, 0) << "The loop index in Fuse should be >= 0.";
+    loops_expr.emplace_back(all_loops[i]);
+  }
+  return this->Fuse(loops_expr);
+}
+
+void ScheduleImpl::MutateForType(const Expr& loop, ForType for_type, int factor) {
+  auto* for_node = loop.As<ir::For>();
+  CHECK(for_node) << "loop param must be For node! Please check.";
+  CHECK(for_node->is_serial()) << "loop is not serial, current forloop type is "
+                               << static_cast<int>(for_node->for_type()) << ", and it cannot become "
+                               << static_cast<int>(for_type);
+  auto loop_copy     = optim::IRCopy(loop);
+  auto* new_for_node = loop_copy.As<ir::For>();
+  CHECK(new_for_node);
+  new_for_node->set_for_type(for_type);
+  if (new_for_node->is_vectorized()) {
+    VectorizeInfo vec_info(0, factor);
+    new_for_node->set_vectorize_info(vec_info);
+  } else if (new_for_node->is_binded()) {
+    BindInfo bind_info(for_type, factor, DeviceAPI::GPU);
+    new_for_node->set_bind_info(bind_info);
+  }
+  this->Replace(loop, loop_copy);
+}
+
+void ScheduleImpl::Parallel(const Expr& loop) { MutateForType(loop, ForType::Parallel); }
+
+void ScheduleImpl::Vectorize(const Expr& loop, int factor) {
+  CHECK_GT(factor, 0) << "vectorize factor should be more than 0";
+  MutateForType(loop, ForType::Vectorized, factor);
+}
+
+void ScheduleImpl::Unroll(const Expr& loop) { MutateForType(loop, ForType::Unrolled); }
+
+void ScheduleImpl::Bind(const Expr& loop, const std::string& thread_axis) {
+  static std::set<std::string> thread_axes = {
+      "blockIdx.x", "blockIdx.y", "blockIdx.z", "threadIdx.x", "threadIdx.y", "threadIdx.z"};
+  CHECK(thread_axes.count(thread_axis)) << "thread_axis " << thread_axis << " is not supported";
+  int offset = thread_axis.back() - 'x';
+  if (thread_axis[0] == 'b') {
+    MutateForType(loop, ForType::GPUBlock, offset);
+  } else {
+    MutateForType(loop, ForType::GPUThread, offset);
+  }
+}
+
+// The struct used to mutate new rfactor forloop and its' schedule block.
+struct RfMutator : public ir::IRMutator<> {
+ public:
+  RfMutator(const Expr& rf_loop, const int& rf_axis) : rf_loop_(rf_loop), rf_axis_(rf_axis) {}
+  void operator()(Expr* expr) {
+    auto* rf_for = rf_loop_.As<For>();
+    CHECK(rf_for);
+    old_rf_loop_var_ = rf_for->loop_var;
+    new_rf_loop_var_ = Var("rf_" + old_rf_loop_var_->name);
+    IRMutator::Visit(expr, expr);
+  }
+
+  Tensor GetNewRfTensor() { return new_rf_tensor_; }
+
+  void Visit(const ScheduleBlockRealize* op, Expr* expr) override {
+    // modify iter_vars and iter_values
+    auto* node = expr->As<ScheduleBlockRealize>();
+    CHECK(node);
+    auto* schedule_block = node->schedule_block.As<ScheduleBlock>();
+    CHECK(schedule_block);
+    old_output_name_  = schedule_block->name;
+    find_tensor_      = false;
+    auto& block_vars  = schedule_block->iter_vars;
+    auto& iter_values = node->iter_values;
+    CHECK(old_rf_loop_var_.defined());
+    CHECK(new_rf_loop_var_.defined());
+    CHECK_EQ(iter_values.size(), block_vars.size());
+    int rf_index = -1;
+    for (int i = 0; i < iter_values.size(); ++i) {
+      // substitute the old rfactor loop var to new rfactor loop var
+      if (ContainVar({iter_values[i]}, old_rf_loop_var_->name)) {
+        CHECK_EQ(rf_index, -1) << "only one block var can bind the rfactor loop var";
+        CHECK(iter_values[i].As<_Var_>()) << "rfactor loop var not support composite bindings";
+        rf_index = i;
+        optim::ReplaceVarWithExpr(&iter_values[i], old_rf_loop_var_, new_rf_loop_var_);
+        new_rf_itervar_ = block_vars[i];
+      }
+    }
+    // create new rfactor block var if not exist
+    if (rf_index == -1) {
+      new_rf_itervar_ = Var(cinn::UniqName("i" + std::to_string(block_vars.size())));
+      iter_values.push_back(new_rf_loop_var_);
+      block_vars.push_back(new_rf_itervar_);
+    }
+    IRMutator::Visit(&node->schedule_block, &node->schedule_block);
+    CHECK(find_tensor_) << "not find the store tensor with the schedule block name " << old_output_name_;
+    schedule_block->name = "rf_" + old_output_name_;
+  }
+
+  void Visit(const Load* op, Expr* expr) override {
+    // insert the new rfactor indice if not exist
+    auto* node = expr->As<Load>();
+    CHECK(node);
+    auto* tensor = node->tensor.As<_Tensor_>();
+    CHECK(tensor);
+    if (tensor->name == "rf_" + old_output_name_) {
+      int size = node->indices.size();
+      CHECK_LE(rf_axis_, size) << "rf_axis should not be greater than indice size " << size;
+      CHECK(new_rf_itervar_.defined());
+      CHECK(!ContainVar(node->indices, new_rf_itervar_->name))
+          << "original output tensor " << old_output_name_ << " should not have the new rfactor index "
+          << new_rf_itervar_;
+      node->indices.insert(node->indices.begin() + rf_axis_, new_rf_itervar_);
+    }
+  }
+
+  void Visit(const Store* op, Expr* expr) override {
+    // insert the new rfactor indice if not exist
+    auto* node = expr->As<Store>();
+    CHECK(node);
+    auto* tensor = node->tensor.As<_Tensor_>();
+    CHECK(tensor);
+    if (tensor->name == old_output_name_) {
+      find_tensor_ = true;
+      tensor->name = "rf_" + tensor->name;
+      int size     = node->indices.size();
+      CHECK_LE(rf_axis_, size) << "rf_axis should not be greater than indice size " << size;
+      CHECK(!ContainVar(node->indices, new_rf_itervar_->name))
+          << "original output tensor " << old_output_name_ << " should not have the new rfactor index "
+          << new_rf_itervar_;
+      node->indices.insert(node->indices.begin() + rf_axis_, new_rf_itervar_);
+      auto* rf_for = rf_loop_.As<For>();
+      CHECK(rf_for);
+      CHECK(is_zero(rf_for->min)) << "rfactor loop's min should be zero";
+      auto extent  = common::AutoSimplify(rf_for->extent);
+      auto& shape  = tensor->shape;
+      auto& domain = tensor->domain;
+      CHECK_LE(rf_axis_, shape.size()) << "rf_axis should not be greater than tensor shape size " << shape.size();
+      CHECK_LE(rf_axis_, domain.size()) << "rf_axis should not be greater than tensor domain size " << domain.size();
+      shape.insert(shape.begin() + rf_axis_, extent);
+      domain.insert(domain.begin() + rf_axis_, extent);
+      if (tensor->buffer.defined()) {
+        if (tensor->buffer->name.find_first_of("rf") == std::string::npos) {
+          tensor->buffer->name  = "rf_" + tensor->buffer->name;
+          tensor->buffer->shape = shape;
+        }
+      }
+      new_rf_tensor_ = Tensor(tensor);
+    }
+    IRMutator::Visit(&node->value, &node->value);
+  }
+
+  void Visit(const For* op, Expr* expr) override {
+    auto* node = expr->As<For>();
+    CHECK(node);
+    depth++;
+    auto* rf_for = rf_loop_.As<For>();
+    CHECK(rf_for);
+    // erase the original rfactor forloop
+    if (node->loop_var->name == old_rf_loop_var_->name) {
+      auto body = node->body.As<Block>();
+      if (body && body->stmts.size() == 1) {
+        *expr = body->stmts[0];
+      } else {
+        *expr = node->body;
+      }
+      IRMutator::Visit(expr, expr);
+    } else {
+      IRMutator::Visit(&node->body, &node->body);
+    }
+    if (rf_axis_ == 0 && depth == rf_axis_) {
+      // insert new rfactor forloop in the rf_axis as serial loop
+      *expr = For::Make(
+          new_rf_loop_var_, rf_for->min, rf_for->extent, ForType::Serial, rf_for->device_api, Block::Make({*expr}));
+    } else if (depth == rf_axis_ - 1) {
+      // insert new rfactor forloop in the rf_axis as serial loop
+      node->body = Block::Make(
+          {For::Make(new_rf_loop_var_, rf_for->min, rf_for->extent, ForType::Serial, rf_for->device_api, node->body)});
+    }
+    depth--;
+  }
+
+ private:
+  Expr rf_loop_;
+  Var old_rf_loop_var_;
+  Var new_rf_loop_var_;
+  int rf_axis_;
+  int depth         = -1;
+  bool find_tensor_ = false;
+  std::string old_output_name_;
+  Var new_rf_itervar_;
+  Tensor new_rf_tensor_;
+};
+
+// The struct used to mutate final write-back forloop and schedule block.
+struct FinalMutator : public ir::IRMutator<> {
+ public:
+  FinalMutator(const Expr& rf_loop, const int& rf_axis, const Tensor& new_rf_tensor)
+      : rf_loop_(rf_loop), rf_axis_(rf_axis), new_rf_tensor_(new_rf_tensor) {}
+  void operator()(Expr* expr) {
+    auto* rf_for = rf_loop_.As<For>();
+    CHECK(rf_for);
+    old_rf_loop_var_ = rf_for->loop_var;
+    IRMutator::Visit(expr, expr);
+  }
+
+  void Visit(const ScheduleBlockRealize* op, Expr* expr) override {
+    auto* node = expr->As<ScheduleBlockRealize>();
+    CHECK(node);
+    auto* schedule_block = node->schedule_block.As<ScheduleBlock>();
+    CHECK(schedule_block);
+    auto& iter_vars   = schedule_block->iter_vars;
+    auto& iter_values = node->iter_values;
+    output_name_      = schedule_block->name;
+    visit_init_block_ = output_name_.rfind("_init") != std::string::npos;
+    if (!visit_init_block_) {
+      for (int i = 0; i < iter_values.size(); ++i) {
+        if (ContainVar({iter_values[i]}, old_rf_loop_var_->name)) {
+          // record the rfactor loop var's block var
+          CHECK(iter_values[i].As<_Var_>()) << "not support complex reduce bindings: " << iter_values[i];
+          old_rf_iter_var_ = iter_vars[i];
+          break;
+        }
+      }
+    }
+    IRMutator::Visit(&node->schedule_block, &node->schedule_block);
+    // modify iter_vars and iter_values, erase other reduce block vars and values
+    for (auto it = iter_values.begin(); it != iter_values.end(); ++it) {
+      for (auto erase_var : erase_reduce_loopvars_) {
+        if (ContainVar({*it}, erase_var)) {
+          CHECK((*it).As<_Var_>()) << "not support complex reduce bindings: " << *it;
+          iter_vars.erase(it - iter_values.begin() + iter_vars.begin());
+          iter_values.erase(it);
+          --it;
+          break;
+        }
+      }
+    }
+  }
+
+  // currently only support reduce_sum, reduce_mul, reduce_min and reduce_max
+  void Visit(const Add* op, Expr* expr) override {
+    auto* node = expr->As<Add>();
+    CHECK(node);
+    auto& oper_b = node->b();
+    oper_b       = Load::Make(new_rf_tensor_, new_rf_indice_);
+  }
+
+  void Visit(const Mul* op, Expr* expr) override {
+    auto* node = expr->As<Mul>();
+    CHECK(node);
+    auto& oper_b = node->b();
+    oper_b       = Load::Make(new_rf_tensor_, new_rf_indice_);
+  }
+
+  void Visit(const Min* op, Expr* expr) override {
+    auto* node = expr->As<Min>();
+    CHECK(node);
+    auto& oper_b = node->b();
+    oper_b       = Load::Make(new_rf_tensor_, new_rf_indice_);
+  }
+
+  void Visit(const Max* op, Expr* expr) override {
+    auto* node = expr->As<Max>();
+    CHECK(node);
+    auto& oper_b = node->b();
+    oper_b       = Load::Make(new_rf_tensor_, new_rf_indice_);
+  }
+
+  void Visit(const Store* op, Expr* expr) override {
+    // insert the new rfactor indice if not exist
+    auto* node = expr->As<Store>();
+    CHECK(node);
+    auto* tensor = node->tensor.As<_Tensor_>();
+    CHECK(tensor);
+    CHECK_EQ(tensor->name, output_name_) << "store name should be same with the schedule block name";
+    if (!visit_init_block_) {
+      new_rf_indice_ = node->indices;
+      CHECK_LE(rf_axis_, new_rf_indice_.size())
+          << "rf_axis_ should not be greater than tensor indice size " << new_rf_indice_.size();
+      CHECK(old_rf_iter_var_.defined());
+      new_rf_indice_.insert(new_rf_indice_.begin() + rf_axis_, old_rf_iter_var_);
+      IRMutator::Visit(&node->value, &node->value);
+    }
+  }
+
+  void Visit(const For* op, Expr* expr) override {
+    auto* node = expr->As<For>();
+    CHECK(node);
+    auto* rf_for = rf_loop_.As<For>();
+    // erase the reduce forloops after the init block except the rfactor loop
+    if (visit_init_block_ && node->loop_var->name != old_rf_loop_var_->name) {
+      erase_reduce_loopvars_.insert(node->loop_var->name);
+      auto body = node->body.As<Block>();
+      if (body && body->stmts.size() == 1) {
+        *expr = body->stmts[0];
+      } else {
+        *expr = node->body;
+      }
+      IRMutator::Visit(expr, expr);
+    } else {
+      IRMutator::Visit(&node->body, &node->body);
+    }
+  }
+
+ private:
+  Expr rf_loop_;
+  int rf_axis_;
+  Var old_rf_loop_var_;
+  Var old_rf_iter_var_;
+  std::string output_name_;
+  // collect reduce loop vars except rfactor loop var
+  std::set<std::string> erase_reduce_loopvars_;
+  bool visit_init_block_ = false;
+  Tensor new_rf_tensor_;
+  std::vector<Expr> new_rf_indice_;
+};
+
+// The struct used to create all stmts after rfactor transformation.
+struct RfCreater : public ir::IRMutator<> {
+ public:
+  RfCreater(const Expr& root, const Expr& rf_loop, const int& rf_axis)
+      : root_(root), rf_loop_(rf_loop), rf_axis_(rf_axis) {}
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+  Expr CreateRfAllStmts() {
+    auto root_realize = root_.As<ScheduleBlockRealize>();
+    CHECK(root_realize);
+    auto root_block = root_realize->schedule_block.As<ScheduleBlock>();
+    CHECK(root_block);
+    Expr root_loop = optim::IRCopy(root_block->body);
+    if (auto block = root_loop.As<Block>()) {
+      CHECK_EQ(block->stmts.size(), 1U) << "rfactor root should only have one block stmt";
+      root_loop = block->stmts[0];
+    }
+    auto* root_for = root_loop.As<For>();
+    CHECK(root_for);
+    auto rf_for = rf_loop_.As<For>();
+    CHECK(rf_for);
+    // create new rfactor forloops
+    Expr new_rf_forloop = optim::IRCopy(root_loop);
+    RfMutator rf_mutator(rf_loop_, rf_axis_);
+    rf_mutator(&new_rf_forloop);
+    VLOG(3) << "After RfMutator, new rf_forloop is\n" << new_rf_forloop;
+    auto new_rf_tensor = rf_mutator.GetNewRfTensor();
+    // create final write-back forloops
+    Expr final_forloop = optim::IRCopy(root_loop);
+    FinalMutator final_mutator(rf_loop_, rf_axis_, new_rf_tensor);
+    final_mutator(&final_forloop);
+    VLOG(3) << "After FinalMuator, final write-back forloop is\n" << final_forloop;
+    // combine the new created rfactor forloops with the final write-back forloops and replace
+    root_block->body = Block::Make({new_rf_forloop, final_forloop});
+    return new_rf_tensor;
+  }
+
+  Expr root_;
+  Expr rf_loop_;
+  int rf_axis_;
+};
+
+Expr ScheduleImpl::Rfactor(const Expr& rf_loop, int rf_axis) {
+  CHECKRfactorValidation(rf_loop, rf_axis);
+  // get root ScheduleBlockRealize
+  Expr root = GetRootBlock(rf_loop);
+  // create all stmts after rfactor transformation
+  RfCreater rf_create(root, rf_loop, rf_axis);
+  // return new created rfactor tensor
+  return rf_create.CreateRfAllStmts();
+}
+
+struct CacheReadRewriter : public ir::IRMutator<> {
+ public:
+  static Expr Rewrite(const Expr& root, CacheBlockInfo* info) {
+    CacheReadRewriter rewriter(root, info);
+    Expr new_root = optim::IRCopy(root);
+    rewriter(&new_root);
+    return new_root;
+  }
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+ private:
+  explicit CacheReadRewriter(const Expr& root, CacheBlockInfo* info) : root_(root), info_(info) {}
+
+  void Visit(const ir::Block* expr, Expr* op) override {
+    if (*op == info_->loc_block) {
+      IRMutator::Visit(expr, op);
+      op->As<Block>()->stmts.insert(op->As<Block>()->stmts.begin() + info_->loc_pos, info_->cache_block);
+    } else {
+      IRMutator::Visit(expr, op);
+    }
+  }
+
+  void Visit(const ir::Load* expr, Expr* op) override {
+    if (expr->tensor == Expr(info_->read_tensor)) {
+      IRMutator::Visit(expr, op);
+      op->As<Load>()->tensor = Expr(info_->write_tensor);
+    } else {
+      IRMutator::Visit(expr, op);
+    }
+  }
+
+ private:
+  /*! \brief The parent scope of the insertion */
+  const Expr& root_;
+  /*! \brief The info for inserting cache stage */
+  CacheBlockInfo* info_;
+};
+
+struct CacheWriteRewriter : public ir::IRMutator<> {
+ public:
+  static Expr Rewrite(const Expr& root, CacheBlockInfo* info) {
+    CacheWriteRewriter rewriter(root, info);
+    Expr new_root               = optim::IRCopy(root);
+    rewriter.mutate_cache_block = true;
+    rewriter(&info->cache_block);
+    rewriter.mutate_cache_block = false;
+    rewriter(&new_root);
+    auto find_tensor = ir::CollectIRNodesWithoutTensor(
+        new_root,
+        [&](const Expr* x) { return x->As<Store>() && (x->As<Store>()->tensor == Expr(info->read_tensor)); },
+        true);
+    if (!find_tensor.empty()) {
+      auto find_store = ir::CollectIRNodesWithoutTensor((*find_tensor.begin()), [&](const Expr* x) {
+        return x->As<Load>() && (x->As<Load>()->tensor == Expr(info->write_tensor));
+      });
+      for (auto load_ir : find_store) {
+        load_ir.As<Load>()->tensor = Expr(info->read_tensor);
+      }
+    }
+    return new_root;
+  }
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+ private:
+  explicit CacheWriteRewriter(const Expr& root, CacheBlockInfo* info) : root_(root), info_(info) {}
+
+  void Visit(const ir::Block* expr, Expr* op) override {
+    if (*op == info_->loc_block) {
+      IRMutator::Visit(expr, op);
+      op->As<Block>()->stmts.insert(op->As<Block>()->stmts.begin() + info_->loc_pos, info_->cache_block);
+    } else {
+      IRMutator::Visit(expr, op);
+    }
+  }
+
+  void Visit(const ir::ScheduleBlock* expr, Expr* op) override {
+    if (op->As<ScheduleBlock>()->name == info_->write_tensor->name) {
+      op->As<ScheduleBlock>()->name = info_->read_tensor->name;
+    } else if (op->As<ScheduleBlock>()->name == info_->read_tensor->name) {
+      op->As<ScheduleBlock>()->name = info_->write_tensor->name;
+    }
+    IRMutator::Visit(expr, op);
+  }
+
+  void Visit(const ir::Load* expr, Expr* op) override {
+    IRMutator::Visit(expr, op);
+    if (op->As<Load>()->tensor == Expr(info_->write_tensor) && mutate_cache_block) {
+      op->As<Load>()->tensor = Expr(info_->read_tensor);
+    } else if (op->As<Load>()->tensor == Expr(info_->read_tensor) && mutate_cache_block) {
+      op->As<Load>()->tensor = Expr(info_->write_tensor);
+    }
+  }
+
+  void Visit(const ir::Store* expr, Expr* op) override {
+    IRMutator::Visit(expr, op);
+    if (op->As<Store>()->tensor == Expr(info_->write_tensor)) {
+      op->As<Store>()->tensor = Expr(info_->read_tensor);
+    } else if (op->As<Store>()->tensor == Expr(info_->read_tensor) && mutate_cache_block) {
+      op->As<Store>()->tensor = Expr(info_->write_tensor);
+    }
+  }
+
+ private:
+  /*! \brief The parent scope of the insertion */
+  const Expr& root_;
+  /*! \brief The info for inserting cache stage */
+  CacheBlockInfo* info_;
+  /*! \brief Are we mutating the cache tensor's block */
+  bool mutate_cache_block{true};
+};
+
+//! Visit all ScheduleBlock and change its body to ir::Block if it is not.
+struct ChangeBodyToBlock : public ir::IRMutator<> {
+ public:
+  static void Change(Expr* expr) {
+    ChangeBodyToBlock mutator;
+    mutator(expr);
+  }
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::ScheduleBlock* expr, Expr* op) override {
+    if (!op->As<ScheduleBlock>()->body.As<Block>()) {
+      op->As<ScheduleBlock>()->body = Block::Make({op->As<ScheduleBlock>()->body});
+    }
+    IRMutator::Visit(expr, op);
+  }
+};
+
+DeviceAPI ScheduleImpl::GetDeviceAPI() const {
+  auto exprs          = this->GetModule().GetExprs();
+  auto find_for_nodes = ir::CollectIRNodesWithoutTensor(
+      exprs.front(), [&](const Expr* x) { return x->As<ir::For>(); }, true);
+  CHECK(!find_for_nodes.empty());
+  return (*find_for_nodes.begin()).As<ir::For>()->device_api;
+}
+
+Expr ScheduleImpl::CacheRead(const Expr& block, int read_tensor_index, const std::string& memory_type) {
+  CHECK(block.As<ScheduleBlockRealize>());
+  auto root = GetRootBlock(block);
+  ChangeBodyToBlock::Change(&root);
+  Expr read_expr = GetNthAccessExpr(block, read_tensor_index, false);
+  CHECK(read_expr.As<ir::Load>());
+  auto tensor_indices = read_expr.As<ir::Load>()->indices;
+  CacheBlockInfo info;
+  info.read_tensor  = read_expr.As<ir::Load>()->tensor.as_tensor_ref();
+  info.write_tensor = MakeCacheTensor(info.read_tensor, memory_type);
+  info.alloc        = info.write_tensor;
+
+  auto read_ranges = CalculateTensorRegions(block, tensor_indices, info.read_tensor, root);
+  auto new_block   = MakeCacheBlock(read_ranges, &info, memory_type, this->GetDeviceAPI());
+  FindInsertionPoint(root, &info, false);
+  auto new_root = CacheReadRewriter::Rewrite(root, &info);
+  this->Replace(root.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>()->body,
+                new_root.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>()->body);
+  return new_block;
+}
+
+Expr ScheduleImpl::CacheWrite(const Expr& block, int write_buffer_index, const std::string& memory_type) {
+  CHECK(block.As<ScheduleBlockRealize>());
+  auto root = GetRootBlock(block);
+  ChangeBodyToBlock::Change(&root);
+  Expr write_expr = GetNthAccessExpr(block, write_buffer_index, true);
+  CHECK(write_expr.As<ir::Store>());
+  Tensor write_tensor = write_expr.As<ir::Store>()->tensor.as_tensor_ref();
+  auto tensor_indices = write_expr.As<ir::Store>()->indices;
+  CacheBlockInfo info;
+  info.read_tensor  = MakeCacheTensor(write_tensor, memory_type);
+  info.write_tensor = write_tensor;
+  info.alloc        = info.read_tensor;
+  auto write_ranges = CalculateTensorRegions(block, tensor_indices, info.write_tensor, root);
+  auto new_block    = MakeCacheBlock(write_ranges, &info, memory_type, this->GetDeviceAPI());
+  FindInsertionPoint(root, &info, true);
+
+  auto new_root = CacheWriteRewriter::Rewrite(root, &info);
+  this->Replace(root.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>()->body,
+                new_root.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>()->body);
+
+  auto find_cache_block = ir::CollectIRNodesWithoutTensor(
+      root,
+      [&](const Expr* x) {
+        return x->As<ir::ScheduleBlockRealize>() && !x->As<ir::ScheduleBlockRealize>()->iter_values.empty() &&
+               GetTensor(*x)->name == info.read_tensor->name;
+      },
+      true);
+
+  CHECK(info.write_tensor->buffer.defined());
+
+  // Replace buffer
+  auto all_tensors = ir::CollectIRNodesWithoutTensor(
+      root, [&](const Expr* x) { return x->as_tensor() && x->as_tensor()->buffer.defined(); });
+
+  for (auto i : all_tensors) {
+    if (i.as_tensor()->name != info.write_tensor->name && i.as_tensor()->buffer.defined() &&
+        i.as_tensor()->buffer->name == info.write_tensor->buffer->name) {
+      i.as_tensor()->Bind(info.read_tensor->buffer);
+    }
+  }
+
+  CHECK_EQ(find_cache_block.size(), 1U);
+
+  return *find_cache_block.begin();
+}
+
+struct InsertExpr : public ir::IRMutator<> {
+ public:
+  static void Insert(const Expr& ir_node, const Expr& insert_node, bool after_node, Expr* expr) {
+    InsertExpr mutator(ir_node, insert_node, after_node);
+    mutator(expr);
+  }
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+ private:
+  explicit InsertExpr(const Expr& ir_node, const Expr& insert_node, bool after_node)
+      : ir_node_(ir_node), insert_node_(insert_node), after_node_(after_node) {}
+
+  void Visit(const ir::Block* expr, Expr* op) override {
+    for (int i = 0; i < expr->stmts.size(); i++) {
+      if (expr->stmts[i] == ir_node_) {
+        if (after_node_) {
+          op->As<ir::Block>()->stmts.insert(op->As<ir::Block>()->stmts.begin() + i + 1, insert_node_);
+        } else {
+          op->As<ir::Block>()->stmts.insert(op->As<ir::Block>()->stmts.begin() + i, insert_node_);
+        }
+        return;
+      }
+    }
+    IRMutator::Visit(expr, op);
+  }
+
+  void Visit(const ir::For* expr, Expr* op) override {
+    if (expr->body == ir_node_) {
+      if (after_node_)
+        op->As<ir::For>()->body = ir::Block::Make({op->As<ir::For>()->body, insert_node_});
+      else
+        op->As<ir::For>()->body = ir::Block::Make({insert_node_, op->As<ir::For>()->body});
+      return;
+    }
+    IRMutator::Visit(expr, op);
+  }
+
+ private:
+  const Expr& ir_node_;
+  const Expr& insert_node_;
+  bool after_node_;
+};
+
+void ScheduleImpl::SyncThreads(const Expr& ir_node, bool after_node) {
+  CHECK(ir_node.As<ScheduleBlockRealize>() || ir_node.As<ir::For>());
+  auto root = GetRootBlock(ir_node);
+  ChangeBodyToBlock::Change(&root);
+  Expr sync_threads = runtime::IntrinsicCall(Void(), "__syncthreads", {});
+  InsertExpr::Insert(ir_node, sync_threads, after_node, &root);
+  return;
+}
+
+/**
+ * Replace a For node to another For node.
+ * @param src_sref The For node to be changed.
+ * @param tgt_stmt The For node we want.
+ */
+void ScheduleImpl::Replace(const Expr& src_sref, const Expr& tgt_stmt) {
+  CHECK(src_sref.As<ir::For>() || src_sref.As<ir::Block>() || src_sref.As<ir::ScheduleBlockRealize>());
+  CHECK(tgt_stmt.As<ir::For>() || tgt_stmt.As<ir::Block>() || tgt_stmt.As<ir::ScheduleBlockRealize>());
+  if (src_sref == tgt_stmt) {
+    return;
+  }
+  struct ForLoopMutator : public ir::IRMutator<> {
+    ForLoopMutator(const Expr& source, const Expr& target) : source_(source), target_(target) {}
+
+    void operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+    void Visit(const ir::For* op, Expr* expr) override {
+      if (*expr == source_) {
+        *expr = target_;
+        return;
+      }
+      ir::IRMutator<>::Visit(op, expr);
+    }
+
+    void Visit(const ir::ScheduleBlockRealize* op, Expr* expr) override {
+      if (*expr == source_) {
+        *expr = target_;
+        return;
+      }
+      ir::IRMutator<>::Visit(op, expr);
+    }
+
+    void Visit(const ir::Block* op, Expr* expr) override {
+      if (*expr == source_) {
+        *expr = target_;
+        return;
+      }
+      ir::IRMutator<>::Visit(op, expr);
+    }
+
+    const Expr& source_;
+    const Expr& target_;
+  };
+  auto exprs = module_expr_.GetExprs();
+  ForLoopMutator mutator(src_sref, tgt_stmt);
+  for (auto& i : exprs) {
+    mutator(&i);
+  }
+}
+
+Expr ScheduleImpl::Reorder(const std::vector<Expr>& loops) {
+  if (loops.size() <= 1) {
+    return Expr{nullptr};
+  }
+  VLOG(4) << "Before Reorder, ir is:\n" << loops[0];
+
+  std::set<Expr, CompExpr> loop_set = CollectLoopsToSet(loops);
+  auto boundary                     = GetBoundaryOfReorderRange(loop_set);
+  Expr top                          = boundary.first;
+  Expr bottom                       = boundary.second;
+  std::vector<Expr> chain           = GetLoopsInRange(top, bottom);
+  std::vector<Expr> if_nodes        = GetIfThenElseInRange(top, bottom);
+  Expr new_loop                     = ConstructNewLoopChain(chain, loops, loop_set, if_nodes);
+  this->Replace(top, new_loop);
+
+  VLOG(4) << "After Reorder, ir is:\n" << new_loop;
+  return new_loop;
+}
+
+Expr ScheduleImpl::Reorder(const std::string& block_name, const std::vector<int>& loops_index) {
+  std::vector<Expr> all_loops = this->GetLoops(block_name);
+  std::vector<Expr> loops_expr;
+  loops_expr.reserve(loops_index.size());
+  for (int i : loops_index) {
+    CHECK_LT(i, (int)all_loops.size()) << "The loop index in Reorder should be less than total loop's number.";
+    CHECK_GE(i, 0) << "The loop index in Reorder should be >= 0.";
+    loops_expr.emplace_back(all_loops[i]);
+  }
+  return this->Reorder(loops_expr);
+}
+
+Expr ScheduleImpl::Reorder(const Expr& block, const std::vector<int>& loops_index) {
+  std::vector<Expr> all_loops = this->GetLoops(block);
+  std::vector<Expr> loops_expr;
+  loops_expr.reserve(loops_index.size());
+  for (int i : loops_index) {
+    CHECK_LT(i, (int)all_loops.size()) << "The loop index in Reorder should be less than total loop's number.";
+    CHECK_GE(i, 0) << "The loop index in Reorder should be >= 0.";
+    loops_expr.emplace_back(all_loops[i]);
+  }
+  return this->Reorder(loops_expr);
+}
+
+Expr ScheduleImpl::GetRootBlock(const Expr& expr) const {
+  auto exprs = this->GetModule().GetExprs();
+  for (auto& it_expr : exprs) {
+    auto find_expr = ir::CollectIRNodesWithoutTensor(
+        it_expr, [&](const Expr* x) { return x->node_type() == expr.node_type() && *x == expr; }, true);
+    if (!find_expr.empty()) {
+      CHECK(it_expr.As<ir::Block>());
+      CHECK_EQ(it_expr.As<ir::Block>()->stmts.size(), 1U);
+      CHECK(it_expr.As<ir::Block>()->stmts[0].As<ir::ScheduleBlockRealize>());
+      return it_expr.As<ir::Block>()->stmts[0];
+    }
+  }
+  LOG(FATAL) << "Didn't find expr \n" << expr << "in ScheduleImpl:\n" << exprs[0];
+}
+
+// The struct used to reconstruct the new For node to replace the old For node.
+struct LoopReconstructor : public ir::IRMutator<> {
+ public:
+  explicit LoopReconstructor(const Expr& root, const Expr& block, const Expr& loop)
+      : root_(root), block_(block), loop_(loop) {}
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+  /* \param inserted_pos The position index of the new_loop_ body `stmts` to be inserted:
+   *        - `index = -1` means inserted into the tail
+   *        - otherwise, it should be a index between [0, stmts size)
+   */
+  std::string MakeNewLoop(const std::vector<IterRange>& iter_ranges, bool keep_unit_loops, int inserted_pos = -1) {
+    int n_iters = iter_ranges.size();
+    std::vector<Var> loop_vars;
+    std::vector<Expr> loop_extents;
+    std::vector<Expr> iter_values;
+    loop_vars.reserve(n_iters);
+    loop_extents.reserve(n_iters);
+    iter_values.reserve(n_iters);
+    std::vector<std::string> new_var_names;
+    for (int i = 0; i < n_iters; ++i) {
+      const auto& range = iter_ranges[i];
+      if (keep_unit_loops || range.extent != Expr(1)) {
+        std::string var_name = common::UniqName("ax" + std::to_string(loop_vars.size()));
+        new_var_names.push_back(var_name);
+        Var var(var_name, Int(32));
+        loop_vars.push_back(var);
+        loop_extents.push_back(range.extent);
+        iter_values.push_back(common::AutoSimplify(range.min) + var);
+      } else {
+        iter_values.push_back(common::AutoSimplify(range.min));
+      }
+    }
+    auto schedule_block_node = block_.As<ir::ScheduleBlockRealize>()->schedule_block;
+    new_block_               = ScheduleBlockRealize::Make(std::move(iter_values), std::move(schedule_block_node));
+    Expr loop_body           = new_block_;
+    for (int i = static_cast<int>(loop_vars.size()) - 1; i >= 0; --i) {
+      auto loop_var    = loop_vars[i];
+      auto loop_extent = loop_extents[i];
+      if (!loop_body.As<ir::Block>()) loop_body = Block::Make({loop_body});
+      loop_body = For::Make(
+          loop_var, Expr(0), loop_extent, ForType::Serial, loop_.As<ir::For>()->device_api, std::move(loop_body));
+    }
+    new_loop_ = optim::IRCopy(loop_);
+
+    // Replace the copied Tensor object with the original Tensor object,
+    // to ensure that the same Tensor in a AST is the same object.
+    std::unordered_map<std::string, ir::Expr> tensors_map;
+    ir::CollectIRNodesWithoutTensor(loop_, [&tensors_map](const Expr* x) {
+      if (x->as_tensor()) {
+        tensors_map.insert({x->as_tensor()->name, *x});
+        return true;
+      }
+      return false;
+    });
+    auto find_store = ir::CollectIRNodesWithoutTensor(new_loop_, [](const Expr* x) { return x->As<ir::Store>(); });
+    for (auto store : find_store) {
+      store.As<ir::Store>()->tensor = tensors_map.at(store.As<ir::Store>()->tensor.as_tensor()->name);
+    }
+    auto find_load = ir::CollectIRNodesWithoutTensor(new_loop_, [](const Expr* x) { return x->As<ir::Load>(); });
+    for (auto load : find_load) {
+      load.As<ir::Load>()->tensor = tensors_map.at(load.As<ir::Load>()->tensor.as_tensor()->name);
+    }
+
+    InsertBlock(new_loop_, loop_body, inserted_pos);
+    return utils::Join(new_var_names, ",");
+  }
+
+ private:
+ public:
+  /*! \brief The root block */
+  Expr root_;
+  /*! \brief The given block to be moved */
+  Expr block_;
+  /*! \brief The given loop the block and its loop nest to be put under */
+  Expr loop_;
+  /*! \brief The new loop to replace the original loop */
+  Expr new_loop_{nullptr};
+  /*! \brief The new block realize to the moved block */
+  Expr new_block_{nullptr};
+  /*! \brief The plan to remove the given block by replacing this loop/block in the AST */
+  Expr source_expr{nullptr};
+  /*! \brief The plan to remove the given block by replacing to this loop/block in the AST */
+  Expr target_expr{nullptr};
+};
+
+struct FixLocalBufferSize : public ir::IRMutator<> {
+ public:
+  FixLocalBufferSize(const std::string& tensor_name) : tensor_name_(tensor_name) {}
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::Store* expr, Expr* op) override {
+    if (op->As<Store>()->tensor.As<_Tensor_>()->name == tensor_name_) {
+      op->As<Store>()->tensor.As<_Tensor_>()->shape         = {Expr(1)};
+      op->As<Store>()->tensor.As<_Tensor_>()->domain        = {Expr(1)};
+      op->As<Store>()->tensor.As<_Tensor_>()->buffer->shape = {Expr(1)};
+      op->As<Store>()->indices                              = {Expr(0)};
+    }
+    IRMutator::Visit(expr, op);
+  }
+
+  void Visit(const ir::Load* expr, Expr* op) override {
+    if (op->As<Load>()->tensor.As<_Tensor_>()->name == tensor_name_) {
+      op->As<Load>()->tensor.As<_Tensor_>()->shape         = {Expr(1)};
+      op->As<Load>()->tensor.As<_Tensor_>()->domain        = {Expr(1)};
+      op->As<Load>()->tensor.As<_Tensor_>()->buffer->shape = {Expr(1)};
+      op->As<Load>()->indices                              = {Expr(0)};
+    }
+    IRMutator::Visit(expr, op);
+  }
+  std::string tensor_name_;
+};
+
+void ScheduleImpl::SetBuffer(Expr& block, const std::string& memory_type, bool fixed) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  auto find_tensor = ir::CollectIRNodesWithoutTensor(
+      block, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
+  CHECK_EQ(find_tensor.size(), 1U) << "One block should only have one Store node!(except for root block)";
+  auto& tensor = (*find_tensor.begin()).As<ir::Store>()->tensor;
+  tensor.as_tensor_ref()->WithBuffer(memory_type, "_" + tensor.as_tensor_ref()->name + "_temp_buffer");
+
+  auto exprs = this->GetModule().GetExprs();
+  for (auto& it_expr : exprs) {
+    auto find_tensor = ir::CollectIRNodesWithoutTensor(it_expr, [&](const Expr* x) {
+      return x->as_tensor() && (x->as_tensor()->name == tensor.as_tensor_ref()->name ||
+                                x->as_tensor()->name == tensor.as_tensor_ref()->name + "__reduce_init");
+    });
+    for (auto& t : find_tensor) {
+      CHECK(t.as_tensor());
+      t.as_tensor_ref()->Bind(tensor.as_tensor_ref()->buffer);
+    }
+  }
+
+  // if buffer type == "local"
+  if (memory_type == "local" && fixed) {
+    FixLocalBufferSize mutator(block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name);
+    auto root = GetRootBlock(block);
+    mutator(&root);
+  }
+}
+
+void ScheduleImpl::MergeExprs() {
+  auto exprs = this->GetModule().GetExprs();
+  if (exprs.size() == 1U) return;
+  CHECK(exprs[0].As<ir::Block>());
+  CHECK_EQ(exprs[0].As<ir::Block>()->stmts.size(), 1U);
+  CHECK(exprs[0].As<ir::Block>()->stmts[0].As<ir::ScheduleBlockRealize>());
+  CHECK(exprs[0].As<ir::Block>()->stmts[0].As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>());
+  std::vector<Expr> merged_block;
+  merged_block.push_back(
+      exprs[0].As<ir::Block>()->stmts[0].As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->body);
+  VLOG(3) << "Before merging, exprs[0] is : " << exprs[0];
+  for (int i = 1; i < exprs.size(); ++i) {
+    auto root_block = ir::CollectIRNodesWithoutTensor(
+        exprs[i],
+        [&](const Expr* x) {
+          return x->As<ir::ScheduleBlockRealize>() && x->As<ir::ScheduleBlockRealize>()->iter_values.empty();
+        },
+        true);
+    CHECK_EQ(root_block.size(), 1U);
+    for (auto& it_block : root_block) {
+      auto& block_body = it_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->body;
+      merged_block.push_back(block_body);
+    }
+  }
+  for (auto& block : merged_block) {
+    VLOG(3) << "in merged_block, it has " << block;
+  }
+  auto merged_expr = ir::Block::Make(merged_block);
+  exprs[0].As<ir::Block>()->stmts[0].As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->body =
+      merged_expr;
+  VLOG(3) << "After merging, exprs[0] is : " << exprs[0];
+  exprs.erase(exprs.begin() + 1, exprs.end());
+  this->SetExprs(exprs);
+}
+
+void ScheduleImpl::ComputeAt(const Expr& block, const Expr& loop, bool keep_unit_loops) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  CHECK(loop.As<ir::For>());
+  Expr root = this->GetRootBlock(block);
+
+  VLOG(3) << "Begin ComputeAt of loop:\n" << loop << "\nat block:\n" << root;
+
+  auto producers = GetProducers(block, root);
+  auto consumers = GetConsumers(block, root);
+  CheckComputeAtValidation(block, loop, root);
+  LoopReconstructor reconstructor(root, block, loop);
+  LeafBlockRemovalPlan remove_plan(block, &reconstructor.source_expr, &reconstructor.target_expr);
+  remove_plan(&root);
+  auto iter_ranges          = CalculateRequiredRegions(block, loop, root, consumers);
+  std::string new_var_names = reconstructor.MakeNewLoop(iter_ranges, keep_unit_loops, 0);
+  auto sch_block_expr       = block.As<ir::ScheduleBlockRealize>()->schedule_block;
+  sch_block_expr.As<ir::ScheduleBlock>()->attrs.emplace(ir::attr::compute_at_extra_var, new_var_names);
+  this->Replace(reconstructor.source_expr, reconstructor.target_expr);
+  this->Replace(reconstructor.loop_, reconstructor.new_loop_);
+
+  VLOG(3) << "After SimpleComputeAt, ir is:\n" << reconstructor.new_loop_;
+}
+
+void ScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  CHECK(loop.As<ir::For>());
+  std::vector<Expr> block_loops = this->GetLoops(block);
+  Expr root                     = this->GetRootBlock(block);
+  auto loops                    = GetLoopsOfExpr(loop, root);
+
+  VLOG(3) << "Begin SimpleComputeAt of loop:\n" << loop << "\nat block:\n" << root;
+
+  auto this_loop  = loop;
+  auto block_name = GetTensor(block)->name;
+  auto this_block = block;
+  if (GetLoopExtent(loops[0]) == 1 && GetLoopExtent(block_loops[0]) != 1) {
+    this->Split(block_loops[0], {1, -1});
+    this_block = this->GetBlock(block_name);
+  } else if (GetLoopExtent(loops[0]) != 1 && GetLoopExtent(block_loops[0]) == 1) {
+    auto splited = this->Split(loops[0], {1, -1});
+    this_loop    = splited[1];
+  }
+
+  block_loops = this->GetLoops(this_block);
+  root        = this->GetRootBlock(this_block);
+  loops       = GetLoopsOfExpr(this_loop, root);
+
+  CHECK_LE(loops.size(), block_loops.size());
+
+  std::vector<Var> replaced_var;
+  std::vector<Expr> substitute_expr;
+  for (int i = 0; i < loops.size(); ++i) {
+    CHECK_EQ(GetLoopExtent(loops[i]), GetLoopExtent(block_loops[i]));
+    if (block_loops[i].As<ir::For>()->bind_info().valid() && !loops[i].As<ir::For>()->bind_info().valid()) {
+      loops[i].As<ir::For>()->set_bind_info(block_loops[i].As<ir::For>()->bind_info());
+    }
+    replaced_var.push_back(block_loops[i].As<ir::For>()->loop_var);
+    substitute_expr.push_back(Expr(loops[i].As<ir::For>()->loop_var));
+  }
+
+  Expr result =
+      loops.size() < block_loops.size() ? optim::IRCopy(block_loops[loops.size()]) : optim::IRCopy(this_block);
+  Expr new_loop = optim::IRCopy(this_loop);
+
+  // Get the body of block_loop under the same loops
+  auto body = block_loops.at(loops.size() - 1).As<ir::For>()->body;
+  // collect if
+  auto if_checker = [](const Expr* x) { return x->As<ir::IfThenElse>(); };
+  auto if_set     = ir::CollectIRNodesWithoutTensor(body, if_checker);
+  for (auto if_expr : if_set) {
+    auto checker = [block_name](const Expr* x) {
+      return x->As<ir::ScheduleBlockRealize>() &&
+             x->As<ir::ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>()->name == block_name;
+    };
+    if (ir::CollectIRNodesWithoutTensor(if_expr, checker, true).size() > 0) {
+      result = IfThenElse::Make(if_expr.As<ir::IfThenElse>()->condition, result);
+      break;
+    }
+  }
+
+  ReplaceExpr(&result, replaced_var, substitute_expr);
+  // When there are two identical IfThenElse
+  if (new_loop.As<ir::For>() && new_loop.As<ir::For>()->body.As<ir::Block>() &&
+      new_loop.As<ir::For>()->body.As<ir::Block>()->stmts[0].As<ir::IfThenElse>()) {
+    auto if_then_else = new_loop.As<ir::For>()->body.As<ir::Block>()->stmts[0];
+    if (result.As<ir::IfThenElse>() &&
+        if_then_else.As<ir::IfThenElse>()->condition == result.As<ir::IfThenElse>()->condition) {
+      new_loop.As<ir::For>()->body.As<ir::Block>()->stmts[0].As<ir::IfThenElse>()->true_case =
+          ir::Block::Make({result.As<ir::IfThenElse>()->true_case,
+                           new_loop.As<ir::For>()->body.As<ir::Block>()->stmts[0].As<ir::IfThenElse>()->true_case});
+    } else {
+      std::vector<ir::Expr>::iterator pos = new_loop.As<ir::For>()->body.As<ir::Block>()->stmts.begin();
+      new_loop.As<ir::For>()->body.As<ir::Block>()->stmts.insert(pos, result);
+    }
+  } else {
+    new_loop.As<ir::For>()->body = ir::Block::Make({result, new_loop.As<ir::For>()->body});
+  }
+
+  Expr source_expr{nullptr};
+  Expr target_expr{nullptr};
+
+  LeafBlockRemovalPlan remove_plan(
+      result.As<ir::For>() ? block_loops[loops.size()] : this_block, &source_expr, &target_expr);
+  remove_plan(&root);
+
+  this->Replace(source_expr, target_expr);
+  this->Replace(this_loop, new_loop);
+
+  VLOG(3) << "After SimpleComputeAt, ir is:\n" << new_loop;
+}
+
+void ScheduleImpl::ReverseComputeAt(const Expr& block, const Expr& loop, bool keep_unit_loops) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  CHECK(loop.As<ir::For>());
+  Expr root      = this->GetRootBlock(block);
+  auto producers = GetProducers(block, root);
+  auto consumers = GetConsumers(block, root);
+  CheckComputeAtValidation(block, loop, root);
+  LoopReconstructor reconstructor(root, block, loop);
+  LeafBlockRemovalPlan remove_plan(block, &reconstructor.source_expr, &reconstructor.target_expr);
+  remove_plan(&root);
+  auto iter_ranges          = CalculateRequiredRegions(block, loop, root, producers, false);
+  std::string new_var_names = reconstructor.MakeNewLoop(iter_ranges, keep_unit_loops, -1);
+  auto sch_block_expr       = block.As<ir::ScheduleBlockRealize>()->schedule_block;
+  sch_block_expr.As<ir::ScheduleBlock>()->attrs.emplace(ir::attr::reverse_compute_at_extra_var, new_var_names);
+  this->Replace(reconstructor.source_expr, reconstructor.target_expr);
+  this->Replace(reconstructor.loop_, reconstructor.new_loop_);
+  return;
+}
+
+void BaseInliner::operator()(Expr* expr) {
+  IRMutator::Visit(&tgt_stmt, &tgt_stmt);
+  IRMutator::Visit(expr, expr);
+}
+
+void BaseInliner::Visit(const ir::Block* expr, Expr* op) {
+  if (*op == src_stmt) {
+    *op = tgt_stmt;
+    return;
+  }
+  IRMutator::Visit(expr, op);
+}
+
+bool BaseInliner::UpdateAndCheckIndexVars(const std::vector<Expr>& indices, int expected_ndim) {
+  int n = indices.size();
+  if (n != expected_ndim) {
+    return false;
+  }
+  std::vector<Var> result;
+  result.reserve(n);
+  for (auto& i : indices) {
+    if (i.as_var()) {
+      result.push_back(i.as_var_ref());
+    } else {
+      return false;
+    }
+  }
+  int n_distinct = std::set<Var, CompVar>(result.begin(), result.end()).size();
+  if (n != n_distinct) {
+    return false;
+  }
+  if (idx_vars_.empty()) {
+    idx_vars_ = std::move(result);
+  } else {
+    if (idx_vars_.size() != result.size()) return false;
+    for (int i = 0; i < result.size(); ++i) {
+      if (Expr(idx_vars_[i]) != Expr(result[i])) return false;
+    }
+  }
+  return true;
+}
+
+void BaseInliner::SetIndexSubstitution(const std::vector<Expr>& indices) {
+  CHECK_EQ(indices.size(), idx_vars_.size());
+  int n = idx_vars_.size();
+  idx_sub_var_.reserve(n);
+  idx_sub_expr_.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    idx_sub_var_.push_back(idx_vars_[i]);
+    idx_sub_expr_.push_back(indices[i]);
+  }
+}
+
+bool ComputeInliner::BodyPatternAllowInline() {
+  if (!inlined_store_.defined()) {
+    return false;
+  }
+  CHECK(inlined_store_.As<Store>());
+  auto find_vars = ir::CollectIRNodesWithoutTensor(inlined_store_, [&](const Expr* x) { return x->as_var(); });
+  std::set<Var, CompVar> vars_set;
+  for (auto& i : find_vars) vars_set.insert(i.as_var_ref());
+  int n_vars = vars_set.size();
+  if (!UpdateAndCheckIndexVars(inlined_store_.As<Store>()->indices, n_vars)) {
+    return false;
+  }
+  return true;
+}
+
+void ComputeInliner::Visit(const ir::Load* expr, Expr* op) {
+  if ((expr->tensor).as_tensor_ref()->name == inlined_tensor_->name) {
+    *op = ReplaceInlinedTensor(op);
+    return;
+  }
+  IRMutator::Visit(expr, op);
+}
+
+//! Replace the 'Load' node on the tensor to 'Load' node of its producers.
+Expr ComputeInliner::ReplaceInlinedTensor(Expr* load) {
+  CHECK(load->As<ir::Load>());
+  SetIndexSubstitution(load->As<ir::Load>()->indices);
+  Expr value_copy = optim::IRCopy(inlined_store_.As<Store>()->value);
+  ReplaceExpr(&value_copy, idx_sub_var_, idx_sub_expr_);
+  return value_copy;
+}
+
+void ScheduleImpl::ComputeInline(const Expr& schedule_block) {
+  CHECK(schedule_block.As<ir::ScheduleBlockRealize>());
+  Expr root  = this->GetRootBlock(schedule_block);
+  Expr store = CheckComputeInlineValidationAndGetStore(schedule_block, root);
+  ComputeInliner inliner(store.As<ir::Store>()->tensor.as_tensor_ref(), store);
+  CHECK(inliner.BodyPatternAllowInline());
+  // Create a plan that removes the block to be inlined
+  LeafBlockRemovalPlan remove_plan(schedule_block, &inliner.src_stmt, &inliner.tgt_stmt);
+  remove_plan(&root);
+  inliner(&root);
+  return;
+}
+
+bool ComputeInlineChecker::Check() {
+  Expr root = ir_schedule_.GetRootBlock(block_);
+  store_    = CheckComputeInlineValidationAndGetStore(block_, root);
+  IRMutator::Visit(&root, &root);
+  return !should_skip_;
+}
+
+void ComputeInlineChecker::BuildDataDependency() {
+  ir_schedule_.SetBuffer(block_, "shared", true);
+  auto loops = ir_schedule_.GetLoops(block_);
+  ir_schedule_.SyncThreads(loops.back(), true);
+}
+
+bool ReverseComputeInliner::BodyPatternAllowInline() {
+  if (!inlined_store_.defined()) {
+    return false;
+  }
+  if (!inlined_load_.defined()) {
+    return false;
+  }
+  if (!target_store_.defined()) {
+    return false;
+  }
+  CHECK(inlined_store_.As<Store>());
+  CHECK(inlined_load_.As<Load>());
+  CHECK(target_store_.As<Store>());
+  auto find_vars = ir::CollectIRNodesWithoutTensor(inlined_store_, [&](const Expr* x) { return x->as_var(); });
+  std::set<Var, CompVar> vars_set;
+  for (auto& i : find_vars) vars_set.insert(i.as_var_ref());
+  int n_vars = vars_set.size();
+  if (!UpdateAndCheckIndexVars(inlined_store_.As<Store>()->indices, n_vars)) {
+    return false;
+  }
+  return true;
+}
+
+void ReverseComputeInliner::Visit(const ir::Load* expr, Expr* op) {
+  if ((expr->tensor).as_tensor_ref()->name == inlined_tensor_->name) {
+    *op = inlined_store_.As<Store>()->value;
+    return;
+  }
+  IRMutator::Visit(expr, op);
+}
+
+void ReverseComputeInliner::Visit(const ir::Store* expr, Expr* op) {
+  if ((expr->tensor).as_tensor_ref()->name == inlined_tensor_->name) {
+    *op = ReplaceTargetTensor(op);
+    return;
+  }
+  IRMutator::Visit(expr, op);
+}
+
+//! Replace the 'Load' node on the tensor to 'Load' node of its producers.
+Expr ReverseComputeInliner::ReplaceInlinedTensor(Expr* load) {
+  CHECK(load->As<ir::Load>());
+  SetIndexSubstitution(load->As<ir::Load>()->indices);
+  Expr value_copy = optim::IRCopy(inlined_store_.As<Store>()->value);
+  return value_copy;
+}
+
+Expr ReverseComputeInliner::ReplaceTargetTensor(Expr* store) {
+  auto indices = inlined_load_.As<ir::Load>()->indices;
+  CHECK_EQ(indices.size(), idx_vars_.size());
+  size_t n = idx_vars_.size();
+  idx_sub_var_.reserve(n);
+  idx_sub_expr_.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    idx_sub_var_.emplace_back(indices[i].as_var_ref());
+    idx_sub_expr_.emplace_back(idx_vars_[i]);
+  }
+
+  Expr value_copy = optim::IRCopy(target_store_);
+  ReplaceExpr(&value_copy, idx_sub_var_, idx_sub_expr_);
+  return value_copy;
+}
+
+void ScheduleImpl::ReverseComputeInline(const Expr& schedule_block) {
+  Expr root          = this->GetRootBlock(schedule_block);
+  auto exprs         = CheckReverseComputeInlineValidationAndGetExprs(schedule_block, root);
+  Expr inlined_load  = std::get<0>(exprs);
+  Expr inlined_store = std::get<1>(exprs);
+  Expr target_store  = std::get<2>(exprs);
+  ReverseComputeInliner inliner(
+      inlined_store.As<ir::Store>()->tensor.as_tensor_ref(), inlined_store, inlined_load, target_store);
+  CHECK(inliner.BodyPatternAllowInline());
+  // Create a plan that removes the block to be inlined
+  LeafBlockRemovalPlan remove_plan(schedule_block, &inliner.src_stmt, &inliner.tgt_stmt);
+  remove_plan(&root);
+  inliner(&root);
+  inliner(&root);
+}
+
+struct FindBlockParent : public ir::IRMutator<> {
+ public:
+  FindBlockParent(const std::string& block_name) : block_name_(block_name) {}
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::Block* expr, Expr* op) override {
+    if (target_) return;
+    for (auto& stmt : expr->stmts) {
+      if (stmt.As<ir::ScheduleBlockRealize>()) {
+        if (stmt.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name == block_name_) {
+          target_ = op;
+          return;
+        }
+      }
+    }
+    IRMutator::Visit(expr, op);
+  }
+
+  void Visit(const ir::For* expr, Expr* op) override {
+    if (target_) return;
+    if (expr->body.As<ir::ScheduleBlockRealize>()) {
+      if (expr->body.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name == block_name_) {
+        target_ = op;
+        return;
+      }
+    }
+    IRMutator::Visit(expr, op);
+  }
+
+  void Visit(const ir::ScheduleBlock* expr, Expr* op) override {
+    if (target_) return;
+    if (expr->body.As<ir::ScheduleBlockRealize>()) {
+      if (expr->body.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name == block_name_) {
+        target_ = op;
+        return;
+      }
+    }
+    IRMutator::Visit(expr, op);
+  }
+
+  std::string block_name_;
+
+ public:
+  ir::Expr* target_{nullptr};
+};
+
+Expr ScheduleImpl::AddUnitLoop(const Expr& block) const {
+  auto exprs = module_expr_.GetExprs();
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  CHECK(block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>());
+  std::string block_name = block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name;
+
+  FindBlockParent visitor(block_name);
+  for (auto expr : exprs) {
+    visitor(&expr);
+    if (visitor.target_) {
+      break;
+    }
+  }
+
+  CHECK(visitor.target_) << ", block name : " << block_name << "\n" << exprs;
+  if (visitor.target_->As<ir::Block>()) {
+    for (auto& stmt : visitor.target_->As<ir::Block>()->stmts) {
+      if (stmt.As<ir::ScheduleBlockRealize>()) {
+        if (stmt.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name == block_name) {
+          auto block = ir::Block::Make({GetBlock(block_name)});
+          auto loop  = ir::For::Make(ir::Var(common::UniqName("ix")),
+                                    ir::Expr(0),
+                                    ir::Expr(1),
+                                    ir::ForType::Serial,
+                                    ir::DeviceAPI::UNK,
+                                    block);
+          stmt       = loop;
+          return loop;
+        }
+      }
+    }
+  } else if (visitor.target_->As<ir::For>()) {
+    auto block = ir::Block::Make({visitor.target_->As<ir::For>()->body});
+    auto loop  = ir::For::Make(
+        ir::Var(common::UniqName("ix")), ir::Expr(0), ir::Expr(1), ir::ForType::Serial, ir::DeviceAPI::UNK, block);
+    visitor.target_->As<ir::For>()->body = loop;
+    return loop;
+  } else if (visitor.target_->As<ir::ScheduleBlock>()) {
+    auto block = ir::Block::Make({visitor.target_->As<ir::ScheduleBlock>()->body});
+    auto loop  = ir::For::Make(
+        ir::Var(common::UniqName("ix")), ir::Expr(0), ir::Expr(1), ir::ForType::Serial, ir::DeviceAPI::UNK, block);
+    visitor.target_->As<ir::ScheduleBlock>()->body = loop;
+    return loop;
+  } else {
+    LOG(FATAL) << "Can't find block's parent!";
+  }
+  LOG(FATAL) << "Shouldn't reach code here in AddUnitLoop";
+  return Expr{nullptr};
+}
+
+std::vector<Expr> ScheduleImpl::GetLoops(const Expr& block) const {
+  std::vector<Expr> result;
+  auto exprs = module_expr_.GetExprs();
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  CHECK(block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>());
+  std::string block_name = block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->name;
+
+  for (auto& it_expr : exprs) {
+    ir::FindLoopsVisitor visitor(block);
+    auto find_loops = visitor(&it_expr);
+    if (!find_loops.empty()) {
+      if (!result.empty()) LOG(FATAL) << "Find block with name: \n" << block_name << " appeared in more than one AST!";
+      result = find_loops;
+    }
+  }
+
+  if (result.empty()) {
+    result.push_back(AddUnitLoop(block));
+  }
+  return result;
+}
+
+std::vector<Expr> ScheduleImpl::GetLoops(const std::string& block_name) const {
+  Expr block               = this->GetBlock(block_name);
+  std::vector<Expr> result = this->GetLoops(block);
+  return result;
+}
+
+std::vector<Expr> ScheduleImpl::GetAllBlocks() const {
+  std::vector<Expr> result;
+  auto exprs = module_expr_.GetExprs();
+  for (auto& it_expr : exprs) {
+    ir::FindBlocksVisitor visitor;
+    auto find_blocks = visitor(&it_expr);
+    result.insert(result.end(), find_blocks.begin(), find_blocks.end());
+  }
+  for (auto& it_expr : exprs) {
+    VLOG(3) << "it_expr is : " << it_expr;
+  }
+  CHECK(!result.empty()) << "Didn't find blocks in expr.";
+  return result;
+}
+
+std::vector<Expr> ScheduleImpl::GetChildBlocks(const Expr& expr) const {
+  CHECK(expr.As<ir::ScheduleBlockRealize>() || expr.As<ir::For>());
+  ir::FindBlocksVisitor visitor;
+  std::vector<Expr> result = visitor(&expr);
+  return result;
+}
+
+bool ScheduleImpl::HasBlock(const std::string& block_name) const {
+  auto exprs = module_expr_.GetExprs();
+  for (auto& it_expr : exprs) {
+    ir::FindBlocksVisitor visitor(block_name);
+    auto find_blocks = visitor(&it_expr);
+    if (!find_blocks.empty()) {
+      CHECK_EQ(find_blocks.size(), 1U) << "There should not be more than 1 block with identical name!";
+      return true;
+    }
+  }
+  return false;
+}
+
+Expr ScheduleImpl::GetBlock(const std::string& block_name) const {
+  Expr result;
+  auto exprs = module_expr_.GetExprs();
+  for (auto& it_expr : exprs) {
+    ir::FindBlocksVisitor visitor(block_name);
+    auto find_blocks = visitor(&it_expr);
+    if (!find_blocks.empty()) {
+      CHECK_EQ(find_blocks.size(), 1U) << "There should not be more than 1 block with identical name!";
+      result = find_blocks[0];
+      return result;
+    }
+  }
+  LOG(FATAL) << "Didn't find a block with name " << block_name << " in this ModuleExpr!";
+}
+
+void ScheduleImpl::Annotate(const Expr& block, const std::string& key, const attr_t& value) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  CHECK(block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>());
+  auto copied_block    = optim::IRCopy(block);
+  auto* schedule_block = copied_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>();
+  schedule_block->attrs.emplace(key, value);
+  this->Replace(block, copied_block);
+}
+
+void ScheduleImpl::Unannotate(Expr& block, const std::string& ann_key) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  CHECK(block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>());
+  auto* schedule_block = block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>();
+  if (schedule_block->attrs.count(ann_key)) {
+    schedule_block->attrs.erase(ann_key);
+  } else {
+    LOG(WARNING) << "Can't find annotation with key: " << ann_key;
+    return;
+  }
+}
+
+void ScheduleImpl::FlattenLoops(const std::vector<Expr>& loops, const bool flat_tensor) {
+  CHECK_GT(loops.size(), 0) << "Loops can't be empty!";
+  VLOG(4) << "Before FlattenLoops, ir is:\n" << loops[0];
+  // compute loop
+  int extent = 1;
+  std::vector<int> strides;
+  std::vector<ir::Var> loop_vars(loops.size());
+  for (int idx = loops.size() - 1; idx >= 0; --idx) {
+    strides.insert(strides.begin(), extent);
+    extent *= loops[idx].As<ir::For>()->extent.as_int32();
+    loop_vars[idx] = loops[idx].As<ir::For>()->loop_var;
+  }
+  CHECK_EQ(loops.size(), strides.size());
+
+  // create new loop.
+  auto last = loops.back().As<ir::For>();
+  auto var  = ir::Var("flat_i");
+  auto _var = ir::Var("_flat_i");
+  auto loop = ir::For::Make(var, ir::Expr(0), ir::Expr(extent), last->for_type(), last->device_api, last->body);
+
+  // map loop var to old loop var.
+  auto _iter = ir::Expr(_var);
+  std::unordered_map<std::string, ir::Expr> loops_to_flat_var_map;
+  for (int idx = 0; idx < strides.size(); ++idx) {
+    if (strides[idx] == 1) {
+      // flat_i_to_loop_var.push_back(_iter);
+      loops_to_flat_var_map[loops[idx].As<ir::For>()->loop_var->name] = _iter;
+    } else {
+      // flat_i_to_loop_var.push_back(_iter / Expr(strides[idx]));
+      loops_to_flat_var_map[loops[idx].As<ir::For>()->loop_var->name] = _iter / Expr(strides[idx]);
+      _iter                                                           = _iter % Expr(strides[idx]);
+    }
+  }
+
+  ir::FindBlocksVisitor visitor;
+  auto blocks      = visitor(&last->body);
+  auto can_do_flat = [](const std::vector<Expr>& indexs, const std::vector<Var>& loop_vars) {
+    if (indexs.size() != loop_vars.size()) {
+      return false;
+    }
+
+    for (int idx = 0; idx < indexs.size(); ++idx) {
+      if (!indexs[idx].as_var()) {
+        return false;
+      } else {
+        auto var = indexs[idx].as_var_ref();
+        if (var->name != loop_vars[idx]->name) {
+          return false;
+        }
+      }
+    }
+    return true;
+  };
+
+  // change blocks iter value/iter var
+  for (auto& block : blocks) {
+    auto block_realize  = block.As<ir::ScheduleBlockRealize>();
+    auto schedule_block = block_realize->schedule_block.As<ir::ScheduleBlock>();
+
+    // checkout loops in orders.
+    std::vector<std::string> var_names = {};
+    CHECK_GE(block_realize->iter_values.size(), loop_vars.size())
+        << "the number of iter bind values must be more than loop vars!";
+    for (int idx = 0; idx < block_realize->iter_values.size(); ++idx) {
+      auto& iter = block_realize->iter_values[idx];
+      if (iter.is_var()) {
+        CHECK_EQ(iter.as_var_ref()->name, loop_vars[idx]->name) << "loops is not the same order with tensor!";
+      } else {
+        CHECK(iter.As<IntImm>());
+        CHECK_EQ(iter.as_int32(), 0);
+      }
+    }
+
+    auto exprs = ir::CollectIRNodesInOrder(schedule_block->body,
+                                           [&](const Expr* x) { return x->As<ir::Store>() || x->As<ir::Load>(); });
+    // reverse exprs from last to first.
+    std::reverse(std::begin(exprs), std::end(exprs));
+
+    std::vector<ir::Var> var_to_replace;
+    std::vector<ir::Expr> flat_i_to_loop_var;
+    // if iter var is more than flat i to loop, there exist dim = 1.
+    for (int idx = 0; idx < block_realize->iter_values.size(); ++idx) {
+      if (block_realize->iter_values[idx].is_var()) {
+        var_to_replace.push_back(schedule_block->iter_vars[idx]);
+        auto var_name = block_realize->iter_values[idx].as_var_ref()->name;
+        CHECK(loops_to_flat_var_map.count(var_name)) << "Can't find var name : " << var_name;
+        flat_i_to_loop_var.push_back(loops_to_flat_var_map[var_name]);
+      } else {
+        CHECK_EQ(block_realize->iter_values[idx].as_int32(), 0);
+        // insert var -> 0, to replace var to 0.
+        var_to_replace.push_back(schedule_block->iter_vars[idx]);
+        flat_i_to_loop_var.push_back(Expr(0));
+      }
+    }
+    CHECK_EQ(var_to_replace.size(), flat_i_to_loop_var.size());
+
+    for (auto expr : exprs) {
+      if (expr.As<ir::Store>()) {
+        auto store = expr.As<ir::Store>();
+        if (store->is_addr_tensor()) {
+          auto t = store->tensor.as_tensor_ref();
+          CHECK(!t->reduce_axis.size());
+          auto tsize = std::accumulate(t->shape.begin(), t->shape.end(), 1, [](const int sum, const Expr& expr) {
+            return sum * expr.as_int32();
+          });
+          if ((!flat_tensor && !can_do_flat(store->indices, schedule_block->iter_vars)) || extent != tsize) {
+            // just replace indexs
+            for (auto& indice : store->indices) {
+              if (!indice.is_var()) {
+                continue;
+              }
+              ReplaceExpr(&indice, var_to_replace, flat_i_to_loop_var);
+            }
+            // compute index and flat tensor.
+            store->indices = {store->index()};
+            continue;
+          }
+          // update var and shape
+          store->indices = {Expr(_var)};
+        }
+      } else {
+        auto load = expr.As<ir::Load>();
+        if (load->is_addr_tensor()) {
+          auto t = load->tensor.as_tensor_ref();
+          CHECK(!t->reduce_axis.size());
+          auto tsize = std::accumulate(t->shape.begin(), t->shape.end(), 1, [](const int sum, const Expr& expr) {
+            return sum * expr.as_int32();
+          });
+          if ((!flat_tensor && !can_do_flat(load->indices, schedule_block->iter_vars)) || extent != tsize) {
+            // just replace indexs
+            for (auto& indice : load->indices) {
+              if (!indice.is_var()) {
+                continue;
+              }
+              ReplaceExpr(&indice, var_to_replace, flat_i_to_loop_var);
+            }
+            // compute index and flat tensor.
+            load->indices = {load->index()};
+            continue;
+          }
+          // update var and shape
+          load->indices = {Expr(_var)};
+        }
+      }
+    }
+    ReplaceExpr(&schedule_block->body, var_to_replace, flat_i_to_loop_var);
+
+    // update iter values
+    auto iter                  = ir::Expr(var);
+    block_realize->iter_values = {iter};
+
+    // update iter_vars
+    schedule_block->iter_vars = {_var};
+    CHECK_EQ(block_realize->iter_values.size(), schedule_block->iter_vars.size());
+  }
+
+  this->Replace(loops[0], loop);
+  VLOG(4) << "After FlattenLoops, ir is:\n" << loop;
+}
+
+void ScheduleImpl::CopyTransformAndLoopInfo(const std::string& block_name, const std::string& block_target_name) {
+  auto block        = this->GetBlock(block_name);
+  auto block_target = this->GetBlock(block_target_name);
+  this->CopyTransformAndLoopInfo(block, block_target);
+}
+
+void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, const Expr& block_target) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  CHECK(block_target.As<ir::ScheduleBlockRealize>());
+  auto exprs = this->GetModule().GetExprs();
+  CHECK_EQ(exprs.size(), 1U);
+  auto expr            = exprs[0];
+  auto vars            = block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->iter_vars;
+  auto vars_target     = block_target.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->iter_vars;
+  auto old_iter_values = block.As<ir::ScheduleBlockRealize>()->iter_values;
+  auto iter_values_target = block_target.As<ir::ScheduleBlockRealize>()->iter_values;
+  std::vector<Expr> new_iter_values;
+  for (int i = 0; i < vars.size() && i < vars_target.size(); ++i) {
+    CHECK(vars[i]->upper_bound.defined() && vars_target[i]->upper_bound.defined());
+    if (vars[i]->upper_bound.is_constant() && vars_target[i]->upper_bound.is_constant() &&
+        vars[i]->upper_bound.get_constant() == vars_target[i]->upper_bound.get_constant() && !vars[i]->is_reduce_axis &&
+        !vars_target[i]->is_reduce_axis) {
+      new_iter_values.push_back(iter_values_target[i]);
+      VLOG(3) << "new_iter_values.push_back " << iter_values_target[i];
+    } else
+      break;
+  }
+
+  if (new_iter_values.empty())
+    LOG(FATAL) << "Cannot CopyTransformAndLoopInfo since shape[0] of source and target is not equal! "
+               << vars[0]->upper_bound << " v.s " << vars_target[0]->upper_bound;
+
+  int changed_loop_num = new_iter_values.size();
+  std::set<std::string> used_target_loop_vars;
+  for (auto& iter_val : new_iter_values) {
+    auto find_partial_loop = ir::CollectIRNodesWithoutTensor(iter_val, [&](const Expr* x) {
+      if (x->as_var()) used_target_loop_vars.insert(x->as_var_ref()->name);
+      return x->as_var();
+    });
+  }
+  CHECK(!used_target_loop_vars.empty());
+  std::vector<Expr> used_target_loops;
+  auto expr_copy = optim::IRCopy(expr);
+  for (auto& var : used_target_loop_vars) {
+    auto find_loop_var = ir::CollectIRNodesWithoutTensor(
+        expr_copy,
+        [&](const Expr* x) {
+          return x->As<ir::For>() && x->As<ir::For>()->loop_var->name == var && Contains(*x, block_target);
+        },
+        true);
+    CHECK_EQ(find_loop_var.size(), 1U);
+    used_target_loops.push_back(*find_loop_var.begin());
+    VLOG(3) << "used_target_loops push_back " << used_target_loops.back();
+  }
+  std::sort(used_target_loops.begin(), used_target_loops.end(), [&](Expr i, Expr j) {
+    return (utils::GetStreamCnt(i).size() > utils::GetStreamCnt(j).size());
+  });
+  for (int i = new_iter_values.size(); i < old_iter_values.size(); ++i) {
+    CHECK(old_iter_values[i].as_var());
+    new_iter_values.push_back(old_iter_values[i]);
+  }
+  Expr new_loop;
+  VLOG(3) << "changed_loop_num is : " << changed_loop_num;
+  VLOG(3) << "old_iter_values.size() is : " << old_iter_values.size();
+  if (changed_loop_num >= (int)old_iter_values.size()) {
+    new_loop                                             = optim::IRCopy(block);
+    new_loop.As<ir::ScheduleBlockRealize>()->iter_values = new_iter_values;
+  } else {
+    CHECK(old_iter_values[changed_loop_num].as_var());
+    auto old_var           = old_iter_values[changed_loop_num].as_var_ref();
+    auto find_partial_loop = ir::CollectIRNodesWithoutTensor(
+        expr,
+        [&](const Expr* x) {
+          return x->As<ir::For>() && x->As<ir::For>()->loop_var->name == old_var->name && Contains(*x, block);
+        },
+        true);
+    CHECK_EQ(find_partial_loop.size(), 1U);
+    new_loop                 = optim::IRCopy(*find_partial_loop.begin());
+    auto find_schedule_block = ir::CollectIRNodesWithoutTensor(
+        new_loop, [&](const Expr* x) { return x->As<ir::ScheduleBlockRealize>(); }, true);
+    CHECK_EQ(find_schedule_block.size(), 1U);
+    Expr sch_block                                        = (*find_schedule_block.begin());
+    sch_block.As<ir::ScheduleBlockRealize>()->iter_values = new_iter_values;
+  }
+  VLOG(3) << "new_loop is : " << new_loop;
+  CHECK(!used_target_loops.empty());
+  Expr res;
+  if (used_target_loops.size() == 1) {
+    auto for_loop = used_target_loops[0].As<ir::For>();
+    res           = For::Make(for_loop->loop_var,
+                    for_loop->min,
+                    for_loop->extent,
+                    for_loop->for_type(),
+                    for_loop->device_api,
+                    new_loop,
+                    for_loop->vectorize_info(),
+                    for_loop->bind_info());
+  } else {
+    Expr outer_loop                = used_target_loops.front();
+    Expr inner_loop                = used_target_loops.back();
+    inner_loop.As<ir::For>()->body = Block::Make({new_loop});
+    res                            = outer_loop;
+  }
+  VLOG(3) << "res is : " << res;
+  std::vector<Expr> all_loops = this->GetLoops(block);
+  CHECK(!all_loops.empty());
+  this->Replace(all_loops[0], res);
+}
+
+std::vector<Expr> ScheduleImpl::SamplePerfectTile(utils::LinearRandomEngine::StateType* rand_seed,
+                                                  const Expr& loop,
+                                                  int n,
+                                                  int max_innermost_factor) {
+  CHECK(loop.As<ir::For>()) << "Expr param of SamplePerfectTile should be a For loop";
+  CHECK_GE(n, 2) << "The number of tile factors should be at least 2";
+  CHECK_GE(max_innermost_factor, 1) << "The max innermost factor should be at least 1";
+  CHECK(common::is_zero(loop.As<ir::For>()->min)) << "The For loop should start from 0";
+  int loop_extent = GetLoopExtent(loop);
+  std::vector<int> innermost_factors;
+  for (int i = max_innermost_factor; i >= 1; --i) {
+    if (loop_extent % i == 0) {
+      innermost_factors.push_back(i);
+    }
+  }
+  CHECK(!innermost_factors.empty()) << "No innermost factor found";
+  int innermost_factor = innermost_factors[utils::SampleUniformInt(0, innermost_factors.size(), rand_seed)];
+  auto result          = SampleTile(rand_seed, n - 1, loop_extent / innermost_factor);
+  std::vector<Expr> result_expr;
+  for (auto& factor : result) {
+    result_expr.push_back(Expr(factor));
+  }
+  result_expr.push_back(Expr(innermost_factor));
+  return result_expr;
+}
+
+Expr ScheduleImpl::SampleCategorical(utils::LinearRandomEngine::StateType* rand_seed,
+                                     const std::vector<int>& candidates,
+                                     const std::vector<float>& probs) {
+  // check two sizes
+  CHECK_EQ(candidates.size(), probs.size()) << "candidates and probs must have same size.";
+  int seed_idx = utils::SampleDiscreteFromDistribution(probs, rand_seed);
+  auto result  = candidates[seed_idx];
+  Expr result_expr(result);
+  return result_expr;
+}
+
+IRSchedule::IRSchedule() {}
+
+IRSchedule::IRSchedule(const ModuleExpr& module_expr, utils::LinearRandomEngine::StateType rand_seed, bool debug_flag) {
+  impl_ = std::make_unique<ScheduleImpl>(module_expr, debug_flag);
+  this->InitSeed(rand_seed);
+}
+
+IRSchedule::IRSchedule(ir::ModuleExpr&& mod_expr, ScheduleDesc&& trace, utils::LinearRandomEngine::StateType rand_seed)
+    : impl_(std::make_unique<ScheduleImpl>(std::move(mod_expr))), trace_(std::move(trace)) {
+  this->InitSeed(rand_seed);
+}
+
+IRSchedule::IRSchedule(const IRSchedule& other)
+    : impl_(std::make_unique<ScheduleImpl>(optim::IRCopy(other.GetModule()))), trace_(other.trace_) {
+  this->InitSeed(other.ForkSeed());
+}
+
+IRSchedule& IRSchedule::operator=(const IRSchedule& src) {
+  impl_  = std::make_unique<ScheduleImpl>(optim::IRCopy(src.GetModule()));
+  trace_ = src.trace_;
+  this->InitSeed(src.ForkSeed());
+  return *this;
+}
+
+IRSchedule::IRSchedule(IRSchedule&& other) : impl_(std::move(other.impl_)), trace_(std::move(other.trace_)) {
+  this->InitSeed(other.ForkSeed());
+}
+
+IRSchedule& IRSchedule::operator=(IRSchedule&& src) {
+  impl_  = std::move(src.impl_);
+  trace_ = std::move(src.trace_);
+  this->InitSeed(src.ForkSeed());
+  return *this;
+}
+
+IRSchedule::~IRSchedule() {}
+
+void IRSchedule::InitSeed(utils::LinearRandomEngine::StateType rand_seed) {
+  this->rand_seed_ = utils::LinearRandomEngine::NormalizeState(rand_seed);
+}
+
+utils::LinearRandomEngine::StateType IRSchedule::ForkSeed() const { return utils::ForkRandomState(&rand_seed_); }
+
+void IRSchedule::SetExprs(const std::vector<Expr>& exprs) {
+  return impl_->SetExprs(exprs);
+  // no need to trace
+}
+
+const ModuleExpr& IRSchedule::GetModule() const {
+  return impl_->GetModule();
+  // no need to trace
+}
+
+bool IRSchedule::HasBlock(const std::string& block_name) const {
+  return impl_->HasBlock(block_name);
+  // no need to trace
+}
+
+void IRSchedule::MergeExprs() {
+  impl_->MergeExprs();
+  trace_.Append(ScheduleDesc::Step("MergeExprs", {}, {}, {}));
+}
+
+std::vector<Expr> IRSchedule::GetLoops(const Expr& block) const {
+  auto results = impl_->GetLoops(block);
+  trace_.Append(ScheduleDesc::Step("GetLoops", {{"block", std::vector<Expr>({block})}}, {}, results));
+  return results;
+}
+
+std::vector<Expr> IRSchedule::GetLoops(const std::string& block_name) const {
+  auto results = impl_->GetLoops(block_name);
+  trace_.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", block_name}}, results));
+  return results;
+}
+
+std::vector<Expr> IRSchedule::GetAllBlocks() const {
+  auto results = impl_->GetAllBlocks();
+  trace_.Append(ScheduleDesc::Step("GetAllBlocks", {}, {}, results));
+  return results;
+}
+
+std::vector<Expr> IRSchedule::GetChildBlocks(const Expr& expr) const {
+  auto results = impl_->GetChildBlocks(expr);
+  trace_.Append(ScheduleDesc::Step("GetChildBlocks", {{"expr", std::vector<Expr>({expr})}}, {}, results));
+  return results;
+}
+
+Expr IRSchedule::GetBlock(const std::string& block_name) const {
+  auto result = impl_->GetBlock(block_name);
+  trace_.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", block_name}}, {result}));
+  return result;
+}
+
+std::vector<Expr> IRSchedule::Split(const Expr& loop, const std::vector<int>& factors) {
+  std::vector<Expr> decision = SamplePerfectTile(loop, factors.size(), loop.As<ir::For>()->extent.as_int32(), factors);
+  auto results               = Split(loop, decision);
+  return results;
+}
+
+std::vector<Expr> IRSchedule::Split(const std::string& block_name, int loop_index, const std::vector<int>& factors) {
+  std::vector<Expr> all_loops = this->GetLoops(block_name);
+  Expr loop_expr;
+  CHECK_LT(loop_index, (int)all_loops.size()) << "The loop index in Split should be less than total loop's number.";
+  CHECK_GE(loop_index, 0) << "The loop index in Split should be >= 0.";
+  loop_expr = all_loops[loop_index];
+
+  return this->Split(loop_expr, factors);
+}
+
+std::vector<Expr> IRSchedule::Split(const Expr& loop, const std::vector<Expr>& factors) {
+  std::vector<int> int_factors;
+  std::transform(factors.begin(), factors.end(), std::back_inserter(int_factors), [](Expr x) { return x.as_int32(); });
+  auto results = impl_->Split(loop, int_factors);
+  trace_.Append(ScheduleDesc::Step("Split", {{"loop", std::vector<Expr>({loop})}, {"factors", factors}}, {}, results));
+  return results;
+}
+
+Expr IRSchedule::Fuse(const std::vector<Expr>& loops) {
+  auto result = impl_->Fuse(loops);
+  trace_.Append(ScheduleDesc::Step("Fuse", {{"loops", loops}}, {}, {result}));
+  return result;
+}
+
+Expr IRSchedule::Fuse(const std::string& block_name, const std::vector<int>& loops_index) {
+  auto result = impl_->Fuse(block_name, loops_index);
+  trace_.Append(
+      ScheduleDesc::Step("FuseWithName", {}, {{"block_name", block_name}, {"loops_index", loops_index}}, {result}));
+  return result;
+}
+
+Expr IRSchedule::Fuse(const Expr& block, const std::vector<int>& loops_index) {
+  auto result = impl_->Fuse(block, loops_index);
+  trace_.Append(ScheduleDesc::Step(
+      "FuseWithBlock", {{"block", std::vector<Expr>({block})}}, {{"loops_index", loops_index}}, {result}));
+  return result;
+}
+
+void IRSchedule::ComputeAt(const Expr& block, const Expr& loop, bool keep_unit_loops) {
+  impl_->ComputeAt(block, loop, keep_unit_loops);
+  trace_.Append(ScheduleDesc::Step("ComputeAt",
+                                   {{"block", std::vector<Expr>({block})}, {"loop", std::vector<Expr>({loop})}},
+                                   {{"keep_unit_loops", keep_unit_loops}},
+                                   {}));
+}
+
+void IRSchedule::SimpleComputeAt(const Expr& block, const Expr& loop) {
+  impl_->SimpleComputeAt(block, loop);
+  trace_.Append(ScheduleDesc::Step(
+      "SimpleComputeAt", {{"block", std::vector<Expr>({block})}, {"loop", std::vector<Expr>({loop})}}, {}, {}));
+}
+
+void IRSchedule::ReverseComputeAt(const Expr& block, const Expr& loop, bool keep_unit_loops) {
+  impl_->ReverseComputeAt(block, loop, keep_unit_loops);
+  trace_.Append(ScheduleDesc::Step("ReverseComputeAt",
+                                   {{"block", std::vector<Expr>({block})}, {"loop", std::vector<Expr>({loop})}},
+                                   {{"keep_unit_loops", keep_unit_loops}},
+                                   {}));
+}
+
+Expr IRSchedule::GetRootBlock(const Expr& expr) const {
+  auto result = impl_->GetRootBlock(expr);
+  trace_.Append(ScheduleDesc::Step("GetRootBlock", {{"expr", std::vector<Expr>({expr})}}, {}, {result}));
+  return result;
+}
+
+Expr IRSchedule::CacheRead(const Expr& block, int read_buffer_index, const std::string& memory_type) {
+  auto result = impl_->CacheRead(block, read_buffer_index, memory_type);
+  trace_.Append(ScheduleDesc::Step("CacheRead",
+                                   {{"block", std::vector<Expr>({block})}},
+                                   {{"read_buffer_index", read_buffer_index}, {"memory_type", memory_type}},
+                                   {result}));
+  return result;
+}
+
+Expr IRSchedule::CacheWrite(const Expr& block, int write_buffer_index, const std::string& memory_type) {
+  auto result = impl_->CacheWrite(block, write_buffer_index, memory_type);
+  trace_.Append(ScheduleDesc::Step("CacheWrite",
+                                   {{"block", std::vector<Expr>({block})}},
+                                   {{"write_buffer_index", write_buffer_index}, {"memory_type", memory_type}},
+                                   {result}));
+  return result;
+}
+
+void IRSchedule::SyncThreads(const Expr& ir_node, bool after_node) {
+  impl_->SyncThreads(ir_node, after_node);
+  trace_.Append(
+      ScheduleDesc::Step("SyncThreads", {{"ir_node", std::vector<Expr>({ir_node})}}, {{"after_node", after_node}}, {}));
+}
+
+void IRSchedule::SetBuffer(Expr& block, const std::string& memory_type, bool fixed) {
+  impl_->SetBuffer(block, memory_type, fixed);
+  trace_.Append(ScheduleDesc::Step(
+      "SetBuffer", {{"block", std::vector<Expr>({block})}}, {{"memory_type", memory_type}, {"fixed", fixed}}, {}));
+}
+
+Expr IRSchedule::Reorder(const std::vector<Expr>& loops) {
+  Expr ret = impl_->Reorder(loops);
+  trace_.Append(ScheduleDesc::Step("Reorder", {{"loops", loops}}, {}, {ret}));
+  return ret;
+}
+
+Expr IRSchedule::Reorder(const std::string& block_name, const std::vector<int>& loops_index) {
+  Expr ret = impl_->Reorder(block_name, loops_index);
+  trace_.Append(
+      ScheduleDesc::Step("ReorderWithName", {}, {{"block_name", block_name}, {"loops_index", loops_index}}, {ret}));
+  return ret;
+}
+
+Expr IRSchedule::Reorder(const Expr& block, const std::vector<int>& loops_index) {
+  Expr ret = impl_->Reorder(block, loops_index);
+  trace_.Append(ScheduleDesc::Step(
+      "ReorderWithBlock", {{"block", std::vector<Expr>({block})}}, {{"loops_index", loops_index}}, {ret}));
+  return ret;
+}
+
+void IRSchedule::Parallel(const Expr& loop) {
+  impl_->Parallel(loop);
+  trace_.Append(ScheduleDesc::Step("Parallel", {{"loop", std::vector<Expr>({loop})}}, {}, {}));
+}
+
+void IRSchedule::Vectorize(const Expr& loop, int factor) {
+  impl_->Vectorize(loop, factor);
+  trace_.Append(ScheduleDesc::Step("Vectorize", {{"loop", std::vector<Expr>({loop})}}, {{"factor", factor}}, {}));
+}
+
+void IRSchedule::Unroll(const Expr& loop) {
+  impl_->Unroll(loop);
+  trace_.Append(ScheduleDesc::Step("Unroll", {{"loop", std::vector<Expr>({loop})}}, {}, {}));
+}
+
+void IRSchedule::ComputeInline(const Expr& schedule_block) {
+  impl_->ComputeInline(schedule_block);
+  trace_.Append(ScheduleDesc::Step("ComputeInline", {{"schedule_block", std::vector<Expr>({schedule_block})}}, {}, {}));
+}
+
+void IRSchedule::ReverseComputeInline(const Expr& schedule_block) {
+  impl_->ReverseComputeInline(schedule_block);
+  trace_.Append(
+      ScheduleDesc::Step("ReverseComputeInline", {{"schedule_block", std::vector<Expr>({schedule_block})}}, {}, {}));
+}
+
+void IRSchedule::Bind(const Expr& loop, const std::string& thread_axis) {
+  impl_->Bind(loop, thread_axis);
+  trace_.Append(ScheduleDesc::Step("Bind", {{"loop", std::vector<Expr>({loop})}}, {{"thread_axis", thread_axis}}, {}));
+}
+
+Expr IRSchedule::Rfactor(const Expr& rf_loop, int rf_axis) {
+  auto result = impl_->Rfactor(rf_loop, rf_axis);
+  trace_.Append(
+      ScheduleDesc::Step("Rfactor", {{"rf_loop", std::vector<Expr>({rf_loop})}}, {{"rf_axis", rf_axis}}, {result}));
+  return result;
+}
+
+void IRSchedule::Annotate(const Expr& block, const std::string& key, const attr_t& value) {
+  impl_->Annotate(block, key, value);
+
+#define TRACE_ANNOTATE_ITEM(data_type, step_name)                                            \
+  if (absl::holds_alternative<data_type>(value)) {                                           \
+    trace_.Append(ScheduleDesc::Step(#step_name,                                             \
+                                     {{"block", std::vector<Expr>({block})}},                \
+                                     {{"key", key}, {"value", absl::get<data_type>(value)}}, \
+                                     {}));                                                   \
+    return;                                                                                  \
+  }
+  TRACE_ANNOTATE_ITEM(int, AnnotateIntAttr)
+  TRACE_ANNOTATE_ITEM(bool, AnnotateBoolAttr)
+  TRACE_ANNOTATE_ITEM(float, AnnotateFloatAttr)
+  TRACE_ANNOTATE_ITEM(std::string, AnnotateStringAttr)
+#undef TRACE_ANNOTATE_ITEM
+
+  LOG(FATAL) << "Value of attribute:" << key << " input unsupported data type";
+}
+
+void IRSchedule::Unannotate(Expr& block, const std::string& key) {
+  impl_->Unannotate(block, key);
+  trace_.Append(ScheduleDesc::Step("Unannotate", {{"block", std::vector<Expr>({block})}}, {{"key", key}}, {}));
+}
+
+void IRSchedule::FlattenLoops(const std::vector<Expr>& loops, const bool force_flat) {
+  impl_->FlattenLoops(loops, force_flat);
+  trace_.Append(
+      ScheduleDesc::Step("FlattenLoops", {{"loop", std::vector<Expr>({loops})}}, {{"force_flat", force_flat}}, {}));
+}
+
+void IRSchedule::CopyTransformAndLoopInfo(const Expr& block, const Expr& block_target) {
+  impl_->CopyTransformAndLoopInfo(block, block_target);
+  // don't support to trace, because we can't ensure both blocks are from the same ModuleExpr
+}
+
+void IRSchedule::CopyTransformAndLoopInfo(const std::string& block_name, const std::string& block_target_name) {
+  impl_->CopyTransformAndLoopInfo(block_name, block_target_name);
+  // don't support to trace, because we can't ensure both blocks are from the same ModuleExpr
+}
+
+std::vector<Expr> IRSchedule::SamplePerfectTile(const Expr& loop,
+                                                int n,
+                                                int max_innermost_factor,
+                                                const std::vector<int>& decision) {
+  std::vector<Expr> factors;
+  std::vector<int> new_decision;
+  if (decision.empty()) {
+    factors = impl_->SamplePerfectTile(&rand_seed_, loop, n, max_innermost_factor);
+    std::transform(
+        factors.begin(), factors.end(), std::back_inserter(new_decision), [](Expr x) { return x.as_int32(); });
+  } else {
+    new_decision = decision;
+    std::transform(decision.begin(), decision.end(), std::back_inserter(factors), [](int x) { return Expr(x); });
+  }
+  trace_.Append(
+      ScheduleDesc::Step("SamplePerfectTile",
+                         {{"loop", std::vector<Expr>({loop})}},
+                         {{"n", n}, {"max_innermost_factor", max_innermost_factor}, {"decision", new_decision}},
+                         factors));
+  return factors;
+}
+
+void IRSchedule::TagPostSchedule() { trace_.Append(ScheduleDesc::Step("TagPostSchedule", {}, {}, {})); }
+
+Expr IRSchedule::SampleCategorical(const std::vector<int>& candidates,
+                                   const std::vector<float>& probs,
+                                   const std::vector<int>& decision) {
+  Expr result;
+  std::vector<int> new_decision;
+  if (decision.empty()) {
+    result = impl_->SampleCategorical(&rand_seed_, candidates, probs);
+    new_decision.push_back(result.as_int32());
+  } else {
+    new_decision = decision;
+    for (auto ndco : new_decision) {
+      result = Expr(ndco);
+    }
+  }
+  trace_.Append(ScheduleDesc::Step(
+      "SampleCategorical", {}, {{"candidates", candidates}, {"probs", probs}, {"decision", new_decision}}, {result}));
+  return result;
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_schedule.h b/paddle/cinn/ir/ir_schedule.h
new file mode 100644
index 0000000000000..6b7b252a57dec
--- /dev/null
+++ b/paddle/cinn/ir/ir_schedule.h
@@ -0,0 +1,614 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/schedule_desc.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/utils/random_engine.h"
+
+namespace cinn {
+namespace ir {
+
+/**
+ * A struct representing a module that contains Expr. This struct is only used in Schedule process.
+ */
+class ModuleExpr {
+ public:
+  ModuleExpr()                           = default;
+  ModuleExpr(const ModuleExpr& mod_expr) = default;
+  ModuleExpr(ModuleExpr&& mod_expr)      = default;
+
+  ModuleExpr& operator=(const ModuleExpr& mod_expr) = default;
+
+  explicit ModuleExpr(const std::vector<Expr>& exprs) : exprs_(exprs) {}
+  explicit ModuleExpr(std::vector<Expr>&& exprs) : exprs_(std::move(exprs)) {}
+
+  //! Get all the Expr in this ModuleExpr.
+  std::vector<Expr> GetExprs() { return exprs_; }
+
+  std::vector<Expr> GetExprs() const { return exprs_; }
+
+  void SetExprs(const std::vector<Expr>& exprs) { exprs_ = exprs; }
+
+ private:
+  //! Exprs stored in ModuleExpr. Each one is an AST, representing a computation kernel.
+  std::vector<Expr> exprs_;
+};
+
+/**
+ * A struct containing all the schedule primitives. Each shedule primitive is a member function of IRSchedule.
+ * Schedule primitves are implmented by ScheduleImpl manipulating the AST - IR(Expr).
+ * To support serializing and replaying, each schedule primitive should append a ScheduleDesc::Step to
+ * the trace_ in its corresponding function implment.
+ */
+class ScheduleImpl;
+class IRSchedule {
+ public:
+  IRSchedule();
+  explicit IRSchedule(const ModuleExpr& modexpr,
+                      utils::LinearRandomEngine::StateType rand_seed = -1,
+                      bool debug_flag                                = false);
+  IRSchedule(ir::ModuleExpr&& mod_expr, ScheduleDesc&& trace, utils::LinearRandomEngine::StateType rand_seed = -1);
+  IRSchedule(const IRSchedule& other);
+  IRSchedule& operator=(const IRSchedule& src);
+  IRSchedule(IRSchedule&& other);
+  IRSchedule& operator=(IRSchedule&& src);
+  ~IRSchedule();
+
+  void SetExprs(const std::vector<Expr>& exprs);
+
+  //! Get the ModuleExpr stored in ScheduleImpl.
+  const ModuleExpr& GetModule() const;
+
+  //! Determine whether a specific block is included
+  bool HasBlock(const std::string& block_name) const;
+
+  //! Merge multiple Exprs in a ModuleExpr to be one
+  void MergeExprs();
+
+  //! Get the ScheduleDesc that traces the scheduling process
+  const ScheduleDesc& GetTraceDesc() const { return trace_; }
+
+  /**
+   * \brief Get all the loops of specific Block stored in ModuleExpr.
+   * @param block The block we find loop in.
+   * @return Loops of the block.
+   */
+  std::vector<Expr> GetLoops(const Expr& block) const;
+
+  /**
+   * \brief Get all the loops of specific Block stored in ModuleExpr.
+   * @param block_name Name of the block.
+   * @return Loops of the block.
+   */
+  std::vector<Expr> GetLoops(const std::string& block_name) const;
+
+  //! Get all blocks stored in this ModuleExpr.
+  std::vector<Expr> GetAllBlocks() const;
+
+  //! Get a block with the specific name.
+  Expr GetBlock(const std::string& block_name) const;
+
+  /**
+   * \brief Get all the childblocks of specific Expr stored in ModuleExpr.
+   * @param expr The expr we find childblock in, can be a loop or block.
+   * @return ChildBlocks of the block.
+   */
+  std::vector<Expr> GetChildBlocks(const Expr& expr) const;
+
+  /**
+   * \brief Split a for loop into multiple loops, based on the factors.
+   * @param loop The loop to be splited.
+   * @param factors The factors we used to split the loop.
+   * @return The splited loops.
+   */
+  std::vector<Expr> Split(const Expr& loop, const std::vector<int>& factors);
+
+  /**
+   * \brief Split a for loop into multiple loops, based on the factors.
+   * @param block_name Name of the block we want to modify.
+   * @param loop_index Index of the loop to be splited.
+   * @param factors The factors we used to split the loop.
+   * @return The splited loops.
+   */
+  std::vector<Expr> Split(const std::string& block_name, int loop_index, const std::vector<int>& factors);
+
+  /**
+   * \brief Split a for loop into multiple loops, based on the factors, only used for deserialization of trace.
+   * @param loop The loop to be splited.
+   * @param factors The factors we used to split the loop.
+   * @return The splited loops.
+   */
+  std::vector<Expr> Split(const Expr& loop, const std::vector<Expr>& factors);
+
+  /**
+   * \brief Fuse for loops and return the fused loop.
+   * @param loops All the loops to be fused, stored in ascending order.
+   * @return The fused loop.
+   */
+  Expr Fuse(const std::vector<Expr>& loops);
+
+  /**
+   * \brief Fuse for loops and return the fused loop.
+   * @param block_name Name of the block we want to modify.
+   * @param loops_index Indices of the loops to be fused, stored in ascending order.
+   * @return The fused loop.
+   */
+  Expr Fuse(const std::string& block_name, const std::vector<int>& loops_index);
+
+  /**
+   * \brief Fuse for loops and return the fused loop.
+   * @param block The block we want to modify.
+   * @param loops_index Indices of the loops to be fused, stored in ascending order.
+   * @return The fused loop.
+   */
+  Expr Fuse(const Expr& block, const std::vector<int>& loops_index);
+
+  /**
+   * \brief Move a producer block's location under a specific loop.
+   * @param block The block we want to move its computation location.
+   * @param loop The loop we will move the block to.
+   * @param keep_unit_loops Whether to keep the unit loop.
+   */
+  void ComputeAt(const Expr& block, const Expr& loop, bool keep_unit_loops = false);
+
+  /**
+   * \brief Move a block's location under a loop without considering their dependency.
+   * @param block The block we want to move its computation location.
+   * @param loop The loop we will move the block to.
+   */
+  void SimpleComputeAt(const Expr& block, const Expr& loop);
+
+  /**
+   * \brief Move a consumer block's location under a specific loop.
+   * @param block The block we want to move its computation location.
+   * @param loop The loop we will move the block to.
+   * @param keep_unit_loops Whether to keep the unit loop.
+   */
+  void ReverseComputeAt(const Expr& block, const Expr& loop, bool keep_unit_loops = false);
+
+  /**
+   * \brief Find an expr's root ScheduleBlockRealize node
+   * @param expr The expr node.
+   * @return Its root ScheduleBlockRealize node.
+   */
+  Expr GetRootBlock(const Expr& expr) const;
+
+  /**
+   * \brief Find a buffer that is being read, and create its cache.
+   * @param block Block that reads the buffer.
+   * @param read_buffer_index Index of the buffer being read in block.
+   * @param memory_type String that indicates the buffer's storage scope.
+   * @return The buffer's cache.
+   */
+  Expr CacheRead(const Expr& block, int read_buffer_index, const std::string& memory_type);
+
+  /**
+   * \brief Find a buffer that is being written, and create its cache.
+   * @param block Block that writes the buffer.
+   * @param write_buffer_index Index of the buffer being written in block.
+   * @param memory_type String that indicates the buffer's storage scope.
+   * @return The buffer's cache.
+   */
+  Expr CacheWrite(const Expr& block, int write_buffer_index, const std::string& memory_type);
+
+  /**
+   * \brief Add SyncThreads statements in AST.
+   * @param ir_node The insertion point in AST.
+   * @param after_node Whether to insert the statement after the insertion point. When it is True, we will insert the
+   * SyncThreads statement after the insertion IR. When it is False, we will insert the SyncThreads statement before the
+   * insertion IR.
+   */
+  void SyncThreads(const Expr& ir_node, bool after_node = true);
+
+  /*!
+   * \brief Set a tensor's buffer type(memory_type)
+   * \param block The ScheduleBlockRealize corresponding to an unique tensor.
+   * \param memory_type The memory type we want to set. Should be "local", "shared" or "global".
+   */
+  void SetBuffer(Expr& block, const std::string& memory_type, bool fixed = false);
+
+  /**
+   * \brief Reorder the loops in the order of vector.
+   * @param loops The loops to be reordered.
+   * @return The reordered Expr, can be ir::For or ir::Block. It is ir::For if
+   *   the reordered loop is a single loop chain. It will be a ir::Block whose
+   *   stmts contain several loop chains if the reordered computation has
+   *   multiple loop chains.
+   */
+  Expr Reorder(const std::vector<Expr>& loops);
+
+  /**
+   * \brief Reorder the loops in the order of vector elements.
+   * @param block_name Name of the block we want to modify.
+   * @param loops_index Indices of loops to be reordered.
+   * @return The reordered Expr, can be ir::For or ir::Block. It is ir::For if
+   *   the reordered loop is a single loop chain. It will be a ir::Block whose
+   *   stmts contain several loop chains if the reordered computation has
+   *   multiple loop chains.
+   */
+  Expr Reorder(const std::string& block_name, const std::vector<int>& loops_index);
+
+  /**
+   * \brief Reorder the loops in the order of vector elements.
+   * @param block The block we want to modify.
+   * @param loops_index Indices of loops to be reordered.
+   * @return The reordered Expr, can be ir::For or ir::Block. It is ir::For if
+   *   the reordered loop is a single loop chain. It will be a ir::Block whose
+   *   stmts contain several loop chains if the reordered computation has
+   *   multiple loop chains.
+   */
+  Expr Reorder(const Expr& block, const std::vector<int>& loops_index);
+
+  /**
+   * Get the device api of this IRSchedule.
+   * @param return The device api of this IRSchedule.
+   */
+  DeviceAPI GetDeviceAPI() const;
+
+  /**
+   * \brief Change forloop to be parallelized/vectorized/unrolled.
+   * @param loop The forloop to parallel/vectorize/unroll.
+   * @param for_type the target forloop type.
+   */
+  void MutateForType(const Expr& loop, ForType for_type, int factor = -1);
+
+  /**
+   * \brief Parallelize the given loop.
+   * @param loop the loop to parallel.
+   */
+  void Parallel(const Expr& loop);
+
+  /**
+   * \brief Vectorize the given loop.
+   * @param loop the loop to vectorize.
+   * @param factor the vectorized factor.
+   */
+  void Vectorize(const Expr& loop, int factor);
+
+  /**
+   * \brief Unroll the given loop.
+   * @param loop the loop to unroll.
+   */
+  void Unroll(const Expr& loop);
+
+  /**
+   * \brief Mark an schedule block as inlined.
+   * @param schedule_block the schedule block to be inlined.
+   */
+  void ComputeInline(const Expr& schedule_block);
+
+  /**
+   * \brief  Inline a consumer block into its only producer.
+   * @param schedule_block the schedule block to be inlined.
+   */
+  void ReverseComputeInline(const Expr& schedule_block);
+
+  /**
+   * \brief Bind the loop to the given thread axis.
+   * @param loop the loop to Bind.
+   * @param thread_axis the name of the thread axis to be bound to the loop.
+   */
+  void Bind(const Expr& loop, const std::string& thread_axis);
+
+  //! Copy another block's schedule transform.
+  void CopyTransformAndLoopInfo(const Expr& block, const Expr& block_target);
+
+  void CopyTransformAndLoopInfo(const std::string& block_name, const std::string& block_target_name);
+
+  /**
+   * \brief Factorize the reduction block by the given loop. The block will be split into two blocks: rfactor block and
+   * final write-back block.
+   * @param rf_loop the reduce loop to do rfactor transformation.
+   * @param rf_axis the axis where the new generated loop is placed in the rfactor block.
+   * @return The new created rfactor tensor.
+   *
+   * For example, input the block:
+   * \code
+   * for (i, 0, 10)      // serial loop
+   *   B_init[i] = 0
+   *   for (j, 0, 20)    // reduce loop
+   *      for (k, 0, 30) // reduce loop
+   *         B[i] = B[i] + A[i, j, k]
+   * \endcode
+   *
+   * If the rfactor loop is k and rf_axis is 0, the rfactor transformation is divided into 2 steps:
+   * 1. get the rfactor block where the reduce loop k is transformed to the serial loop with no accumalation and a new
+   * rfactor tensor is created. The axis k will be placed in the rf_axis of the new rf_tensor. The rf_block is as
+   * follows:
+   * \code
+   * for (rf_k, 0, 30)      // rfactor loop k is transformed to the serial loop.
+   *   for (i, 0, 10)       // serial loop for (j, 0, 20) // reduce loop
+   *     rf_B_init[rf_k, i] = 0
+   *     for (j, 0, 20)     // reduce loop
+   *       rf_B[rf_k, i] = rf_B[rf_k, i] + A[i, j, rf_k]
+   * \endcode
+   * 2. do reduction of the rfactor loop k to get the final result block:
+   * \code
+   *   for (i, 0, 10)    // serial loop
+   *      B_init[i] = 0
+   *      for (k, 0, 30)
+   *        B[i] = B[i] + rf_B[k, i]
+   * \endcode
+   */
+  Expr Rfactor(const Expr& rf_loop, int rf_axis);
+
+  /*!
+   * \brief Annotate a block with a key-value pair to set as its attribute
+   * \param block The block to be annotated
+   * \param key The attribute key
+   * \param val The attribute value, its type should be one of attr_t listing
+   */
+  void Annotate(const Expr& block, const std::string& key, const attr_t& value);
+
+  /*!
+   * \brief To cancel an annotation within a block using the key
+   * \param block The block to be unannotated
+   * \param key The attribute key
+   */
+  void Unannotate(Expr& block, const std::string& key);
+
+  /*!
+   * \brief flatten the loops in one dim.
+   * \param loops  the loops to be flatted.
+   * \param force_flat force to flat the right value.
+   */
+  // Temporary solution for simplify the elementwise/broadcast/injective index.
+  // TODO(sunli): Solve Index Simplify.
+  void FlattenLoops(const std::vector<Expr>& loops, const bool force_flat = false);
+
+  /*!
+   * \brief Sample the factors to tile a specific loop perfectly
+   * \param loop the loop to be split
+   * \param n the number of loop layers to split
+   * \param max_innermost_factor the maximum factor of the innermost loop
+   * \param decision the decision data of the last sample, or the artificially given decision data
+   * \return the split factors of the loop (The larger the index, the inner the corresponding loop)
+   * For example, return {16,64} means the loop will be like this:
+   * for (i, 0, 16) {
+   *  for (j, 0, 64) {
+   *   ...
+   *  }
+   * }
+   */
+  std::vector<Expr> SamplePerfectTile(const Expr& loop,
+                                      int n,
+                                      int max_innermost_factor,
+                                      const std::vector<int>& decision = {});
+
+  /*!
+   * \brief Insert a tag in schedule_desc to mark the beginning of post processing,
+   * the schedue primitive itself does not make any changes to the IR.
+   */
+  void TagPostSchedule();
+
+  /**
+   * \brief Randomly sample an integer according to the given distribution.
+   * @param candidates Candidate set of integers.
+   * @param probs Probability distribution of candidate integer set.
+   * @param decision the decision data of the last sample, or the artificially given decision data.
+   * @return Random variables sampled.
+   */
+  Expr SampleCategorical(const std::vector<int>& candidates,
+                         const std::vector<float>& probs,
+                         const std::vector<int>& decision = {});
+
+ private:
+  // Init the random seed with a new seed
+  void InitSeed(utils::LinearRandomEngine::StateType rand_seed);
+
+  // Fork a new seed from current seed
+  utils::LinearRandomEngine::StateType ForkSeed() const;
+
+ private:
+  std::unique_ptr<ScheduleImpl> impl_;
+  mutable ScheduleDesc trace_;  // trace the scheduling process
+  mutable utils::LinearRandomEngine::StateType rand_seed_;
+};
+
+/*!
+ * \brief The base class of the inliner, which handles:
+ * 1) Remove the block to be lined
+ * 2) Maintain a list of index variables and their substition of the buffer being inlined
+ */
+class BaseInliner : public ir::IRMutator<> {
+ protected:
+  explicit BaseInliner(const Tensor& inlined_tensor, const Expr& inlined_store)
+      : inlined_tensor_(inlined_tensor), inlined_store_(inlined_store) {}
+
+ public:
+  void operator()(Expr* expr);
+
+ private:
+  void Visit(const ir::Block* expr, Expr* op) override;
+
+ protected:
+  //! Check if indices are validate. If so, set idx_vars_ properly.
+  bool UpdateAndCheckIndexVars(const std::vector<Expr>& indices, int expected_ndim);
+
+  void SetIndexSubstitution(const std::vector<Expr>& indices);
+
+ protected:
+  //! The tensor to be inlined
+  Tensor inlined_tensor_{nullptr};
+  //! The body of the block to be inlined
+  Expr inlined_store_{nullptr};
+  //! The indices used for indexing the buffer to be inlined
+  std::vector<Var> idx_vars_;
+  //! Replacing vars(idx_sub_var_) in indices to corresponding expr(idx_sub_expr_)
+  std::vector<Var> idx_sub_var_;
+  std::vector<Expr> idx_sub_expr_;
+
+ public:
+  /*!
+   * \brief The Expr to be replaced when removing the block
+   * \note The pair (src_stmt, tgt_stmt) are produced by LeafBlockRemovalPlan
+   */
+  Expr src_stmt{nullptr};
+  //! The Expr to replace the original one when removing the block
+  Expr tgt_stmt{nullptr};
+};
+
+/*!
+ * \brief Helper to inline the producer block into its consumer(s)
+ * The derived class implements:
+ * Substitute `Load` on the tensor to be inlined to its value calculation in the producer block
+ */
+class ComputeInliner : public BaseInliner {
+ public:
+  explicit ComputeInliner(const Tensor& inlined_tensor, const Expr& inlined_store)
+      : BaseInliner(inlined_tensor, inlined_store) {}
+
+  bool BodyPatternAllowInline();
+
+ private:
+  void Visit(const ir::Load* expr, Expr* op) override;
+
+  //! Replace the 'Load' node on the tensor to 'Load' node of its producers.
+  Expr ReplaceInlinedTensor(Expr* load);
+};
+
+/*!
+ * \brief Helper to inline a block into the its producer
+ * The derived class implements the following functionalities:
+ * 1) Substitute `Load` on the tensor to be inlined
+ * to its value calculation in the producer block
+ * 2) Analyze the producer block to determine the remapping of index variables
+ */
+class ReverseComputeInliner : public BaseInliner {
+ public:
+  explicit ReverseComputeInliner(const Tensor& inlined_tensor,
+                                 const Expr& inlined_store,
+                                 const Expr& inlined_load,
+                                 const Expr& target_store)
+      : BaseInliner(inlined_tensor, inlined_store), inlined_load_(inlined_load), target_store_(target_store) {}
+
+  bool BodyPatternAllowInline();
+
+ protected:
+  Expr inlined_load_{nullptr};
+  Expr target_store_{nullptr};
+
+ private:
+  void Visit(const ir::Load* expr, Expr* op) override;
+  void Visit(const ir::Store* expr, Expr* op) override;
+
+  //! Replace the 'Load' node on the tensor to 'Store' node of its consumers.
+  Expr ReplaceInlinedTensor(Expr* load);
+  Expr ReplaceTargetTensor(Expr* store);
+};
+
+// The struct used to remove the original block in ComputeAt.
+class LeafBlockRemovalPlan : public ir::IRMutator<> {
+ public:
+  LeafBlockRemovalPlan(const Expr& block, Expr* source_expr, Expr* target_expr)
+      : block_(block), source_expr_(source_expr), target_expr_(target_expr) {}
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::ScheduleBlockRealize* expr, Expr* op) override {
+    if (*op == block_) {
+      find_block = true;
+      return;
+    }
+    IRMutator::Visit(expr, op);
+  }
+
+  void Visit(const ir::For* expr, Expr* op) override {
+    if (*op == block_) {
+      find_block = true;
+      return;
+    }
+    IRMutator::Visit(expr, op);
+  }
+
+  void Visit(const ir::Block* expr, Expr* op) override {
+    if (expr->stmts.size() > 1U) {
+      int block_index = -1;
+      for (int i = 0; i < expr->stmts.size(); ++i) {
+        auto keep_flag = find_block;
+        find_block     = false;
+        auto* node     = op->As<ir::Block>();
+        IRMutator::Visit(&node->stmts[i], &node->stmts[i]);
+        if (find_block) {
+          if (depth == 0) {
+            *source_expr_ = *op;
+            block_index   = i;
+          }
+          depth++;
+        }
+        find_block = find_block || keep_flag;
+      }
+      if (block_index != -1) {
+        std::vector<Expr> new_stmts;
+        for (int i = 0; i < expr->stmts.size(); ++i) {
+          if (i == block_index)
+            continue;
+          else
+            new_stmts.push_back(expr->stmts[i]);
+        }
+        auto target_block = ir::Block::Make(new_stmts);
+        *target_expr_     = target_block;
+      }
+    } else {
+      IRMutator::Visit(expr, op);
+    }
+  }
+
+ private:
+  bool find_block{false};
+  int depth{0};
+  const Expr& block_;
+  Expr* source_expr_;
+  Expr* target_expr_;
+};
+
+class ComputeInlineChecker : public ir::IRMutator<> {
+ public:
+  ComputeInlineChecker(IRSchedule& schedule, Expr& block) : ir_schedule_(schedule), block_(block) {}
+
+  bool Check();
+
+  void BuildDataDependency();
+
+ private:
+  void Visit(const ir::Load* expr, Expr* op) {
+    // Check there is Load Expr corresponds to Store Expr
+    if ((store_.As<ir::Store>()->tensor).as_tensor_ref()->name == expr->tensor.as_tensor_ref()->name) {
+      should_skip_ = false;
+      return;
+    }
+    IRMutator::Visit(expr, op);
+  }
+
+ private:
+  IRSchedule& ir_schedule_;
+  Expr& block_;
+
+  Expr store_;
+  bool should_skip_{true};
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_schedule_util.cc b/paddle/cinn/ir/ir_schedule_util.cc
new file mode 100644
index 0000000000000..054e05dee06d3
--- /dev/null
+++ b/paddle/cinn/ir/ir_schedule_util.cc
@@ -0,0 +1,1038 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_schedule_util.h"
+
+#include <math.h>
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/lang/compute.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/replace_var_with_expr.h"
+
+namespace cinn {
+namespace ir {
+
+Tensor GetTensor(const Expr& block) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  auto find_tensor = ir::CollectIRNodesWithoutTensor(
+      block, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
+  CHECK_EQ(find_tensor.size(), 1U) << "One block should only have one Store node!(except for root block)";
+  CHECK((*find_tensor.begin()).As<ir::Store>()->tensor.as_tensor());
+  Tensor tensor = (*find_tensor.begin()).As<ir::Store>()->tensor.as_tensor_ref();
+  return tensor;
+}
+
+Tensor GetReadTensor(const Expr& block, int index) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  auto find_tensor = ir::CollectIRNodesWithoutTensor(
+      block, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
+  CHECK_EQ(find_tensor.size(), 1U) << "One block should only have one Store node!(except for root block)";
+  std::vector<Tensor> res;
+  auto find_read_tensor = ir::CollectIRNodesWithoutTensor(block, [&](const Expr* x) {
+    if (x->As<ir::Load>()) res.push_back(x->As<ir::Load>()->tensor.as_tensor_ref());
+    return x->As<ir::Load>();
+  });
+  CHECK_EQ(find_read_tensor.size(), res.size());
+  CHECK(!find_read_tensor.empty()) << "Didn't find Load tensor in block!";
+  CHECK_LT(index, (int)find_read_tensor.size()) << "Index is not < read tensor's size!";
+  return res[index];
+}
+
+int GetLoopExtent(const Expr& loop) {
+  CHECK(loop.As<ir::For>());
+  CHECK(common::is_zero(loop.As<ir::For>()->min));
+  CHECK(loop.As<ir::For>()->extent.is_constant());
+  return (int)loop.As<ir::For>()->extent.get_constant();
+}
+
+void SetCudaAxisInfo(Expr* lowered_func) {
+  if (!lowered_func->as_lowered_func()) {
+    LOG(ERROR) << "The input of SetCudaAxisInfo should be lowered_func!";
+    return;
+  }
+
+  auto func_body = lowered_func->as_lowered_func_ref()->body;
+  CudaAxisInfo info;
+
+  auto block_nodes                                    = ir::CollectIRNodes(func_body, [&](const Expr* x) {
+    if (x->As<ir::For>() && x->As<ir::For>()->bind_info().valid()) {
+      auto bind_info = x->As<ir::For>()->bind_info();
+      info.set_valid(true);
+      if (bind_info.for_type == ForType::GPUThread) {
+        CHECK(common::is_zero(x->As<ir::For>()->min));
+        CHECK(x->As<ir::For>()->extent.is_constant());
+        int range = x->As<ir::For>()->extent.get_constant();
+        range     = range > info.block_dim(bind_info.offset) ? range : info.block_dim(bind_info.offset);
+        VLOG(3) << "Set block dim[" << bind_info.offset << "] with range " << range;
+        info.set_block_dim(bind_info.offset, range);
+      } else if (bind_info.for_type == ForType::GPUBlock) {
+        CHECK(common::is_zero(x->As<ir::For>()->min));
+        CHECK(x->As<ir::For>()->extent.is_constant());
+        int range = x->As<ir::For>()->extent.get_constant();
+        range     = range > info.grid_dim(bind_info.offset) ? range : info.grid_dim(bind_info.offset);
+        info.set_grid_dim(bind_info.offset, range);
+        VLOG(3) << "Set grid dim[" << bind_info.offset << "] with range " << range;
+      } else {
+        LOG(FATAL) << "The for loop's bind info should be gpu block or thread!";
+      }
+    }
+    return (x->As<ir::For>() && x->As<ir::For>()->bind_info().valid());
+  });
+  lowered_func->as_lowered_func_ref()->cuda_axis_info = info;
+}
+
+bool Contains(const Expr& container, const Expr& expr) {
+  auto find_expr = ir::CollectIRNodesWithoutTensor(
+      container, [&](const Expr* x) { return (x->node_type() == expr.node_type() && *x == expr); }, true);
+  return (!find_expr.empty());
+}
+
+Expr GetNextForLoop(const Expr& for_loop) {
+  Expr result;
+  CHECK(for_loop.As<ir::For>()) << "The input of GetNextForLoop should be ir::For!";
+  Expr for_body             = for_loop.As<ir::For>()->body;
+  ir::Block* for_body_block = for_body.As<ir::Block>();
+  CHECK(for_body_block) << "The for_loop's body shoule be Block!";
+
+  // Only support for body block contains a sub for loop
+  int next_idx = -1;
+  for (int i = 0; i < for_body_block->stmts.size(); ++i) {
+    Expr stmt = for_body_block->stmts[i];
+    if (stmt.As<IfThenElse>() || stmt.As<ir::For>()) {
+      if (next_idx == -1) {
+        next_idx = i;
+      } else {
+        // More then one sub for loop, Return undefined.
+        return result;
+      }
+    }
+  }
+  if (next_idx == -1) {
+    // More then one sub for loop, Return undefined.
+    return result;
+  }
+
+  Expr block_body = for_body_block->stmts[next_idx];
+  if (block_body.As<IfThenElse>()) {
+    // TODO(zhhsplendid): is it right to only handle true case?
+    // It may be wrong, but the code is written by previous developer, for us,
+    // we will check it later in the future.
+    CHECK(block_body.As<IfThenElse>()->true_case.As<ir::Block>());
+    Expr true_case = block_body.As<IfThenElse>()->true_case;
+    if (true_case.As<ir::Block>()->stmts.size() != 1U || !true_case.As<ir::Block>()->stmts[0].As<ir::For>())
+      return result;
+    result = true_case.As<ir::Block>()->stmts[0];
+    return result;
+  } else if (block_body.As<ir::For>()) {
+    return block_body;
+  } else {
+    return result;
+  }
+}
+
+std::vector<Expr> GetIfThenElseInRange(const Expr& top, const Expr& bottom) {
+  std::vector<Expr> if_nodes;
+  CHECK(top.As<ir::For>());
+  CHECK(bottom.As<ir::For>());
+  for (auto loop_iter = top; loop_iter != bottom;) {
+    CHECK(loop_iter.As<ir::For>());
+    CHECK(loop_iter.As<ir::For>()->body.As<ir::Block>()) << "For node's body should be Block!";
+    auto block = loop_iter.As<ir::For>()->body.As<ir::Block>();
+    for (Expr tmp : block->stmts) {
+      if (tmp.As<IfThenElse>()) {
+        if_nodes.push_back(tmp);
+        CHECK(tmp.As<IfThenElse>()->true_case.As<ir::Block>());
+        Expr true_case = tmp.As<IfThenElse>()->true_case;
+        CHECK(true_case.As<ir::Block>()->stmts.size() == 1U && true_case.As<ir::Block>()->stmts[0].As<ir::For>());
+        tmp = true_case.As<ir::Block>()->stmts[0];
+      }
+      if (tmp.As<ir::For>()) {
+        loop_iter = tmp;
+      }
+    }
+  }
+  return if_nodes;
+}
+
+void ReplaceExpr(Expr* source, const std::vector<Var>& replaced, const std::vector<Expr>& candidates) {
+  CHECK_EQ(replaced.size(), candidates.size())
+      << "In ReplaceExpr, the size of Vars to be replaced must be equal to the size of cadidate Exprs! Please check.";
+  if (replaced.empty()) return;
+  std::map<Var, Expr, CompVar> replacing_map;
+  for (int i = 0; i < replaced.size(); ++i) {
+    // If the Var to be replaced is equal to the candidate, we skip it.
+    if (candidates[i].is_var() && candidates[i].as_var_ref() == replaced[i]) continue;
+    replacing_map[replaced[i]] = candidates[i];
+  }
+  MappingVarToExprMutator mapper(replacing_map);
+  mapper(source);
+  return;
+}
+
+std::vector<int> ValidateFactors(const std::vector<int>& factors, int total_extent) {
+  CHECK(!factors.empty()) << "The factors param of Split should not be empty! Please check.";
+  bool has_minus_one = false;
+  int product        = 1;
+  for (auto& i : factors) {
+    CHECK(i != 0) << "The params in factors of Split should not be 0! Please check.";
+    CHECK(i >= -1) << "The params in factors of Split should not be less than -1! Please check.";
+    if (i == -1) {
+      CHECK(!has_minus_one) << "The params in factors of Split should not have more than one -1! Please check.";
+      has_minus_one = true;
+    } else {
+      product *= i;
+    }
+  }
+  std::vector<int> validated_factors = factors;
+  if (!has_minus_one) {
+    CHECK_GE(product, total_extent)
+        << "In Split, the factors' product should be equal to original loop's extent! Please check.";
+    return validated_factors;
+  } else {
+    CHECK_LE(product, total_extent) << "In Split, when there is -1 in factors, the other factors' product should be <= "
+                                       "original loop's extent! Please check.";
+    int minus_one_candidate = (int)ceil((double)total_extent / (double)product);
+    for (int i = 0; i < validated_factors.size(); ++i) {
+      if (validated_factors[i] == -1) {
+        validated_factors[i] = minus_one_candidate;
+      }
+    }
+    return validated_factors;
+  }
+}
+
+void CHECKRfactorValidation(const Expr& rf_loop, int rf_axis) {
+  auto* rf_for = rf_loop.As<ir::For>();
+  CHECK(rf_for) << "Expr param of Rfactor must be For node! Please check.";
+  // check the rf_loop only has one schedule block
+  auto block_nodes = ir::CollectIRNodesWithoutTensor(
+      rf_loop, [&](const Expr* x) { return x->As<ScheduleBlockRealize>(); }, true);
+  CHECK_EQ(block_nodes.size(), 1U) << "Rfactor Loop should only have one schedule block";
+  auto find_store = ir::CollectIRNodesWithoutTensor(
+      rf_loop, [&](const Expr* x) { return x->As<Store>(); }, true);
+  CHECK_EQ(find_store.size(), 1U);
+  auto indice = find_store.begin()->As<Store>()->indices;
+  // check rf_axis
+  CHECK_LE(rf_axis, indice.size()) << "rf_axis should not be greater than store's domain size";
+  // check rfactor loop is reduce
+  auto* sch_block_realize = block_nodes.begin()->As<ScheduleBlockRealize>();
+  auto* sch_block         = sch_block_realize->schedule_block.As<ScheduleBlock>();
+  CHECK(sch_block);
+  auto& iter_values = sch_block_realize->iter_values;
+  auto& iter_vars   = sch_block->iter_vars;
+  CHECK_EQ(iter_values.size(), iter_vars.size());
+  auto rf_loop_var = rf_for->loop_var;
+  Var rf_block_var;
+  for (int i = 0; i < iter_values.size(); ++i) {
+    if (ContainVar({iter_values[i]}, rf_loop_var->name)) {
+      CHECK(!rf_block_var.defined()) << "rfactor loop var can only be binded to one block var";
+      auto iter_value = iter_values[i].As<_Var_>();
+      CHECK(iter_value) << "not support complex reduce bindings";
+      rf_block_var = iter_vars[i];
+      auto it      = std::find_if(indice.begin(), indice.end(), [&](const Expr& x) {
+        return x.As<_Var_>() && x.As<_Var_>()->name == rf_block_var->name;
+      });
+      CHECK(it == indice.end()) << "rfactor loop var is not reduce, please check!";
+    }
+  }
+}
+
+std::vector<Expr> GetLoopsOfExpr(const Expr& expr, const Expr& root) {
+  auto loop_nodes =
+      ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) { return x->As<ir::For>() && Contains(*x, expr); });
+  std::vector<Expr> result(loop_nodes.begin(), loop_nodes.end());
+  if (result.empty()) LOG(FATAL) << "Didn't find expr's : \n" << expr << "\n loops in root : \n" << root;
+  std::sort(result.begin(), result.end(), [&](Expr i, Expr j) {
+    return (utils::GetStreamCnt(i).size() > utils::GetStreamCnt(j).size());
+  });
+  return result;
+}
+
+IterRange GetAccessedRange(const Expr& index,
+                           const std::vector<Var>& iter_vars,
+                           const std::vector<IterRange>& iter_ranges) {
+  CHECK_EQ(iter_vars.size(), iter_ranges.size());
+  std::vector<Expr> var_mins, var_maxs;
+  for (const auto& range : iter_ranges) {
+    var_mins.emplace_back(range.min);
+    var_maxs.emplace_back(range.min + range.extent - 1);
+  }
+
+  Expr indice_min = optim::IRCopy(index);
+  Expr indice_max = optim::IRCopy(index);
+  // replace the var by the corresponding iter_value
+  ReplaceExpr(&indice_min, iter_vars, var_mins);
+  ReplaceExpr(&indice_max, iter_vars, var_maxs);
+  // simplify expression
+  indice_min = common::AutoSimplify(indice_min);
+  indice_max = common::AutoSimplify(indice_max);
+
+  Expr indice_extent;
+  Expr mod_extent(0);
+  if (indice_min.As<Mod>() && indice_min.As<Mod>()->b().is_constant()) mod_extent = indice_min.As<Mod>()->b();
+
+  if (indice_min == indice_max) {
+    if (common::is_zero(mod_extent)) {
+      // If a index keeps constant, its extent should be 1.
+      indice_extent = Expr(1);
+    } else {
+      indice_extent = mod_extent;
+    }
+  } else {
+    indice_extent = common::AutoSimplify(common::AutoSimplify(indice_max) - common::AutoSimplify(indice_min) + 1);
+  }
+
+  if (indice_extent.is_constant() && indice_extent.get_constant() < 0) {
+    VLOG(3) << "deduced indices are not constant";
+    indice_min    = indice_max;
+    indice_extent = Expr(-indice_extent.get_constant());
+  }
+  VLOG(3) << "indice_min=" << indice_min << ", indice_max=" << indice_max << ", indice_extent=" << indice_extent;
+  return IterRange(indice_min, indice_extent);
+}
+
+std::vector<IterRange> CalculateTensorRegions(const Expr& block,
+                                              const std::vector<Expr>& tensor_indices,
+                                              const Tensor& tensor,
+                                              const Expr& root) {
+  CHECK(block.As<ScheduleBlockRealize>());
+  auto iter_vars   = block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->iter_vars;
+  auto iter_values = block.As<ir::ScheduleBlockRealize>()->iter_values;
+
+  std::vector<Var> loop_vars;
+  std::vector<IterRange> loop_ranges;
+
+  auto outer_loops = GetLoopsOfExpr(block, root);
+  for (auto& loop : outer_loops) {
+    CHECK(loop.As<For>());
+    loop_vars.emplace_back(loop.As<For>()->loop_var);
+    loop_ranges.emplace_back(IterRange(loop.As<For>()->min, loop.As<For>()->extent));
+  }
+
+  std::vector<IterRange> result;
+  for (int i = 0; i < tensor_indices.size(); ++i) {
+    Expr binded_index = optim::IRCopy(tensor_indices[i]);
+    ReplaceExpr(&binded_index, iter_vars, iter_values);
+    auto range = GetAccessedRange(binded_index, loop_vars, loop_ranges);
+
+    // in generally, the range should be constant, but in some cases our AutoSimplify
+    // (algebraic simplification function) can't simplify completely where we use the whole
+    // shape in this indice as the accessed range conservatively
+    if (!range.min.is_constant() || !range.extent.is_constant()) {
+      VLOG(3) << "deduced range is not constant, range.min=" << range.min << ", range.extent=" << range.extent;
+      if (tensor->buffer.defined()) {
+        CHECK_GT((int)tensor->buffer->shape.size(), i);
+        result.emplace_back(IterRange(Expr(0), tensor->buffer->shape[i]));
+      } else {
+        CHECK_GT((int)tensor->shape.size(), i);
+        result.emplace_back(IterRange(Expr(0), tensor->shape[i]));
+      }
+    } else {
+      result.emplace_back(std::move(range));
+    }
+  }
+
+  return result;
+}
+
+Expr GetNthAccessExpr(const Expr& block, int index, bool is_write) {
+  CHECK(block.As<ScheduleBlockRealize>());
+  auto compute_body = block.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>()->body;
+  if (is_write) {
+    std::vector<Expr> find_store_vec;
+    auto find_store = ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+      if (x->As<ir::Store>()) find_store_vec.push_back(*x);
+      return x->As<ir::Store>();
+    });
+    CHECK_EQ(find_store.size(), find_store_vec.size());
+    CHECK_LT(index, (int)find_store.size());
+    Expr store_index = find_store_vec[index];
+    return store_index;
+  } else {
+    std::vector<Expr> find_load_vec;
+    auto find_load = ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+      if (x->As<ir::Load>()) find_load_vec.push_back(*x);
+      return x->As<ir::Load>();
+    });
+    CHECK_EQ(find_load.size(), find_load_vec.size());
+    CHECK_LT(index, (int)find_load.size());
+    Expr load_index = find_load_vec[index];
+    return load_index;
+  }
+}
+
+Tensor MakeCacheTensor(const Tensor& tensor, const std::string& memory_type) {
+  auto cache_tensor = lang::Compute(
+      tensor->shape,
+      [=](const std::vector<Expr>& dims) { return tensor(dims); },
+      tensor->name + "_" + memory_type + "_temp_buffer");
+  cache_tensor->WithBuffer(memory_type);
+  return cache_tensor;
+}
+
+Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
+                    CacheBlockInfo* info,
+                    const std::string& memory_type,
+                    DeviceAPI device_api) {
+  // loop variables
+  std::vector<Var> loop_vars;
+  // bindings in block realize
+  std::vector<Expr> iter_values;
+  // Create loop vars and block vars' binding_value
+  for (const auto& range : buffer_ranges) {
+    Var loop_var(common::UniqName("cache_ax" + std::to_string(loop_vars.size())));
+    // Var loop_var("ax" + std::to_string(loop_vars.size()));
+    loop_vars.push_back(loop_var);
+    iter_values.push_back(common::AutoSimplify(range.min + loop_var));
+  }
+  // block variables
+  std::vector<Var> block_vars;
+  Tensor new_tensor = info->alloc;
+  // Create block vars, block's accessed region and accessing indices
+  CHECK(new_tensor->buffer.defined());
+  for (auto& dim : new_tensor->buffer->shape) {
+    Var var(Expr(0), dim, "v" + std::to_string(block_vars.size()), false);
+    block_vars.push_back(var);
+  }
+  auto body                  = new_tensor->tensor_store_expanded_body();
+  std::vector<Var> axis_vars = common::GenDefaultAxis(new_tensor->domain.size());
+  axis_vars.insert(axis_vars.end(), new_tensor->reduce_axis.begin(), new_tensor->reduce_axis.end());
+  for (int i = 0; i < axis_vars.size(); ++i) {
+    optim::ReplaceVarWithExpr(&body, axis_vars[i], block_vars[i]);
+  }
+  Expr block = ir::ScheduleBlockRealize::Make(
+      iter_values, ir::ScheduleBlock::Make(block_vars, {}, {}, new_tensor->name, Block::Make({body})));
+  Expr new_body = block;
+  for (int i = (int)loop_vars.size() - 1; i >= 0; i--) {
+    new_body = For::Make(loop_vars[i],
+                         Expr(0),
+                         common::AutoSimplify(buffer_ranges[i].extent),
+                         ir::ForType::Serial,
+                         device_api,
+                         ir::Block::Make({new_body}));
+  }
+  info->cache_block = std::move(new_body);
+  return block;
+}
+
+void FindInsertionPoint(Expr& root, CacheBlockInfo* info, bool is_write) {
+  Expr find_tensor       = is_write ? Expr(info->write_tensor) : Expr(info->read_tensor);
+  auto find_produce_read = ir::CollectIRNodesWithoutTensor(
+      root, [&](const Expr* x) { return x->As<ir::Store>() && x->As<ir::Store>()->tensor == find_tensor; });
+
+  if (find_produce_read.empty()) {
+    CHECK(root.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>());
+    CHECK(root.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>()->body.As<Block>());
+    info->loc_block = root.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>()->body;
+    info->loc_pos   = 0;
+    return;
+  }
+
+  CHECK_EQ(find_produce_read.size(), 1U);
+  Expr producer = *(find_produce_read.begin());
+
+  CHECK(root.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>());
+  CHECK(root.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>()->body.As<Block>());
+  info->loc_block = root.As<ScheduleBlockRealize>()->schedule_block.As<ScheduleBlock>()->body;
+  for (int i = 0; i < (int)info->loc_block.As<Block>()->stmts.size(); ++i) {
+    if (Contains(info->loc_block.As<Block>()->stmts[i], producer)) {
+      info->loc_pos = i + 1;
+      break;
+    }
+  }
+}
+
+const std::set<Expr, CompExpr> CollectLoopsToSet(const std::vector<Expr>& loops) {
+  std::set<Expr, CompExpr> for_loops;
+  for (auto& i : loops) {
+    CHECK(i.As<ir::For>()) << "loops should be For node! Please check.";
+    auto inserted = for_loops.insert(i);
+    if (!inserted.second) {
+      LOG(FATAL) << "There should be no duplicate elements in loops! Please check.";
+    }
+  }
+  return for_loops;
+}
+
+// This function is used in Reorder schedule primitive. Since input loop
+// Expr(s) of Reorder doesn't give original for loop order, we have to
+// find the top (most outter) loop and bottom (most inner) among loop Expr(s)
+std::pair<Expr, Expr> GetBoundaryOfReorderRange(const std::set<Expr, CompExpr>& loop_set) {
+  Expr top = *loop_set.begin();
+  Expr bottom;
+  std::set<Expr, CompExpr> visited;
+  bool first_traversal = true;
+  for (Expr loop_i : loop_set) {
+    if (visited.count(loop_i)) {
+      continue;
+    }
+    Expr v_for = loop_i;
+    CHECK(v_for.As<ir::For>());
+    while (v_for.defined()) {
+      // If loop_i's sub loop is visited it must be pre-visited top.
+      // Then loop_i should be the new top
+      if (visited.count(v_for)) {
+        if (v_for != top) {
+          LOG(FATAL) << "Loops in GetBoundaryOfReorderRange is not a chain! Please check.";
+        }
+        top = loop_i;
+        break;
+      }
+
+      // This while loop always GetNextForLoop(sub loop), so the last
+      // visited v_for in the first traversal will be the bottom.
+      if (first_traversal && loop_set.count(v_for)) {
+        bottom = v_for;
+      }
+      visited.insert(v_for);
+      v_for = GetNextForLoop(v_for);
+    }
+    first_traversal = false;
+  }
+  CHECK(top.As<ir::For>());
+  CHECK(bottom.defined());
+  CHECK(bottom.As<ir::For>());
+  return std::make_pair(top, bottom);
+}
+
+std::vector<Expr> GetLoopsInRange(const Expr& top, const Expr& bottom) {
+  std::vector<Expr> chain;
+  CHECK(top.As<ir::For>());
+  CHECK(bottom.As<ir::For>());
+  for (auto loop_iter = top; loop_iter != bottom;) {
+    Expr tmp = GetNextForLoop(loop_iter);
+    if (!tmp.defined()) LOG(FATAL) << "Loops in GetLoopsInReorderRange is not a chain! Please check.";
+    chain.push_back(loop_iter);
+    loop_iter = tmp;
+  }
+  chain.push_back(bottom);
+  return chain;
+}
+
+// Construct a loop chain such that:
+//
+//   loops[i_1] {
+//     loops[i_2] {
+//       ...
+//        loops[i_n] {
+//          stmts;
+//        }
+//     }
+//   }
+//
+// where reordered_indices = {i_1, i_2, ... i_n }
+//
+// This is a helper function which constructs non-main chain for other body
+// statements in Reorder. See comment and call place in ConstructNewLoopChain
+Expr ConstructOtherStmtChain(const std::vector<Expr>& stmts,
+                             const std::vector<Expr>& loops,
+                             const std::vector<int> reordered_indices) {
+  Expr new_loop;
+  for (int i = reordered_indices.size() - 1; i >= 0; --i) {
+    Expr temp = optim::IRCopy(loops[reordered_indices[i]]);
+    CHECK(temp.defined());
+    CHECK(temp.As<ir::For>());
+    if (new_loop.defined()) {
+      temp.As<ir::For>()->body = Block::Make({new_loop});
+    } else {
+      temp.As<ir::For>()->body = Block::Make({stmts});
+    }
+    new_loop = temp;
+  }
+  return new_loop;
+}
+
+Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
+                           const std::vector<Expr>& ordered_loops,
+                           const std::set<Expr, CompExpr>& loop_set,
+                           std::vector<Expr>& if_nodes) {
+  std::vector<std::set<std::string>> condition_vars;
+  // In each IfThenElse node, find the vars its condition depends on.
+  for (auto& if_expr : if_nodes) {
+    CHECK(if_expr.As<IfThenElse>());
+    auto var_set = ir::CollectIRNodes(if_expr.As<IfThenElse>()->condition, [&](const Expr* x) { return x->as_var(); });
+    std::set<std::string> var_name_set;
+    for (auto& i : var_set) var_name_set.insert(i.as_var()->name);
+    condition_vars.push_back(var_name_set);
+  }
+  Expr new_loop;
+  int index = static_cast<int>(ordered_loops.size()) - 1;
+
+  std::vector<Expr> reordered_loop_chain;
+  // Construct the main loop chain from bottom to top.
+  for (int i = static_cast<int>(chain.size()) - 1; i >= 0; i--) {
+    auto& loop_in_chain = chain[i];
+    CHECK(loop_in_chain.As<ir::For>());
+    Expr temp;
+    if (loop_set.count(loop_in_chain)) {
+      CHECK_GE(index, 0);
+      temp = optim::IRCopy(ordered_loops[index]);
+      --index;
+    } else {
+      temp = optim::IRCopy(loop_in_chain);
+    }
+    CHECK(temp.defined());
+    CHECK(temp.As<ir::For>());
+    // Main chain, each loop's body only contains sub_loop or bottom loop's body
+    if (new_loop.defined()) {
+      temp.As<ir::For>()->body = Block::Make({new_loop});
+    } else {
+      temp.As<ir::For>()->body = loop_in_chain.As<ir::For>()->body;
+    }
+    Expr original_temp = temp;
+    // Here we handle the IfThenElse nodes.
+    for (int i = 0; i < static_cast<int>(if_nodes.size()); ++i) {
+      if (condition_vars[i].count(original_temp.As<ir::For>()->loop_var->name)) {
+        Expr temp_body = temp.As<ir::For>()->body;
+        if (temp_body.As<Block>() && temp_body.As<Block>()->stmts.size() == 1U)
+          temp_body = temp_body.As<Block>()->stmts[0];
+        temp.As<ir::For>()->body = IfThenElse::Make(
+            if_nodes[i].As<IfThenElse>()->condition, temp_body, if_nodes[i].As<IfThenElse>()->false_case);
+        temp.As<ir::For>()->body = Block::Make({temp.As<ir::For>()->body});
+        if_nodes.erase(if_nodes.begin() + i);
+        condition_vars.erase(condition_vars.begin() + i);
+        i--;
+      }
+    }
+    new_loop = temp;
+    reordered_loop_chain.push_back(new_loop);
+  }
+  CHECK(new_loop.defined());
+
+  // new_loop_chain, which represents the main loop chain, now is from top to bottom.
+  std::reverse(reordered_loop_chain.begin(), reordered_loop_chain.end());
+
+  // In the main loop chain, each loop's body only contains sub_loop or bottom
+  // loop's body, but the origin loop chain may contain some other body stmts.
+  // The main loop chain lost those other body stmts.
+  // For example:
+  //
+  // for (i, 0, 32) {         Reorder j, i         for (j, 0, 64) {
+  //   other_body_stmts       above main chine
+  //   for (j, 0, 64) {      ------------------>     for (i, 0, 32) {
+  //     bottom_loop_body                              bottom_loop_body
+  //   }                                             }
+  // }                                             }
+  //
+  // We go throuph origin loop and check other body stmts, adding it as another
+  // chain, such as:
+  //
+  // for (i, 0, 32) {
+  //   other_body_stmts
+  // }
+  // for (j, 0, 64) {
+  //   for (i, 0, 32) {
+  //     bottom_loop_body
+  //   }
+  // }
+  //
+
+  // Construct the complete loop chain from origin loop top to bottom.
+  CHECK_EQ(chain.size(), reordered_loop_chain.size())
+      << "origin loop chain size not equals reordered requirement when ConstructNewLoopChain in Reorder";
+  std::unordered_set<std::string> origin_loop_var_names;
+  Expr ret = new_loop;
+
+  // Maintain an index to add stmt (other body stmt chain)
+  //
+  //  stmt  stmt  MainChainLoop  stmt   stmt
+  //               index        index+1
+  //
+  // The index of this MainChainLoop points the place before next MainChainLoop
+  // We can insert statements before MainChainLoop at the index, and insert
+  // statements after MainChainLoop at the index + 1
+  int add_other_chain_index = 0;
+
+  for (int i = 0; i < chain.size() - 1; ++i) {
+    // we just check i < chain.size() - 1
+    // because bottom loop's body stmts have been all added
+
+    const ir::For* loop_in_chain = chain[i].As<ir::For>();
+    ir::For* reordered_in_chain  = reordered_loop_chain[i].As<ir::For>();
+
+    origin_loop_var_names.insert(loop_in_chain->loop_var->name);
+    CHECK_EQ(origin_loop_var_names.size(), i + 1) << "Duplicate loop var name in origin Chain during Reorder";
+
+    const ir::Block* body_block = loop_in_chain->body.As<ir::Block>();
+
+    if (body_block != nullptr && body_block->stmts.size() > 1) {
+      // contains other body stmts
+
+      // Get the other body statements before loop and after loop
+      bool other_stmt_body_before_loop = true;
+      std::vector<Expr> stmts_before_loop;
+      std::vector<Expr> stmts_after_loop;
+      for (int j = 0; j < body_block->stmts.size(); ++j) {
+        if (body_block->stmts[j].As<ir::For>() &&
+            body_block->stmts[j].As<ir::For>()->loop_var->name == chain[i + 1].As<ir::For>()->loop_var->name) {
+          other_stmt_body_before_loop = false;
+          continue;
+        }
+        if (other_stmt_body_before_loop) {
+          stmts_before_loop.push_back(body_block->stmts[j]);
+        } else {
+          stmts_after_loop.push_back(body_block->stmts[j]);
+        }
+      }
+
+      // Find the chain that other body stmts shares with main loop chain
+      std::vector<int> reordered_indices;
+      for (int j = 0; j < reordered_loop_chain.size(); ++j) {
+        if (origin_loop_var_names.count(reordered_loop_chain[j].As<ir::For>()->loop_var->name)) {
+          reordered_indices.push_back(j);
+        }
+      }
+      CHECK_EQ(reordered_indices.size(), origin_loop_var_names.size())
+          << "Reordered chain loop var names doesn't match other stmt chain loop var names";
+
+      // Add other stmts chain to root Block if other stmts exist
+      if (!stmts_before_loop.empty()) {
+        Expr before_chain = ConstructOtherStmtChain(stmts_before_loop, reordered_loop_chain, reordered_indices);
+        if (ret.As<ir::Block>() == nullptr) {
+          ret = ir::Block::Make({ret});
+        }
+        std::vector<Expr>& inplace_stmts = ret.As<ir::Block>()->stmts;
+        auto pos                         = inplace_stmts.begin() + add_other_chain_index;
+        inplace_stmts.insert(pos, before_chain);
+        ++add_other_chain_index;
+      }
+
+      if (!stmts_after_loop.empty()) {
+        Expr after_chain = ConstructOtherStmtChain(stmts_after_loop, reordered_loop_chain, reordered_indices);
+        if (ret.As<ir::Block>() == nullptr) {
+          ret = ir::Block::Make({ret});
+        }
+        std::vector<Expr>& inplace_stmts = ret.As<ir::Block>()->stmts;
+        auto pos                         = inplace_stmts.begin() + add_other_chain_index + 1;
+        inplace_stmts.insert(pos, after_chain);
+      }
+    }
+  }
+
+  return ret;
+}
+
+std::vector<Expr> GetProducers(const Expr& block, const Expr& root) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  CHECK(root.As<ir::ScheduleBlockRealize>());
+  std::vector<Expr> producers;
+
+  // collect all producers' tensor names
+  std::set<std::string> producer_tensor_names;
+  auto compute_body = block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->body;
+  ir::CollectIRNodesWithoutTensor(compute_body, [&producer_tensor_names](const Expr* x) {
+    auto* load = x->As<ir::Load>();
+    if (load) {
+      producer_tensor_names.insert(load->tensor.as_tensor()->name);
+      return true;
+    }
+    return false;
+  });
+
+  // traverse each of other blocks and filter those ones which contain at least one producer tensor;
+  auto find_blocks = ir::CollectIRNodesWithoutTensor(
+      root, [&block, &root](const Expr* x) { return x->As<ir::ScheduleBlockRealize>() && *x != block && *x != root; });
+  for (auto&& cur : find_blocks) {
+    auto* cur_block = cur.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>();
+    CHECK(cur_block) << "block result should be a ScheduleBlockRealize";
+    auto find_stores = ir::CollectIRNodesWithoutTensor(cur_block->body, [&producer_tensor_names](const Expr* x) {
+      return x->As<ir::Store>() && producer_tensor_names.count(x->As<ir::Store>()->tensor.as_tensor()->name) > 0;
+    });
+    if (!find_stores.empty()) producers.emplace_back(cur);
+  }
+  return producers;
+}
+
+std::vector<Expr> GetConsumers(const Expr& block, const Expr& root) {
+  CHECK(block.As<ir::ScheduleBlockRealize>());
+  CHECK(root.As<ir::ScheduleBlockRealize>());
+  std::vector<Expr> consumers;
+  std::string block_tensor = GetTensor(block)->name;
+  auto find_block          = ir::CollectIRNodesWithoutTensor(
+      root, [&](const Expr* x) { return x->As<ir::ScheduleBlockRealize>() && *x != block && *x != root; });
+  for (auto& i : find_block) {
+    CHECK(i.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>());
+    auto block_body = i.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->body;
+    auto find_load  = ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) {
+      return x->As<ir::Load>() && x->As<ir::Load>()->tensor.as_tensor_ref()->name == block_tensor;
+    });
+    if (!find_load.empty()) consumers.emplace_back(i);
+  }
+  return consumers;
+}
+
+void CheckComputeAtValidation(const Expr& block, const Expr& loop, const Expr& root) {
+  auto find_block = ir::CollectIRNodesWithoutTensor(
+      root, [&](const Expr* x) { return x->As<ir::ScheduleBlockRealize>() && *x == block; }, true);
+  CHECK(!find_block.empty()) << "Didn't find block in root!";
+
+  auto find_loop = ir::CollectIRNodesWithoutTensor(
+      root, [&](const Expr* x) { return x->As<ir::For>() && *x == loop; }, true);
+  CHECK(!find_loop.empty()) << "Didn't find loop in root!";
+
+  auto find_block_in_loop = ir::CollectIRNodesWithoutTensor(
+      loop, [&](const Expr* x) { return x->As<ir::ScheduleBlockRealize>() && *x == block; }, true);
+  CHECK(find_block_in_loop.empty()) << "loop should not be block's ancestor!";
+}
+
+void InsertBlock(Expr& for_loop, const Expr& insertion, int index) {
+  CHECK(for_loop.As<ir::For>());
+  CHECK(for_loop.As<ir::For>()->body.As<Block>());
+  ir::Block* dst_block = for_loop.As<ir::For>()->body.As<Block>();
+  CHECK(index == -1 || index >= 0 && index < dst_block->stmts.size())
+      << "index = " << index << ", it should be -1 or between [0, block stmts size)";
+
+  if (index == -1) {
+    dst_block->stmts.emplace_back(insertion);
+  } else {
+    auto dst_it = dst_block->stmts.begin() + index;
+    if (dst_it->As<IfThenElse>()) {
+      auto* inserted_block = dst_it->As<IfThenElse>()->true_case.As<Block>();
+      CHECK(inserted_block) << "the IfThenElse node to be inserted shuold contain a true_case block";
+      inserted_block->stmts.insert(inserted_block->stmts.begin(), insertion);
+    } else {
+      dst_block->stmts.insert(dst_it, insertion);
+    }
+  }
+}
+
+IterRange RangeUnion(const IterRange& range1, const IterRange& range2) {
+  Expr new_min    = common::AutoSimplify(Min::Make(range1.min, range2.min));
+  Expr new_extent = common::AutoSimplify(
+      common::AutoSimplify(Max::Make(range1.min + range1.extent, range2.min + range2.extent)) - new_min);
+  return IterRange(new_min, new_extent);
+}
+
+std::vector<IterRange> CalculateRequiredRegions(const Expr& block,
+                                                const Expr& loop,
+                                                const Expr& root,
+                                                const std::vector<Expr>& required_blocks,
+                                                bool is_store_provided) {
+  CHECK(block.As<ir::ScheduleBlockRealize>()) << "Param block should be a ir::ScheduleBlockRealize node";
+  CHECK(loop.As<ir::For>()) << "Param loop should be a ir::For node";
+
+  std::set<Expr> provided_nodes;
+  if (is_store_provided) {
+    provided_nodes = ir::CollectIRNodesWithoutTensor(block, [&](const Expr* x) { return x->As<ir::Store>(); });
+  } else {
+    provided_nodes = ir::CollectIRNodesWithoutTensor(block, [&](const Expr* x) { return x->As<ir::Load>(); });
+  }
+
+  std::vector<IterRange> required_buffer_range;
+  // deduce accessed regions of the provided tensor in block by itering each required block
+  for (const Expr& pro_node : provided_nodes) {
+    const std::string& provided_tensor_name = is_store_provided ? pro_node.As<ir::Store>()->tensor.as_tensor()->name
+                                                                : pro_node.As<ir::Load>()->tensor.as_tensor()->name;
+
+    for (const Expr& req_block : required_blocks) {
+      CHECK(req_block.As<ir::ScheduleBlockRealize>());
+      Expr block_body =
+          optim::IRCopy(req_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->body);
+      auto iter_vars   = req_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->iter_vars;
+      auto iter_values = req_block.As<ir::ScheduleBlockRealize>()->iter_values;
+      ReplaceExpr(&block_body, iter_vars, iter_values);
+
+      // Notice that we look for For nodes in loop's body instead of loop itself.
+      auto find_loops = ir::CollectIRNodesWithoutTensor(
+          loop.As<ir::For>()->body, [&](const Expr* x) { return x->As<ir::For>() && Contains(*x, req_block); });
+
+      // collect vars and their ranges of each loop under the input loop
+      std::vector<Var> loop_vars;
+      std::vector<IterRange> loop_ranges;
+      for (const auto& for_loop : find_loops) {
+        loop_vars.emplace_back(for_loop.As<ir::For>()->loop_var);
+        loop_ranges.emplace_back(for_loop.As<ir::For>()->min, for_loop.As<ir::For>()->extent);
+      }
+
+      std::set<Expr> required_nodes;
+      if (is_store_provided) {
+        required_nodes = ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) {
+          return x->As<ir::Load>() && x->As<ir::Load>()->tensor.as_tensor_ref()->name == provided_tensor_name;
+        });
+      } else {
+        required_nodes = ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) {
+          return x->As<ir::Store>() && x->As<ir::Store>()->tensor.as_tensor_ref()->name == provided_tensor_name;
+        });
+      }
+
+      // deducing range by indices of each required node
+      for (const Expr& req_node : required_nodes) {
+        const auto& indices = is_store_provided ? req_node.As<ir::Load>()->indices : req_node.As<ir::Store>()->indices;
+
+        if (find_loops.empty()) {
+          for (int i = 0; i < indices.size(); ++i) {
+            if (i >= required_buffer_range.size())
+              required_buffer_range.emplace_back(indices[i], Expr(1));
+            else
+              required_buffer_range[i] = RangeUnion(required_buffer_range[i], IterRange(indices[i], Expr(1)));
+          }
+        } else {
+          for (int i = 0; i < indices.size(); ++i) {
+            auto range = GetAccessedRange(indices[i], loop_vars, loop_ranges);
+            if (i >= required_buffer_range.size()) {
+              required_buffer_range.emplace_back(std::move(range));
+            } else {
+              required_buffer_range[i] = RangeUnion(required_buffer_range[i], range);
+            }
+          }
+        }
+      }  // end for load_nodes
+    }
+  }
+
+  int iter_size = block.As<ir::ScheduleBlockRealize>()->iter_values.size();
+  // maybe some dimensions are not accessed by consumers so we should append them
+  if (iter_size > required_buffer_range.size()) {
+    for (int i = required_buffer_range.size(); i < iter_size; ++i) {
+      CHECK(block.As<ir::ScheduleBlockRealize>()->iter_values[i].as_var() ||
+            block.As<ir::ScheduleBlockRealize>()->iter_values[i].is_constant());
+      if (block.As<ir::ScheduleBlockRealize>()->iter_values[i].as_var()) {
+        auto find_for_loops = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+          return x->As<ir::For>() && x->As<ir::For>()->loop_var->name ==
+                                         block.As<ir::ScheduleBlockRealize>()->iter_values[i].as_var_ref()->name;
+        });
+        CHECK_EQ(find_for_loops.size(), 1U);
+        required_buffer_range.emplace_back((*find_for_loops.begin()).As<ir::For>()->min,
+                                           (*find_for_loops.begin()).As<ir::For>()->extent);
+      } else {
+        int cons = (int)block.As<ir::ScheduleBlockRealize>()->iter_values[i].is_constant();
+        required_buffer_range.emplace_back(Expr(cons), Expr(1));
+      }
+    }
+  }
+  return required_buffer_range;
+}
+
+Expr CheckComputeInlineValidationAndGetStore(const Expr& schedule_block, const Expr& root) {
+  CHECK(schedule_block.As<ir::ScheduleBlockRealize>());
+  auto compute_body = schedule_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->body;
+  // 1. Check the schedule block to be inlined is not a reduce tensor.
+  auto find_store = ir::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
+  CHECK_EQ(find_store.size(), 1U);
+  Expr tensor = (*find_store.begin()).As<ir::Store>()->tensor;
+  CHECK(!tensor.as_tensor_ref()->is_reduce_tensor());
+  // 2. Check this schedule block is the only writer of the tensor.
+  find_store = ir::CollectIRNodesWithoutTensor(
+      root,
+      [&](const Expr* x) {
+        return x->As<ir::Store>() && (x->As<ir::Store>()->tensor).as_tensor_ref()->name == tensor.as_tensor_ref()->name;
+      },
+      true);
+  CHECK_EQ(find_store.size(), 1U);
+  // 3. Check there is no overlap between the buffers the schedule block reads and writes.
+  auto find_load = ir::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) { return x->As<ir::Load>() && x->As<ir::Load>()->tensor == tensor; });
+  CHECK(find_load.empty());
+  return (*find_store.begin());
+}
+
+std::tuple<Expr, Expr, Expr> CheckReverseComputeInlineValidationAndGetExprs(const Expr& schedule_block,
+                                                                            const Expr& root) {
+  CHECK(schedule_block.As<ir::ScheduleBlockRealize>());
+  auto compute_body = schedule_block.As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>()->body;
+  // 1. Check the schedule block to be reverse inlined is not a reduce tensor.
+  auto find_inlined_load = ir::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) { return x->As<ir::Load>(); }, true);
+  CHECK_EQ(find_inlined_load.size(), 1U);
+  Expr tensor = (*find_inlined_load.begin()).As<ir::Load>()->tensor;
+  CHECK(!tensor.as_tensor_ref()->is_reduce_tensor());
+  auto inlined_load = *find_inlined_load.begin();
+  // 2. Check this schedule block is the only reader of the tensor.
+  auto find_load = ir::CollectIRNodesWithoutTensor(
+      root,
+      [&](const Expr* x) {
+        return x->As<ir::Load>() && (x->As<ir::Load>()->tensor).as_tensor_ref()->name == tensor.as_tensor_ref()->name;
+      },
+      true);
+  CHECK_EQ(find_load.size(), 1U);
+  // 3. Check there is no overlap between the buffers the schedule block reads and writes.
+  auto find_store = ir::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) { return x->As<ir::Store>() && x->As<ir::Store>()->tensor == tensor; });
+  CHECK(find_store.empty());
+  // 4. Get store that will be inlined.
+  auto find_inlined_store = ir::CollectIRNodesWithoutTensor(
+      root, [&](const Expr* x) { return x->As<ir::Store>() && x->As<ir::Store>()->tensor == tensor; });
+  CHECK_EQ(find_inlined_store.size(), 1U);
+  auto inlined_store = *find_inlined_store.begin();
+  // 5. Get target store.
+  auto find_target_store = ir::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
+  CHECK_EQ(find_target_store.size(), 1U);
+  auto target_store = *find_target_store.begin();
+  return {inlined_load, inlined_store, target_store};
+}
+
+bool ContainVar(const std::vector<Expr>& exprs, const std::string& var_name) {
+  for (auto& expr : exprs) {
+    auto find_expr = ir::CollectIRNodesWithoutTensor(
+        expr, [&](const Expr* x) { return x->As<_Var_>() && x->As<_Var_>()->name == var_name; }, true);
+    if (!find_expr.empty()) return true;
+  }
+  return false;
+}
+
+std::unordered_map<int, int> PrimeFactorize(int n) {
+  std::unordered_map<int, int> factors;
+  while (n % 2 == 0) {
+    ++factors[2];
+    n /= 2;
+  }
+  for (int i = 3; i <= sqrt(n); i += 2) {
+    while (n % i == 0) {
+      ++factors[i];
+      n /= i;
+    }
+  }
+  if (n > 2) {
+    factors[n] = 1;
+  }
+  return factors;
+}
+
+std::vector<int> SampleTile(utils::LinearRandomEngine::StateType* rand_seed, int n, int extent) {
+  std::vector<int> tile;
+  while (n > 1) {
+    std::unordered_map<int, int> factors = PrimeFactorize(extent);
+    int product                          = 1;
+    for (auto& factor : factors) {
+      if (factor.second >= 1) {
+        int num = utils::SampleUniformInt(1, factor.second + 1, rand_seed);
+        product *= std::pow(factor.first, num);
+      }
+    }
+    tile.push_back(product);
+    extent /= product;
+    --n;
+  }
+  tile.push_back(extent);
+  return tile;
+}
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_schedule_util.h b/paddle/cinn/ir/ir_schedule_util.h
new file mode 100644
index 0000000000000..12a80f637969c
--- /dev/null
+++ b/paddle/cinn/ir/ir_schedule_util.h
@@ -0,0 +1,448 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/utils/random_engine.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace ir {
+
+// Self-defined operator to support std::set<Expr>
+struct CompExpr {
+  bool operator()(const Expr& left, const Expr& right) const {
+    return utils::GetStreamCnt(left) < utils::GetStreamCnt(right);
+  }
+};
+
+// Self-defined operator to support std::set<Var>
+struct CompVar {
+  bool operator()(const Var& left, const Var& right) const { return left->name < right->name; }
+};
+
+struct MappingVarToExprMutator : public ir::IRMutator<> {
+  MappingVarToExprMutator(const std::map<Var, Expr, CompVar>& replacing_map) : replacing_map_(replacing_map) {}
+
+  void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::_Var_* expr, Expr* op) override {
+    if (replacing_map_.count(op->as_var_ref())) {
+      *op = replacing_map_.at(op->as_var_ref());
+    }
+  }
+
+ private:
+  const std::map<Var, Expr, CompVar>& replacing_map_;
+};
+
+struct FindLoopsVisitor {
+  FindLoopsVisitor(const Expr& block) : block_(block) {}
+
+  std::vector<Expr> operator()(const Expr* expr) {
+    CHECK(block_.As<ir::ScheduleBlockRealize>());
+    visit_end = false;
+    Visit(expr);
+    return result;
+  }
+
+ private:
+  void Visit(const Expr* expr) {
+    if (visit_end || !expr->defined()) return;
+    if (expr->As<ir::For>()) {
+      father_loops.emplace_back(*expr);
+      Visit(&(expr->As<ir::For>()->body));
+      father_loops.pop_back();
+    } else if (expr->As<ir::ScheduleBlockRealize>()) {
+      if (!expr->As<ir::ScheduleBlockRealize>()->iter_values.empty() && (*expr == block_)) {
+        result    = father_loops;
+        visit_end = true;
+        return;
+      } else {
+        Visit(&(expr->As<ir::ScheduleBlockRealize>()->schedule_block));
+      }
+    } else if (expr->As<ir::ScheduleBlock>()) {
+      Visit(&(expr->As<ir::ScheduleBlock>()->body));
+    } else if (expr->As<ir::Block>()) {
+      for (auto& n : expr->As<ir::Block>()->stmts) Visit(&n);
+    } else if (expr->As<ir::IfThenElse>()) {
+      Visit(&(expr->As<ir::IfThenElse>()->true_case));
+      Visit(&(expr->As<ir::IfThenElse>()->false_case));
+    }
+  }
+
+  std::vector<Expr> father_loops{};
+  std::vector<Expr> result{};
+  bool visit_end{false};
+  const Expr& block_;
+};
+
+/**
+ * \brief Given a ScheduleBlockRealize node, return the Store tensor in its body.
+ * @param block The given ScheduleBlockRealize node
+ * @return The Store tensor in block
+ */
+Tensor GetTensor(const Expr& block);
+
+struct FindBlocksVisitor {
+  FindBlocksVisitor(const std::string& block_name = "") : block_name_(block_name) {}
+
+  std::vector<Expr> operator()(const Expr* expr) {
+    Visit(expr);
+    return result;
+  }
+
+ private:
+  void Visit(const Expr* expr) {
+    if (!expr->defined()) return;
+    if (!block_name_.empty() && !result.empty()) return;
+    if (expr->As<ir::For>()) {
+      Visit(&(expr->As<ir::For>()->body));
+    } else if (expr->As<ir::ScheduleBlockRealize>()) {
+      if (!expr->As<ir::ScheduleBlockRealize>()->iter_values.empty()) {
+        auto* schedule_block = expr->As<ir::ScheduleBlockRealize>()->schedule_block.As<ir::ScheduleBlock>();
+        if (block_name_.empty() || schedule_block->name == block_name_) {
+          result.emplace_back(*expr);
+        }
+      } else {
+        Visit(&(expr->As<ir::ScheduleBlockRealize>()->schedule_block));
+      }
+    } else if (expr->As<ir::ScheduleBlock>()) {
+      Visit(&(expr->As<ir::ScheduleBlock>()->body));
+    } else if (expr->As<ir::Block>()) {
+      for (auto& n : expr->As<ir::Block>()->stmts) Visit(&n);
+    } else if (expr->As<ir::IfThenElse>()) {
+      Visit(&(expr->As<ir::IfThenElse>()->true_case));
+      Visit(&(expr->As<ir::IfThenElse>()->false_case));
+    }
+  }
+  std::string block_name_;
+  std::vector<Expr> result{};
+};
+
+struct CacheBlockInfo {
+  /*! \brief The tensor to be read. */
+  Tensor read_tensor;
+  /*! \brief The tensor to be written. */
+  Tensor write_tensor;
+  /*! \brief The tensor allocation to be inserted into the block signature. */
+  Tensor alloc;
+  /*! \brief The AST node whose body is where the cache stage should be inserted. */
+  Expr loc_block;
+  /*! \brief The index to insert the cache_read/cache_write stage. */
+  int loc_pos;
+  /*! \brief The cache_read/cache_write stage to be inserted. */
+  Expr cache_block;
+};
+
+// a struct to present the min value and the extent of a iterable range,
+// where it is represented as a semi-closed interval, i.e [min, min + extent)
+struct IterRange {
+  IterRange(Expr begin, Expr length) : min(begin), extent(length) {}
+
+  Expr min;
+  Expr extent;
+};
+
+/**
+ * \brief Given a ScheduleBlockRealize node, return the index-th Load tensor in its body.
+ * @param block The given ScheduleBlockRealize node
+ * @param index The index of Load tensor
+ * @return The index-th Load tensor in block
+ */
+Tensor GetReadTensor(const Expr& block, int index);
+
+/**
+ * \brief Given a For node, return its extent as int.
+ * @param loop The given For node
+ * @return The extent of For node
+ */
+int GetLoopExtent(const Expr& loop);
+
+/**
+ * \brief Given a vector of Exors, return whether they contain a var with specific name.
+ * @param exprs The given vector of Exprs
+ * @param var_name The name of specific var
+ * @return Whether there is a Var with the same name as var_name
+ */
+bool ContainVar(const std::vector<Expr>& exprs, const std::string& var_name);
+
+/**
+ * \brief Given a _LoweredFunc_, set its cuda_axis_info based on its func_body.
+ * @param lowered_func A pointer to the given _LoweredFunc_
+ */
+void SetCudaAxisInfo(Expr* lowered_func);
+
+/*!
+ * \brief Check if a Expr node contains a ScheduleBlockRealize node.
+ * \param container The container Expr node.
+ * \param expr The node we want to find.
+ * \return If the container contains the expr.
+ */
+bool Contains(const Expr& container, const Expr& expr);
+
+/**
+ * \brief Given a For loop, return the next For loop in its body.
+ * @param for_loop The given For loop.
+ * @return The next For loop.
+ */
+Expr GetNextForLoop(const Expr& for_loop);
+
+/**
+ * \brief Given two For loops, return all ir::IfThenElse nodes between them.
+ * @param top The given top For loop.
+ * @param bottom The given bottom For loop.
+ * @return All ir::IfThenElse nodes between them.
+ */
+std::vector<Expr> GetIfThenElseInRange(const Expr& top, const Expr& bottom);
+
+/**
+ * Replace Vars in replaced to Exprs in candidates in source. Vars -> Exprs is one-to-one correspondence.
+ * @param source The Expr we will implement the change.
+ * @param replaced The Vars to be replaced.
+ * @param candidates The Exprs to replace Vars in replaced.
+ */
+void ReplaceExpr(Expr* source, const std::vector<Var>& replaced, const std::vector<Expr>& candidates);
+
+/**
+ * Validate the factors param of Split. We will check if factors are validate and change -1 to positive integer.
+ * @param factors The original factors.
+ * @param total_extent The extent of the loop to be splitted.
+ * @return return The valiated factors.
+ */
+std::vector<int> ValidateFactors(const std::vector<int>& factors, int total_extent);
+
+void CHECKRfactorValidation(const Expr& rf_loop, int rf_axis);
+
+/**
+ * Return loops that contain the expr.
+ * @param expr The expr.
+ * @param root The root of the whole AST.
+ * @return return Loops in AST that contain the expr.
+ */
+std::vector<Expr> GetLoopsOfExpr(const Expr& expr, const Expr& root);
+
+/**
+ * Given an index Expr and all vars' range, return the accessed range in this indice.
+ * @param index The Expr of a specified indice.
+ * @param iter_vars The vars in expr.
+ * @param iter_range Each var's range.
+ * @return return an IterRange represents the accessed range of this indice, If it is not constant, return corresponding
+ * tensor's shape.
+ */
+IterRange GetAccessedRange(const Expr& index,
+                           const std::vector<Var>& iter_vars,
+                           const std::vector<IterRange>& iter_ranges);
+
+/**
+ * Given a ScheduleBlockRealize, an AST root, a tensor and its tensor_indices, return the accessed buffer region of the
+ * tensor in block.
+ * @param block The ScheduleBlockRealize.
+ * @param tensor_indices The tensor's indices.
+ * @param tensor The tensor.
+ * @param root The root of whole AST.
+ * @return return The accessed buffer region of the tensor in block.
+ */
+
+std::vector<IterRange> CalculateTensorRegions(const Expr& block,
+                                              const std::vector<Expr>& tensor_indices,
+                                              const Tensor& tensor,
+                                              const Expr& root);
+
+/**
+ * Return n-th access tensor in block
+ * @param block The ScheduleBlockRealize.
+ * @param index The index indicating which tensor we want to get.
+ * @param is_write We want to get write tensor or read tensor.
+ * @return return The n-th access tensor in block. Should be ir::Store(is_write) or ir::Load(!is_write).
+ */
+Expr GetNthAccessExpr(const Expr& block, int index, bool is_write);
+
+/**
+ * Make a tensor's cache tensor.
+ * @param tensor The original tensor.
+ * @param memory_type The memory type of the cache tensor.
+ * @return return The tensor's cache tensor.
+ */
+Tensor MakeCacheTensor(const Tensor& tensor, const std::string& memory_type);
+
+/**
+ * Make a the cache tensor's block.
+ * @param buffer_region The accessed region of cache tensor.
+ * @param info The information of cache block.
+ * @param memory_type The memory type of cache tensor.
+ * @param device_api The device api of this Expr.
+ * @return return ScheduleBlockRealize of the cache tensor.
+ */
+Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
+                    CacheBlockInfo* info,
+                    const std::string& memory_type,
+                    DeviceAPI device_api);
+
+/**
+ * Fidn cache tensor block's insertion point in the whole AST(root).
+ * @param root The whole AST.
+ * @param info The information of cache block.
+ * @param is_write Are we inserting a write cache tensor or a read cache tensor.
+ */
+void FindInsertionPoint(Expr& root, CacheBlockInfo* info, bool is_write);
+
+/**
+ * \brief Given a vector of For loops, return a set of them.
+ * @param loops The given vector of For loops.
+ * @return A set containing all the For loops in loops.
+ */
+const std::set<Expr, CompExpr> CollectLoopsToSet(const std::vector<Expr>& loops);
+
+/**
+ * \brief Given a set of For loops, return the boundary among them.
+ * @param loop_set The given set of For loops.
+ * @return A pair of the boundary among For loops.(The top For and bottom For)
+ */
+std::pair<Expr, Expr> GetBoundaryOfReorderRange(const std::set<Expr, CompExpr>& loop_set);
+
+/**
+ * \brief Given two For loops, return all loops between them.
+ * @param top The top For loop.
+ * @param bottom The bottom For loop.
+ * @return A vector containing all For loops between the boundary, stored in ascending order.
+ */
+std::vector<Expr> GetLoopsInRange(const Expr& top, const Expr& bottom);
+
+/**
+ * \brief Given params, construct a new loop.
+ */
+Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
+                           const std::vector<Expr>& ordered_loops,
+                           const std::set<Expr, CompExpr>& loop_set,
+                           std::vector<Expr>& if_nodes);
+
+/*!
+ * \brief Find producers of block in root.
+ * \param block The ScheduleBlockRealize node we want to find its producers.
+ * \param root The root ScheduleBlockRealize node.
+ * \return block's producers(ScheduleBlockRealize nodes) in root.
+ */
+std::vector<Expr> GetProducers(const Expr& block, const Expr& root);
+
+/*!
+ * \brief Find consumers of block in root.
+ * \param block The ScheduleBlockRealize node we want to find its consumers.
+ * \param root The root ScheduleBlockRealize node.
+ * \return block's consumers(ScheduleBlockRealize nodes) in root.
+ */
+std::vector<Expr> GetConsumers(const Expr& block, const Expr& root);
+
+/*!
+ * \brief Check if the params of ComputeAt is validate.
+ * \param block The block node we want to move in ComputeAt.
+ * \param loop The for node we want to put the block under in ComputeAt.
+ * \param root The root ScheduleBlockRealize node of block and loop.
+ */
+void CheckComputeAtValidation(const Expr& block, const Expr& loop, const Expr& root);
+
+/*!
+ * \brief Insert a new ScheduleBlockRealize in a loop's body(under its IfThenElse Node, if any)
+ * \param for_loop The for loop whose body we want to modify
+ * \param insertion The ScheduleBlockRealize we want to insert
+ * \param index The position index of the for_loop body `stmts` to be inserted:
+ *        - `index = -1` means inserted into the tail
+ *        - otherwise, it should be a index between [0, stmts size)
+ */
+void InsertBlock(Expr& for_loop, const Expr& insertion, int index = 0);
+
+/*!
+ * \brief Make a union of two range. The detailed function is :
+ * new_range.min = min(range1.min, range2.min)
+ * new_range.extent = max(range1.min + range1.extent, range2.min + range2.extent) - new_range.min
+ * Notice that the pair<Expr, Expr> indicates a range's min and extent.
+ * \param range1 The first range
+ * \param range2 The second range
+ * \return The union of these two ranges
+ */
+IterRange RangeUnion(const IterRange& range1, const IterRange& range2);
+
+/*!
+ * \brief Calculate the required buffer region given a block and its required blocks.
+ * For example, if block is :
+ * B[i0, j0] = A[i0, j0]
+ * loop is :
+ * for (i, 0, 64) {
+ *   for (j, 0, 64) {
+ *     C[i, j] = B[i, j]
+ *   }
+ * }
+ * And required_blocks is :
+ * C[i, j] = B[i, j]
+ * Then we get the required B's region:
+ * B[i, j], where:
+ * i : [i, i]
+ * j : [0, 64]
+ * \param block The ScheduleBlockRealize node begin required
+ * \param loop The loop where we will insert the block under it
+ * @param root The root of the whole AST.
+ * \param required_blocks vector of ScheduleBlockRealize nodes that require the block
+ * \param is_store_provided Whether Store nodes of the block provide the tensor,
+ *        true means it is in compute_at case, otherwise false means in reverse_compuate_at case
+ * \return Each index's range of block's tensor. Indicating the buffer region being required.
+ */
+std::vector<IterRange> CalculateRequiredRegions(const Expr& block,
+                                                const Expr& loop,
+                                                const Expr& root,
+                                                const std::vector<Expr>& required_blocks,
+                                                bool is_store_provided = true);
+
+Expr CheckComputeInlineValidationAndGetStore(const Expr& schedule_block, const Expr& root);
+
+/*!
+ * \brief Check if the reverse compute inline validation passes for a given schedule block and root expression,
+ * and retrieve the store expression if so.
+ * Reverse compute inline validation ensures that the outputs of a loop nest are properly computed in reverse order.
+ * \param schedule_block The schedule block to check.
+ * \param root The root expression of the loop nest.
+ * \return A tuple  containing the load that will be inlined, the store that will be inlined and the target store.
+ */
+std::tuple<Expr, Expr, Expr> CheckReverseComputeInlineValidationAndGetExprs(const Expr& schedule_block,
+                                                                            const Expr& root);
+
+/*!
+ * \brief Get the prime factors of a number.
+ * For example, 12 = 2^2 * 3^1, then the return value is {2: 2, 3: 1}.
+ * \param n The number to be factorized.
+ * \return A map of prime factors and their corresponding exponents.
+ */
+std::unordered_map<int, int> PrimeFactorize(int n);
+
+/*!
+ * \brief Given a number returns the form of the product of its n factors
+ * For example:
+ *  n = 2, dividend = 12, return one of {2, 6}, {6, 2}, {3, 4}, {4, 3}
+ * \param seed The random number generator to use.
+ * \param n The number to be factorized.
+ * \param dividend The dividend of the number.
+ */
+std::vector<int> SampleTile(utils::LinearRandomEngine::StateType* rand_seed, int n, int dividend);
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_test.cc b/paddle/cinn/ir/ir_test.cc
new file mode 100644
index 0000000000000..39ec6b0073f58
--- /dev/null
+++ b/paddle/cinn/ir/ir_test.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace ir {
+
+TEST(Expr, basic) {
+  Expr a(1);
+  auto b = Expr(a);
+  LOG(INFO) << b.as_int32();
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_verify.cc b/paddle/cinn/ir/ir_verify.cc
new file mode 100644
index 0000000000000..b9f3fc7226e14
--- /dev/null
+++ b/paddle/cinn/ir/ir_verify.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_verify.h"
+
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn::ir {
+
+struct IrVerifyVisitor : public ir::IRMutator<> {
+  using ir::IRMutator<>::Visit;
+
+#define __(op__)                                    \
+  void Visit(const op__ *op, Expr *expr) override { \
+    op->Verify();                                   \
+    IRMutator::Visit(op, expr);                     \
+  }
+  NODETY_FORALL(__)
+#undef __
+};
+
+void IrVerify(Expr e) {
+  IrVerifyVisitor visitor;
+  visitor.Visit(&e, &e);
+}
+
+}  // namespace cinn::ir
diff --git a/paddle/cinn/ir/ir_verify.h b/paddle/cinn/ir/ir_verify.h
new file mode 100644
index 0000000000000..fa2fe259ef127
--- /dev/null
+++ b/paddle/cinn/ir/ir_verify.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir.h"
+
+namespace cinn::ir {
+
+void IrVerify(Expr e);
+
+}  // namespace cinn::ir
diff --git a/paddle/cinn/ir/ir_verify_test.cc b/paddle/cinn/ir/ir_verify_test.cc
new file mode 100644
index 0000000000000..5fcfe4cc8dcef
--- /dev/null
+++ b/paddle/cinn/ir/ir_verify_test.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_verify.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/ir/ir_operators.h"
+
+namespace cinn::ir {
+
+TEST(IrVerify, basic) {
+  Expr a(1);
+  Expr b(1);
+  IrVerify(a + b);
+}
+
+}  // namespace cinn::ir
diff --git a/paddle/cinn/ir/ir_visitor.cc b/paddle/cinn/ir/ir_visitor.cc
new file mode 100644
index 0000000000000..0cdbc828a91a2
--- /dev/null
+++ b/paddle/cinn/ir/ir_visitor.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir_visitor.h"
+
+#include <unordered_set>
+
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace ir {
+
+bool operator==(Expr a, Expr b) {
+  if (a.get() == b.get()) return true;
+  // TODO(Superjomn) implement with a more accurate one
+  return utils::GetStreamCnt(a) == utils::GetStreamCnt(b);
+}
+
+bool operator!=(Expr a, Expr b) { return !(a == b); }
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/ir_visitor.h b/paddle/cinn/ir/ir_visitor.h
new file mode 100644
index 0000000000000..21d7bab369ae8
--- /dev/null
+++ b/paddle/cinn/ir/ir_visitor.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <functional>
+#include <set>
+
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/intrinsic_ops.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace ir {
+
+struct _Tensor_;
+
+/**
+ * Base class of all the methods visit the IR tree.
+ * @param RetTy return type.
+ * @param Args type of the extra arguments passed to the all the methods.
+ */
+template <typename RetTy = void, typename... Args>
+struct IRVisitorBase {
+  //! Visit a expression.
+  // @{
+  virtual RetTy Visit(const ir::Expr* expr, Args... args) {
+    CHECK(expr->defined());
+    switch (expr->node_type()) {
+#define __(op__)           \
+  case ir::IrNodeTy::op__: \
+    return Visit(expr->As<ir::op__>(), args...);
+
+      NODETY_FORALL(__)
+
+      default:
+        LOG(FATAL) << "not supported NodeTy";
+#undef __
+    }
+    return RetTy();
+  }
+  // @}
+
+ protected:
+#define __(op__) virtual RetTy Visit(const ir::op__* op, Args... args) = 0;
+  NODETY_FORALL(__)
+#undef __
+};
+
+/**
+ * Base of all the Ir readonly visitor.
+ */
+struct IRVisitor : public IRVisitorBase<void> {
+  IRVisitor() = default;
+
+  void Visit(const Expr* x) { IRVisitorBase::Visit(x); }
+#define __m(t__) \
+  virtual void Visit(const t__* x) {}
+  NODETY_FORALL(__m)
+#undef __m
+};
+
+// std::set<Expr> CollectIRNodes(Expr expr, std::function<bool(const Expr*)> teller);
+
+bool operator==(Expr a, Expr b);
+bool operator!=(Expr a, Expr b);
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/layout.cc b/paddle/cinn/ir/layout.cc
new file mode 100644
index 0000000000000..9b97c0e5ecab2
--- /dev/null
+++ b/paddle/cinn/ir/layout.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/layout.h"
+
+namespace cinn {
+namespace ir {
+
+void Layout::Verify() {
+  {
+    CHECK(!name_.empty());
+    CHECK(!axes_.empty());
+    axis_names_ = "";
+    for (auto& axis : axes_) {
+      CHECK_EQ(axis->name.size(), 1U);
+      auto axis_name = axis->name[0];
+      CHECK((axis_name >= 'A' && axis_name <= 'Z') || (axis_name >= 'a' && axis_name <= 'z'));
+      CHECK(axis_names_.find(axis_name) == axis_names_.npos) << axis_name << " has already exsit.";
+      axis_names_ += axis_name;
+    }
+    int offset = 'A' - 'a';
+    for (auto& axis : axes_) {
+      CHECK_EQ(axis->name.size(), 1U);
+      auto axis_name = axis->name[0];
+      if (axis_name >= 'a' && axis_name <= 'z') {
+        CHECK(axis_names_.find(axis_name + offset) != axis_names_.npos)
+            << "sub-axis " << axis_name << " finds no primal axis";
+      }
+    }
+  }
+}
+Layout::Layout(const std::string& name) {
+  CHECK(!name.empty());
+  int factor = 0;
+  std::vector<Var> axes;
+  for (char c : name) {
+    if (c >= 'A' && c <= 'Z') {
+      CHECK_EQ(factor, 0) << "Invalid factor " << factor << " before primal axis " << c;
+      axes.push_back(ir::Var(std::string(1, c)));
+    } else if (c >= '0' && c <= '9') {
+      factor = 10 * factor + c - '0';
+    } else if (c >= 'a' && c <= 'z') {
+      CHECK_GT(factor, 0) << "Invalid factor " << factor << " for sub-axis " << c;
+      axes.push_back(ir::Var(factor, std::string(1, c)));
+      factor = 0;
+    } else {
+      LOG(FATAL) << "Invalid layout: " << name;
+    }
+  }
+  name_ = name;
+  axes_ = axes;
+  Verify();
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/layout.h b/paddle/cinn/ir/layout.h
new file mode 100644
index 0000000000000..1af93114c93bd
--- /dev/null
+++ b/paddle/cinn/ir/layout.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <set>
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+
+namespace cinn {
+namespace ir {
+class Layout {
+ public:
+  std::string name_;
+  std::string axis_names_;
+  std::vector<ir::Var> axes_;
+
+  Layout(const std::string& name, const std::vector<ir::Var>& axes) : name_(name), axes_(axes) { Verify(); }
+
+  explicit Layout(const std::string& name);
+
+  inline const std::string& name() const { return name_; }
+  // axis name without factor
+  inline const std::string& axis_names() const { return axis_names_; }
+  inline const std::vector<ir::Var>& axes() const { return axes_; }
+  inline int ndims() const { return axes_.size(); }
+  inline const Var operator[](int i) const { return axes_[i]; }
+  inline const char axis_names(int i) const { return axis_names_[i]; }
+
+  void Verify();
+  Expr Make(const std::string& name, const std::vector<ir::Var>& axes);
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc
new file mode 100644
index 0000000000000..36b0dcf6014c8
--- /dev/null
+++ b/paddle/cinn/ir/lowered_func.cc
@@ -0,0 +1,472 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/lowered_func.h"
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/optim/tensor_write_tell.h"
+#include "cinn/runtime/intrinsic.h"
+#include "cinn/utils/string.h"
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace ir {
+
+using common::bfloat16;
+using common::float16;
+
+const _LoweredFunc_* LoweredFunc::operator->() const { return As<_LoweredFunc_>(); }
+_LoweredFunc_* LoweredFunc::operator->() { return As<_LoweredFunc_>(); }
+
+LoweredFunc _LoweredFunc_::Make(const std::string& name,
+                                const std::vector<Argument>& args,
+                                const Expr& body,
+                                const std::vector<ir::Buffer>& temp_bufs) {
+  auto* n      = make_shared<_LoweredFunc_>();
+  n->name      = name;
+  n->args      = args;
+  n->body      = body;
+  n->temp_bufs = temp_bufs;
+
+  n->CheckValid();
+  n->PrepareAllocOutputBufferExprs();
+  n->PrepareCreateTempBufferExprs();
+  n->PrepareAllocTempBufferExprs();
+  n->AllocTempBuffer();
+  bool with_expr_gen_tensor = true;
+  if (FLAGS_cinn_ir_schedule) with_expr_gen_tensor = false;
+  n->PrepareBufferCastExprs(with_expr_gen_tensor);
+  n->PrepareArgumentExprs();
+  n->PrepareDeallocTempBufferExprs();
+  n->PrepareDeallocOutputBufferExprs();
+  return LoweredFunc(n);
+}
+
+void _LoweredFunc_::CheckValid() const {
+  // check there is at least one output
+  int out_count = 0;
+  int in_count  = 0;
+  for (auto& arg : args) {
+    in_count += arg.is_input();
+    out_count += arg.is_output();
+  }
+  CHECK_GT(out_count, 0) << "At least one output argument is needed for a function\n" << body;
+}
+
+std::vector<Expr*> _LoweredFunc_::expr_fields() { return {&body}; }
+std::vector<const Expr*> _LoweredFunc_::expr_fields() const { return {&body}; }
+
+void _LoweredFunc_::PrepareCudaAxisInfoFromBody() {
+  std::set<Expr> bound_for_exprs = ir::CollectIRNodes(body, [](const Expr* expr) {
+    const ir::For* for_expr = expr->As<ir::For>();
+    return for_expr != nullptr && for_expr->is_binded();
+  });
+
+  if (bound_for_exprs.empty()) {
+    device_api = ir::DeviceAPI::GPU;
+    cuda_axis_info.set_grid_dim(0, 1);
+    cuda_axis_info.set_block_dim(0, 1);
+    cuda_axis_info.set_valid(true);
+    return;
+  }
+
+  // bound_for_exprs.empty() is false
+  for (const Expr& expr : bound_for_exprs) {
+    const ir::For* for_expr = expr.As<ir::For>();
+    if (for_expr->for_type() == ir::ForType::GPUBlock) {
+      cuda_axis_info.set_grid_dim(for_expr->bind_info().offset, for_expr->extent.as_int32());
+    } else if (for_expr->for_type() == ir::ForType::GPUThread) {
+      cuda_axis_info.set_block_dim(for_expr->bind_info().offset, for_expr->extent.as_int32());
+    }
+  }
+  device_api = ir::DeviceAPI::GPU;
+  cuda_axis_info.set_valid(true);
+}
+
+void _LoweredFunc_::PrepareAllocOutputBufferExprs() {
+  CHECK(alloc_output_buffer_exprs.empty()) << "duplicate prepare the allocate buffer for outputs";
+
+  std::set<std::string> buffer_names;
+  for (auto& arg : args) {
+    if (arg.is_output()) {
+      CHECK(arg.type().valid()) << "argument [" << arg.name() << "]'s type should be set";
+      if (arg.is_buffer() && !buffer_names.count(arg.name())) {  // only buffer need allocation.
+        buffer_names.insert(arg.name());                         // Avoid duplicate
+        alloc_output_buffer_exprs.push_back(
+            Alloc::Make(arg.buffer_arg(), arg.buffer_arg()->type(), arg.buffer_arg()->shape, Expr(), Expr()));
+      }
+    }
+  }
+}
+
+std::vector<Expr> _LoweredFunc_::PrepareAllocTempBufferExprs() const {
+  std::vector<Expr> alloc_temp_buffer_exprs;
+  for (auto& temp_buf : temp_bufs) {
+    if (!temp_buf->shape.empty() && temp_buf->type() != Void()) {
+      alloc_temp_buffer_exprs.push_back(Alloc::Make(temp_buf, temp_buf->type(), temp_buf->shape, Expr(), Expr()));
+    }
+  }
+  return alloc_temp_buffer_exprs;
+}
+
+std::vector<Expr> _LoweredFunc_::PrepareDeallocTempBufferExprs() const {
+  std::vector<Expr> dealloc_temp_buffer_exprs;
+  for (auto& temp_buf : temp_bufs) {
+    if (!temp_buf->shape.empty() && temp_buf->type() != Void()) {
+      dealloc_temp_buffer_exprs.push_back(Free::Make(temp_buf));
+    }
+  }
+  return dealloc_temp_buffer_exprs;
+}
+
+std::vector<Expr> _LoweredFunc_::PrepareCreateTempBufferExprs() const {
+  std::vector<Expr> create_temp_buffer_exprs;
+  for (auto& temp_buf : temp_bufs) {
+    if (!temp_buf->shape.empty() && temp_buf->type() != Void()) {
+      auto expr            = ir::intrinsics::BufferCreate::Make(temp_buf);
+      auto buffer_ptr_type = Type().set_customized_type(common::customized_type::kbuffer_t).set_cpp_handle();
+      Var variable         = ir::_Var_::Make(temp_buf->name, buffer_ptr_type);
+      expr                 = ir::Let::Make(variable, expr);
+      create_temp_buffer_exprs.push_back(expr);
+    }
+  }
+  return create_temp_buffer_exprs;
+}
+
+std::vector<Expr> _LoweredFunc_::CudaPrepareAllocTempBufferExprs() const {
+  std::vector<Expr> alloc_output_buffer_exprs;
+  for (auto temp_buf : temp_bufs) {
+    if (utils::Startswith(temp_buf->name, "_")) {
+      temp_buf->name = temp_buf->name.substr(1);
+    }
+    if (!temp_buf->shape.empty() && temp_buf->type() != Void()) {
+      alloc_output_buffer_exprs.push_back(Alloc::Make(temp_buf, temp_buf->type(), temp_buf->shape, Expr(), Expr()));
+    }
+  }
+  return alloc_output_buffer_exprs;
+}
+
+void _LoweredFunc_::PrepareDeallocOutputBufferExprs() {
+  CHECK(dealloc_output_buffer_exprs.empty()) << "duplicate prepare the allocate buffer for outputs";
+
+  std::set<std::string> buffer_names;
+  for (auto& arg : args) {
+    if (arg.is_output()) {
+      CHECK(arg.type().valid()) << "argument [" << arg.name() << "]'s type should be set";
+      if (arg.is_buffer() && !buffer_names.count(arg.name())) {  // only buffer need allocation.
+        buffer_names.insert(arg.name());                         // Avoid duplicate
+        dealloc_output_buffer_exprs.push_back(Free::Make(arg.buffer_arg()));
+      }
+    }
+  }
+}
+
+void _LoweredFunc_::AllocTempBuffer() {}
+
+void _LoweredFunc_::PrepareBufferCastExprs(bool with_expr_gen_tensor) {
+  buffer_data_cast_exprs.clear();
+  // collect write.
+  optim::TensorWriteTeller write_teller;
+  write_teller.Collect(&body);
+
+  auto tensors = CollectAllTensorReference(with_expr_gen_tensor);
+  std::sort(tensors.begin(), tensors.end(), [](const Tensor& a, const Tensor& b) { return a->name < b->name; });
+
+  VLOG(3) << "Function used " << tensors.size() << " buffers";
+  for (auto& tensor : tensors) {
+    auto* node = tensor.As<ir::_Tensor_>();
+    CHECK(node);
+    if (!tensor->buffer.defined()) continue;
+
+    Type value_type = tensor->type().ElementOf();
+    bool is_const   = !write_teller.IsWrite(tensor->name);
+    value_type.set_cpp_handle();
+    value_type.set_cpp_const(is_const);
+    Var variable = _Var_::Make(tensor->name, value_type);
+
+    Expr body = is_const ? ir::intrinsics::BufferGetDataConstHandle::Make(tensor->buffer)
+                         : ir::intrinsics::BufferGetDataHandle::Make(tensor->buffer);
+
+    Type target_type = is_const ? tensor->buffer->dtype.PointerOf().ConstOf() : tensor->buffer->dtype.PointerOf();
+    body             = ir::Cast::Make(target_type, body);
+    auto let         = Let::Make(variable, body);
+
+    buffer_data_cast_exprs.push_back(let);
+  }
+}
+
+std::vector<Expr> _LoweredFunc_::CudaAliasVarExprs() const {
+  std::unordered_set<std::string> args_buffer;
+  for (auto arg : args) {
+    args_buffer.insert(arg.name());
+  }
+  // collect write.
+  std::vector<Expr> res;
+  optim::TensorWriteTeller write_teller;
+  write_teller.Collect(&body);
+
+  auto tensors = CollectAllTensorReference();
+  std::sort(tensors.begin(), tensors.end(), [](const Tensor& a, const Tensor& b) { return a->name < b->name; });
+
+  for (auto& tensor : tensors) {
+    auto* node = tensor.As<ir::_Tensor_>();
+    CHECK(node);
+    if (!tensor->buffer.defined()) {
+      continue;
+    }
+    if (tensor->name == tensor->buffer->name.substr(1) || args_buffer.count(tensor->buffer->name) == 0) {
+      continue;
+    }
+    Type value_type = tensor->type().ElementOf();
+    bool is_const   = !write_teller.IsWrite(tensor->name);
+    value_type.set_cpp_handle();
+    value_type.set_cpp_const(is_const);
+    Var variable = _Var_::Make(tensor->name, value_type);
+    Var body     = Var(tensor->buffer->name.substr(1), value_type);
+
+    auto let = Let::Make(variable, body);
+
+    res.push_back(let);
+  }
+  return res;
+}
+
+void _LoweredFunc_::PrepareArgumentExprs() {
+  // Seems a CINN func.
+  if (args.front().is_var() && args.front().var_arg()->type() == type_of<cinn_pod_value_t*>()) return;
+
+  // type of `void*`
+  auto void_ptr_array_type = Type().with_type(Type::type_t::Void).set_cpp_handle();
+  // type of `cinn_buffer_t*`
+  auto buffer_ptr_type = Type().set_customized_type(common::customized_type::kbuffer_t).set_cpp_handle();
+  // type of `const cinn_buffer_t*`
+  auto const_buffer_ptr_type = buffer_ptr_type.with_cpp_const();
+  CHECK(!buffer_ptr_type.is_cpp_const());
+
+  Var args_passed_in("_args", type_of<void*>());
+  auto pod_value_ptr = common::CastIfNeeded(args_passed_in, type_of<cinn_pod_value_t*>());
+
+  if (FLAGS_cinn_runtime_display_debug_info) {
+    argument_prepare_exprs.push_back(runtime::IntrinsicCall(
+        Void(), runtime::intrinsic::print_debug_args_repr, {pod_value_ptr, common::make_const(Int(32), args.size())}));
+  }
+
+  /*
+   * Get something like:
+   *
+   * const cinn_buffer_t* _A = args[0];
+   * cinn_buffer_t* _B = (cinn_buffer_t*)args[1];
+   * int M = (int)arg[2];
+   */
+
+  // We just has two kinds of argument types, first is `cinn_buffer_t*`, second is `const cinn_buffer_t*`, do not need a
+  // `any` type support currently.
+  for (int i = 0; i < args.size(); i++) {
+    auto& arg = args[i];
+    // cast arg to cinn_pod_value_t*
+
+    // something like `_args[0]`
+    Expr load_expr = Load::Make(pod_value_ptr, {common::make_const(i)});
+    CHECK_EQ(load_expr.type(), type_of<cinn_pod_value_t>());
+    load_expr = ir::intrinsics::GetAddr::Make(load_expr);
+
+    Var _arg;
+    bool is_const = arg.is_input();
+
+    if (arg.is_buffer()) {
+      auto buffer_type = is_const ? const_buffer_ptr_type : buffer_ptr_type;
+      _arg             = Var(arg.name(), buffer_type);
+    } else if (arg.is_var()) {
+      _arg = Var(arg.name(), arg.var_arg()->type());
+    } else {
+      CINN_NOT_IMPLEMENTED
+    }
+
+    CHECK(_arg->type().valid());
+
+    Expr pod_cast_expr;
+
+    if (arg.is_buffer()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<cinn_buffer_t*>());
+    } else if (arg.type() == type_of<int8_t>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<int8_t>());
+    } else if (arg.type() == type_of<int16_t>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<int16_t>());
+    } else if (arg.type() == type_of<int32_t>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<int32_t>());
+    } else if (arg.type() == type_of<int64_t>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<int64_t>());
+    } else if (arg.type() == type_of<uint8_t>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<uint8_t>());
+    } else if (arg.type() == type_of<uint16_t>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<uint16_t>());
+    } else if (arg.type() == type_of<uint32_t>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<uint32_t>());
+    } else if (arg.type() == type_of<uint64_t>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<uint64_t>());
+    } else if (arg.type() == type_of<bfloat16>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<bfloat16>());
+    } else if (arg.type() == type_of<float16>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<float16>());
+    } else if (arg.type() == type_of<float>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<float>());
+    } else if (arg.type() == type_of<double>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<double>());
+    } else if (arg.type() == type_of<bool>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<bool>());
+    } else if (arg.type() == type_of<void*>()) {
+      pod_cast_expr = ir::intrinsics::PodValueToX::Make(load_expr, type_of<void*>());
+    } else {
+      LOG(ERROR) << "Not supported type [" << arg.type() << "]";
+      CINN_NOT_IMPLEMENTED
+    }
+
+    Expr let_expr = Let::Make(_arg, pod_cast_expr);
+    CHECK(let_expr.type().valid());
+    argument_prepare_exprs.push_back(let_expr);
+  }
+}
+
+std::vector<Tensor> _LoweredFunc_::CollectAllTensorReference(bool with_expr_gen_tensor) const {
+  std::set<Expr> tensor_exprs =
+      with_expr_gen_tensor
+          ? ir::CollectIRNodes(body, [](const Expr* expr) { return expr->As<ir::_Tensor_>(); })
+          : ir::CollectIRNodesWithoutTensor(body, [](const Expr* expr) { return expr->As<ir::_Tensor_>(); });
+
+  std::vector<Tensor> tensors;
+  // remove the duplicate tensor by their name.
+  std::set<std::string> names;
+
+  for (const Expr& expr : tensor_exprs) {
+    Expr& _expr = *const_cast<Expr*>(&expr);
+    Tensor b(_expr.As<_Tensor_>());
+    if (names.count(b->name)) continue;
+    tensors.push_back(b);
+    names.insert(b->name);
+  }
+
+  return tensors;
+}
+
+ir::Buffer Argument::buffer_arg() const {
+  CHECK(is_buffer());
+  return buffer_arg_;
+}
+
+ir::Var Argument::var_arg() const {
+  CHECK(is_var());
+  return var_arg_;
+}
+
+void Argument::set_buffer(const ir::Buffer& x) {
+  CHECK(!is_var()) << "the buffer is already a var";
+  buffer_arg_ = x;
+}
+
+void Argument::set_var(const ir::Var& x) {
+  CHECK(!is_buffer()) << "the buffer is already a buffer";
+  var_arg_ = x;
+}
+
+Argument::Argument(const ir::Buffer& buffer, Argument::IO io) {
+  set_buffer(buffer);
+  this->io = io;
+}
+
+Type Argument::type() const {
+  if (is_var())
+    return var_arg()->type();
+  else if (is_buffer())
+    return buffer_arg()->type();
+  else
+    CINN_NOT_IMPLEMENTED
+}
+
+std::string Argument::name() const {
+  if (is_buffer())
+    return buffer_arg()->name;
+  else if (is_var())
+    return var_arg()->name;
+  else
+    CINN_NOT_IMPLEMENTED
+  return "";
+}
+
+Argument::Argument(const ir::Var& var, Argument::IO io) {
+  set_var(var);
+  this->io = io;
+}
+
+std::string Argument::human_readable() const {
+  std::stringstream os;
+  os << "<Argument: " << name() << " ";
+  os << (is_input() ? "R" : "W");
+  os << ">";
+  return os.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const CudaAxisInfo& x) {
+  os << "<grid:" << x.grid_dim(0) << ", " << x.grid_dim(1) << ", " << x.grid_dim(2) << ">";
+  os << "<block:" << x.block_dim(0) << ", " << x.block_dim(1) << ", " << x.block_dim(2) << ">";
+  return os;
+}
+
+void CudaAxisInfo::set_grid_dim(int offset, int x) {
+  valid_ = true;
+  CHECK_LT(offset, 3);
+  grid_dims_[offset] = x;
+}
+void CudaAxisInfo::set_block_dim(int offset, int x) {
+  valid_ = true;
+  CHECK_LT(offset, 3);
+  block_dims_[offset] = x;
+}
+int CudaAxisInfo::grid_dim(int offset) const {
+  CHECK(valid_);
+  CHECK_LT(offset, 3);
+  return grid_dims_[offset];
+}
+int CudaAxisInfo::block_dim(int offset) const {
+  CHECK(valid_);
+  CHECK_LT(offset, 3);
+  return block_dims_[offset];
+}
+void CudaAxisInfo::ExtendWith(const CudaAxisInfo& other) {
+  set_valid(true);
+  for (int i = 0; i < 3; i++) {
+    grid_dims_[i]  = std::max(grid_dims_[i], other.grid_dims_[i]);
+    block_dims_[i] = std::max(block_dims_[i], other.block_dims_[i]);
+  }
+}
+void CudaAxisInfo::CopyGridDimsTo(std::vector<int>* dest) const {
+  dest->insert(dest->begin(), grid_dims_.begin(), grid_dims_.end());
+}
+void CudaAxisInfo::CopyBlockDimsTo(std::vector<int>* dest) const {
+  dest->insert(dest->begin(), block_dims_.begin(), block_dims_.end());
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/lowered_func.h b/paddle/cinn/ir/lowered_func.h
new file mode 100755
index 0000000000000..f237232b1c7ab
--- /dev/null
+++ b/paddle/cinn/ir/lowered_func.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir_base.h"
+
+namespace cinn {
+namespace ir {
+
+class _LoweredFunc_;
+
+/**
+ * A struct representing an argument to a lowered function. Used for specifying the function signature of generated
+ * code.
+ */
+struct Argument {
+  //! Input or output.
+  enum class IO { kInput = 0, kOutput = 1 };
+
+  IO io{IO::kInput};
+
+  Argument() = default;
+  explicit Argument(const ir::Buffer& buffer, IO io = IO::kInput);
+  explicit Argument(const ir::Var& var, IO io = IO::kInput);
+
+  //! Set the buffer argument, all the buffer information are stored in ir::Buffer.
+  void set_buffer(const ir::Buffer& x);
+
+  //! Set the var argument.
+  void set_var(const ir::Var& x);
+
+  bool is_input() const { return io == IO::kInput; }
+  bool is_output() const { return io == IO::kOutput; }
+
+  bool is_var() const { return var_arg_.defined(); }
+  bool is_buffer() const { return buffer_arg_.defined(); }
+  bool defined() const { return is_var() || is_buffer(); }
+
+  ir::Buffer buffer_arg() const;
+  ir::Var var_arg() const;
+
+  //! The type of the buffer or scalar.
+  Type type() const;
+
+  std::string name() const;
+
+  std::string human_readable() const;
+
+ private:
+  //! The buffer field.
+  ir::Buffer buffer_arg_;
+  //! The scalar field.
+  ir::Var var_arg_;
+};
+
+//! Wrapper for _LoweredFunc_
+class LoweredFunc : public IrNodeRef {
+ public:
+  LoweredFunc() = default;
+  explicit LoweredFunc(IrNode* n) : IrNodeRef(n) {}
+
+  operator Expr() const { return Expr(ptr()); }
+
+  const _LoweredFunc_* operator->() const;
+  _LoweredFunc_* operator->();
+};
+
+using dim3_t = std::array<int, 3>;
+struct CudaAxisInfo {
+  CudaAxisInfo() {
+    for (int& v : grid_dims_) v = 1;
+    for (int& v : block_dims_) v = 1;
+    set_valid(false);
+  }
+
+  void set_grid_dim(int offset, int x);
+  void set_block_dim(int offset, int x);
+
+  int grid_dim(int offset) const;
+  int block_dim(int offset) const;
+
+  void CopyGridDimsTo(std::vector<int>* dest) const;
+  void CopyBlockDimsTo(std::vector<int>* dest) const;
+
+  inline void set_valid(bool x = false) { valid_ = x; }
+  inline bool valid() const { return valid_; }
+
+  //! Extend the axis dims and keep the larger dims.
+  void ExtendWith(const CudaAxisInfo& other);
+
+ private:
+  // the three dimensions represents x, y, z
+  dim3_t grid_dims_;
+  // the three dimensions represents x, y, z
+  dim3_t block_dims_;
+  bool valid_{false};
+};
+
+std::ostream& operator<<(std::ostream& os, const CudaAxisInfo& x);
+
+/**
+ * Definition of a lowered function. Note that, it should be functional.
+ *
+ * Arguments of the function:
+ *
+ * both the input and output arguments, the output arguments are in the tail.
+ */
+struct _LoweredFunc_ : ExprNode<_LoweredFunc_> {
+  //! The name of this function.
+  std::string name;
+
+  //! The Arguments used in the body of the function.
+  std::vector<Argument> args;
+
+  //! Temporary buffers(as output), these buffers will not appear in the function's argument list, but will be used in
+  //! the body.
+  std::vector<Buffer> temp_bufs;
+
+  //! Body of this function.
+  Expr body;
+
+  DeviceAPI device_api{DeviceAPI::UNK};
+
+  CudaAxisInfo cuda_axis_info;
+
+  /**
+   * The output buffer will be resized to the size required, we leave all the expression here.
+   * The allocation and deallocation expressions will insert into the head and tail of the function's body. It supports
+   * lazy allocation/deallocation if the corresponding intristic methods support.
+   *
+   * Currently, we assume that all the input and output buffers should locate in heap, no other memory type is allowed.
+   */
+  // @{
+  std::vector<Expr> alloc_output_buffer_exprs;
+  std::vector<Expr> dealloc_output_buffer_exprs;
+  // @}
+
+  //! something like: float* A_data = (float*)(A->memory);
+  std::vector<Expr> buffer_data_cast_exprs;
+
+  std::vector<Expr> argument_prepare_exprs;
+
+  static LoweredFunc Make(const std::string& name,
+                          const std::vector<Argument>& args,
+                          const Expr& body,
+                          const std::vector<ir::Buffer>& temp_bufs);
+
+  bool is_gpu_host() const { return cuda_axis_info.valid(); }
+
+  void Verify() const override {}
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  static const IrNodeTy _node_type_ = IrNodeTy::_LoweredFunc_;
+
+  std::vector<Expr> PrepareCreateTempBufferExprs() const;
+  //! Prepare the expressions for `alloc_tmp_buffer_exprs`.
+  std::vector<Expr> PrepareAllocTempBufferExprs() const;
+  std::vector<Expr> PrepareDeallocTempBufferExprs() const;
+  std::vector<Expr> CudaPrepareAllocTempBufferExprs() const;
+  std::vector<Expr> CudaAliasVarExprs() const;
+  void PrepareBufferCastExprs(bool with_expr_gen_tensor = true);
+  void PrepareCudaAxisInfoFromBody();
+
+ private:
+  void CheckValid() const;
+  //! Prepare the expressions for `alloc_output_buffer_exprs`.
+  void PrepareAllocOutputBufferExprs();
+  //! Prepare the expressions for `dealloc_output_buffer_exprs`.
+  void PrepareDeallocOutputBufferExprs();
+  //! Insert the allocation expr for temporary variables.
+  void AllocTempBuffer();
+
+  void PrepareArgumentExprs();
+  //! Get all the Buffers the function body references.
+  //! NOTE it will return the buffers with duplicates removed(by comparing their name).
+  std::vector<Tensor> CollectAllTensorReference(bool with_expr_gen_tensor = true) const;
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/module.cc b/paddle/cinn/ir/module.cc
new file mode 100644
index 0000000000000..d0bd612bf0a7b
--- /dev/null
+++ b/paddle/cinn/ir/module.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/module.h"
+
+#include <memory>
+
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/optimize.h"
+
+namespace cinn {
+namespace ir {
+
+void Module::Builder::AddFunction(ir::LoweredFunc func) {
+  optim::Simplify(&(func->body));
+  optim::SimplifyForLoops(&(func->body));
+  optim::SimplifyBlocks(&(func->body));
+  func->body = optim::Optimize(func->body, module_->target);
+  module_->functions.push_back(func);
+}
+
+void Module::Builder::AddBuffer(ir::Buffer buffer) {
+  CHECK(buffer->target.defined()) << "buffer [" << buffer->name << "]'s target is undefined";
+  if (std::find_if(module_->buffers.begin(), module_->buffers.end(), [&](const Expr &x) {
+        return x.as_buffer()->name == buffer->name;
+      }) == std::end(module_->buffers)) {
+    module_->buffers.push_back(buffer);
+    if (module_->target.arch == Target::Arch::X86) {
+      module_->buffers.back().as_buffer()->data_alignment = 32;
+    }
+  }
+}
+
+void Module::Builder::Clear() {
+  module_->buffers.clear();
+  module_->functions.clear();
+  module_->submodules.clear();
+}
+
+Module Module::Builder::Build() {
+  if (module_->functions.empty()) {
+    VLOG(1) << "Module has no functions";
+  }
+
+  auto res = ir::Module(module_.get());
+
+  return optim::Optimize(res, module_->target);
+}
+
+ir::_Module_ *Module::self() { return p_->as<ir::_Module_>(); }
+const ir::_Module_ *Module::self() const { return p_->as<ir::_Module_>(); }
+
+const Target &Module::target() const { return self()->target; }
+
+const std::string &Module::name() const { return self()->name; }
+
+std::vector<ir::Buffer> Module::buffers() const {
+  std::vector<ir::Buffer> buffers;
+  for (auto &buffer : self()->buffers) {
+    buffers.emplace_back(buffer.as_buffer_ref());
+  }
+  return buffers;
+}
+
+std::vector<ir::LoweredFunc> Module::functions() const {
+  std::vector<ir::LoweredFunc> functions;
+  for (auto &x : self()->functions) {
+    functions.emplace_back(x.as_lowered_func_ref());
+  }
+  return functions;
+}
+
+std::vector<Module> Module::submodules() const {
+  std::vector<ir::Module> modules;
+  for (auto &x : self()->submodules) {
+    modules.push_back(x.as_module_ref());
+  }
+  return modules;
+}
+
+void Module::Compile(const backends::Outputs &outputs) const {}
+
+Module::operator Expr() const { return Expr(ptr()); }
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/module.h b/paddle/cinn/ir/module.h
new file mode 100644
index 0000000000000..e92df6f219801
--- /dev/null
+++ b/paddle/cinn/ir/module.h
@@ -0,0 +1,89 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/backends/outputs.h"
+#include "cinn/common/common.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/lang/buffer.h"
+
+namespace cinn {
+
+namespace backends {
+class CodeGenC;
+}  // namespace backends
+
+namespace ir {
+
+/**
+ * Module represents IR containing lowered function definitions and buffers.
+ */
+class Module : public ir::IrNodeRef {
+ public:
+  struct Builder {
+    Builder(const std::string& name, const Target& target) : module_(common::make_shared<ir::_Module_>()) {
+      module_->name   = name;
+      module_->target = target;
+    }
+
+    void AddFunction(ir::LoweredFunc func);
+    void AddBuffer(ir::Buffer buffer);
+    void Clear();
+
+    Module Build();
+
+   private:
+    Shared<ir::_Module_> module_;
+  };
+
+  //! Get the target of this module.
+  const Target& target() const;
+
+  //! Get the name of the module.
+  const std::string& name() const;
+
+  //! The members in the module.
+  // @{
+  std::vector<ir::Buffer> buffers() const;
+  std::vector<ir::LoweredFunc> functions() const;
+  std::vector<Module> submodules() const;
+  // @}
+
+  //! Compile a module to some outputs.
+  void Compile(const backends::Outputs& outputs) const;
+
+  ir::_Module_* self();
+  const ir::_Module_* self() const;
+
+  ir::_Module_* operator->() { return self(); }
+  const ir::_Module_* operator->() const { return self(); }
+
+  operator Expr() const;
+
+ protected:
+  Module(const std::string& name, const Target& target);
+
+  explicit Module(ir::IrNode* n) : ir::IrNodeRef(n) {}
+
+  friend class Module::Builder;
+  friend class backends::CodeGenC;
+  friend class ::cinn::ir::Expr;
+  friend class ::cinn::ir::_Module_;
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/operation.cc b/paddle/cinn/ir/operation.cc
new file mode 100644
index 0000000000000..217d0f853b762
--- /dev/null
+++ b/paddle/cinn/ir/operation.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/operation.h"
+
+#include <memory>
+
+#include "cinn/common/common.h"
+
+namespace cinn {
+namespace ir {
+
+Operation PlaceholderOp::Make(const std::string &name, const std::vector<Expr> &shape, Type dtype) {
+  auto n   = make_shared<PlaceholderOp>();
+  n->name  = name;
+  n->shape = shape;
+  n->set_type(dtype);
+  return Operation(n);
+}
+
+const char *PlaceholderOp::func_type() const { return "placeholder_op"; }
+
+const char *ComputeOp::func_type() const { return "compute_op"; }
+
+Operation ComputeOp::Make(const std::string &name,
+                          ComputeOp::handle_t handle,
+                          const std::vector<Expr> &shape,
+                          const std::vector<Expr> &domain,
+                          const std::vector<Var> &reduce_axis,
+                          const std::map<std::string, IrNodeRef> &attrs,
+                          const std::string &tag) {
+  auto n         = make_shared<ComputeOp>();
+  n->name        = name;
+  n->producer_fn = handle;
+  n->shape       = domain;
+  n->reduce_axis = reduce_axis;
+  n->tag         = tag;
+  n->attrs       = attrs;
+  auto axis      = common::GenDefaultAxis(domain.size());
+  std::vector<Expr> _axis;
+  for (auto &x : axis) _axis.push_back(x);
+  n->body        = {handle(_axis)};
+  n->reduce_axis = reduce_axis;
+  return Operation(n);
+}
+
+Operation CallOp::Make(const std::string &call_target, Expr call_op) {
+  auto n       = make_shared<CallOp>();
+  n->call_expr = call_op;
+  return Operation(n);
+}
+
+Operation PrecedingViewOp::Make(const Tensor &tensor, int preceding_axis) { return Operation(); }
+
+const char *PrecedingViewOp::func_type() const { return PrecedingViewOp::__func_type__; }
+
+const char *CallOp::func_type() const { return __func_type__; }
+
+const char *ComputeOp::__func_type__     = "compute_op";
+const char *PlaceholderOp::__func_type__ = "placeholder_op";
+const char *CallOp::__func_type__        = "call_op";
+
+const std::string &CallOp::target() const {
+  auto *call = call_expr.As<ir::Call>();
+  CHECK(call);
+  return call->name;
+}
+std::vector<Expr> &CallOp::write_args() {
+  auto *call = call_expr.As<ir::Call>();
+  CHECK(call);
+  return call->write_args;
+}
+std::vector<Expr> &CallOp::read_args() {
+  auto *call = call_expr.As<ir::Call>();
+  CHECK(call);
+  return call->read_args;
+}
+const std::vector<Expr> &CallOp::write_args() const {
+  auto *call = call_expr.As<ir::Call>();
+  CHECK(call);
+  return call->write_args;
+}
+const std::vector<Expr> &CallOp::read_args() const {
+  auto *call = call_expr.As<ir::Call>();
+  CHECK(call);
+  return call->read_args;
+}
+std::vector<Expr> CallOp::args() const {
+  std::vector<Expr> args;
+  auto &rargs = read_args();
+  auto &wargs = write_args();
+  args.insert(std::end(args), rargs.begin(), rargs.end());
+  args.insert(std::end(args), wargs.begin(), wargs.end());
+  return args;
+}
+const char *PrecedingViewOp::__func_type__ = "preceding_view_op";
+
+const char *BufferShareOp::__func_type__ = "buffer_share_op";
+const char *BufferShareOp::func_type() const { return __func_type__; }
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/operation.h b/paddle/cinn/ir/operation.h
new file mode 100644
index 0000000000000..be30969105356
--- /dev/null
+++ b/paddle/cinn/ir/operation.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace ir {
+
+/**
+ * @brief A placeholder op represents an input placeholder.
+ */
+struct PlaceholderOp : public _Operation_ {
+  //! The shape of the input.
+  std::vector<Expr> shape;
+  //! The data type of the input.
+  Type dtype;
+
+  static Operation Make(const std::string &name, const std::vector<Expr> &shape, Type dtype);
+
+  const char *func_type() const override;
+
+  static char const *__func_type__;
+};
+
+struct CallOp : public _Operation_ {
+  const std::string &target() const;
+
+  Expr call_expr;
+
+  std::vector<Expr> &read_args();
+  std::vector<Expr> &write_args();
+  const std::vector<Expr> &read_args() const;
+  const std::vector<Expr> &write_args() const;
+  std::vector<Expr> args() const;
+
+  //! A reference to the target LoweredFunc if this CallOp calls an generated LoweredFunc.
+  Expr func;
+
+  // the offset int the tuple of return values.
+  int value_slot{-1};
+
+  bool is_tuple_get{false};
+
+  //! Number of the value slots.
+  int num_value_slots{0};
+
+  CallOp() = default;
+
+  static Operation Make(const std::string &call_target, Expr call_op);
+
+  const char *func_type() const override;
+
+  static char const *__func_type__;
+};
+
+/**
+ * The operation of the preceding view of a tensor.
+ */
+struct PrecedingViewOp : public _Operation_ {
+  Expr tensor;
+
+  int preceding_axis{-1};
+
+  static Operation Make(const Tensor &tensor, int preceding_axis);
+
+  const char *func_type() const override;
+
+  static char const *__func_type__;
+};
+
+/**
+ * Share the same buffer.
+ */
+struct BufferShareOp : public _Operation_ {
+  const char *func_type() const override;
+  static Operation Make() { return Operation(new BufferShareOp); }
+  static char const *__func_type__;
+};
+
+/**
+ * @brief A Compute op that compute a tensor on certain domain.
+ */
+struct ComputeOp : public _Operation_ {
+  using handle_t = std::function<Expr(const std::vector<Expr> &)>;
+  //! Var on each reduction axis, if the body is a Reduction.
+  std::vector<Var> reduce_axis;
+  //! Shape of the output.
+  std::vector<Expr> shape;
+  //! The compute expression.
+  std::vector<Expr> body;
+  //! The functor to generate the body, used to inline the expression if needed.
+  handle_t producer_fn;
+
+  ComputeOp() = default;
+
+  static Operation Make(const std::string &name,
+                        ComputeOp::handle_t handle,
+                        const std::vector<Expr> &shape,
+                        const std::vector<Expr> &domain,
+                        const std::vector<Var> &reduce_axis           = {},
+                        const std::map<std::string, IrNodeRef> &attrs = {},
+                        const std::string &tag                        = "");
+
+  const char *func_type() const override;
+
+  static const char *__func_type__;
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/registry.cc b/paddle/cinn/ir/registry.cc
new file mode 100644
index 0000000000000..2e8a7caf1efb1
--- /dev/null
+++ b/paddle/cinn/ir/registry.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/registry.h"
+
+#include <map>
+#include <mutex>  // NOLINT
+
+namespace cinn::ir {
+struct Registry::Manager {
+  static Manager *Global() {
+    static Manager manager;
+    return &manager;
+  }
+
+  std::mutex mu;
+  std::map<std::string, Registry *> functions;
+
+ private:
+  Manager()                = default;
+  Manager(const Manager &) = delete;
+  void operator=(Manager &) = delete;
+};
+
+Registry &Registry::SetBody(lang::PackedFunc f) {
+  func_ = f;
+  return *this;
+}
+
+Registry &Registry::SetBody(lang::PackedFunc::body_t f) {
+  func_ = lang::PackedFunc(f);
+  return *this;
+}
+
+Registry::Registry(const std::string &name) : name_(name) {}
+
+/*static*/ Registry &Registry::Register(const std::string &name, bool can_override) {
+  auto *manager = Registry::Manager::Global();
+  std::lock_guard<std::mutex> lock(manager->mu);
+  if (manager->functions.count(name)) {
+    CHECK(can_override) << "Global PackedFunc[" << name << "] is already exists";
+  }
+
+  auto *r                  = new Registry(name);
+  manager->functions[name] = r;
+  return *r;
+}
+
+/*static*/ bool Registry::Remove(const std::string &name) {
+  auto manager = Manager::Global();
+  std::lock_guard<std::mutex> lock(manager->mu);
+  auto it = manager->functions.find(name);
+  if (it != manager->functions.end()) {
+    manager->functions.erase(it);
+    return true;
+  }
+  return false;
+}
+
+/*static*/ const lang::PackedFunc *Registry::Get(const std::string &name) {
+  auto *manager = Manager::Global();
+  std::lock_guard<std::mutex> lock(manager->mu);
+  auto *r = manager->functions[name];
+  if (r) {
+    return &r->func_;
+  }
+  return nullptr;
+}
+
+/*static*/ std::vector<std::string> Registry::ListNames() {
+  auto *manager = Manager::Global();
+  std::lock_guard<std::mutex> lock(manager->mu);
+  std::vector<std::string> keys;
+  for (const auto &_k_v_ : manager->functions) {
+    auto &k = std::get<0>(_k_v_);
+    auto &v = std::get<1>(_k_v_);
+    keys.push_back(k);
+  }
+  return keys;
+}
+
+}  // namespace cinn::ir
diff --git a/paddle/cinn/ir/registry.h b/paddle/cinn/ir/registry.h
new file mode 100644
index 0000000000000..612213a95d9cc
--- /dev/null
+++ b/paddle/cinn/ir/registry.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/strings/string_view.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/lang/packed_func.h"
+
+namespace cinn::ir {
+
+class Registry {
+ public:
+  Registry &SetBody(lang::PackedFunc f);
+  Registry &SetBody(lang::PackedFunc::body_t f);
+
+  static Registry &Register(const std::string &name, bool can_override = false);
+  static bool Remove(const std::string &name);
+  static const lang::PackedFunc *Get(const std::string &name);
+  static std::vector<std::string> ListNames();
+
+  struct Manager;
+
+  explicit Registry(const std::string &);
+
+ protected:
+  std::string name_;
+  lang::PackedFunc func_;
+  friend class Manager;
+};
+
+}  // namespace cinn::ir
diff --git a/paddle/cinn/ir/schedule_desc.cc b/paddle/cinn/ir/schedule_desc.cc
new file mode 100644
index 0000000000000..cb50cc2ab9614
--- /dev/null
+++ b/paddle/cinn/ir/schedule_desc.cc
@@ -0,0 +1,680 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/schedule_desc.h"
+
+#include <glog/logging.h>
+
+#include <functional>
+#include <typeinfo>
+#include <utility>
+
+#include "cinn/common/macros.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace ir {
+
+// ------ Following codes are about `Apply` functions registry of variaous types of ScheduleDesc::Step
+class PackedStepContext;
+// uniformed function prototype of a scheduling operation in IRSchedule
+using StepApplyFunc = std::vector<Expr> (*)(PackedStepContext*);
+
+// format the inputs, attrs, uniformed function of a scheduling step
+class StepKindInfo {
+ public:
+  // compatible for Registry::EntryType
+  std::string name;
+
+  // format: {"<name1>", "<name2>", ...}
+  StepKindInfo& Inputs(std::vector<std::string>&& inputs) {
+    inputs_ = inputs;
+    return *this;
+  }
+  // format: {"<name1>", "<name2>", ...}
+  StepKindInfo& Attrs(std::vector<std::string>&& attrs) {
+    attrs_ = attrs;
+    return *this;
+  }
+  // format: APPLY_FUNC_UNIFORM(...)
+  StepKindInfo& SetApplyFn(StepApplyFunc&& func) {
+    apply_func_ = func;
+    return *this;
+  }
+
+  // execute the Apply function of this type
+  std::vector<Expr> Apply(PackedStepContext* context) const { return apply_func_(context); }
+
+ private:
+  friend class PackedStepContext;
+
+  std::vector<std::string> inputs_;
+  std::vector<std::string> attrs_;
+  StepApplyFunc apply_func_{nullptr};
+};
+
+// StepKindInfo register for all scheduling steps
+class StepKindRegistry : public Registry<StepKindInfo> {
+ public:
+  StepKindRegistry() = default;
+
+ private:
+  CINN_DISALLOW_COPY_AND_ASSIGN(StepKindRegistry);
+};
+
+// PackedStepContext is the param of a uniformed `Apply` function, which is used to be an
+// auxiliary structure to interact with in/out arguments of the original scheduling function in IRSchedule
+class PackedStepContext {
+ public:
+  explicit PackedStepContext(const ScheduleDesc::Step& desc, const StepKindInfo* step_kind, IRSchedule* schedule)
+      : ir_schedule_(schedule) {
+    Build(desc, step_kind);
+  }
+
+  // get the pointer of current IRSchedule object
+  IRSchedule* ScheduleHandler() const { return ir_schedule_; }
+
+  // get the idx-th input whose signature is Expr
+  Expr InputAt(size_t idx) const {
+    CHECK_LT(idx, input_range_.size()) << "idx overranges";
+    const auto& range = input_range_.at(idx);
+    CHECK(range.second - range.first == 1) << "not single param";
+    return inputs_[range.first];
+  }
+
+  // get the idx-th input whose signature is `std::vector<Expr>`
+  std::vector<Expr> InputsAt(size_t idx) const {
+    CHECK_LT(idx, input_range_.size()) << "idx overranges";
+    const auto& range = input_range_.at(idx);
+    std::vector<Expr> results;
+    for (size_t s = range.first; s < range.second; ++s) {
+      results.emplace_back(inputs_[s]);
+    }
+    return results;
+  }
+
+  // get the idx-th attribute value with correct type
+  template <typename AttrType>
+  const AttrType& AttrAt(size_t idx) const {
+    try {
+      return absl::get<AttrType>(attrs_.at(idx));
+    } catch (absl::bad_variant_access& ex) {
+      LOG(FATAL) << "Attribute cast error, idx:" << idx << ", get tpye:" << typeid(AttrType).name()
+                 << ", real index:" << attrs_.at(idx).index();
+      throw ex;
+    }
+  }
+
+ private:
+  void Build(const ScheduleDesc::Step& desc, const StepKindInfo* step_kind) {
+    // build inputs
+    size_t input_idx = 0;
+    for (auto&& param_name : step_kind->inputs_) {
+      auto arg_it = desc.inputs.find(param_name);
+      CHECK(arg_it != desc.inputs.end()) << "Can't find param:" << param_name;
+      auto&& args = arg_it->second;
+      inputs_.insert(inputs_.end(), std::make_move_iterator(args.begin()), std::make_move_iterator(args.end()));
+      input_range_.emplace_back(input_idx, input_idx + args.size());
+      input_idx += args.size();
+    }
+
+    // build attrs
+    size_t attr_idx = 0;
+    for (auto&& attr_name : step_kind->attrs_) {
+      auto attr_it = desc.attrs.find(attr_name);
+      CHECK(attr_it != desc.attrs.end()) << "Can't find attribute:" << attr_name;
+      attrs_.emplace_back(attr_it->second);
+      ++attr_idx;
+    }
+  }
+
+  IRSchedule* ir_schedule_;
+  std::vector<Expr> inputs_;
+  std::vector<std::pair<size_t, size_t>> input_range_;
+  std::vector<utils::Attribute> attrs_;
+};
+
+#define CINN_SPECIALIZE_ApplyCallHelper(attr_type)                                    \
+  template <typename... Tail>                                                         \
+  struct ApplyCallHelper<attr_type, Tail...> {                                        \
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>        \
+    static std::vector<Expr> Apply(PackedStepContext* ctx, PreviousArgs... pargs) {   \
+      using rf_attr_type = std::remove_reference<attr_type>::type;                    \
+      using rc_attr_type = std::remove_const<rf_attr_type>::type;                     \
+      const auto& arg    = ctx->AttrAt<rc_attr_type>(attr_idx);                       \
+      return ApplyCallHelper<Tail...>::template Apply<in_idx, attr_idx + 1, out_idx>( \
+          ctx, std::forward<PreviousArgs>(pargs)..., arg);                            \
+    }                                                                                 \
+  }
+
+template <typename T>
+struct TypeTag {};
+
+// used for converting a member function of the IRSchedule to be a free function
+// with the first parameter is a pointer to the IRSchedule.
+template <typename F, F f>
+struct FreeFuncConverter;
+
+template <typename Return, typename... Args, Return (IRSchedule::*impl_fn)(Args...)>
+struct FreeFuncConverter<Return (IRSchedule::*)(Args...), impl_fn> {
+  static Return Apply(IRSchedule* sch, Args... args) { return (sch->*impl_fn)(std::forward<Args>(args)...); }
+};
+
+template <typename Return, typename... Args, Return (IRSchedule::*impl_fn)(Args...) const>
+struct FreeFuncConverter<Return (IRSchedule::*)(Args...) const, impl_fn> {
+  static Return Apply(IRSchedule* sch, Args... args) { return (sch->*impl_fn)(std::forward<Args>(args)...); }
+};
+
+// used for formatting scheduling functions with variaous function signatures to be uniformed form
+template <typename F, F f>
+struct ApplyFuncImpl;
+
+template <typename Return, typename... Args, Return(impl_fn)(Args...)>
+struct ApplyFuncImpl<Return (*)(Args...), impl_fn> {
+  static std::vector<Expr> Apply(PackedStepContext* ctx) {
+    return ApplyCallHelper<Args..., TypeTag<int>>::template Apply<0, 0, 0>(ctx);
+  }
+
+ private:
+  template <typename... RemainingArgs>
+  struct ApplyCallHelper;
+
+  // the signature of input parameters of a scheduling operation only can
+  // be one of IRSchedule, Expr or std::vector<Expr>
+  template <typename... Tail>
+  struct ApplyCallHelper<IRSchedule*, Tail...> {
+    template <int in_idx, int attr_idx, int out_idx>
+    static std::vector<Expr> Apply(PackedStepContext* ctx) {
+      static_assert(in_idx == 0, "IRSchedule* must be the first argument");
+      IRSchedule* ir_schedule = ctx->ScheduleHandler();
+      return ApplyCallHelper<Tail...>::template Apply<in_idx + 1, attr_idx, out_idx>(ctx, ir_schedule);
+    }
+  };
+
+  template <typename... Tail>
+  struct ApplyCallHelper<Expr&, Tail...> {
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static std::vector<Expr> Apply(PackedStepContext* ctx, PreviousArgs... pargs) {
+      auto arg = ctx->InputAt(in_idx - 1);
+      return ApplyCallHelper<Tail...>::template Apply<in_idx + 1, attr_idx, out_idx>(
+          ctx, std::forward<PreviousArgs>(pargs)..., arg);
+    }
+  };
+
+  template <typename... Tail>
+  struct ApplyCallHelper<const Expr&, Tail...> {
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static std::vector<Expr> Apply(PackedStepContext* ctx, PreviousArgs... pargs) {
+      auto arg = ctx->InputAt(in_idx - 1);
+      return ApplyCallHelper<Tail...>::template Apply<in_idx + 1, attr_idx, out_idx>(
+          ctx, std::forward<PreviousArgs>(pargs)..., arg);
+    }
+  };
+
+  template <typename... Tail>
+  struct ApplyCallHelper<const std::vector<Expr>&, Tail...> {
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static std::vector<Expr> Apply(PackedStepContext* ctx, PreviousArgs... pargs) {
+      auto arg = ctx->InputsAt(in_idx - 1);
+      return ApplyCallHelper<Tail...>::template Apply<in_idx + 1, attr_idx, out_idx>(
+          ctx, std::forward<PreviousArgs>(pargs)..., arg);
+    }
+  };
+
+  CINN_SPECIALIZE_ApplyCallHelper(bool);
+  CINN_SPECIALIZE_ApplyCallHelper(int);
+  CINN_SPECIALIZE_ApplyCallHelper(float);
+  CINN_SPECIALIZE_ApplyCallHelper(const std::string&);
+  CINN_SPECIALIZE_ApplyCallHelper(const std::vector<bool>&);
+  CINN_SPECIALIZE_ApplyCallHelper(const std::vector<int>&);
+  CINN_SPECIALIZE_ApplyCallHelper(const std::vector<float>&);
+  CINN_SPECIALIZE_ApplyCallHelper(const std::vector<std::string>&);
+  CINN_SPECIALIZE_ApplyCallHelper(int64_t);
+  CINN_SPECIALIZE_ApplyCallHelper(double);
+  CINN_SPECIALIZE_ApplyCallHelper(const std::vector<int64_t>&);
+  CINN_SPECIALIZE_ApplyCallHelper(const std::vector<double>&);
+
+  template <int out_idx, typename T>
+  struct ApplyReturnHelper;
+
+  template <int out_idx>
+  struct ApplyReturnHelper<out_idx, void> {
+    static std::vector<Expr> Apply(Args... args) {
+      impl_fn(std::forward<Args>(args)...);
+      return {};
+    }
+  };
+
+  template <int out_idx>
+  struct ApplyReturnHelper<out_idx, Expr> {
+    static std::vector<Expr> Apply(Args... args) {
+      auto ret = impl_fn(std::forward<Args>(args)...);
+      return {ret};
+    }
+  };
+
+  template <int out_idx>
+  struct ApplyReturnHelper<out_idx, std::vector<Expr>> {
+    static std::vector<Expr> Apply(Args... args) { return impl_fn(std::forward<Args>(args)...); }
+  };
+
+  // end: base template
+  template <typename T>
+  struct ApplyCallHelper<TypeTag<T>> {
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static std::vector<Expr> Apply(PackedStepContext* ctx, PreviousArgs... pargs) {
+      static_assert(out_idx == 0, "Output is exported from return value");
+      return ApplyReturnHelper<out_idx, Return>::Apply(std::forward<Args>(pargs)...);
+    }
+  };
+};
+
+#define APPLY_FUNC_UNIFORM(...) ::cinn::ir::ApplyFuncImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Apply
+#define FREE_FUNCTION_CONVERTER(...) ::cinn::ir::FreeFuncConverter<decltype(__VA_ARGS__), __VA_ARGS__>::Apply
+
+#define CINN_BUILD_STEP_KIND(TypeName)                                \
+  static ::cinn::ir::StepKindInfo& __step_kind_registrar_##TypeName = \
+      ::cinn::ir::StepKindRegistry::Global()->__REGISTER_OR_GET__(#TypeName)
+
+// register StepKindInfo for every type of scheduling operation
+// clang-format off
+CINN_BUILD_STEP_KIND(GetAllBlocks)
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                static_cast<std::vector<Expr> (IRSchedule::*)() const>(&IRSchedule::GetAllBlocks))));
+
+CINN_BUILD_STEP_KIND(GetChildBlocks)
+    .Inputs({"expr"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                static_cast<std::vector<Expr> (IRSchedule::*)(const Expr&) const>(&IRSchedule::GetChildBlocks))));
+
+CINN_BUILD_STEP_KIND(GetLoops)
+    .Inputs({"block"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                static_cast<std::vector<Expr> (IRSchedule::*)(const Expr&) const>(&IRSchedule::GetLoops))));
+
+CINN_BUILD_STEP_KIND(GetLoopsWithName)
+    .Attrs({"block_name"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                static_cast<std::vector<Expr> (IRSchedule::*)(const std::string&) const>(&IRSchedule::GetLoops))));
+
+CINN_BUILD_STEP_KIND(GetBlock)
+    .Attrs({"block_name"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                static_cast<Expr (IRSchedule::*)(const std::string&) const>(&IRSchedule::GetBlock))));
+
+CINN_BUILD_STEP_KIND(Split)
+    .Inputs({"loop", "factors"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                static_cast<std::vector<Expr> (IRSchedule::*)(const Expr&, const std::vector<Expr>&)>(&IRSchedule::Split))));
+
+CINN_BUILD_STEP_KIND(Fuse)
+    .Inputs({"loops"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                static_cast<Expr (IRSchedule::*)(const std::vector<Expr>&)>(&IRSchedule::Fuse))));
+
+CINN_BUILD_STEP_KIND(FuseWithName)
+    .Attrs({"block_name", "loops_index"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                static_cast<Expr (IRSchedule::*)(const std::string&, const std::vector<int>&)>(&IRSchedule::Fuse))));
+
+CINN_BUILD_STEP_KIND(FuseWithBlock)
+    .Inputs({"block"})
+    .Attrs({"loops_index"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                    static_cast<Expr (IRSchedule::*)(const Expr&, const std::vector<int>&)>(&IRSchedule::Fuse))));
+
+CINN_BUILD_STEP_KIND(ComputeAt)
+    .Inputs({"block", "loop"})
+    .Attrs({"keep_unit_loops"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::ComputeAt)));
+
+CINN_BUILD_STEP_KIND(SimpleComputeAt)
+    .Inputs({"block", "loop"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::SimpleComputeAt)));
+
+CINN_BUILD_STEP_KIND(ReverseComputeAt)
+    .Inputs({"block", "loop"})
+    .Attrs({"keep_unit_loops"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::ReverseComputeAt)));
+
+CINN_BUILD_STEP_KIND(GetRootBlock)
+    .Inputs({"expr"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::GetRootBlock)));
+
+CINN_BUILD_STEP_KIND(CacheRead)
+    .Inputs({"block"})
+    .Attrs({"read_buffer_index", "memory_type"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::CacheRead)));
+
+CINN_BUILD_STEP_KIND(CacheWrite)
+    .Inputs({"block"})
+    .Attrs({"write_buffer_index", "memory_type"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::CacheWrite)));
+
+CINN_BUILD_STEP_KIND(SyncThreads)
+    .Inputs({"ir_node"})
+    .Attrs({"after_node"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::SyncThreads)));
+
+CINN_BUILD_STEP_KIND(SetBuffer)
+    .Inputs({"block"})
+    .Attrs({"memory_type", "fixed"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::SetBuffer)));
+
+CINN_BUILD_STEP_KIND(Reorder)
+    .Inputs({"loops"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                    static_cast<Expr (IRSchedule::*)(const std::vector<Expr>&)>(&IRSchedule::Reorder))));
+
+CINN_BUILD_STEP_KIND(ReorderWithBlock)
+    .Inputs({"block"})
+    .Attrs({"loops_index"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                    static_cast<Expr (IRSchedule::*)(const Expr&, const std::vector<int>&)>(&IRSchedule::Reorder))));
+
+CINN_BUILD_STEP_KIND(ReorderWithName)
+    .Attrs({"block_name", "loops_index"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(
+                    static_cast<Expr (IRSchedule::*)(const std::string&, const std::vector<int>&)>(&IRSchedule::Reorder))));
+
+CINN_BUILD_STEP_KIND(Parallel)
+    .Inputs({"loop"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::Parallel)));
+
+CINN_BUILD_STEP_KIND(Vectorize)
+    .Inputs({"loop"})
+    .Attrs({"factor"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::Vectorize)));
+
+CINN_BUILD_STEP_KIND(Unroll)
+    .Inputs({"loop"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::Unroll)));
+
+CINN_BUILD_STEP_KIND(ComputeInline)
+    .Inputs({"schedule_block"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::ComputeInline)));
+
+CINN_BUILD_STEP_KIND(ReverseComputeInline)
+    .Inputs({"schedule_block"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::ReverseComputeInline)));
+
+CINN_BUILD_STEP_KIND(Bind)
+    .Inputs({"loop"})
+    .Attrs({"thread_axis"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::Bind)));
+
+CINN_BUILD_STEP_KIND(Rfactor)
+    .Inputs({"rf_loop"})
+    .Attrs({"rf_axis"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::Rfactor)));
+
+CINN_BUILD_STEP_KIND(MergeExprs)
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::MergeExprs)));
+
+template <typename AttrType> void Annotate(IRSchedule* ir_sch, const Expr&, const std::string&, AttrType);
+template <> void Annotate<int>(IRSchedule* ir_sch, const Expr& block, const std::string& key, int value) {
+    ir_sch->Annotate(block, key, value);
+}
+template <> void Annotate<bool>(IRSchedule* ir_sch, const Expr& block, const std::string& key, bool value) {
+    ir_sch->Annotate(block, key, value);
+}
+template <> void Annotate<float>(IRSchedule* ir_sch, const Expr& block, const std::string& key, float value) {
+    ir_sch->Annotate(block, key, value);
+}
+void AnnotateStringAttr(IRSchedule* ir_sch, const Expr& block, const std::string& key, const std::string& value) {
+    ir_sch->Annotate(block, key, value);
+}
+
+CINN_BUILD_STEP_KIND(AnnotateIntAttr)
+    .Inputs({"block"})
+    .Attrs({"key", "value"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(Annotate<int>));
+
+CINN_BUILD_STEP_KIND(AnnotateBoolAttr)
+    .Inputs({"block"})
+    .Attrs({"key", "value"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(Annotate<bool>));
+
+CINN_BUILD_STEP_KIND(AnnotateFloatAttr)
+    .Inputs({"block"})
+    .Attrs({"key", "value"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(Annotate<float>));
+
+CINN_BUILD_STEP_KIND(AnnotateStringAttr)
+    .Inputs({"block"})
+    .Attrs({"key", "value"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(AnnotateStringAttr));
+
+CINN_BUILD_STEP_KIND(Unannotate)
+    .Inputs({"block"})
+    .Attrs({"key"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::Unannotate)));
+
+CINN_BUILD_STEP_KIND(FlattenLoops)
+    .Inputs({"loops"})
+    .Attrs({"force_flat"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::FlattenLoops)));
+
+CINN_BUILD_STEP_KIND(SamplePerfectTile)
+    .Inputs({"loop"})
+    .Attrs({"n", "max_innermost_factor", "decision"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::SamplePerfectTile)));
+
+CINN_BUILD_STEP_KIND(TagPostSchedule)
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::TagPostSchedule)));
+
+CINN_BUILD_STEP_KIND(SampleCategorical)
+    .Attrs({"candidates", "probs", "decision"})
+    .SetApplyFn(APPLY_FUNC_UNIFORM(FREE_FUNCTION_CONVERTER(&IRSchedule::SampleCategorical)));
+// clang-format on
+
+// ------ Following codes are about member function implement of the ScheduleDesc class
+void AttrVariantToProto(const utils::Attribute& attr, proto::ScheduleDesc_Attr* attr_proto) {
+#define SET_DESC_SINGLE_ITEM(index, built_type, proto_type, proto_field)   \
+  case index:                                                              \
+    attr_proto->set_dtype(proto::ScheduleDesc_Attr_DataType_##proto_type); \
+    attr_proto->set_##proto_field(absl::get<built_type>(attr));            \
+    break;
+
+#define SET_DESC_REPEATED_ITEM(index, built_type, proto_type, proto_field) \
+  case index: {                                                            \
+    attr_proto->set_dtype(proto::ScheduleDesc_Attr_DataType_##proto_type); \
+    const auto& values = absl::get<built_type>(attr);                      \
+    attr_proto->mutable_##proto_field()->Reserve(values.size());           \
+    *attr_proto->mutable_##proto_field() = {values.begin(), values.end()}; \
+    break;                                                                 \
+  }
+
+  switch (attr.index()) {
+    SET_DESC_SINGLE_ITEM(0, bool, BOOLEAN, b);
+    SET_DESC_SINGLE_ITEM(1, float, FLOAT, f);
+    SET_DESC_SINGLE_ITEM(2, int, INT, i);
+    SET_DESC_SINGLE_ITEM(3, std::string, STRING, s);
+    SET_DESC_REPEATED_ITEM(4, std::vector<bool>, BOOLEANS, bools);
+    SET_DESC_REPEATED_ITEM(5, std::vector<int>, INTS, ints);
+    SET_DESC_REPEATED_ITEM(6, std::vector<float>, FLOATS, floats);
+    SET_DESC_REPEATED_ITEM(7, std::vector<std::string>, STRINGS, strings);
+    SET_DESC_SINGLE_ITEM(8, int64_t, LONG, l);
+    SET_DESC_SINGLE_ITEM(9, double, DOUBLE, d);
+    SET_DESC_REPEATED_ITEM(10, std::vector<int64_t>, LONGS, longs);
+    SET_DESC_REPEATED_ITEM(11, std::vector<double>, DOUBLES, doubles);
+    default:
+      LOG(FATAL) << "Invalid index:" << attr.index();
+  }
+
+#undef SET_DESC_SINGLE_ITEM
+#undef SET_DESC_REPEATED_ITEM
+}
+
+utils::Attribute AttrProtoToVariant(const proto::ScheduleDesc_Attr& attr) {
+  utils::Attribute value;
+#define PARSE_DESC_SINGLE_ITEM(proto_type, proto_field, built_type) \
+  case proto::ScheduleDesc_Attr_DataType_##proto_type:              \
+    value = built_type(attr.proto_field());                         \
+    break;
+
+#define PARSE_DESC_REPEATED_ITEM(proto_type, proto_field, built_type)           \
+  case proto::ScheduleDesc_Attr_DataType_##proto_type:                          \
+    value = built_type({attr.proto_field().begin(), attr.proto_field().end()}); \
+    break;
+
+  switch (attr.dtype()) {
+    PARSE_DESC_SINGLE_ITEM(BOOLEAN, b, bool);
+    PARSE_DESC_SINGLE_ITEM(INT, i, int);
+    PARSE_DESC_SINGLE_ITEM(FLOAT, f, float);
+    PARSE_DESC_SINGLE_ITEM(STRING, s, std::string);
+    PARSE_DESC_REPEATED_ITEM(BOOLEANS, bools, std::vector<bool>);
+    PARSE_DESC_REPEATED_ITEM(INTS, ints, std::vector<int>);
+    PARSE_DESC_REPEATED_ITEM(FLOATS, floats, std::vector<float>);
+    PARSE_DESC_REPEATED_ITEM(STRINGS, strings, std::vector<std::string>);
+    PARSE_DESC_SINGLE_ITEM(LONG, l, int64_t);
+    PARSE_DESC_SINGLE_ITEM(DOUBLE, d, double);
+    PARSE_DESC_REPEATED_ITEM(LONGS, longs, std::vector<int64_t>);
+    PARSE_DESC_REPEATED_ITEM(DOUBLES, doubles, std::vector<double>);
+    default:
+      LOG(FATAL) << "Invalid type:" << attr.DebugString();
+  }
+
+#undef PARSE_DESC_SINGLE_ITEM
+#undef PARSE_DESC_REPEATED_ITEM
+  return value;
+}
+
+// Expr hash functor, presents how to hash an Expr
+struct ExprHash {
+  size_t operator()(const Expr& e) const { return std::hash<IrNode*>()(e.ptr()); }
+};
+// Expr equal functor, presents whether a Expr pair is equal
+struct ExprEqual {
+  bool operator()(const Expr& lhs, const Expr& rhs) const { return lhs.get() == rhs.get(); }
+};
+
+void ScheduleDesc::Append(Step&& step) { steps_.emplace_back(std::move(step)); }
+
+void ScheduleDesc::Pop() {
+  if (!steps_.empty()) {
+    steps_.pop_back();
+  }
+}
+
+void ScheduleDesc::Replay(IRSchedule* schedule, bool without_post_schedule) const {
+  ReplayWithProto(this->ToProto(), schedule, without_post_schedule);
+}
+
+proto::ScheduleDesc ScheduleDesc::ToProto() const {
+  // map each Expr to a formatted name (e1, e2, ...)
+  absl::flat_hash_map<Expr, std::string, ExprHash, ExprEqual> expr2name;
+  proto::ScheduleDesc desc_proto;
+
+  for (auto&& step : steps_) {
+    auto* step_proto = desc_proto.add_steps();
+    step_proto->set_type(step.type);
+    // inputs of a step must refer to Exprs resulted by preceding steps
+    for (auto&& param2exprs : step.inputs) {
+      const std::string& param_name = param2exprs.first;
+      auto* expr_desc               = step_proto->add_inputs();
+      expr_desc->set_parameter(param_name);
+      for (auto&& expr : param2exprs.second) {
+        auto expr_it = expr2name.find(expr);
+        CHECK(expr_it != expr2name.end()) << "Can't find expr of param_name: " << param_name;
+        expr_desc->add_arguments(expr_it->second);
+      }
+    }
+
+    // each output Expr is represented by a formatted name, to be refered by suceeding steps
+    for (auto&& expr : step.outputs) {
+      std::string local_name = "e" + std::to_string(expr2name.size());
+      expr2name.emplace(expr, local_name);
+      step_proto->add_outputs(expr2name.at(expr));
+    }
+
+    for (auto&& attr2value : step.attrs) {
+      auto* attr_proto       = step_proto->add_attrs();
+      const auto& attr_value = attr2value.second;
+      VLOG(5) << "Attr.index:" << attr_value.index();
+      attr_proto->set_name(attr2value.first);
+      AttrVariantToProto(attr_value, attr_proto);
+    }
+  }
+  return desc_proto;
+}
+
+std::vector<Expr> ScheduleDesc::ReplayWithProto(const proto::ScheduleDesc& desc_proto,
+                                                IRSchedule* sch,
+                                                bool without_post_schedule) {
+  VLOG(4) << "proto::ScheduleDesc:\n" << desc_proto.DebugString();
+  if (desc_proto.steps().empty()) {
+    LOG(WARNING) << "Input proto::ScheduleDesc is empty";
+    return {};
+  }
+
+  // map a formatted name (e1, e2, ...) to an Expr
+  absl::flat_hash_map<std::string, Expr> name2expr;
+  std::vector<Expr> last_outputs;
+
+  // resotre each scheduling step and apply to the new IRSchedule object
+  for (auto&& step_proto : desc_proto.steps()) {
+    VLOG(4) << "Replay step:\n" << step_proto.DebugString();
+    ScheduleDesc::Step step;
+    step.type = step_proto.type();
+    CHECK(!step.type.empty()) << "Name of StepKind is empty";
+    if (without_post_schedule && step.type == "TagPostSchedule") {
+      break;
+    }
+    const StepKindInfo* step_kind = StepKindRegistry::Global()->Find(step.type);
+    CHECK(step_kind) << "Can't find StepKind:" << step.type;
+
+    for (auto&& param2args : step_proto.inputs()) {
+      for (auto&& arg : param2args.arguments()) {
+        auto arg_it = name2expr.find(arg);
+        CHECK(arg_it != name2expr.end()) << "Cant't find argument:" << arg;
+        step.inputs[param2args.parameter()].emplace_back(arg_it->second);
+      }
+    }
+    for (auto&& attr : step_proto.attrs()) {
+      step.attrs[attr.name()] = AttrProtoToVariant(attr);
+    }
+
+    PackedStepContext context(step, step_kind, sch);
+    step.outputs = step_kind->Apply(&context);
+    CHECK_EQ(step_proto.outputs().size(), step.outputs.size()) << "Output size not matched";
+    for (size_t i = 0; i < step.outputs.size(); ++i) {
+      name2expr[step_proto.outputs(i)] = step.outputs.at(i);
+    }
+    last_outputs = std::move(step.outputs);
+  }
+  return last_outputs;
+}
+
+ScheduleDesc ScheduleDesc::ForkAndUpdate(int step_idx, utils::Attribute decision, bool without_post_schedule) const {
+  int n_valid_step = 0;
+  if (!without_post_schedule) {
+    n_valid_step = steps_.size();
+  } else {
+    for (const auto& step : steps_) {
+      if (step.type != "TagPostSchedule") {
+        ++n_valid_step;
+      } else {
+        break;
+      }
+    }
+  }
+  std::vector<ScheduleDesc::Step> new_steps(steps_.begin(), steps_.begin() + n_valid_step);
+  new_steps[step_idx].attrs["decision"] = decision;
+  return ScheduleDesc(std::move(new_steps));
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/schedule_desc.h b/paddle/cinn/ir/schedule_desc.h
new file mode 100644
index 0000000000000..43a1820cfe9e0
--- /dev/null
+++ b/paddle/cinn/ir/schedule_desc.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/schedule_desc.pb.h"
+#include "cinn/utils/registry.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace ir {
+
+// A ScheduleDesc describe the scheduling process of an ir::ModuleExpr, it records
+// all transform/getting operations executed by a corresponding ir::IRSchedule.
+// A ScheduleDesc can be serialized to JSON format and saved to file. For deserializing,
+// it can be re-applied to a new IRSchedule that is initialzied by a semantics-euqal
+// original ir::ModuleExpr, and then achieves the same result.
+
+class IRSchedule;  // forward declartion to avoid cross-reference
+class ScheduleDesc {
+ public:
+  // each operation executed through IRSchedule is recorded as a step
+  struct Step {
+    std::string type;  // step name
+    absl::flat_hash_map<std::string, std::vector<Expr>> inputs;
+    utils::AttributeMap attrs;
+    std::vector<Expr> outputs;
+    Step() = default;
+    Step(std::string type_i,
+         absl::flat_hash_map<std::string, std::vector<Expr>> inputs_i,
+         utils::AttributeMap attrs_i,
+         std::vector<Expr> outputs_i)
+        : type(type_i), inputs(inputs_i), attrs(attrs_i), outputs(outputs_i) {}
+  };
+
+  /**
+   * \brief Re-applied a scheduling process represented as a proto::ScheduleDesc to a new IRSchedule object.
+   * @param desc_proto The proto of the ScheduleDesc to be re-applied.
+   * @param sch The original IRSchedule to be replayed the description on.
+   * @param without_post_schedule Determine whether to delete the post schedules.
+   */
+  static std::vector<Expr> ReplayWithProto(const proto::ScheduleDesc& desc_proto,
+                                           IRSchedule* sch,
+                                           bool without_post_schedule = false);
+
+  ScheduleDesc() = default;
+
+  ScheduleDesc(const std::vector<Step>& steps) : steps_(steps) {}
+
+  ScheduleDesc(std::vector<Step>&& steps) : steps_(steps) {}
+
+  // Append a new step
+  void Append(Step&& step);
+
+  // Pop the last step
+  void Pop();
+
+  /**
+   * \brief Replay this description to a new IRSchedule that is initialzied by a semantics-euqal original ModuleExpr.
+   * @param schedule The original IRSchedule to be replayed the description on.
+   * @param without_post_schedule Determine whether to delete the post schedules.
+   */
+  void Replay(IRSchedule* schedule, bool without_post_schedule = false) const;
+
+  // convert to a proto::ScheduleDesc object
+  proto::ScheduleDesc ToProto() const;
+
+  // return detail string of a ScheduleDesc for debug;
+  std::string DebugString() const { return ToProto().DebugString(); }
+
+  std::vector<Step> Steps() const { return steps_; }
+
+  bool Empty() const { return steps_.empty(); }
+
+  /**
+   * \brief Fork this ScheduleDesc and update a step of the new ScheduleDesc with a new decision.
+   * @param step_idx The index of the step to be update.
+   * @param decision The new decision.
+   * @param without_post_schedule Determine whether to delete the post schedules.
+   * @return The new ScheduleDesc.
+   */
+  ScheduleDesc ForkAndUpdate(int step_idx, utils::Attribute decision, bool without_post_schedule) const;
+
+ private:
+  std::vector<Step> steps_;  // all operations are recorded in order.
+};
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/schedule_desc.proto b/paddle/cinn/ir/schedule_desc.proto
new file mode 100644
index 0000000000000..829478cf22dd4
--- /dev/null
+++ b/paddle/cinn/ir/schedule_desc.proto
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax ="proto3";
+
+package cinn.ir.proto;
+
+message ScheduleDesc {
+  message Expr {
+    string parameter = 1;
+    repeated string arguments = 2;
+  };
+
+  // Attribute type and value
+  message Attr {
+    enum DataType {
+        BOOLEAN = 0;
+        INT = 1;
+        FLOAT = 2;
+        STRING = 3;
+        BOOLEANS = 4;
+        INTS = 5;
+        FLOATS = 6;
+        STRINGS = 7;
+        LONG = 8;
+        DOUBLE = 9;
+        LONGS = 10;
+        DOUBLES = 11;
+    };
+
+    string name = 1;
+    DataType dtype = 2;
+    bool b = 3;
+    int32 i = 4;
+    float f = 5;
+    string s = 6;
+    repeated bool bools = 7;
+    repeated int32 ints = 8;
+    repeated float floats = 9;
+    repeated string strings = 10;
+    int64 l = 11;
+    double d = 12;
+    repeated int64 longs = 13;
+    repeated double doubles = 14;
+  };
+
+  message Step {
+      string type = 1;
+      repeated Expr inputs = 2;
+      repeated string outputs = 3;
+      repeated Attr attrs = 4;
+  };
+
+  // scheduling operation sequence
+  repeated Step steps = 1;
+};
diff --git a/paddle/cinn/ir/schedule_desc_test.cc b/paddle/cinn/ir/schedule_desc_test.cc
new file mode 100644
index 0000000000000..171f1fbedc3f8
--- /dev/null
+++ b/paddle/cinn/ir/schedule_desc_test.cc
@@ -0,0 +1,809 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/schedule_desc.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/common/context.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/lang/lower.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/utils/string.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace ir {
+
+// Return lowerd ir AST for example functions used in this test
+std::vector<ir::LoweredFunc> LowerCompute(const std::vector<int>& shape,
+                                          const Target& target,
+                                          bool need_c                  = false,
+                                          const std::string& operation = "elementwise-copy") {
+  CHECK(shape.size() == 2 || shape.size() == 3) << "shape should be 2 or 3";
+  std::vector<Expr> domain;
+  for (auto i = 0; i < shape.size(); ++i) {
+    domain.emplace_back(shape[i]);
+  }
+
+  Placeholder<float> A("A", domain);
+  ir::Tensor B, C;
+
+  if (operation == "elementwise-copy") {
+    if (domain.size() == 2) {
+      B = Compute(
+          domain, [&A](Var i, Var j) { return A(i, j); }, "B");
+      C = Compute(
+          domain, [&B](Var i, Var j) { return B(i, j); }, "C");
+    } else {
+      B = Compute(
+          domain, [&A](Var i, Var j, Var k) { return A(i, j, k); }, "B");
+      C = Compute(
+          domain, [&B](Var i, Var j, Var k) { return B(i, j, k); }, "C");
+    }
+  }
+
+  if (operation == "elementwise-add_const") {
+    if (domain.size() == 2) {
+      B = Compute(
+          domain, [&A](Var i, Var j) { return A(i, j) * Expr(2.f); }, "B");
+      C = Compute(
+          domain, [&B](Var i, Var j) { return B(i, j) + Expr(1.f); }, "C");
+    } else {
+      B = Compute(
+          domain, [&A](Var i, Var j, Var k) { return A(i, j, k) * Expr(2.f); }, "B");
+      C = Compute(
+          domain, [&B](Var i, Var j, Var k) { return B(i, j, k) + Expr(1.f); }, "C");
+    }
+  }
+
+  if (need_c) {
+    return cinn::lang::LowerVec("test_func", CreateStages({A, B, C}), {A, C}, {}, {}, nullptr, target, true);
+  }
+
+  return cinn::lang::LowerVec("test_func", CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
+}
+
+// Create a new IRSchedule with copied ir::LoweredFunc AST
+IRSchedule MakeIRSchedule(const std::vector<ir::LoweredFunc>& lowered_funcs) {
+  std::vector<Expr> exprs;
+  for (auto&& func : lowered_funcs) {
+    exprs.emplace_back(optim::IRCopy(func->body));
+  }
+  return ir::IRSchedule(ir::ModuleExpr(exprs));
+}
+
+// Generate source code with transformed ModuleExpr
+std::string SourceCodeGen(const ModuleExpr& module_expr,
+                          const std::vector<ir::LoweredFunc>& lowered_funcs,
+                          const Target& target) {
+  auto exprs = module_expr.GetExprs();
+  CHECK_EQ(exprs.size(), lowered_funcs.size()) << "size of func is not euqal";
+  std::vector<ir::LoweredFunc> updated_funcs = optim::IRCopy(lowered_funcs);
+  Module::Builder builder("test_module", target);
+  for (auto i = 0; i < lowered_funcs.size(); ++i) {
+    updated_funcs[i]->body = optim::IRCopy(exprs.at(i));
+    builder.AddFunction(updated_funcs[i]);
+  }
+  auto module = builder.Build();
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  return codegen.Compile(module, CodeGenC::OutputKind::CImpl);
+}
+
+class TestScheduleDesc : public ::testing::Test {
+ public:
+  Target target = common::DefaultHostTarget();
+  std::vector<ir::LoweredFunc> lowered_funcs;
+  ScheduleDesc trace;
+  void SetUp() override { Context::Global().ResetNameId(); }
+
+  void CheckTracingOutputs(const std::vector<Expr>& base, const ScheduleDesc& trace_desc) {
+    Context::Global().ResetNameId();
+    ir::IRSchedule replay_sch = MakeIRSchedule(lowered_funcs);
+    auto traced_outputs       = ScheduleDesc::ReplayWithProto(trace_desc.ToProto(), &replay_sch);
+    ASSERT_EQ(base.size(), traced_outputs.size());
+    for (auto i = 0; i < base.size(); ++i) {
+      ASSERT_EQ(utils::GetStreamCnt(base.at(i)), utils::GetStreamCnt(traced_outputs.at(i)));
+    }
+  }
+
+  void CheckReplayResult(const ir::IRSchedule& ir_sch, const ScheduleDesc& trace_desc) {
+    Context::Global().ResetNameId();
+    ir::IRSchedule replay_sch = MakeIRSchedule(lowered_funcs);
+    trace_desc.Replay(&replay_sch);
+
+    // check the equality of module expr between original schedule
+    // and the schedule generated by replaying with tracing ScheduleDesc
+    auto lhs_exprs = ir_sch.GetModule().GetExprs();
+    auto rhs_exprs = replay_sch.GetModule().GetExprs();
+    ASSERT_EQ(lhs_exprs.size(), rhs_exprs.size());
+    for (auto i = 0; i < lhs_exprs.size(); ++i) {
+      ASSERT_EQ(utils::GetStreamCnt(lhs_exprs.at(i)), utils::GetStreamCnt(rhs_exprs.at(i)));
+    }
+
+    // check the equality of source code between them
+    ASSERT_EQ(utils::Trim(SourceCodeGen(ir_sch.GetModule(), lowered_funcs, target)),
+              utils::Trim(SourceCodeGen(replay_sch.GetModule(), lowered_funcs, target)));
+  }
+};
+
+TEST_F(TestScheduleDesc, Append_Replay) {
+  lowered_funcs         = LowerCompute({32, 32}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto fused = ir_sch.Fuse("B", {0, 1});
+  trace.Append(ScheduleDesc::Step(
+      "FuseWithName", {}, {{"block_name", std::string("B")}, {"loops_index", std::vector<int>({0, 1})}}, {fused}));
+  auto sample = ir_sch.SamplePerfectTile(fused, 2, 1, {4, -1});
+  trace.Append(ScheduleDesc::Step("SamplePerfectTile",
+                                  {{"loop", std::vector<Expr>({fused})}},
+                                  {{"n", 2}, {"max_innermost_factor", 1}, {"decision", std::vector<int>{4, -1}}},
+                                  sample));
+  auto splited = ir_sch.Split(fused, sample);
+  trace.Append(ScheduleDesc::Step("Split", {{"loop", std::vector<Expr>({fused})}, {"factors", sample}}, {}, splited));
+
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  fused = ir_sch.Fuse(loops);
+  trace.Append(ScheduleDesc::Step("Fuse", {{"loops", loops}}, {}, {fused}));
+  sample = ir_sch.SamplePerfectTile(fused, 2, 1, {256, -1});
+  trace.Append(ScheduleDesc::Step("SamplePerfectTile",
+                                  {{"loop", std::vector<Expr>({fused})}},
+                                  {{"n", 2}, {"max_innermost_factor", 1}, {"decision", std::vector<int>{256, -1}}},
+                                  sample));
+  splited = ir_sch.Split(fused, sample);
+  trace.Append(ScheduleDesc::Step("Split", {{"loop", std::vector<Expr>({fused})}, {"factors", sample}}, {}, splited));
+
+  // check the equality of results between the ir_sch and replaying of trace
+  CheckTracingOutputs(splited, trace);
+  CheckReplayResult(ir_sch, trace);
+  // check the equality of results between the ir_sch and replaying of its trace
+  CheckTracingOutputs(splited, ir_sch.GetTraceDesc());
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+// Test cases with `StepKind` prefix are to check the correctness of their StepKindInfo register
+TEST_F(TestScheduleDesc, StepKind_GetAllBlocks) {
+  lowered_funcs         = LowerCompute({32, 32}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto all_blocks = ir_sch.GetAllBlocks();
+  trace.Append(ScheduleDesc::Step("GetAllBlocks", {}, {}, {all_blocks}));
+  CheckTracingOutputs(all_blocks, trace);
+  CheckTracingOutputs(all_blocks, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_GetChildBlocks) {
+  lowered_funcs         = LowerCompute({32, 32, 64}, target, true);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  auto loops = ir_sch.GetLoops("C");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("C")}}, loops));
+  ir_sch.ComputeAt(block_b, loops[1]);
+  trace.Append(ScheduleDesc::Step("ComputeAt",
+                                  {{"block", std::vector<Expr>({block_b})}, {"loop", std::vector<Expr>({loops[1]})}},
+                                  {{"keep_unit_loops", false}},
+                                  {}));
+  loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  auto root_block = ir_sch.GetRootBlock(loops[1]);
+  trace.Append(ScheduleDesc::Step("GetRootBlock", {{"expr", std::vector<Expr>({loops[1]})}}, {}, {root_block}));
+  auto childblocks = ir_sch.GetChildBlocks(root_block);
+  trace.Append(ScheduleDesc::Step("GetChildBlocks", {{"expr", std::vector<Expr>({root_block})}}, {}, childblocks));
+  CheckTracingOutputs(childblocks, trace);
+  CheckTracingOutputs(childblocks, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_GetLoops) {
+  lowered_funcs         = LowerCompute({32, 32}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  auto loops = ir_sch.GetLoops(block_b);
+  trace.Append(ScheduleDesc::Step("GetLoops", {{"block", std::vector<Expr>({block_b})}}, {}, loops));
+  CheckTracingOutputs(loops, trace);
+  CheckTracingOutputs(loops, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_GetLoopsWithName) {
+  lowered_funcs         = LowerCompute({32, 32}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  CheckTracingOutputs(loops, trace);
+  CheckTracingOutputs(loops, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_GetBlock) {
+  lowered_funcs         = LowerCompute({32, 32, 32}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  CheckTracingOutputs({block_b}, trace);
+  CheckTracingOutputs({block_b}, ir_sch.GetTraceDesc());
+}
+// TODO: fix in future, as fix split var name, this case some problem.
+/*
+TEST_F(TestScheduleDesc, StepKind_Split) {
+  lowered_funcs                         = LowerCompute({32, 32, 32}, target);
+  ir::IRSchedule ir_sch_split_base      = MakeIRSchedule(lowered_funcs);
+  ir::IRSchedule ir_sch_split           = MakeIRSchedule(lowered_funcs);
+  ir::IRSchedule ir_sch_split_with_name = MakeIRSchedule(lowered_funcs);
+
+  // test split with inputs of Expr
+  auto loops = ir_sch_split_base.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  auto sample = ir_sch_split_base.SamplePerfectTile(loops.front(), 2, 1, {4, -1});
+  trace.Append(ScheduleDesc::Step("SamplePerfectTile",
+                                  {{"loop", std::vector<Expr>({loops.front()})}},
+                                  {{"n", 2}, {"max_innermost_factor", 1}, {"decision", std::vector<int>{4, -1}}},
+                                  sample));
+  auto splited = ir_sch_split_base.Split(loops.front(), sample);
+  trace.Append(
+      ScheduleDesc::Step("Split", {{"loop", std::vector<Expr>({loops.front()})}, {"factors", sample}}, {}, splited));
+  CheckTracingOutputs(splited, trace);
+  CheckTracingOutputs(splited, ir_sch_split_base.GetTraceDesc());
+
+  // test split with inputs of int
+  loops   = ir_sch_split.GetLoops("B");
+  splited = ir_sch_split.Split(loops.front(), {4, -1});
+  CheckTracingOutputs(splited, trace);
+  CheckTracingOutputs(splited, ir_sch_split.GetTraceDesc());
+
+  // test split with block name and inputs of int
+  splited = ir_sch_split_with_name.Split("B", 0, {4, -1});
+  CheckTracingOutputs(splited, trace);
+  CheckTracingOutputs(splited, ir_sch_split_with_name.GetTraceDesc());
+}
+*/
+TEST_F(TestScheduleDesc, StepKind_Fuse) {
+  lowered_funcs         = LowerCompute({32, 32, 64}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  auto fused = ir_sch.Fuse(loops);
+  trace.Append(ScheduleDesc::Step("Fuse", {{"loops", loops}}, {}, {fused}));
+  CheckTracingOutputs({fused}, trace);
+  CheckTracingOutputs({fused}, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_FuseWithName) {
+  lowered_funcs         = LowerCompute({32, 32, 64}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto fused = ir_sch.Fuse("B", {0, 1, 2});
+  trace.Append(ScheduleDesc::Step(
+      "FuseWithName", {}, {{"block_name", std::string("B")}, {"loops_index", std::vector<int>({0, 1, 2})}}, {fused}));
+  CheckTracingOutputs({fused}, trace);
+  CheckTracingOutputs({fused}, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_FuseWithBlock) {
+  lowered_funcs         = LowerCompute({32, 32, 64}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  auto fused = ir_sch.Fuse(block_b, {0, 1, 2});
+  trace.Append(ScheduleDesc::Step("FuseWithBlock",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"loops_index", std::vector<int>({0, 1, 2})}},
+                                  {fused}));
+  CheckTracingOutputs({fused}, trace);
+  CheckTracingOutputs({fused}, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_ComputeAt) {
+  lowered_funcs         = LowerCompute({32, 32, 64}, target, true);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  auto loops = ir_sch.GetLoops("C");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("C")}}, loops));
+  ir_sch.ComputeAt(block_b, loops[1]);
+  trace.Append(ScheduleDesc::Step("ComputeAt",
+                                  {{"block", std::vector<Expr>({block_b})}, {"loop", std::vector<Expr>({loops[1]})}},
+                                  {{"keep_unit_loops", false}},
+                                  {}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_SimpleComputeAt) {
+  lowered_funcs         = LowerCompute({32, 32, 64}, target, true);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  auto loops = ir_sch.GetLoops("C");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("C")}}, loops));
+  ir_sch.SimpleComputeAt(block_b, loops[2]);
+  trace.Append(ScheduleDesc::Step("SimpleComputeAt",
+                                  {{"block", std::vector<Expr>({block_b})}, {"loop", std::vector<Expr>({loops[2]})}},
+                                  {{"keep_unit_loops", false}},
+                                  {}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_ReverseComputeAt) {
+  lowered_funcs         = LowerCompute({32, 32, 64}, target, true);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_c = ir_sch.GetBlock("C");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("C")}}, {block_c}));
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  ir_sch.ReverseComputeAt(block_c, loops[1]);
+  trace.Append(ScheduleDesc::Step("ReverseComputeAt",
+                                  {{"block", std::vector<Expr>({block_c})}, {"loop", std::vector<Expr>({loops[1]})}},
+                                  {{"keep_unit_loops", false}},
+                                  {}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_GetRootBlock) {
+  lowered_funcs         = LowerCompute({32, 64}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  auto root_b = ir_sch.GetRootBlock(loops[1]);
+  trace.Append(ScheduleDesc::Step("GetRootBlock", {{"expr", std::vector<Expr>({loops[1]})}}, {}, {root_b}));
+  CheckTracingOutputs({root_b}, trace);
+  CheckTracingOutputs({root_b}, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_CacheRead) {
+  lowered_funcs         = LowerCompute({32, 64}, target, false, "elementwise-add_const");
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  auto a_cache = ir_sch.CacheRead(block_b, 0, "local");
+  trace.Append(ScheduleDesc::Step("CacheRead",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"read_buffer_index", 0}, {"memory_type", std::string("local")}},
+                                  {a_cache}));
+  CheckTracingOutputs({a_cache}, trace);
+  CheckTracingOutputs({a_cache}, ir_sch.GetTraceDesc());
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_CacheWrite) {
+  lowered_funcs         = LowerCompute({32, 64}, target, false, "elementwise-add_const");
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  auto b_cache = ir_sch.CacheWrite(block_b, 0, "local");
+  trace.Append(ScheduleDesc::Step("CacheWrite",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"write_buffer_index", 0}, {"memory_type", std::string("local")}},
+                                  {b_cache}));
+  CheckTracingOutputs({b_cache}, trace);
+  CheckTracingOutputs({b_cache}, ir_sch.GetTraceDesc());
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_SyncThreads) {
+  lowered_funcs         = LowerCompute({64, 32}, target, true, "elementwise-add_const");
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  auto b_cache = ir_sch.CacheWrite(block_b, 0, "local");
+  trace.Append(ScheduleDesc::Step("CacheWrite",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"write_buffer_index", 0}, {"memory_type", std::string("local")}},
+                                  {b_cache}));
+  auto block_c = ir_sch.GetBlock("C");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("C")}}, {block_c}));
+  auto c_cache = ir_sch.CacheWrite(block_c, 0, "local");
+  trace.Append(ScheduleDesc::Step("CacheWrite",
+                                  {{"block", std::vector<Expr>({block_c})}},
+                                  {{"write_buffer_index", 0}, {"memory_type", std::string("local")}},
+                                  {c_cache}));
+  block_c = ir_sch.GetBlock("C");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("C")}}, {block_c}));
+  ir_sch.SyncThreads(block_c, false);
+  trace.Append(
+      ScheduleDesc::Step("SyncThreads", {{"ir_node", std::vector<Expr>({block_c})}}, {{"after_node", false}}, {}));
+  block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.SyncThreads(block_b);
+  trace.Append(
+      ScheduleDesc::Step("SyncThreads", {{"ir_node", std::vector<Expr>({block_b})}}, {{"after_node", true}}, {}));
+
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_SetBuffer) {
+  lowered_funcs         = LowerCompute({32, 64}, target, false, "elementwise-add_const");
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.SetBuffer(block_b, "shared", true);
+  trace.Append(ScheduleDesc::Step("SetBuffer",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"memory_type", std::string("shared")}, {"fixed", true}},
+                                  {}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_Reorder) {
+  lowered_funcs         = LowerCompute({32, 64, 12}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  auto sample = ir_sch.SamplePerfectTile(loops[0], 2, 1, {-1, 4});
+  trace.Append(ScheduleDesc::Step("SamplePerfectTile",
+                                  {{"loop", std::vector<Expr>({loops[0]})}},
+                                  {{"n", 2}, {"max_innermost_factor", 1}, {"decision", std::vector<int>{-1, 4}}},
+                                  sample));
+  auto splited = ir_sch.Split(loops[0], sample);
+  trace.Append(
+      ScheduleDesc::Step("Split", {{"loop", std::vector<Expr>({loops[0]})}, {"factors", sample}}, {}, splited));
+
+  loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  sample = ir_sch.SamplePerfectTile(loops[2], 2, 1, {-1, 2});
+  trace.Append(ScheduleDesc::Step("SamplePerfectTile",
+                                  {{"loop", std::vector<Expr>({loops[2]})}},
+                                  {{"n", 2}, {"max_innermost_factor", 1}, {"decision", std::vector<int>{-1, 2}}},
+                                  sample));
+  splited = ir_sch.Split(loops[2], sample);
+  trace.Append(
+      ScheduleDesc::Step("Split", {{"loop", std::vector<Expr>({loops[2]})}, {"factors", sample}}, {}, splited));
+
+  loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  Expr ret = ir_sch.Reorder({loops[4], loops[0]});
+  trace.Append(ScheduleDesc::Step("Reorder", {{"loops", std::vector<Expr>({loops[4], loops[0]})}}, {}, {ret}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_ReorderWithBlock) {
+  lowered_funcs         = LowerCompute({32, 32, 64}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+  auto loops            = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  auto sample = ir_sch.SamplePerfectTile(loops[0], 2, 1, {-1, 4});
+  trace.Append(ScheduleDesc::Step("SamplePerfectTile",
+                                  {{"loop", std::vector<Expr>({loops[0]})}},
+                                  {{"n", 2}, {"max_innermost_factor", 1}, {"decision", std::vector<int>{-1, 4}}},
+                                  sample));
+  auto splited = ir_sch.Split(loops[0], sample);
+  trace.Append(
+      ScheduleDesc::Step("Split", {{"loop", std::vector<Expr>({loops[0]})}, {"factors", sample}}, {}, splited));
+
+  loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  sample = ir_sch.SamplePerfectTile(loops[2], 2, 1, {-1, 2});
+  trace.Append(ScheduleDesc::Step("SamplePerfectTile",
+                                  {{"loop", std::vector<Expr>({loops[2]})}},
+                                  {{"n", 2}, {"max_innermost_factor", 1}, {"decision", std::vector<int>{-1, 2}}},
+                                  sample));
+  splited = ir_sch.Split(loops[2], sample);
+  trace.Append(
+      ScheduleDesc::Step("Split", {{"loop", std::vector<Expr>({loops[2]})}, {"factors", sample}}, {}, splited));
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  Expr ret = ir_sch.Reorder("B", {2, 3, 1, 4, 0});
+  trace.Append(ScheduleDesc::Step("ReorderWithBlock",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"loops_index", std::vector<int>({2, 3, 1, 4, 0})}},
+                                  {ret}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_ReorderWithName) {
+  lowered_funcs         = LowerCompute({32, 32, 64}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  auto sample = ir_sch.SamplePerfectTile(loops[0], 2, 1, {-1, 4});
+  trace.Append(ScheduleDesc::Step("SamplePerfectTile",
+                                  {{"loop", std::vector<Expr>({loops[0]})}},
+                                  {{"n", 2}, {"max_innermost_factor", 1}, {"decision", std::vector<int>{-1, 4}}},
+                                  sample));
+  auto splited = ir_sch.Split(loops[0], sample);
+  trace.Append(
+      ScheduleDesc::Step("Split", {{"loop", std::vector<Expr>({loops[0]})}, {"factors", sample}}, {}, splited));
+
+  loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  sample = ir_sch.SamplePerfectTile(loops[2], 2, 1, {-1, 2});
+  trace.Append(ScheduleDesc::Step("SamplePerfectTile",
+                                  {{"loop", std::vector<Expr>({loops[2]})}},
+                                  {{"n", 2}, {"max_innermost_factor", 1}, {"decision", std::vector<int>{-1, 2}}},
+                                  sample));
+  splited = ir_sch.Split(loops[2], sample);
+  trace.Append(
+      ScheduleDesc::Step("Split", {{"loop", std::vector<Expr>({loops[2]})}, {"factors", sample}}, {}, splited));
+
+  Expr ret = ir_sch.Reorder("B", {4, 2, 3, 1, 0});
+  trace.Append(
+      ScheduleDesc::Step("ReorderWithName",
+                         {},
+                         {{"block_name", std::string("B")}, {"loops_index", std::vector<int>({4, 2, 3, 1, 0})}},
+                         {ret}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_Parallel) {
+  lowered_funcs         = LowerCompute({32, 64}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  ir_sch.Parallel(loops[0]);
+  trace.Append(ScheduleDesc::Step("Parallel", {{"loop", std::vector<Expr>({loops[0]})}}, {}, {}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_Vectorize) {
+  lowered_funcs         = LowerCompute({32, 64}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  ir_sch.Vectorize(loops[1], 16);
+  trace.Append(ScheduleDesc::Step("Vectorize", {{"loop", std::vector<Expr>({loops[1]})}}, {{"factor", 16}}, {}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_Unroll) {
+  lowered_funcs         = LowerCompute({32, 2}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  ir_sch.Unroll(loops[1]);
+  trace.Append(ScheduleDesc::Step("Unroll", {{"loop", std::vector<Expr>({loops[1]})}}, {}, {}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_ComputeInline) {
+  lowered_funcs         = LowerCompute({32, 32, 32}, target, true, "elementwise-add_const");
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.ComputeInline(block_b);
+  trace.Append(ScheduleDesc::Step("ComputeInline", {{"schedule_block", std::vector<Expr>({block_b})}}, {}, {}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_ReverseComputeInline) {
+  lowered_funcs         = LowerCompute({32, 32, 32}, target, true, "elementwise-add_const");
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+  auto block_c          = ir_sch.GetBlock("C");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("C")}}, {block_c}));
+  ir_sch.ReverseComputeInline(block_c);
+  trace.Append(ScheduleDesc::Step("ReverseComputeInline", {{"schedule_block", std::vector<Expr>({block_c})}}, {}, {}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_Bind) {
+  lowered_funcs         = LowerCompute({32, 128}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto loops = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  ir_sch.Bind(loops[0], "blockIdx.x");
+  trace.Append(ScheduleDesc::Step(
+      "Bind", {{"loop", std::vector<Expr>({loops[0]})}}, {{"thread_axis", std::string("blockIdx.x")}}, {}));
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_Rfactor) {
+  Expr M(32);
+  Expr N(2);
+  Expr K(16);
+
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+  Var k(16, "k0");
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  lowered_funcs =
+      cinn::lang::LowerVec("test_rfactor", CreateStages({A, B, C}), {A, B, C}, {}, {}, nullptr, target, true);
+
+  cinn::common::Context::Global().ResetNameId();
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+  cinn::common::Context::Global().ResetNameId();
+
+  auto loops = ir_sch.GetLoops("C");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("C")}}, loops));
+  auto new_rf_tensor = ir_sch.Rfactor(loops[2], 0);
+  trace.Append(
+      ScheduleDesc::Step("Rfactor", {{"rf_loop", std::vector<Expr>({loops[2]})}}, {{"rf_axis", 0}}, {new_rf_tensor}));
+  CheckTracingOutputs({new_rf_tensor}, trace);
+  CheckTracingOutputs({new_rf_tensor}, ir_sch.GetTraceDesc());
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_MergeExprs) {
+  auto funcs_0 = LowerCompute({32, 128}, target);
+  auto funcs_1 = LowerCompute({32, 32, 32}, target, true, "elementwise-add_const");
+
+  ir::IRSchedule ir_sch =
+      ir::IRSchedule(ir::ModuleExpr({optim::IRCopy(funcs_0[0]->body), optim::IRCopy(funcs_0[0]->body)}));
+  ir_sch.MergeExprs();
+  trace.Append(ScheduleDesc::Step("MergeExprs", {}, {}, {}));
+  ir::IRSchedule replay_sch =
+      ir::IRSchedule(ir::ModuleExpr({optim::IRCopy(funcs_0[0]->body), optim::IRCopy(funcs_0[0]->body)}));
+  trace.Replay(&replay_sch);
+
+  auto lhs_exprs = ir_sch.GetModule().GetExprs();
+  auto rhs_exprs = replay_sch.GetModule().GetExprs();
+  ASSERT_EQ(lhs_exprs.size(), rhs_exprs.size());
+  for (auto i = 0; i < lhs_exprs.size(); ++i) {
+    ASSERT_EQ(utils::GetStreamCnt(lhs_exprs.at(i)), utils::GetStreamCnt(rhs_exprs.at(i)));
+  }
+}
+
+TEST_F(TestScheduleDesc, StepKind_Annotate) {
+  lowered_funcs         = LowerCompute({32, 128}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.Annotate(block_b, "k1", int(64));
+  trace.Append(ScheduleDesc::Step("AnnotateIntAttr",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"key", std::string("k1")}, {"value", int(64)}},
+                                  {}));
+
+  block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.Annotate(block_b, "k2", bool(true));
+  trace.Append(ScheduleDesc::Step("AnnotateBoolAttr",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"key", std::string("k2")}, {"value", bool(true)}},
+                                  {}));
+
+  block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.Annotate(block_b, "k3", float(2.0));
+  trace.Append(ScheduleDesc::Step("AnnotateFloatAttr",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"key", std::string("k3")}, {"value", float(2.0)}},
+                                  {}));
+
+  block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.Annotate(block_b, "k4", std::string("v4"));
+  trace.Append(ScheduleDesc::Step("AnnotateStringAttr",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"key", std::string("k4")}, {"value", std::string("v4")}},
+                                  {}));
+
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_Unannotate) {
+  lowered_funcs         = LowerCompute({32, 128}, target);
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+
+  auto block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.Annotate(block_b, "k1", int(64));
+  trace.Append(ScheduleDesc::Step("AnnotateIntAttr",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"key", std::string("k1")}, {"value", int(64)}},
+                                  {}));
+
+  block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.Annotate(block_b, "k2", bool(true));
+  trace.Append(ScheduleDesc::Step("AnnotateBoolAttr",
+                                  {{"block", std::vector<Expr>({block_b})}},
+                                  {{"key", std::string("k2")}, {"value", bool(true)}},
+                                  {}));
+
+  block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.Unannotate(block_b, "k1");
+  trace.Append(
+      ScheduleDesc::Step("Unannotate", {{"block", std::vector<Expr>({block_b})}}, {{"key", std::string("k1")}}, {}));
+
+  block_b = ir_sch.GetBlock("B");
+  trace.Append(ScheduleDesc::Step("GetBlock", {}, {{"block_name", std::string("B")}}, {block_b}));
+  ir_sch.Unannotate(block_b, "k2");
+  trace.Append(
+      ScheduleDesc::Step("Unannotate", {{"block", std::vector<Expr>({block_b})}}, {{"key", std::string("k2")}}, {}));
+
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_SamplePerfectTile) {
+  Expr M(1024);
+  Var n(1, "n");
+
+  Placeholder<int> A("A", {M});
+  auto B = Compute(
+      {M}, [&](Expr i) { return A(i) + n; }, "B");
+  lowered_funcs =
+      cinn::lang::LowerVec("test_sample_perfect_tile", CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
+
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs);
+  auto loops            = ir_sch.GetLoops("B");
+  trace.Append(ScheduleDesc::Step("GetLoopsWithName", {}, {{"block_name", std::string("B")}}, loops));
+  auto result = ir_sch.SamplePerfectTile(loops[0], 2, 64);
+  std::vector<int> decision;
+  std::transform(result.begin(), result.end(), std::back_inserter(decision), [](Expr x) { return x.as_int32(); });
+  trace.Append(ScheduleDesc::Step("SamplePerfectTile",
+                                  {{"loop", std::vector<Expr>({loops[0]})}},
+                                  {{"n", 2}, {"max_innermost_factor", 64}, {"decision", decision}},
+                                  result));
+  CheckTracingOutputs(result, trace);
+  CheckTracingOutputs(result, ir_sch.GetTraceDesc());
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+TEST_F(TestScheduleDesc, StepKind_SampleCategorical) {
+  lowered_funcs             = LowerCompute({32, 32, 64}, target, true);
+  ir::IRSchedule ir_sch     = MakeIRSchedule(lowered_funcs);
+  Expr ret                  = ir_sch.SampleCategorical({1, 2, 3}, {1.0, 2.0, 3.0});
+  std::vector<int> decision = {ret.as_int32()};
+  trace.Append(ScheduleDesc::Step("SampleCategorical",
+                                  {},
+                                  {{"candidates", std::vector<int>({1, 2, 3})},
+                                   {"probs", std::vector<float>({1.0, 2.0, 3.0})},
+                                   {"decision", decision}},
+                                  {ret}));
+  CheckTracingOutputs({ret}, trace);
+  CheckTracingOutputs({ret}, ir_sch.GetTraceDesc());
+  CheckReplayResult(ir_sch, trace);
+  CheckReplayResult(ir_sch, ir_sch.GetTraceDesc());
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
new file mode 100755
index 0000000000000..f0e53231fd33e
--- /dev/null
+++ b/paddle/cinn/ir/tensor.cc
@@ -0,0 +1,590 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/tensor.h"
+
+#include <cstring>
+
+#include "cinn/cinn.h"
+#include "cinn/common/arithmatic.h"
+#include "cinn/common/axis.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/ir/operation.h"
+#include "cinn/lang/compute.h"
+#include "cinn/poly/isl_utils.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace ir {
+
+Tensor _Tensor_::Make(const std::string &name,
+                      Type dtype,
+                      const std::vector<Expr> &shape,
+                      const std::vector<Expr> &domain,
+                      FunctionRef fn,
+                      const std::vector<Var> &reduce_axis) {
+  CHECK(!name.empty()) << "Tensor name is set empty";
+  auto n         = make_shared<_Tensor_>();
+  n->name        = name;
+  n->shape       = shape;
+  n->domain      = domain;
+  n->reduce_axis = reduce_axis;
+  n->set_type(dtype);
+  n->operation = fn;
+  n->InitAxis();
+
+  return Tensor(n);
+}
+
+size_t Tensor::ndims() const { return operator->()->shape.size(); }
+
+std::set<std::string> _Tensor_::GetDependTensorNames() const {
+  std::set<std::string> names;
+
+  auto add_depend_tensors_from_expr = [&](Expr expr) {
+    auto tensors =
+        CollectIRNodes(expr, [&](const Expr *x) { return x->as_tensor() && x->as_tensor()->name != this->name; });
+    for (auto &e : tensors) {
+      names.insert(e.as_tensor()->name);
+    }
+  };
+
+  if (is_compute_node()) {
+    add_depend_tensors_from_expr(body());
+  } else if (is_call_node()) {
+    add_depend_tensors_from_expr(body());
+  } else if (is_extern_call_node()) {
+    add_depend_tensors_from_expr(body());
+  } else if (is_placeholder_node()) {
+    return names;
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  return names;
+}
+
+Expr Tensor::operator()(const std::vector<Expr> &indices) const {
+  CHECK(!self()->is_tuple()) << "should extract a specific value from the tuple and operate on that instead";
+  auto *node = operator->();
+
+  CHECK_EQ(indices.size(), ndims()) << "number of indices not match the dimension";
+
+  return Load::Make(*this, indices);
+}
+
+Expr _Tensor_::inline_expanded(const std::vector<Expr> &indices) {
+  CHECK(is_compute_node());
+  return get_compute_op()->producer_fn(indices);
+}
+
+const char *_Tensor_::operation_type() const {
+  if (!operation.defined()) return "";
+  return operation->as<ir::_Operation_>()->func_type();
+}
+
+bool _Tensor_::is_compute_node() const { return std::strcmp(operation_type(), ir::ComputeOp::__func_type__) == 0; }
+bool _Tensor_::is_placeholder_node() const {
+  return std::strcmp(operation_type(), ir::PlaceholderOp::__func_type__) == 0;
+}
+bool _Tensor_::is_call_node() const { return std::strcmp(operation_type(), ir::CallOp::__func_type__) == 0; }
+bool _Tensor_::is_extern_call_node() const {
+  if (std::strcmp(operation_type(), ir::CallOp::__func_type__) == 0) {
+    auto *op   = operation->as<ir::CallOp>();
+    auto *call = op->call_expr.As<ir::Call>();
+    if (call) {
+      return call->is_extern_call();
+    }
+  }
+  return false;
+}
+bool _Tensor_::is_buffer_shared_node() const {
+  return std::strcmp(operation_type(), ir::BufferShareOp::__func_type__) == 0;
+}
+
+bool _Tensor_::is_preceding_view_node() const {
+  return std::strcmp(operation_type(), ir::PrecedingViewOp::__func_type__) == 0;
+}
+
+ComputeOp *_Tensor_::get_compute_op() const {
+  if (!is_compute_node()) return nullptr;
+  return operation->as<ComputeOp>();
+}
+
+PlaceholderOp *_Tensor_::get_placeholder_op() const {
+  if (!is_placeholder_node()) return nullptr;
+  return operation->as<PlaceholderOp>();
+}
+
+void _Tensor_::InitAxis() const {
+  // CHECK(!domain_without_reduce_axis().empty());
+  axis_ = common::GenDefaultAxis(domain_without_reduce_axis().size());
+}
+
+bool _Tensor_::has_expression() const {
+  return (!is_placeholder_node()) && (!is_tuple_get()) && (!is_buffer_shared_node());
+}
+
+isl::set _Tensor_::GenerateIslDomain() const {
+  // include the reduce axis.
+  std::vector<poly::Dim> dims;
+
+  if (has_expression()) {
+    if (axis_.empty()) InitAxis();
+    auto domain = domain_with_reduce_axis();
+    CHECK_EQ(axis_with_reduce().size(), domain.size());
+    auto _axis_with_reduce = axis_with_reduce();
+    for (int i = 0; i < domain.size(); i++) {
+      auto dim = domain[i];
+      if (dim.is_constant()) {
+        dims.emplace_back(_axis_with_reduce[i]->name, 0, dim.as_int32() - 1);
+      } else {
+        dims.emplace_back(_axis_with_reduce[i]->name, Expr(0), Sub::Make(dim, common::make_const(1)));
+      }
+    }
+  }
+
+  poly::Domain isl_domain(Context::isl_ctx(), name, dims);
+  VLOG(1) << "name:" << this->name << ", domain: " << isl_domain.__str__();
+  return isl_domain.to_isl();
+}
+
+std::vector<Expr *> _Tensor_::expr_fields() {
+  std::vector<Expr *> res;
+  const char *func_type = operation->as<ir::_Operation_>()->func_type();
+  if (operation.defined()) {
+    if (is_compute_node()) {
+      auto *op = operation->as<ir::ComputeOp>();
+      for (auto &expr : op->body) res.push_back(&expr);
+    } else if (is_placeholder_node()) {
+      auto *op = operation->as<ir::PlaceholderOp>();
+    } else if (is_call_node()) {
+      auto *op = operation->as<ir::CallOp>();
+      for (auto &expr : op->read_args()) res.push_back(&expr);
+    } else if (is_buffer_shared_node()) {
+    } else {
+      CINN_NOT_IMPLEMENTED
+    }
+  }
+
+  for (auto &e : shape) {
+    res.push_back(&e);
+  }
+  for (auto &e : domain) {
+    res.push_back(&e);
+  }
+  return res;
+}
+
+std::vector<const Expr *> _Tensor_::expr_fields() const {
+  std::vector<const Expr *> res;
+  const char *func_type = operation->as<ir::_Operation_>()->func_type();
+  if (operation.defined()) {
+    if (is_compute_node()) {
+      auto *op = operation->as<ir::ComputeOp>();
+      for (auto &expr : op->body) res.push_back(&expr);
+    } else if (is_placeholder_node()) {
+      auto *op = operation->as<ir::PlaceholderOp>();
+    } else if (is_call_node()) {
+      auto *op = operation->as<ir::CallOp>();
+      for (auto &expr : op->read_args()) res.push_back(&expr);
+    } else if (is_buffer_shared_node()) {
+    } else {
+      LOG(ERROR) << "func_type: " << func_type;
+      CINN_NOT_IMPLEMENTED
+    }
+  }
+
+  for (auto &e : shape) {
+    res.push_back(&e);
+  }
+  for (auto &e : domain) {
+    res.push_back(&e);
+  }
+
+  return res;
+}
+
+_Tensor_::~_Tensor_() {}
+
+Expr _Tensor_::body() const {
+  if (is_placeholder_node()) return Expr();
+  if (is_buffer_shared_node()) return Expr();
+  if (is_compute_node()) return operation->as<ir::ComputeOp>()->body.front();
+  if (is_call_node()) return operation->as<ir::CallOp>()->call_expr;
+  CINN_NOT_IMPLEMENTED;
+}
+
+Expr *_Tensor_::mutable_body() {
+  if (is_placeholder_node()) return nullptr;
+  if (is_buffer_shared_node()) return nullptr;
+  if (is_compute_node()) return &operation->as<ir::ComputeOp>()->body.front();
+  if (is_call_node()) return &operation->as<ir::CallOp>()->call_expr;
+  CINN_NOT_IMPLEMENTED
+}
+
+ir::Tensor _Tensor_::InitReduction(poly::StageMap stages, const Target &target) const {
+  CHECK(contains_reduce_axis()) << "InitReduction only works on a reduce tensor";
+  // return if already rexists.
+  std::string init_reduce_tensor_name = GenReduceInitTensorNameOf(name);
+  if (stages->Lookup(init_reduce_tensor_name)) return stages[this]->LookupCtrlDepend(init_reduce_tensor_name);
+
+  // create a new init tensor.
+  auto init_tensor = lang::Compute(
+      domain, [=](const std::vector<Expr> &axis) { return GetReduceInitVal(); }, init_reduce_tensor_name);
+  stages->InsertLazily(init_tensor);
+  std::string this_transform = isl_map_to_str(stages[this]->transform().get());
+  isl::ctx this_ctx          = stages[this]->transform().ctx();
+  isl::map temp_transform(this_ctx, this_transform);
+  int reduce_axis_num                        = this->reduce_axis.size();
+  auto dim_out_names                         = poly::isl_get_dim_names(stages[this]->transform(), isl_dim_out);
+  auto dim_in_size                           = isl_map_dim(stages[this]->transform().get(), isl_dim_in);
+  auto dim_in_names                          = poly::isl_get_dim_names(stages[this]->transform(), isl_dim_in);
+  std::vector<std::string> reduce_axis_input = stages[this]->origin_reduce_axis_names();
+  auto origin_domain                         = stages[this]->domain();
+  auto reduce_axis_output = poly::GetRelatedOutputAxies(temp_transform, origin_domain, reduce_axis_input);
+  std::set<std::string> reduce_axis_output_set;
+  for (auto &i : reduce_axis_output) {
+    reduce_axis_output_set.insert(i);
+  }
+  int compute_at_axis = -1;
+  for (auto &i : dim_out_names) {
+    if (reduce_axis_output_set.count(i) == 0) {
+      compute_at_axis++;
+    } else {
+      break;
+    }
+  }
+
+  temp_transform = poly::RemoveAxiesByOutputNames(temp_transform, origin_domain, reduce_axis_output);
+
+  //! When the first axis is not reduce axis, do ComputeAt.
+  if (compute_at_axis >= 0) {
+    stages[init_tensor]->ComputeAt2(stages[this], compute_at_axis);
+    init_tensor->new_indices = this->new_indices;
+    stages[this]->CtrlDepend(init_tensor);
+    stages[init_tensor]->ShareBufferWith(stages[this]);
+    init_tensor->shape = shape;
+    return init_tensor;
+  }
+  //! When reduce axies are reordered to front, ComputeAt is illegal.
+  //! So we just copy transform and forloopInfo.
+  isl_map_set_tuple_name(temp_transform.get(), isl_dim_in, init_reduce_tensor_name.c_str());
+  isl_map_set_tuple_name(temp_transform.get(), isl_dim_out, init_reduce_tensor_name.c_str());
+  stages[init_tensor]->SetTransform(temp_transform);
+  auto init_dim_out_names                                 = poly::isl_get_dim_names(temp_transform, isl_dim_out);
+  std::map<int, poly::StageForloopInfo> temp_forloop_info = stages[this]->forloop_infos();
+  std::map<int, poly::StageForloopInfo> init_forloop_info;
+  for (auto &i : temp_forloop_info) {
+    for (int j = 0; j < init_dim_out_names.size(); j++) {
+      if (i.first < 0) continue;
+      int new_i = poly::isl_get_original_axes_from_optimized_level(stages[this]->transformed_domain().get(), i.first);
+      if (dim_out_names[new_i] == init_dim_out_names[j]) {
+        stages[init_tensor]->AddForloopInfo(j, i.second);
+      }
+    }
+  }
+  init_tensor->new_indices = this->new_indices;
+  stages[this]->CtrlDepend(init_tensor);
+  stages[init_tensor]->ShareBufferWith(stages[this]);
+  init_tensor->shape = shape;
+  return init_tensor;
+}
+
+ir::Tensor _Tensor_::GetInitTensor(poly::StageMap stages, const Target &target) const {
+  return InitReduction(stages, target);
+}
+
+Expr _Tensor_::tensor_store_expanded_body() {
+  CHECK(!is_placeholder_node()) << "placeholder should not expand store";
+
+  Expr final_body = body();
+  if (shape.empty()) return final_body;
+
+  std::vector<Expr> g_axis = common::GenDefaultAxisAsExpr(shape.size());
+  if (!new_indices.empty()) {
+    g_axis = new_indices;
+  }
+
+  auto *reduce_node = body().As<ir::Reduce>();
+  if (reduce_node) {
+    final_body = reduce_node->body;
+    switch (reduce_node->reduce_type) {
+      case ir::Reduce::kSum:
+        final_body = Tensor(this)(g_axis) + final_body;
+        break;
+      case ir::Reduce::kMul:
+        final_body = Tensor(this)(g_axis) * final_body;
+        break;
+      case ir::Reduce::kMax:
+        final_body = Max::Make(Tensor(this)(g_axis), final_body);
+        break;
+      case ir::Reduce::kMin:
+        final_body = Min::Make(Tensor(this)(g_axis), final_body);
+        break;
+      case ir::Reduce::kAll:
+        final_body = Tensor(this)(g_axis) && final_body;
+        break;
+      case ir::Reduce::kAny:
+        final_body = Tensor(this)(g_axis) || final_body;
+        break;
+      default:
+        CINN_NOT_IMPLEMENTED
+    }
+  }
+
+  if (is_tuple()) return final_body;
+
+  return ir::Store::Make(Expr(Buffer(this)), final_body, g_axis);
+}
+
+void _Tensor_::Bind(lang::Buffer &buffer) {
+  // CHECK(!inlined()) << "Inlined tensor should bing buffer";
+  CHECK(!buffer->type().is_void());
+  if (this->buffer.defined()) {
+    // remove the old buffer
+    if (this->buffer == buffer.buffer()) return;
+    this->buffer->Unbind(this);
+  }
+  // Extract the tensors thouse has binded to this buffer.
+  buffer_depended_tensor_names_ = buffer.buffer()->binded_tensor_names();
+
+  buffer.buffer()->BindTo(this);
+  CHECK(!buffer->binded_tensor_names().empty());
+  this->buffer = buffer.buffer();
+  CHECK(this->buffer.defined());
+}
+
+void _Tensor_::Bind(const Buffer &buffer) {
+  lang::Buffer buf(buffer);
+  Bind(buf);
+}
+
+void _Tensor_::WithBuffer(const Type &type) {
+  Type buf_type = type.is_void() ? type_ : type;
+  lang::Buffer buf(buf_type);
+  buf->target = common::DefaultHostTarget();
+  Bind(buf);
+}
+
+void _Tensor_::WithBuffer(const std::string &memory_type, const std::string &buffer_name, const Type &type) {
+  Type buf_type = type.is_void() ? type_ : type;
+  if (this->buffer.defined()) {
+    this->buffer->dtype = buf_type;
+    this->buffer->name  = buffer_name;
+    if (memory_type == "shared") {
+      this->buffer->memory_type = MemoryType::GPUShared;
+    } else if (memory_type == "local") {
+      this->buffer->memory_type = MemoryType::GPULocal;
+    } else if (memory_type == "global") {
+      this->buffer->memory_type = MemoryType::Heap;
+    } else {
+      LOG(FATAL) << "Not supported memory type " << memory_type;
+    }
+  } else {
+    lang::Buffer buf(buf_type, buffer_name);
+    buf->target = common::DefaultHostTarget();
+    Bind(buf);
+
+    if (memory_type == "shared") {
+      buf->memory_type = MemoryType::GPUShared;
+    } else if (memory_type == "local") {
+      buf->memory_type = MemoryType::GPULocal;
+    } else if (memory_type == "global") {
+      buf->memory_type = MemoryType::Heap;
+    } else {
+      LOG(FATAL) << "Not supported memory type " << memory_type;
+    }
+  }
+}
+
+bool _Tensor_::HasSameShapeWith(const Tensor &other) const {
+  if (shape.size() != other->shape.size()) return false;
+
+  for (int i = 0; i < shape.size(); i++) {
+    Expr dim0 = common::AutoSimplify(shape[i]);
+    Expr dim1 = common::AutoSimplify(other->shape[i]);
+
+    if (dim0 != dim1) return false;
+  }
+  return true;
+}
+
+Tensor _Tensor_::TupleGet(int offset) const {
+  CHECK(is_tuple());
+  auto *call = body().As<ir::Call>();
+  CHECK_LT(offset, call->write_args.size());
+  auto tensor = call->write_args[offset].as_tensor_ref();
+  tensor->WithBuffer();
+  return tensor;
+}
+
+bool _Tensor_::is_tuple() const {
+  if (!has_expression()) return false;
+  auto *call = body().As<ir::Call>();
+  if (call && call->is_extern_call() && !call->write_args.empty()) return true;
+  return false;
+}
+
+std::vector<Expr> _Tensor_::domain_with_reduce_axis() const {
+  if (reduce_axis.empty()) return domain;
+  auto res = domain;
+  for (const Var &axis : reduce_axis) {
+    CHECK(axis->upper_bound.type().is_int(32)) << axis->upper_bound;
+    res.push_back(axis->upper_bound);
+  }
+  return res;
+}
+
+bool operator<(const Tensor &a, const Tensor &b) { return a->name < b->name; }
+
+Tensor::Tensor(const std::string &name,
+               Type dtype,
+               const std::vector<Expr> &shape,
+               const std::vector<Expr> &domain,
+               FunctionRef fn,
+               const std::vector<Var> &reduce_axis)
+    : IrNodeRef(_Tensor_::Make(name, dtype, shape, domain, fn, reduce_axis).self()) {}
+
+bool _Tensor_::is_tuple_get() const {
+  return is_call_node() && operation.defined() &&
+         operation->as<ir::_Operation_>()->func_type() == ir::CallOp::__func_type__ &&
+         operation->as<ir::CallOp>()->is_tuple_get;
+}
+
+bool _Tensor_::IsDependOnStatement(absl::string_view statement) {
+  if (!is_compute_node()) {
+    return false;
+  }
+
+  auto depend_tensors = DependingTensorNames();
+  for (const auto &x : depend_tensors) {
+    if (x == statement) return true;
+  }
+  return false;
+}
+
+std::set<std::string> _Tensor_::DependingTensorNames() {
+  std::set<std::string> res;
+  if (body().defined()) {
+    auto depend_tensors = ir::CollectIRNodes(body(), [](const Expr *x) -> bool { return x->as_tensor(); });
+    for (const auto &x : depend_tensors) {
+      if (x.get() != this) {
+        res.insert(x.as_tensor()->name);
+      }
+    }
+  }
+  return res;
+}
+
+const std::vector<Var> &_Tensor_::axis() const {
+  CHECK_EQ(axis_.size(), domain_without_reduce_axis().size());
+  return axis_;
+}
+
+std::vector<Var> _Tensor_::axis_with_reduce() const {
+  auto axis = axis_;
+  axis.insert(axis.end(), reduce_axis.begin(), reduce_axis.end());
+  return axis;
+}
+
+bool _Tensor_::Uses(const Tensor &other) const {
+  auto loads = ir::CollectIRNodes(body(), [&](const Expr *x) {
+    auto *loadn = x->As<ir::Load>();
+    if (!loadn) return false;
+    return loadn->tensor.as_tensor()->name == other->name;
+  });
+  return !loads.empty();
+}
+
+ir::Tensor _Tensor_::Reshape(const std::vector<Expr> &shape, poly::StageMap stages) const {
+  CHECK(!stages[this]->inlined());
+  auto op    = BufferShareOp::Make();
+  auto n     = make_shared<_Tensor_>();
+  auto selft = Tensor(const_cast<ir::_Tensor_ *>(this));
+
+  {
+    Expr this_num_elements = Expr(1);
+    for (auto &e : this->shape) this_num_elements = this_num_elements * e;
+
+    Expr num_elements = Expr(1);
+    for (auto &e : shape) num_elements = num_elements * e;
+
+    CHECK(MathIsZero(this_num_elements - num_elements)) << "number of elements mismatch";
+  }
+
+  n->name   = Context::Global().NewName(name + "_reshape");
+  n->shape  = shape;
+  n->domain = shape;
+  n->set_type(type());
+  n->operation = op;
+  n->InitAxis();
+
+  auto t = Tensor(n);
+  stages->InsertLazily(t);
+
+  stages[n]->ShareBufferWith(stages[this]);
+  stages[n]->CtrlDepend(selft);
+  return t;
+}
+
+ir::Tensor _Tensor_::ReshapeCopied(const std::vector<Expr> &shape, poly::StageMap stages) const {
+  auto t      = ir::Tensor(const_cast<ir::_Tensor_ *>(this));
+  auto copied = Compute(
+      domain,
+      [=](const std::vector<Expr> &axis) { return t(axis); },
+      Context::Global().NewName(this->name + "_copied"));
+  stages->InsertLazily(copied);
+  auto res = copied->Reshape(shape, stages);
+  stages->InsertLazily(res);
+  return res;
+}
+
+Shared<poly::Stage> CreateStage(Tensor tensor) {
+  auto isl_domain = tensor->GenerateIslDomain();
+  return poly::Stage::New(isl_domain, tensor->body(), tensor.self());
+}
+
+std::string GenReduceInitTensorNameOf(const std::string &tensor_name) { return tensor_name + "__reduce_init"; }
+
+bool _Tensor_::is_reduce_sum() const {
+  if (!contains_reduce_axis()) return false;
+  return body().As<ir::Reduce>() && body().As<ir::Reduce>()->reduce_type == ir::Reduce::ReduceType::kSum;
+}
+bool _Tensor_::is_reduce_mul() const {
+  if (!contains_reduce_axis()) return false;
+  return body().As<ir::Reduce>() && body().As<ir::Reduce>()->reduce_type == ir::Reduce::ReduceType::kMul;
+}
+
+Expr _Tensor_::GetReduceInitVal() const {
+  CHECK(is_reduce_tensor());
+  return body().As<ir::Reduce>()->init;
+}
+
+bool _Tensor_::IsReduceInited(poly::StageMap stages) const { return stages->Lookup(GenReduceInitTensorNameOf(name)); }
+
+void _Tensor_::Verify() const {
+  CHECK(!shape.empty());
+  CHECK(!domain.empty());
+  CHECK(!name.empty()) << "Name of tensor should be set";
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/tensor.h b/paddle/cinn/ir/tensor.h
new file mode 100644
index 0000000000000..437fe62e6d31c
--- /dev/null
+++ b/paddle/cinn/ir/tensor.h
@@ -0,0 +1,342 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/strings/string_view.h>
+#include <isl/cpp.h>
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/function_base.h"
+#include "cinn/lang/buffer.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+
+namespace ir {
+class Tensor;
+}  // namespace ir
+
+namespace lang {
+template <typename T>
+struct Placeholder;
+
+void InitReduceTensor(poly::StageMap stages,
+                      const ir::Tensor& tensor,
+                      const Target& target = common::DefaultHostTarget());
+}  // namespace lang
+
+namespace ir {
+namespace detail {
+constexpr bool LE(int a, int b) { return a <= b; }
+constexpr bool GE(int a, int b) { return a >= b; }
+
+}  // namespace detail
+
+class _Tensor_;
+class Tensor;
+
+class Tensor : public ir::IrNodeRef {
+ public:
+  Tensor() = default;
+  explicit Tensor(ir::IrNode* n) : IrNodeRef(n) {}
+  Tensor(const std::string& name,
+         Type dtype,
+         const std::vector<Expr>& shape,
+         const std::vector<Expr>& domain,
+         FunctionRef fn,
+         const std::vector<Var>& reduce_axis = {});
+
+  //! Get number of dimensions.
+  size_t ndims() const;
+
+  /**
+   * Take elements from the tensor.
+   * This take one or multiple expressions as indices.
+   *
+   * usage:
+   *
+   * Tensor A;
+   * A(i,j) get the [i][j] element.
+   */
+  // @{
+  Expr operator()(const Expr& a) const { return operator()(std::vector<Expr>({a})); }
+  template <typename... Args>
+  inline typename std::enable_if<detail::GE(sizeof...(Args), 2), Expr>::type operator()(Args&&... args) const {
+    return operator()({std::forward<Args>(args)...});
+  }
+  // @}
+
+  /**
+   * Take elements from the tensor.
+   * @param indices  The indices.
+   * @return The result expression representing a tensor read.
+   */
+  Expr operator()(const std::vector<Expr>& indices) const;
+
+  friend bool operator<(const Tensor& a, const Tensor& b);
+
+  _Tensor_* self() { return operator->(); }
+  const _Tensor_* self() const { return operator->(); }
+
+  inline const _Tensor_* operator->() const { return As<_Tensor_>(); }
+  inline _Tensor_* operator->() { return As<_Tensor_>(); }
+
+  //! Cast to an Expr.
+  inline operator Expr() const { return Expr(get()); }
+};
+
+/**
+ * \brief Generate the name of the reduce init tensor of \p tensor.
+ * This is used for retrieving the corresponding reduction-init tensor from a stage map by name.
+ */
+std::string GenReduceInitTensorNameOf(const std::string& tensor_name);
+
+class ComputeOp;
+class PlaceholderOp;
+struct ReadCacheRelation;
+struct WriteCacheRelation;
+
+/**
+ * _Tensor_ holds the content of a Tensor.
+ *
+ * NOTE(All) Some rules:
+ *
+ * 1. a _Tensor_ is a node in SSA, so every tensor's name should be unique,
+ * 2. never try to change a tensor's name, that will cause chaos.
+ */
+class _Tensor_ : public ExprNode<_Tensor_> {
+ public:
+  //! Shape of this tensor(buffer).
+  std::vector<Expr> shape;
+  //! The domain of each axis(without reduce_axis)
+  // TODO(Superjomn) support ISL domain.
+  std::vector<Expr> domain;
+
+  std::vector<Var> reduce_axis;
+  //! The operation that generates Tensor.
+  FunctionRef operation;
+  //! Name of this tensor.
+  std::string name;
+  //! The bound buffer, for each tensor if it is not inline.
+  Buffer buffer;
+  //! Normal axis.
+  mutable std::vector<Var> axis_;
+
+  std::vector<Expr> new_indices{};
+  std::vector<Expr> domain_with_reduce_axis() const;
+  const std::vector<Expr>& domain_without_reduce_axis() const { return domain; }
+
+  //! Generate a tensor from a function.
+  static Tensor Make(const std::string& name,
+                     Type dtype,
+                     const std::vector<Expr>& shape,
+                     const std::vector<Expr>& domain,
+                     FunctionRef fn,
+                     const std::vector<Var>& reduce_axis = {});
+
+  void Verify() const override;
+
+  bool IsReduceInited(poly::StageMap stages) const;
+
+  //! Tell whether this tensor represents a tuple (consists of one or multiple tensors as output of a extern Call).
+  bool is_tuple() const;
+  bool is_tuple_get() const;
+
+  Tensor TupleGet(int offset) const;
+
+  /**
+   * Get the names of the dependency(read or write) tensors.
+   * e.g. A[i] = C[i]*2 + D[i], A's dependency tensors are {C,D}
+   */
+  std::set<std::string> GetDependTensorNames() const;
+
+  /**
+   * \brief Tell whether this tensor's computation relays on a specific statement.
+   * @param statement The name of a statement(equivalent to the id of tensor).
+   * @return A boolean.
+   */
+  bool IsDependOnStatement(absl::string_view statement);
+
+  /**
+   * Get the names of the tensors thouse this tensor depends on.
+   */
+  std::set<std::string> DependingTensorNames();
+
+  /**
+   * Get a new tensor with the \p shape, but the underlying buffer shared.
+   * NOTE the tensor to Reshape should not be an inlined computation.
+   */
+  ir::Tensor Reshape(const std::vector<Expr>& shape, poly::StageMap stages) const;
+
+  /**
+   * Get a new tensor with the \p shape with a newly allocated buffer.
+   * NOTE the tensor to Reshape should not be an inlined computation.
+   */
+  ir::Tensor ReshapeCopied(const std::vector<Expr>& shape, poly::StageMap stages) const;
+
+  /**
+   * Tell whether this tensor has same shape with \p other.
+   */
+  bool HasSameShapeWith(const Tensor& other) const;
+
+  //! Operation related.
+  // @{
+  bool is_compute_node() const;
+  bool is_placeholder_node() const;
+  bool is_call_node() const;
+  bool is_extern_call_node() const;
+  bool is_preceding_view_node() const;
+  bool is_buffer_shared_node() const;
+  const char* operation_type() const;
+  ComputeOp* get_compute_op() const;
+  PlaceholderOp* get_placeholder_op() const;
+  // @}
+
+  //! The expression generate this tensor, will be empty if it is a PlaceHolder.
+  Expr body() const;
+  Expr* mutable_body();
+  //! Get the expression with `store(tensor)` inserted into the body.
+  Expr tensor_store_expanded_body();
+
+  Expr inline_expanded(const std::vector<Expr>& indices);
+
+  //! Tell whether contain a reduce axis.
+  bool contains_reduce_axis() const { return !reduce_axis.empty(); }
+  bool is_reduce_tensor() const { return contains_reduce_axis(); }
+  bool is_reduce_sum() const;
+  bool is_reduce_mul() const;
+  //! Get the initial value of a reduce tensor.
+  Expr GetReduceInitVal() const;
+
+  std::vector<Expr*> expr_fields() override;
+  std::vector<const Expr*> expr_fields() const override;
+
+  /**
+   * The normal axis without reducing ones.
+   */
+  const std::vector<Var>& axis() const;
+
+  /**
+   * The axis with the reduce ones.
+   */
+  std::vector<Var> axis_with_reduce() const;
+
+  /**
+   * Get the tensors thouse depend on the same buffer belong to this tensor.
+   */
+  const std::set<std::string>& buffer_depended_tensor_names() const { return buffer_depended_tensor_names_; }
+
+  static const IrNodeTy _node_type_ = IrNodeTy::_Tensor_;
+
+  _Tensor_() : ExprNode<_Tensor_>(Float(32)) {}
+
+  bool has_expression() const;
+
+  ~_Tensor_();
+
+  /**
+   * Tell if this tensor uses other tensors in the body.
+   */
+  bool Uses(const ir::Tensor& other) const;
+
+  //! Bind to a buffer, will persist data to the buffer in runtime.
+  void Bind(lang::Buffer& buffer);  // NOLINT
+  void Bind(const Buffer& buffer);
+  void UnBind(lang::Buffer& buffer);  // NOLINT
+
+  //! Create a buffer belong to this tensor.
+  void WithBuffer(const Type& type = Void());
+  void WithBuffer(const std::string& memory_type, const std::string& buffer_name = "", const Type& type = Void());
+  Tensor GetInitTensor(poly::StageMap stages, const Target& target = common::DefaultHostTarget()) const;
+
+ private:
+  //! Initialize the axis field after the shape field is assigned.
+  void InitAxis() const;
+
+  isl::set GenerateIslDomain() const;
+
+  /**
+   * Create the initialization tensor.
+   * @param stages The stages.
+   * @param init_val The initial value.
+   * @return The initializing tensor.
+   */
+  ir::Tensor InitReduction(poly::StageMap stages, const Target& target = common::DefaultHostTarget()) const;
+
+  //! The names of the tensors depend the same buffer and should schedule before this.
+  std::set<std::string> buffer_depended_tensor_names_;
+
+  friend Shared<poly::Stage> CreateStage(Tensor tensor);
+
+  friend void lang::InitReduceTensor(poly::StageMap stages, const ir::Tensor& tensor, const Target& target);
+};
+
+Shared<poly::Stage> CreateStage(Tensor tensor);
+
+class _Operation_;
+class Operation : public FunctionRef {
+ public:
+  Operation() = default;
+  explicit Operation(IrNode* n) : FunctionRef(n) {}
+
+  inline const _Operation_* operator->() const { return reinterpret_cast<_Operation_*>(get()); }
+  inline _Operation_* operator->() { return reinterpret_cast<_Operation_*>(get()); }
+
+  //! Get the i-th output of the operation.
+  // Tensor output(size_t i) const;
+
+  std::string name;
+};
+
+class _Operation_ : public ir::FunctionBase {
+ public:
+  //! Optional name of the operation.
+  std::string name;
+  //! Optional tag of the operation.
+  std::string tag;
+  //! Additional attributes of the operation.
+  std::map<std::string, IrNodeRef> attrs;
+
+  const std::string& func_name() const final { return name; }
+
+  void Verify() const override {}
+
+  //! The function type.
+  virtual const char* func_type() const = 0;
+};
+
+}  // namespace ir
+}  // namespace cinn
+
+namespace std {
+
+template <>
+struct hash<cinn::ir::Tensor> {
+  inline size_t operator()(const cinn::ir::Tensor& x) {
+    // We treat the tensor's name as the unique identifier.
+    return std::hash<std::string>()(x->name);
+  }
+};
+
+}  // namespace std
diff --git a/paddle/cinn/ir/tensor_test.cc b/paddle/cinn/ir/tensor_test.cc
new file mode 100755
index 0000000000000..54c46bfa7028b
--- /dev/null
+++ b/paddle/cinn/ir/tensor_test.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/tensor.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/cinn.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/lang/placeholder.h"
+
+namespace cinn {
+namespace ir {
+using utils::GetStreamCnt;
+using utils::Trim;
+
+TEST(Tensor, inlined) {
+  Expr M(100), N(20);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  // C is inlined
+  Tensor C = lang::Compute(
+      {M, N}, [=](Var i, Var j) { return A(i, j) + B(i, j); }, "C");
+
+  Tensor D = lang::Compute(
+      {M, N}, [=](Var i, Var j) -> Expr { return C(i, j) * 2.f + 1.f; }, "D");
+
+  auto stages = CreateStages({D});
+  stages[C]->ComputeInline();
+
+  auto func = lang::Lower("func_C", stages, {A, B, D});
+  std::cout << "output: \n" << func << std::endl;
+  auto out = GetStreamCnt(func);
+  EXPECT_EQ(Trim(out), Trim(R"ROC(
+function func_C (_A, _B, _D)
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 20)
+    {
+      D[i, j] = (1.00000000f + ((2.00000000f * A[i, j]) + (2.00000000f * B[i, j])))
+    }
+  }
+}
+)ROC"));
+}
+
+TEST(Tensor, IsDependOnStatement) {
+  Expr N(100);
+
+  Placeholder<float> X("X", {N});
+  auto t = Compute(
+      {N}, [&](Var i) -> Expr { return X(i); }, "t");
+
+  ASSERT_TRUE(t->IsDependOnStatement("X"));
+  ASSERT_FALSE(t->IsDependOnStatement("XXX"));
+}
+
+TEST(Tensor, Reshape) {
+  Context::Global().ResetNameId();
+  Expr M(100);
+  Expr N(100);
+  Placeholder<float> A("A", {M, N});
+
+  auto stages = CreateStages({A});
+
+  auto A1 = A->Reshape({Expr(10), Expr(10), Expr(100)}, stages);
+  auto B  = Compute(
+      A1->shape, [=](Expr i, Expr j, Expr k) { return A1(i, j, k) * 2.f; }, "B");
+
+  stages->InsertLazily(B);
+
+  auto func = lang::Lower("fn", stages, {A, B});
+
+  ir::Module::Builder builder("some_modue", common::DefaultHostTarget());
+  builder.AddFunction(func);
+
+  backends::CodeGenC codegenc(common::DefaultHostTarget());
+  codegenc.SetInlineBuiltinCodes(false);
+  auto source = codegenc.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  LOG(INFO) << "source:\n" << source;
+
+  auto target_source = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void fn(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_malloc((void*)(0), _B);
+  const float* A_reshape = ((const float*)(_A->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t i = 0; i < 10; i += 1) {
+    for (int32_t j = 0; j < 10; j += 1) {
+      for (int32_t k = 0; k < 100; k += 1) {
+        B[((1000 * i) + ((100 * j) + k))] = (2.00000000f * A_reshape[((1000 * i) + ((100 * j) + k))]);
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _B);
+}
+)ROC";
+
+  ASSERT_EQ(Trim(target_source), Trim(source));
+}
+
+TEST(Tensor, ReshapeCopied) {
+  Context::Global().ResetNameId();
+  Expr M(100);
+  Expr N(100);
+  Placeholder<float> A("A", {M, N});
+
+  auto stages = CreateStages({A});
+
+  auto A1 = A->ReshapeCopied({Expr(10), Expr(10), Expr(100)}, stages);
+  auto B  = Compute(
+      A1->shape, [=](Expr i, Expr j, Expr k) { return A1(i, j, k) * 2.f; }, "B");
+
+  stages->InsertLazily(B);
+
+  ir::Module::Builder builder("some_modue", common::DefaultHostTarget());
+  auto func = lang::Lower("fn", stages, {A, B}, {}, {}, &builder);
+
+  backends::CodeGenC codegenc(common::DefaultHostTarget());
+  codegenc.SetInlineBuiltinCodes(false);
+  auto source = codegenc.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  LOG(INFO) << "source:\n" << source;
+
+  auto target_source = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void fn(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _A_copied_reshape = cinn_buffer_t::new_((cinn_device_kind_t)(0)/*target*/, cinn_float32_t(), { 10, 10, 100 }, 32/*align*/);
+  cinn_buffer_malloc((void*)(0), _B);
+  cinn_buffer_malloc((void*)(0), _A_copied_reshape);
+  const float* A = ((const float*)(_A->memory));
+  float* A_copied = ((float*)(_A_copied_reshape->memory));
+  const float* A_copied_reshape = ((const float*)(_A_copied_reshape->memory));
+  float* B = ((float*)(_B->memory));
+  for (int32_t i = 0; i < 100; i += 1) {
+    for (int32_t j = 0; j < 100; j += 1) {
+      A_copied[((100 * i) + j)] = A[((100 * i) + j)];
+    };
+  };
+  for (int32_t i = 0; i < 10; i += 1) {
+    for (int32_t j = 0; j < 10; j += 1) {
+      for (int32_t k = 0; k < 100; k += 1) {
+        B[((1000 * i) + ((100 * j) + k))] = (2.00000000f * A_copied_reshape[((1000 * i) + ((100 * j) + k))]);
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _A_copied_reshape);
+  cinn_buffer_free((void*)(0), _B);
+}
+)ROC";
+
+  ASSERT_EQ(Trim(target_source), Trim(source));
+}
+
+TEST(Tensor, reduce) {
+  Placeholder<float> A("A", {Expr(10)});
+  Var reduce_axis(Expr(10), "reduce_k");
+  {
+    auto C = Compute(
+        A->shape,
+        [=](const std::vector<Expr>& axis) { return lang::ReduceSum(A(reduce_axis) + 1.f, {reduce_axis}); },
+        "C");
+    ASSERT_TRUE(C->has_expression());
+    ASSERT_TRUE(C->is_reduce_sum());
+    ASSERT_FALSE(C->is_reduce_mul());
+  }
+
+  {
+    auto C = Compute(
+        A->shape,
+        [=](const std::vector<Expr>& axis) { return lang::ReduceMul(A(reduce_axis) + 1.f, {reduce_axis}); },
+        "C");
+    ASSERT_TRUE(C->has_expression());
+    ASSERT_TRUE(C->is_reduce_mul());
+    ASSERT_FALSE(C->is_reduce_sum());
+  }
+}
+
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/lang/CMakeLists.txt b/paddle/cinn/lang/CMakeLists.txt
new file mode 100644
index 0000000000000..9a9c86a63e141
--- /dev/null
+++ b/paddle/cinn/lang/CMakeLists.txt
@@ -0,0 +1,17 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    buffer.cc
+    compute.cc
+    placeholder.cc
+    lower.cc
+    builtin.cc
+    lower_impl.cc
+    packed_func.cc
+    )
+
+cc_test(test_compute SRCS compute_test.cc DEPS cinncore)
+cc_test(test_placeholder SRCS placeholder_test.cc DEPS cinncore)
+cc_test(test_lower SRCS lower_test.cc DEPS cinncore)
+cc_test(test_lower_impl SRCS lower_impl_test.cc DEPS cinncore)
+cc_test(test_cinn_packed_func SRCS packed_func_test.cc DEPS cinncore)
diff --git a/paddle/cinn/lang/README.md b/paddle/cinn/lang/README.md
new file mode 100644
index 0000000000000..ebbb2ca579f64
--- /dev/null
+++ b/paddle/cinn/lang/README.md
@@ -0,0 +1,93 @@
+# Design of CINN/DSL
+This module is a simple DSL defined in CINN project. 
+The DSL module aims to represent the overall computation in a hardware indenpendent way.
+
+## Concepts
+### Object
+All the mutable elements in CINN are `Object`.
+### Shared
+The `Shared` objects are reference-count-self-contained container, which is similar to the `std::shared_ptr`.
+
+One can pass a `Shared` object by passing a pointer and the consumer object should store it in a local `Shared` member variable.
+
+## Tensor
+
+The input or the temporary ouptut node.
+
+Every `Compute` will output a Tensor, the tensor can be sliced.
+
+
+
+### PlaceHolder
+
+The special tensor that represents a input slot.
+
+```c++
+PlaceHolder<float> A("A", {M, N});
+PlaceHolder<float> B("B", {M, N});
+```
+
+## Operation
+
+The Operation is the operation on tensors, including
+
+- placeholder
+- compute
+- bound inference
+
+```c++
+Tensor C = Compute({M,N}/*output shape*/, [&](Var i, Var j) {
+  Var k;
+  return ReduceSum(A[i,k] * B[k,j], {k});
+});
+```
+
+### Bound inference
+
+The PlaceHolder should define a shape.
+
+```c++
+Var M(Int(32));
+Var N(Int(32));
+
+PlaceHolder<float> A({M, N});
+
+Var i,j;
+Expr tmp = A[i][j] + 1; // i \in {0, M}; j \in {0, N}
+```
+
+To simplify the implementation, we use ISL to generate code for basic snippets.
+
+## Schedule
+
+The schedule will
+
+1. determine the order of computation, by topological sorting the computational graph composed of tensors.
+2. transforming the computations
+
+### order schedule
+
+1. Topological sort the tensors
+2. for each tensor, generate the code it needs.
+
+## Some examples
+A matrix multiplication
+
+```c++
+// Declare some iterator variables.
+Var i, j, k;
+Placeholder<float> A({M, K}), B({K, N});
+
+Tensor C = Compute({M, N}/*output shape*/, 
+        [](Var i, Var j) {
+            return ReduceSum(A(i,k) * B(k, j), k);
+        }, "C");
+Tensor D = Compute({M, N}, [](Var i, Var j) {
+  return Map(C(i,j) + 1);
+});
+        
+Schedule s = CreateSchedule(C);
+auto func = Build(s, [A, B, C], target=target, name="matmul");
+
+func(a, b, c);
+```
diff --git a/paddle/cinn/lang/buffer.cc b/paddle/cinn/lang/buffer.cc
new file mode 100644
index 0000000000000..182d8c4b4c5a9
--- /dev/null
+++ b/paddle/cinn/lang/buffer.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/buffer.h"
+
+#include "cinn/ir/buffer.h"
+
+namespace cinn {
+namespace lang {
+
+using ir::_Buffer_;
+
+Buffer::Buffer(Type type, const std::string& name) {
+  buffer_        = _Buffer_::Make();
+  buffer_->dtype = type;
+  buffer_->set_type(type_of<cinn_buffer_t*>());
+  buffer_->elem_offset = Expr(0);
+  if (!name.empty()) {
+    buffer_->name = name;
+  }
+  buffer_->target = common::DefaultHostTarget();
+}
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/buffer.h b/paddle/cinn/lang/buffer.h
new file mode 100644
index 0000000000000..bcb4f5a602e74
--- /dev/null
+++ b/paddle/cinn/lang/buffer.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "cinn/ir/buffer.h"
+
+namespace cinn {
+namespace lang {
+
+/**
+ * This is a DSL wrapper for ir::Buffer.
+ */
+class Buffer {
+ public:
+  explicit Buffer(Type type, const std::string& name = "");
+  explicit Buffer(const ir::Buffer& x) : buffer_(x) {}
+
+  ir::_Buffer_* operator->() { return buffer_.As<ir::_Buffer_>(); }
+  const ir::_Buffer_* operator->() const { return buffer_.As<ir::_Buffer_>(); }
+
+  ir::_Buffer_* self() { return buffer_.As<ir::_Buffer_>(); }
+
+  ir::Buffer buffer() const { return buffer_; }
+
+ private:
+  ir::Buffer buffer_;
+};
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/builtin.cc b/paddle/cinn/lang/builtin.cc
new file mode 100644
index 0000000000000..266f704a76576
--- /dev/null
+++ b/paddle/cinn/lang/builtin.cc
@@ -0,0 +1,262 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/builtin.h"
+
+#include <cmath>
+#include <limits>
+#include <utility>
+
+#include "cinn/cinn.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir.h"
+#include "cinn/lang/buffer.h"
+
+namespace cinn {
+namespace lang {
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+Expr logic_and(const std::vector<Expr>& conds) {
+  CHECK(!conds.empty());
+  auto start = ir::And::Make(conds[0], conds[1]);
+  for (int i = 2; i < conds.size(); i++) {
+    start = ir::And::Make(start, conds[i]);
+  }
+  return start;
+}
+
+Expr logic_or(const std::vector<Expr>& conds) {
+  CHECK(!conds.empty());
+  auto start = ir::Or::Make(conds[0], conds[1]);
+  for (int i = 2; i < conds.size(); i++) {
+    start = ir::Or::Make(start, conds[i]);
+  }
+  return start;
+}
+
+//! extern call op
+#define EXTERN_CALL_IMP(name__, target__) \
+  Expr name__(Expr e) { return ir::Call::Make(e->type(), #target__, {e}, {}, ir::CallType::Extern); }
+
+#define EXTERN_CALL_IMP_NO_VEC(name__, target__)                                                               \
+  Expr name__(Expr e) {                                                                                        \
+    return ir::Call::Make(                                                                                     \
+        e->type(), #target__, {e}, {}, ir::CallType::Extern, ir::FunctionRef(), 0, {{"vectorizable", false}}); \
+  }
+
+EXTERN_CALL_IMP(Exp, exp);
+EXTERN_CALL_IMP_NO_VEC(Erf, erf);
+EXTERN_CALL_IMP(Sqrt, sqrt);
+EXTERN_CALL_IMP(Rsqrt, rsqrt);
+EXTERN_CALL_IMP(Log, log);
+EXTERN_CALL_IMP(Log2, log2);
+EXTERN_CALL_IMP(Log10, log10);
+EXTERN_CALL_IMP(Floor, floor);
+EXTERN_CALL_IMP(Ceil, ceil);
+EXTERN_CALL_IMP(Round, round);
+EXTERN_CALL_IMP(Trunc, trunc);
+EXTERN_CALL_IMP(Cos, cos);
+EXTERN_CALL_IMP(Sin, sin);
+EXTERN_CALL_IMP(Cosh, cosh);
+EXTERN_CALL_IMP(Tan, tan);
+EXTERN_CALL_IMP(Tanh, tanh);
+EXTERN_CALL_IMP(Sinh, sinh);
+EXTERN_CALL_IMP_NO_VEC(Acos, acos);
+EXTERN_CALL_IMP_NO_VEC(Acosh, acosh);
+EXTERN_CALL_IMP_NO_VEC(Asin, asin);
+EXTERN_CALL_IMP_NO_VEC(Asinh, asinh);
+EXTERN_CALL_IMP_NO_VEC(Atan, atan);
+EXTERN_CALL_IMP_NO_VEC(Atanh, atanh);
+EXTERN_CALL_IMP(Cbrt, cbrt);
+EXTERN_CALL_IMP(Clz, clz);
+EXTERN_CALL_IMP(Popc, popc);
+
+#undef EXTERN_CALL_IMP
+#undef EXTERN_CALL_IMP_NO_VEC
+
+#define EXTERN_BINARY_CALL_IMP(name__, target__)                                                \
+  Expr name__(Expr a, Expr b) {                                                                 \
+    CHECK_EQ(a.type(), b.type()) << #name__ << "'s inputs type not equal, where a:" << a.type() \
+                                 << " but b:" << b.type();                                      \
+    return ir::Call::Make(a->type(), #target__, {a, b}, {}, ir::CallType::Extern);              \
+  }
+
+EXTERN_BINARY_CALL_IMP(Remainder, mod)
+EXTERN_BINARY_CALL_IMP(LogicalRightShift, logical_right_shift)
+EXTERN_BINARY_CALL_IMP(Pow, pow)
+EXTERN_BINARY_CALL_IMP(Mod, mod)
+
+#undef EXTERN_BINARY_CALL_IMP
+
+Expr Zero(const Type& type) { return ir::Zero(type); }
+
+Expr One(const Type& type) { return ir::One(type); }
+
+Expr FloorDivide(Expr a, Expr b) {
+  CHECK_EQ(a.type(), b.type()) << "FloorDivide's inputs type not equal, where a:" << a.type() << " but b:" << b.type();
+  if (a.type().is_float()) {
+    return Floor(a / b);
+  } else if (a.type().is_uint()) {
+    return a / b;
+  } else {
+    auto div = a / b;
+    auto mod = a % b;
+    auto ret = ir::Select::Make(
+        ir::EQ::Make(mod, common::make_const(a.type(), 0)), div, div - common::make_const(a.type(), 1));
+    return ir::Select::Make((a > 0 && b > 0) || (a < 0 && b < 0), div, ret);
+  }
+}
+
+Expr min_value(const Type& type) {
+  CHECK_EQ(type.lanes(), 1);
+#define FOR_CASE(type__)                                                     \
+  if (type == type_of<type__>()) {                                           \
+    return Expr(static_cast<type__>(std::numeric_limits<type__>::lowest())); \
+  }
+  FOR_CASE(int8_t)
+  FOR_CASE(int16_t)
+  FOR_CASE(int32_t)
+  FOR_CASE(int64_t)
+  FOR_CASE(uint8_t)
+  FOR_CASE(uint16_t)
+  FOR_CASE(uint32_t)
+  FOR_CASE(uint64_t)
+  FOR_CASE(bfloat16)
+  FOR_CASE(float16)
+  FOR_CASE(float)
+  FOR_CASE(double)
+#undef FOR_CASE
+  return Expr();
+}
+
+Expr max_value(const Type& type) {
+  CHECK_EQ(type.lanes(), 1);
+
+#define FOR_CASE(type__)                                                  \
+  if (type == type_of<type__>()) {                                        \
+    return Expr(static_cast<type__>(std::numeric_limits<type__>::max())); \
+  }
+  FOR_CASE(int8_t)
+  FOR_CASE(int16_t)
+  FOR_CASE(int32_t)
+  FOR_CASE(int64_t)
+  FOR_CASE(uint8_t)
+  FOR_CASE(uint16_t)
+  FOR_CASE(uint32_t)
+  FOR_CASE(uint64_t)
+  FOR_CASE(bfloat16)
+  FOR_CASE(float16)
+  FOR_CASE(float)
+  FOR_CASE(double)
+#undef FOR_CASE
+
+  CINN_NOT_IMPLEMENTED
+  return Expr();
+}
+
+Expr Epsilon(const Type& type) {
+  CHECK_EQ(type.lanes(), 1);
+
+#define FOR_CASE(type__)                                                      \
+  if (type == type_of<type__>()) {                                            \
+    return Expr(static_cast<type__>(std::numeric_limits<type__>::epsilon())); \
+  }
+  FOR_CASE(int8_t)
+  FOR_CASE(int16_t)
+  FOR_CASE(int32_t)
+  FOR_CASE(int64_t)
+  FOR_CASE(uint8_t)
+  FOR_CASE(uint16_t)
+  FOR_CASE(uint32_t)
+  FOR_CASE(uint64_t)
+  FOR_CASE(bfloat16)
+  FOR_CASE(float16)
+  FOR_CASE(float)
+  FOR_CASE(double)
+#undef FOR_CASE
+
+  CINN_NOT_IMPLEMENTED
+  return Expr();
+}
+
+Expr Abs(Expr e) {
+  Type type      = e->type();
+  Type bool_type = Bool(type.lanes());
+  if (type.is_uint()) {
+    return e;
+  } else if (type.is_int() || type.is_float()) {
+    auto node = e.As<ir::IntImm>();
+    if (node) {
+      return make_const(type, std::abs(node->value));
+    }
+    return ir::Select::Make(e > Zero(e->type()), e, -e);
+  } else {
+    LOG(FATAL) << "Abs Not support data type " << type;
+  }
+  return e;
+}
+
+Expr IsNan(Expr e) {
+  Type type = e->type();
+  if (type.is_int() || type.is_uint()) {
+    return common::make_bool(false, type.lanes());
+  } else if (type.is_float()) {
+    auto* node = e.As<ir::FloatImm>();
+    if (node) {
+      return common::make_bool(std::isnan(node->value), type.lanes());
+    }
+    return CallExtern("isnan", {e}, {{"vectorizable", false}});
+  } else {
+    LOG(FATAL) << type << "is not supported for isnan op.";
+    return e;
+  }
+}
+
+Expr Infinity(const Type& type) {
+  CHECK_EQ(type.lanes(), 1U);
+  if (type.is_float()) {
+    if (type.bits() == 64) {
+      return make_const(type, std::numeric_limits<double>::infinity());
+    } else if (type.bits() == 32) {
+      return make_const(type, std::numeric_limits<float>::infinity());
+    } else if (type.bits() == 16) {
+      return make_const(type, std::numeric_limits<float16>::infinity());
+    }
+  }
+  LOG(FATAL) << "Cannot decide infinity for type " << type;
+  return Expr();
+}
+
+Expr IsInf(Expr e) {
+  Type type = e->type();
+  if (type.is_int() || type.is_uint()) {
+    return common::make_bool(false, type.lanes());
+  } else if (type.is_float()) {
+    auto* node = e.As<ir::FloatImm>();
+    if (node) {
+      return common::make_bool(std::isinf(node->value), type.lanes());
+    }
+    return CallExtern("isinf", {e}, {{"vectorizable", false}});
+  } else {
+    LOG(FATAL) << type << "is not supported for isinf op.";
+    return e;
+  }
+}
+
+Expr IsFinite(Expr e) { return !IsInf(e) && !IsNan(e); }
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/builtin.h b/paddle/cinn/lang/builtin.h
new file mode 100644
index 0000000000000..763461b697bc2
--- /dev/null
+++ b/paddle/cinn/lang/builtin.h
@@ -0,0 +1,173 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_operators.h"
+
+namespace cinn {
+namespace lang {
+
+//! Get the ALL of the conditions.
+Expr logic_and(const std::vector<Expr>& conds);
+Expr logic_or(const std::vector<Expr>& conds);
+
+Expr Zero(const Type& type);
+Expr One(const Type& type);
+Expr min_value(const Type& type);
+Expr max_value(const Type& type);
+Expr Epsilon(const Type& type);
+
+//! extern call op
+#define EXTERN_CALL_DCL(name__) Expr name__(Expr e);
+
+EXTERN_CALL_DCL(Exp);
+EXTERN_CALL_DCL(Erf);
+EXTERN_CALL_DCL(Sqrt);
+EXTERN_CALL_DCL(Rsqrt);
+EXTERN_CALL_DCL(Log);
+EXTERN_CALL_DCL(Log2);
+EXTERN_CALL_DCL(Log10);
+EXTERN_CALL_DCL(Floor);
+EXTERN_CALL_DCL(Ceil);
+EXTERN_CALL_DCL(Round);
+EXTERN_CALL_DCL(Trunc);
+EXTERN_CALL_DCL(Cos);
+EXTERN_CALL_DCL(Cosh);
+EXTERN_CALL_DCL(Tan);
+EXTERN_CALL_DCL(Sin);
+EXTERN_CALL_DCL(Sinh);
+EXTERN_CALL_DCL(Acos);
+EXTERN_CALL_DCL(Acosh);
+EXTERN_CALL_DCL(Asin);
+EXTERN_CALL_DCL(Asinh);
+EXTERN_CALL_DCL(Atan);
+EXTERN_CALL_DCL(Atanh);
+EXTERN_CALL_DCL(Tanh);
+EXTERN_CALL_DCL(Cbrt);
+EXTERN_CALL_DCL(Clz);
+EXTERN_CALL_DCL(Popc);
+
+#undef EXTERN_CALL_DCL
+
+//! extern call binary op
+#define EXTERN_BINARY_CALL_DCL(name__) Expr name__(Expr a, Expr b);
+
+EXTERN_BINARY_CALL_DCL(FloorDivide);
+EXTERN_BINARY_CALL_DCL(Remainder);
+EXTERN_BINARY_CALL_DCL(Mod);
+EXTERN_BINARY_CALL_DCL(LogicalRightShift);
+EXTERN_BINARY_CALL_DCL(Pow);
+
+#undef EXTERN_BINARY_CALL_DCL
+
+inline Expr Sigmoid(Expr e) {
+  auto one = One(e->type());
+  return one / (one + Exp(-e));
+}
+
+inline Expr Sign(Expr e) {
+  auto zero    = Zero(e->type());
+  auto one     = One(e->type());
+  auto neg_one = ir::Cast::Make(e->type(), Expr(-1));
+  auto ret0    = ir::Select::Make(ir::EQ::Make(e, zero), zero, e);
+  auto ret1    = ir::Select::Make(e > zero, one, ret0);
+  auto ret2    = ir::Select::Make(e < zero, neg_one, ret1);
+  return ret2;
+}
+
+Expr Abs(Expr e);
+
+inline Expr Negative(Expr e) { return -e; }
+inline Expr Identity(Expr e) { return e; }
+inline Expr LogicalNot(Expr e) { return !e; }
+inline Expr BitwiseNot(Expr e) { return ~e; }
+inline Expr BitwiseAnd(Expr a, Expr b) { return a & b; }
+inline Expr BitwiseOr(Expr a, Expr b) { return a | b; }
+inline Expr BitwiseXor(Expr a, Expr b) { return a ^ b; }
+inline Expr LeftShift(Expr a, Expr b) { return a << b; }
+inline Expr RightShift(Expr a, Expr b) { return a >> b; }
+
+inline Expr Relu(Expr e, double threshold = 0.0) {
+  return ir::Max::Make(e, ir::Cast::Make(e->type(), Expr(threshold)));
+}
+
+inline Expr Relu6(Expr e, double threshold = 0.0) {
+  return ir::Min::Make(ir::Max::Make(e, ir::Cast::Make(e->type(), Expr(threshold))),
+                       ir::Cast::Make(e->type(), Expr(6.0)));
+}
+
+inline Expr LeakyRelu(Expr e, double alpha) {
+  auto zero = Zero(e->type());
+  return ir::Select::Make(e > zero, e, e * ir::Cast::Make(e->type(), Expr(alpha)));
+}
+
+inline Expr LeakyRelu(Expr e, Expr alpha) {
+  auto zero = Zero(e->type());
+  return ir::Select::Make(e > zero, e, e * alpha);
+}
+
+inline Expr ReduceSum(Expr e, const std::vector<Var>& reduce_axis, Expr initial = Expr()) {
+  if (!initial.defined()) {
+    initial = Zero(e->type());
+  }
+  return ir::Reduce::Make(ir::Reduce::kSum, initial, e, reduce_axis);
+}
+
+inline Expr ReduceMul(Expr e, const std::vector<Var>& reduce_axis, Expr initial = Expr()) {
+  if (!initial.defined()) {
+    initial = One(e->type());
+  }
+  return ir::Reduce::Make(ir::Reduce::kMul, initial, e, reduce_axis);
+}
+
+inline Expr ReduceMax(Expr e, const std::vector<Var>& reduce_axis, Expr initial = Expr()) {
+  if (!initial.defined()) {
+    initial = min_value(e.type());
+  }
+  return ir::Reduce::Make(ir::Reduce::kMax, initial, e, reduce_axis);
+}
+inline Expr ReduceMin(Expr e, const std::vector<Var>& reduce_axis, Expr initial = Expr()) {
+  if (!initial.defined()) {
+    initial = max_value(e.type());
+  }
+  return ir::Reduce::Make(ir::Reduce::kMin, initial, e, reduce_axis);
+}
+inline Expr ReduceAll(Expr e, const std::vector<Var>& reduce_axis, Expr initial = Expr()) {
+  if (!initial.defined()) {
+    initial = Expr(true);
+  }
+  return ir::Reduce::Make(ir::Reduce::kAll, initial, e, reduce_axis);
+}
+inline Expr ReduceAny(Expr e, const std::vector<Var>& reduce_axis, Expr initial = Expr()) {
+  if (!initial.defined()) {
+    initial = Expr(false);
+  }
+  return ir::Reduce::Make(ir::Reduce::kAny, initial, e, reduce_axis);
+}
+
+Expr IsNan(Expr e);
+
+Expr Infinity(const Type& type);
+
+Expr IsInf(Expr e);
+
+Expr IsFinite(Expr e);
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/compute.cc b/paddle/cinn/lang/compute.cc
new file mode 100644
index 0000000000000..ac2e83ede44cf
--- /dev/null
+++ b/paddle/cinn/lang/compute.cc
@@ -0,0 +1,229 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/compute.h"
+
+#include "cinn/backends/extern_func_protos.h"
+#include "cinn/common/common.h"
+#include "cinn/ir/operation.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/poly/dim.h"
+#include "cinn/poly/domain.h"
+#include "cinn/poly/stage.h"
+#include "cinn/runtime/use_extern_funcs.h"
+
+namespace cinn {
+namespace lang {
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr()> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape) {
+  return Compute(
+      domain,
+      [fn](const std::vector<Expr> &axis) -> Expr {
+        // CHECK_EQ(axis.size(), 0);
+        return fn();
+      },
+      name,
+      shape);
+}
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape) {
+  return Compute(
+      domain,
+      [fn](const std::vector<Expr> &axis) -> Expr {
+        CHECK_EQ(axis.size(), 1);
+        return fn(axis[0]);
+      },
+      name,
+      shape);
+}
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr, Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape) {
+  return Compute(
+      domain,
+      [fn](const std::vector<Expr> &axis) -> Expr {
+        CHECK_EQ(axis.size(), 2);
+        return fn(axis[0], axis[1]);
+      },
+      name,
+      shape);
+}
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr, Expr, Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape) {
+  return Compute(
+      domain,
+      [fn](const std::vector<Expr> &axis) -> Expr {
+        CHECK_EQ(axis.size(), 3);
+        return fn(axis[0], axis[1], axis[2]);
+      },
+      name,
+      shape);
+}
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr, Expr, Expr, Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape) {
+  return Compute(
+      domain,
+      [fn](const std::vector<Expr> &axis) -> Expr {
+        CHECK_EQ(axis.size(), 4);
+        return fn(axis[0], axis[1], axis[2], axis[3]);
+      },
+      name,
+      shape);
+}
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr, Expr, Expr, Expr, Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape) {
+  return Compute(
+      domain,
+      [fn](const std::vector<Expr> &axis) -> Expr {
+        CHECK_EQ(axis.size(), 5);
+        return fn(axis[0], axis[1], axis[2], axis[3], axis[4]);
+      },
+      name,
+      shape);
+}
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr, Expr, Expr, Expr, Expr, Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape) {
+  return Compute(
+      domain,
+      [fn](const std::vector<Expr> &axis) -> Expr {
+        CHECK_EQ(axis.size(), 6);
+        return fn(axis[0], axis[1], axis[2], axis[3], axis[4], axis[5]);
+      },
+      name,
+      shape);
+}
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(const std::vector<Expr> &)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape) {
+  auto axises = common::GenDefaultAxis(domain.size());
+  std::vector<Expr> _axis;
+  for (auto &x : axises) _axis.push_back(x);
+  Expr fn_body = fn(_axis);
+
+  std::vector<Var> reduce_axis;
+  if (fn_body.defined() && fn_body.As<ir::Reduce>()) {
+    auto &fn_reduce_axis = fn_body.As<ir::Reduce>()->reduce_axis;
+    reduce_axis.insert(std::begin(reduce_axis), fn_reduce_axis.begin(), fn_reduce_axis.end());
+  }
+
+  // When the fn_body is a CallExtern, a tensor will return directly.
+  if (fn_body.as_tensor()) {
+    return fn_body.as_tensor_ref();
+  }
+
+  // shape is the buffer's shape.
+  std::vector<Expr> domain_without_reduce_axis;
+  std::vector<Expr> shape_simplified;
+
+  // construct the shape.
+  for (auto dim : domain) {
+    auto copied = dim;
+    optim::Simplify(&copied);
+    domain_without_reduce_axis.push_back(copied);
+  }
+
+  for (auto dim : shape) {
+    auto copied = dim;
+    optim::Simplify(&copied);
+    shape_simplified.push_back(copied);
+  }
+
+  auto real_shape = shape_simplified.empty() ? domain_without_reduce_axis : shape_simplified;
+
+  // The body returns void, that means no buffer is needed.
+  if (fn_body.type() == Void()) real_shape.clear();
+
+  auto unique_name = name.empty() ? Context::Global().NewName("tensor") : name;
+
+  // check reduce_axis not include the reserved axis name
+  for (auto &ra : reduce_axis) {
+    CHECK(!common::IsAxisNameReserved(ra->name)) << "reduce axis [" << ra->name << "]'s name is reserved";
+  }
+
+  VLOG(3) << "tensor " << name << "'s domain is : " << domain_without_reduce_axis;
+
+  auto op     = ir::ComputeOp::Make(unique_name, fn, real_shape, domain_without_reduce_axis, reduce_axis);
+  auto tensor = ir::Tensor(unique_name, fn_body.type(), real_shape, domain_without_reduce_axis, op, reduce_axis);
+  return tensor;
+}
+
+std::vector<ir::Tensor> CallLowered(const std::string &func_name,
+                                    const std::vector<Expr> &args,
+                                    const std::vector<ReturnType> &return_types) {
+  auto call = ir::Call::Make(Void(), func_name, args, {}, ir::CallType::CINN, ir::FunctionRef(), 0);
+  std::vector<ir::Tensor> new_tensors;
+  for (int i = 0; i < return_types.size(); i++) {
+    auto &return_type = return_types[i];
+    auto call_op      = ir::CallOp::Make(func_name, call);
+    auto new_tensor   = ir::Tensor(return_type.name, return_type.type, return_type.dims, {Expr(1)}, call_op);
+    // Append write tensors in the tail.
+    call.As<ir::Call>()->write_args.push_back(new_tensor);
+    new_tensor->set_type(return_type.type);
+    new_tensor->WithBuffer();
+    new_tensors.push_back(new_tensor);
+  }
+
+  return new_tensors;
+}
+
+Expr CallExtern(const std::string &func_name,
+                const std::vector<Expr> &args,
+                const std::map<std::string, attr_t> &attrs) {
+  auto *proto = backends::ExternFunctionProtoRegistry::Global().Lookup(func_name);
+  CHECK(proto) << "No extern function prototype " << func_name << " found\n"
+               << "existing records are:\n"
+               << backends::ExternFunctionProtoRegistry::Global().debug_string();
+
+  auto call = ir::Call::Make(proto->ret_type, func_name, args, {}, ir::CallType::Extern, ir::FunctionRef(), 0, attrs);
+  std::vector<Expr> mutable_args;
+  // Call a function with multiple outputs.
+  if (proto->ret_type.is_void()) {
+    for (int i = 0; i < proto->mutable_arg_types.size(); i++) {
+      auto shape                         = proto->shape_inference(args, i);
+      auto op                            = ir::CallOp::Make(func_name, call);
+      op->as<ir::CallOp>()->value_slot   = i;
+      op->as<ir::CallOp>()->is_tuple_get = true;
+      auto name                          = cinn::UniqName("tuple_" + func_name + "_out" + std::to_string(i) + "_");
+      auto ret                           = ir::Tensor(name, proto->mutable_arg_types[i], shape, shape, op, {});
+      mutable_args.push_back(ret);
+    }
+    call.As<ir::Call>()->write_args = mutable_args;
+  }
+  return call;
+}
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/compute.h b/paddle/cinn/lang/compute.h
new file mode 100755
index 0000000000000..230a2037c80a0
--- /dev/null
+++ b/paddle/cinn/lang/compute.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/types/variant.h>
+
+#include <functional>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/schedule.h"
+
+namespace cinn {
+namespace lang {
+
+using compute_handler_t = std::function<Expr(const std::vector<Expr> &)>;
+using attr_t            = absl::variant<int, float, bool, std::string>;
+
+//! Compute methods for one to five Vars as arguments.
+// @{
+// The shape are constant integers.
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr()> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape = {});
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape = {});
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr, Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape = {});
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr, Expr, Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape = {});
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr, Expr, Expr, Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape = {});
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr, Expr, Expr, Expr, Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape = {});
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   std::function<Expr(Expr, Expr, Expr, Expr, Expr, Expr)> fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape = {});
+
+ir::Tensor Compute(const std::vector<Expr> &domain,
+                   compute_handler_t fn,
+                   const std::string &name,
+                   const std::vector<Expr> &shape = {});
+// @}
+
+struct ReturnType {
+  Type type;
+  std::vector<Expr> dims;
+  std::string name;
+};
+
+/**
+ * \brief Call a lowered function and return one or more tensors as result.
+ *
+ * A lowered function is generated by lang::Lower method.
+ *
+ * TODO(Superjomn) Add a registry (symbol table?) to make return result inference automatically.
+ *
+ * @param func_name The name of the function to call.
+ * @param args The readonly arguments(while the mutable tensors are return result).
+ * @param return_types The types of the return values.
+ * @return Return one or more tensors as result.
+ */
+std::vector<ir::Tensor> CallLowered(const std::string &func_name,
+                                    const std::vector<Expr> &args,
+                                    const std::vector<ReturnType> &return_types);
+
+/**
+ * \brief Call an external function and get some tensors as result.
+ *
+ * There are two kinds of extern functions distinguished by the return type.
+ *
+ * 1. Void, there are one or more mutable tensors in the argument list.
+ * \code
+ * Tensor tuple = Compute({M}, []() { return CallExtern("mkl_gemm", {X, W}); });
+ * \endcode
+ *
+ * To support returning multiple value one time, we include the tuple concept, it is a Tensor with CallOp marked with
+ * value_offset(from 0 to num_returns-1).
+ *
+ * 2. POD value, return an expression directly, and it can be inline expand in following computations.
+ * \code
+ * Tensor tanh_out = Compute({M}, [](Var i) { return CallExtern("tanh", X(i)); });
+ * \endcode
+ *
+ * Will generate something like
+ *
+ * \code
+ * for (i) {
+ *   gemm_mkl(X[i], gemm_out[i])
+ * }
+ * \endcode
+ *
+ * @param func_name The name of the function to call.
+ * @param args The readonly arguments(while there should be only one tensor as result).
+ * @param attrs The readonly attrs.
+ */
+Expr CallExtern(const std::string &func_name,
+                const std::vector<Expr> &args,
+                const std::map<std::string, attr_t> &attrs = {});
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/compute_test.cc b/paddle/cinn/lang/compute_test.cc
new file mode 100644
index 0000000000000..cca239df92fbc
--- /dev/null
+++ b/paddle/cinn/lang/compute_test.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/compute.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/buffer.h"
+#include "cinn/lang/placeholder.h"
+
+namespace cinn {
+namespace lang {
+
+TEST(Call, basic) {
+  Expr M(100);
+
+  Placeholder<float> x("x", {M, Expr(10)});
+  Placeholder<float> y("y", {M, Expr(10)});
+
+  std::vector<ReturnType> return_types({{Float(32), std::vector<Expr>{{M, Expr(20)}}, "C"}});
+  auto tensors = CallLowered("lowered_fun0", {Expr(x), Expr(y)}, return_types);
+}
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/lower.cc b/paddle/cinn/lang/lower.cc
new file mode 100755
index 0000000000000..5781c69e3b853
--- /dev/null
+++ b/paddle/cinn/lang/lower.cc
@@ -0,0 +1,302 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/lower.h"
+
+#include <iostream>
+#include <map>
+#include <set>
+#include <stack>
+#include <unordered_set>
+#include <utility>
+
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/lang/lower_impl.h"
+#include "cinn/optim/optimize.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace lang {
+
+using ir::Tensor;
+using poly::Stage;
+
+std::vector<ir::Argument> GetArgs(const Expr& func_body, const std::vector<std::string>& input_output_nodes) {
+  std::vector<ir::Argument> res;
+  std::map<std::string, std::set<const ir::Load*>> name2loads;
+  std::map<std::string, std::set<const ir::Store*>> name2stores;
+  auto load_or_store_nodes = ir::CollectIRNodesWithoutTensor(
+      func_body, [&](const Expr* x) { return x->As<ir::Store>() || x->As<ir::Load>(); });
+
+  for (auto&& e : load_or_store_nodes) {
+    if (e.As<ir::Load>()) {
+      auto&& tensor_name = e.As<ir::Load>()->tensor.as_tensor()->name;
+      name2loads[tensor_name].insert(e.As<ir::Load>());
+    } else {  // Store node
+      auto&& tensor_name = e.As<ir::Store>()->tensor.as_tensor()->name;
+      name2stores[tensor_name].insert(e.As<ir::Store>());
+    }
+  }
+
+  for (auto&& node_name : input_output_nodes) {
+    auto load_it  = name2loads.find(node_name);
+    auto store_it = name2stores.find(node_name);
+    // if a node is ir::Load and also ir::Store, then process it as a ir::Store in priority.
+    if (store_it != name2stores.end()) {  //
+      for (auto&& node : store_it->second) {
+        const auto* tensor = node->tensor.as_tensor();
+        if (tensor->buffer.defined()) {
+          res.emplace_back(tensor->buffer, ir::Argument::IO::kOutput);
+          break;
+        }
+      }
+    } else if (load_it != name2loads.end()) {
+      for (auto&& node : load_it->second) {
+        const auto* tensor = node->tensor.as_tensor();
+        if (tensor->buffer.defined()) {
+          res.emplace_back(tensor->buffer, ir::Argument::IO::kInput);
+          break;
+        }
+      }
+    }
+  }
+
+  if (VLOG_IS_ON(3)) {
+    for (auto& i : input_output_nodes) VLOG(3) << "In input_output_nodes, arg has : " << i;
+    for (auto& i : res) VLOG(3) << "In res, arg has : " << i.name();
+  }
+  return res;
+}
+
+//! Collect the temporary tensors from a computational graph.
+std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
+                                       const poly::StageMap& stage_map,
+                                       Expr body) {
+  std::unordered_set<std::string> tensor_arg_names;
+  std::unordered_set<std::string> buffer_arg_names;
+  for (auto& tensor : tensor_args) {
+    tensor_arg_names.insert(tensor->name);
+    if (tensor->buffer.defined()) {
+      buffer_arg_names.insert(tensor->buffer->name);
+    }
+  }
+  std::map<std::string, ir::Buffer> name_to_buffer;  // used to avoid duplication.
+
+  auto all_temp_tensors = ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+    return x->as_tensor() && x->as_tensor()->buffer.defined() &&
+           (!stage_map->Lookup(x->as_tensor()->name) || !stage_map[x->as_tensor()]->inlined()) &&
+           ((!buffer_arg_names.count(x->as_tensor()->buffer->name) && !tensor_arg_names.count(x->as_tensor()->name)) ||
+            utils::Endswith(x->as_tensor()->buffer->name, "temp_buffer"));
+  });
+  for (auto& e : all_temp_tensors) {
+    auto buffer_name = e.as_tensor()->buffer->name;
+    if (!name_to_buffer.count(buffer_name)) {
+      name_to_buffer[buffer_name] = e.as_tensor()->buffer;
+    } else {
+      if (e.as_tensor()->buffer->numel() < name_to_buffer[buffer_name]->numel()) {
+        name_to_buffer[buffer_name] = e.as_tensor()->buffer;
+      }
+    }
+  }
+  // visit the ir body and update the map of name_to_buffer
+  auto update_map = ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+    if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
+      auto buffer_name = x->as_tensor()->buffer->name;
+      if (name_to_buffer.count(buffer_name) && x->as_tensor()->buffer->numel() < name_to_buffer[buffer_name]->numel()) {
+        name_to_buffer[buffer_name] = x->as_tensor()->buffer;
+      }
+    }
+    return x->as_tensor() && x->as_tensor()->buffer.defined();
+  });
+
+  std::vector<ir::Buffer> temp_buffers;
+  for (auto& i : name_to_buffer) temp_buffers.push_back(i.second);
+  return temp_buffers;
+}
+
+//! Collect the temporary tensors from a computational graph.
+std::vector<ir::Buffer> GetTempBuffers(const std::vector<ir::Argument>& args, Expr body) {
+  std::unordered_set<std::string> buffer_arg_names;
+  for (auto& a : args) {
+    if (a.is_buffer()) {
+      buffer_arg_names.insert(a.name());
+    }
+  }
+  std::map<std::string, ir::Buffer> name_to_buffer;  // used to avoid duplication.
+
+  auto all_temp_tensors = ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+    return x->as_tensor() && x->as_tensor()->buffer.defined() &&
+           (!buffer_arg_names.count(x->as_tensor()->buffer->name) ||
+            utils::Endswith(x->as_tensor()->buffer->name, "temp_buffer"));
+  });
+  for (auto& e : all_temp_tensors) {
+    auto buffer_name = e.as_tensor()->buffer->name;
+    if (!name_to_buffer.count(buffer_name)) {
+      name_to_buffer[buffer_name] = e.as_tensor()->buffer;
+    } else {
+      if (e.as_tensor()->buffer->numel() < name_to_buffer[buffer_name]->numel()) {
+        name_to_buffer[buffer_name] = e.as_tensor()->buffer;
+      }
+    }
+  }
+  // visit the ir body and update the map of name_to_buffer
+  auto update_map = ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+    if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
+      auto buffer_name = x->as_tensor()->buffer->name;
+      if (name_to_buffer.count(buffer_name) && x->as_tensor()->buffer->numel() < name_to_buffer[buffer_name]->numel()) {
+        name_to_buffer[buffer_name] = x->as_tensor()->buffer;
+      }
+    }
+    return x->as_tensor() && x->as_tensor()->buffer.defined();
+  });
+
+  std::vector<ir::Buffer> temp_buffers;
+  for (auto& i : name_to_buffer) temp_buffers.push_back(i.second);
+  return temp_buffers;
+}
+
+std::set<ir::Tensor> CollectTempTensorsFromCtrlDepends(StageMap stages, const std::vector<Tensor>& tensor_args) {
+  std::set<ir::Tensor> res;
+  for (auto& stage : stages) {
+    res.emplace(ir::Tensor(stage.second->tensor()));
+    res.insert(stage.second->ctrl_depends().begin(), stage.second->ctrl_depends().end());
+  }
+  for (auto& t : tensor_args) {
+    if (res.count(t)) res.erase(t);
+  }
+  return res;
+}
+
+void InitReduceTensor(StageMap stages, const Tensor& tensor, const Target& target) {
+  if (tensor->is_reduce_tensor() && !tensor->IsReduceInited(stages)) {
+    tensor->InitReduction(stages, target);
+  }
+  auto uninited_reduce_tensors = ir::CollectIRNodes(tensor->body(), [&](const Expr* x) {
+    return x && x->defined() && x->as_tensor() && x->as_tensor()->is_reduce_tensor() &&
+           !x->as_tensor()->IsReduceInited(stages);
+  });
+  for (auto& t : uninited_reduce_tensors) {
+    VLOG(3) << "Init reduce tensor: " << t.as_tensor()->name;
+    t.as_tensor()->InitReduction(stages, target);
+  }
+}
+
+ir::LoweredFunc Lower(const std::string& name,
+                      StageMap stages,
+                      const std::vector<Tensor>& tensor_args,
+                      const std::vector<Var>& scalar_args,
+                      const std::vector<Tensor>& temp_tensors,
+                      Module::Builder* b,
+                      const Target& target,
+                      bool support_ir_schedule) {
+  // Init the reduce tensors first before any process.
+  for (auto& t : tensor_args) InitReduceTensor(stages, t, target);
+  for (auto& t : temp_tensors) InitReduceTensor(stages, t, target);
+  // Merge the ctrl_deps with the given temp_tensors ang get a new temp_tensors
+  auto ctrl_deps = CollectTempTensorsFromCtrlDepends(stages, tensor_args);
+  ctrl_deps.insert(temp_tensors.begin(), temp_tensors.end());
+  auto lower_impl_instance = detail::LowerImpl(name,
+                                               stages,
+                                               tensor_args,
+                                               scalar_args,
+                                               std::vector<Tensor>(ctrl_deps.begin(), ctrl_deps.end()),
+                                               target,
+                                               support_ir_schedule);
+  auto result              = lower_impl_instance();
+  std::vector<ir::LoweredFunc> return_value;
+  for (auto& res : result) {
+    auto temp_buffers = GetTempBuffers(tensor_args, stages, res->body);
+    if (b) {
+      for (auto& temp_buffer : temp_buffers) {
+        b->AddBuffer(temp_buffer);
+      }
+    }
+    {
+      for (auto& stage : stages) {
+        if (stage.second->IfCudaBind()) {
+          res->device_api = ir::DeviceAPI::GPU;
+          break;
+        }
+      }
+      if (target == common::DefaultNVGPUTarget()) {
+        res->device_api = ir::DeviceAPI::GPU;
+      }
+    }
+    if (b) {
+      b->AddFunction(res);
+    }
+    res->temp_bufs = temp_buffers;
+    return_value.push_back(res);
+  }
+  return return_value[0];
+}
+
+std::vector<ir::LoweredFunc> LowerVec(const std::string& name,
+                                      StageMap stages,
+                                      const std::vector<Tensor>& tensor_args,
+                                      const std::vector<Var>& scalar_args,
+                                      const std::vector<Tensor>& temp_tensors,
+                                      Module::Builder* b,
+                                      const Target& target,
+                                      bool support_ir_schedule) {
+  // Init the reduce tensors first before any process.
+  for (auto& t : tensor_args) InitReduceTensor(stages, t, target);
+  for (auto& t : temp_tensors) InitReduceTensor(stages, t, target);
+  // Merge the ctrl_deps with the given temp_tensors ang get a new temp_tensors
+  auto ctrl_deps = CollectTempTensorsFromCtrlDepends(stages, tensor_args);
+  ctrl_deps.insert(temp_tensors.begin(), temp_tensors.end());
+  auto lower_impl_instance = detail::LowerImpl(name,
+                                               stages,
+                                               tensor_args,
+                                               scalar_args,
+                                               std::vector<Tensor>(ctrl_deps.begin(), ctrl_deps.end()),
+                                               target,
+                                               support_ir_schedule);
+  // return vectorof ir::LoweredFunc.
+  auto result = lower_impl_instance();
+  std::vector<ir::LoweredFunc> return_value;
+  for (auto& res : result) {
+    auto temp_buffers = GetTempBuffers(tensor_args, stages, res->body);
+    if (b) {
+      for (auto& temp_buffer : temp_buffers) {
+        b->AddBuffer(temp_buffer);
+      }
+    }
+
+    {  // set function device_api
+      for (auto& stage : stages) {
+        if (stage.second->IfCudaBind()) {
+          res->device_api = ir::DeviceAPI::GPU;
+          break;
+        }
+      }
+
+      if (target == common::DefaultNVGPUTarget()) {
+        res->device_api = ir::DeviceAPI::GPU;
+      }
+    }
+    if (b) {
+      b->AddFunction(res);
+    }
+
+    res->temp_bufs = temp_buffers;
+
+    return_value.push_back(res);
+  }
+  return return_value;
+}
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/lower.h b/paddle/cinn/lang/lower.h
new file mode 100644
index 0000000000000..d20adad843174
--- /dev/null
+++ b/paddle/cinn/lang/lower.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * Lower lowerise the statements to LoweredFuncs.
+ */
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/ir/module.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/poly/schedule.h"
+
+namespace cinn {
+namespace lang {
+using ir::Tensor;
+using poly::StageMap;
+
+/**
+ * \brief Lower the computation of \p tensor_args and \p scalar_args to a LoweredFunc.
+ * @param name The name of the function.
+ * @param tensor_args The tensor arguments, where the computation logic locates.
+ * @param scalar_args The scalar arguments, indicate some dimensions.
+ * @param temp_tensors The temporary tensors(buffers) used in the body.
+ * @param b The module this function belongs to.
+ * @return A LoweredFunc, whose name is \p name, the argument list is the concatenation of \p tensor_args and \p
+ * scalar_args.
+ */
+ir::LoweredFunc Lower(const std::string &name,
+                      StageMap stages,
+                      const std::vector<Tensor> &tensor_args,
+                      const std::vector<Var> &scalar_args     = {},
+                      const std::vector<Tensor> &temp_tensors = {},
+                      ir::Module::Builder *b                  = nullptr,
+                      const Target &target                    = common::DefaultHostTarget(),
+                      bool support_ir_schedule                = false);
+
+/**
+ * \brief Lower the computation of \p tensor_args and \p scalar_args to a vector of LoweredFuncs. Each schedule group
+ * forms a LoweredFunc.
+ * @param name The name of the function.
+ * @param tensor_args The tensor arguments, where the computation logic locates.
+ * @param scalar_args The scalar arguments, indicate some dimensions.
+ * @param temp_tensors The temporary tensors(buffers) used in the body.
+ * @param b The module this function belongs to.
+ * @return A vector of LoweredFuncs, whose name is \p name, name + "_1", name + "_2"... The argument list is deduced
+ * from the expression of each func.
+ */
+std::vector<ir::LoweredFunc> LowerVec(const std::string &name,
+                                      StageMap stages,
+                                      const std::vector<Tensor> &tensor_args,
+                                      const std::vector<Var> &scalar_args     = {},
+                                      const std::vector<Tensor> &temp_tensors = {},
+                                      ir::Module::Builder *b                  = nullptr,
+                                      const Target &target                    = common::DefaultHostTarget(),
+                                      bool support_ir_schedule                = false);
+
+std::vector<ir::Argument> GetArgs(const Expr &func_body, const std::vector<std::string> &input_output_nodes);
+
+//! Collect the temporary tensors from a computational graph.
+std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor> &tensor_args,
+                                       const poly::StageMap &stage_map,
+                                       Expr body);
+
+//! Collect the temporary tensors from a computational graph.
+std::vector<ir::Buffer> GetTempBuffers(const std::vector<ir::Argument> &args, Expr body);
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/lower_impl.cc b/paddle/cinn/lang/lower_impl.cc
new file mode 100644
index 0000000000000..e839fc8ef0507
--- /dev/null
+++ b/paddle/cinn/lang/lower_impl.cc
@@ -0,0 +1,791 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/lower_impl.h"
+
+#include <algorithm>
+#include <queue>
+#include <string>
+#include <unordered_set>
+
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/optim/remove_nested_block.h"
+#include "cinn/optim/replace_var_with_expr.h"
+#include "cinn/optim/transform_polyfor_to_for.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace lang {
+namespace detail {
+
+void CheckNoIslCallRemains(Expr* expr) {
+  auto isl_calls = ir::CollectIRNodes(
+      *expr, [](const Expr* expr) { return expr->As<ir::Call>() && expr->As<ir::Call>()->is_isl_call(); });
+#ifdef CINN_DEBUG
+  for (auto& item : isl_calls) {
+    LOG(ERROR) << "ISL call: " << item;
+  }
+#endif
+  if (!isl_calls.empty()) {
+    LOG(WARNING) << "Some ISL call nodes remained, get " << isl_calls.size() << " isl_calls, the first one is "
+                 << *isl_calls.begin();
+  }
+}
+
+void BindBuffer(StageMap& stages) {
+  absl::flat_hash_map<std::string, ir::_Tensor_*> tensor_map;
+  for (auto& stage : stages) {
+    tensor_map[stage.second->tensor()->name] = stage.second->tensor();
+  }
+  for (auto& stage : stages) {
+    if (!stage.second->tensor()->buffer.defined() && !stage.second->meta.tensors_to_share_buffer_with.empty()) {
+      for (auto& str : stage.second->meta.tensors_to_share_buffer_with) {
+        if (tensor_map[str]->buffer.defined()) {
+          auto edited_shape = tensor_map[str]->buffer->shape;
+          stage.second->tensor()->Bind(tensor_map[str]->buffer);
+          tensor_map[str]->buffer->shape = edited_shape;
+          VLOG(3) << "Tensor " << stage.second->tensor()->name << " bind buffer to " << tensor_map[str]->name << " , "
+                  << tensor_map[str]->buffer->name;
+        }
+      }
+    }
+  }
+}
+
+Expr LowerGroup(const poly::ScheduleGroup& group,
+                const std::map<std::string, Expr>& tuple_to_expr,
+                std::map<std::string, ir::Tensor>* global_tensor_map,
+                std::unordered_map<std::string, std::vector<Expr>>& resized_buffer_cache,
+                StageMap stage_map,
+                ir::CudaAxisInfo* cuda_axis_info) {
+  BindBuffer(stage_map);
+  std::vector<poly::Stage*> stages;
+  for (auto& node : group.nodes) {
+    VLOG(1) << "In LowerGroup, node id is: " << node->id();
+    if (node->stage->has_expression()) {
+      stages.push_back(node->stage);
+      VLOG(1) << "stage expr " << node->stage->expr();
+    } else {
+      VLOG(1) << "stage expression is null: " << node->stage->domain();
+    }
+  }
+
+  if (stages.empty()) return Expr();
+
+  // get isl generated expression
+  isl::set context(Context::isl_ctx(), "{:}");
+  poly::AstGen gen(context, stages, group);
+  isl::ast_node ast = gen.Build();
+  ir::Expr e;
+
+  // The code where adds length 1 loop back to CINN Expr, if you do not want to
+  // add back, call poly::IslAstNodeToCinnExpr(ast, &e) instead of
+  // poly::IslAstNodeToCinnExpr(ast, gen.domain(), &e);
+
+  VLOG(6) << "before ast to expr";
+  // poly::IslAstNodeToCinnExpr(ast, &e);
+  poly::IslAstNodeToCinnExpr(ast, gen.domain(), &e);
+  // now we get a workable expression, but the statement are something like `B(((16 * po0) + po1), po2)`, we need to
+  // transform this to some realworld statement in CINN.
+
+  VLOG(1) << "ast to expr: \n" << e << std::endl;
+
+  // replace isl call to the corresponding CINN statement, we need to replace the axis at the same time.
+  for (auto& statement : tuple_to_expr) {
+    VLOG(2) << "LowerGroup working on statement: " << statement.first;
+    if (!gen.ContainsStatement(statement.first)) continue;
+    // the axis_ast_map contains the axis from the original (like `i`) to the transformed (like `i+3`).
+    auto axis_expr_map = gen.axis2expr(statement.first);
+    for (auto& item : axis_expr_map) {
+      VLOG(4) << "statement ast map axis [" << item.first << "] to "
+              << "[" << item.second << "]";
+    }
+
+    // the original CINN statements.
+    Expr statement_candi_expr = tuple_to_expr.at(statement.first);
+
+    VLOG(3) << "replacing " << statement.first << " to " << statement_candi_expr;
+    optim::ReplaceIslCallWithExpr(&e, statement.first, statement_candi_expr, axis_expr_map);
+  }
+  CheckNoIslCallRemains(&e);
+
+  // Update global_tensor_map
+  for (auto& e : stage_map) {
+    if (!global_tensor_map->count(e.second->id())) {
+      (*global_tensor_map)[e.second->id()] = ir::Tensor(e.second->tensor());
+    }
+  }
+
+  // mark vectorize.
+  {
+    std::map<std::string, ir::VectorizeInfo> vectorizes;
+    for (auto& node : group.nodes) {
+      if (node->stage->vectorize_info().valid()) {
+        vectorizes[node->stage->id()] = node->stage->vectorize_info();
+      }
+    }
+    MarkVectorizeMutator mutator(vectorizes);
+    mutator(&e);
+  }
+
+  // mark unroll.
+  {
+    std::map<std::string, std::set<int>> unrolls;
+    for (auto& node : group.nodes) {
+      if (!node->stage->unroll_info().empty()) {
+        unrolls[node->stage->id()] = node->stage->unroll_info();
+      }
+    }
+    MarkUnrollMutator mutator(unrolls);
+    mutator(&e);
+  }
+
+  // mark parallel.
+  {
+    std::map<std::string, std::set<int>> parallels;
+    for (auto& node : group.nodes) {
+      if (!node->stage->parallel_info().empty()) {
+        parallels[node->stage->id()] = node->stage->parallel_info();
+      }
+    }
+    MarkParallelMutator mutator(parallels);
+    mutator(&e);
+  }
+
+  return e;
+}
+
+bool TensorContainsGPUInfo(ir::Tensor t, poly::Stage* stage) {
+  if (stage->inlined()) return false;
+  if (stage) {
+    for (auto& info : stage->forloop_infos()) {
+      if (info.second.device == ir::DeviceAPI::GPU) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+const char* CompuGraphNode::__type_info__ = "ComputeGraphNode";
+const char* CompuGraphNode::type_info() const { return __type_info__; }
+std::string CompuGraphNode::id() const {
+  CHECK(tensor.defined());
+  return tensor->name;
+}
+
+/**
+ * \brief Add nodes to graph with dependencies.
+ * We create a computation graph based on the tensor dependency relations.
+ * NOTE The graph will contain the inline tensors so that the dependency will be reserved.
+ * @param graph The graph
+ * @param t The tensor.
+ * @param stages The stage map.
+ */
+void CreateCompGraphWithInlineTensors(common::Graph* graph,
+                                      const ir::Tensor& t,
+                                      StageMap stages,
+                                      std::set<ir::Tensor>* visited) {
+  if (visited->count(t)) return;
+  common::GraphNode* t_node = graph->RetrieveNode(t->name);
+  if (!t_node) {
+    t_node = graph->RegisterNode(t->name, new CompuGraphNode(t));
+  }
+
+  visited->insert(t);
+
+  // collect dependency tensors of t
+  // here we just collect the tensors in Load nodes
+  // NOTE there may be some other cases.
+  auto deps = ir::CollectLoadTensors(t->body(), [](const Expr* x) { return x->as_tensor(); });
+  for (const auto& dep : deps) {
+    auto e_tensor = dep.as_tensor_ref();
+    auto* e_node  = graph->RetrieveNode(e_tensor->name);
+    if (!e_node) {
+      e_node = graph->RegisterNode(e_tensor->name, new CompuGraphNode(e_tensor));
+    }
+    e_node->Controls(t_node);
+    if (!visited->count(e_tensor)) {
+      CreateCompGraphWithInlineTensors(graph, e_tensor, stages, visited);
+    }
+  }
+}
+
+std::unique_ptr<common::Graph> CreateCompGraphWithInlineTensorHidden(const std::vector<ir::Tensor>& tensors,
+                                                                     StageMap stages) {
+  // create a graph with inline tensor first.
+  std::unique_ptr<common::Graph> graph(new common::Graph);
+  std::set<ir::Tensor> visited;
+  for (auto& t : tensors) {
+    CreateCompGraphWithInlineTensors(graph.get(), t, stages, &visited);
+  }
+
+  // greedy remove the inline tensor, each time merge the inputs of an inline tensor to its sink node.
+
+  std::set<common::GraphNode*> inline_nodes;
+  do {
+    inline_nodes = graph->CollectNodes([&](const common::GraphNode* x) {
+      auto* comp_node = x->safe_as<CompuGraphNode>();
+      return stages[comp_node->tensor]->inlined();
+    });
+    if (inline_nodes.empty()) break;
+
+    /*
+     * A -> inlined -> B
+     * C /
+     * =>
+     * A -> B
+     * C /
+     */
+    for (auto* inline_node : inline_nodes) {
+      // remove this node, merge its inputs to the sink nodes.
+      auto inline_inlinks  = inline_node->inlinks();
+      auto inline_outlinks = inline_node->outlinks();
+
+      // unlink the inline node from its inputs and outputs
+      for (auto& link : inline_inlinks) {
+        link->source()->UnLinkSingleTo(link->sink());
+      }
+      for (auto& link : inline_outlinks) {
+        link->source()->UnLinkSingleTo(link->sink());
+      }
+
+      // link inline node's input nodes to its output nodes.
+      for (auto out_edge : inline_outlinks) {
+        auto* out = out_edge->sink();
+        for (auto in_edge : inline_inlinks) {
+          auto* source = in_edge->source();
+          source->LinkTo(out);
+        }
+      }
+
+      graph->DropNode(inline_node);
+    }
+  } while (!inline_nodes.empty());
+
+  return graph;
+}
+
+void CompuGraphAddCtrlDepLinks(common::Graph* graph, StageMap stages) {
+  for (auto& x : graph->nodes()) {
+    auto* node = x->safe_as<CompuGraphNode>();
+    CHECK(node);
+    for (auto& dep : stages[node->tensor]->ctrl_depends()) {
+      auto* dep_node = graph->RetrieveNode(dep->name);
+      if (dep_node) {
+        VLOG(3) << "Add control link: " << dep << " -> " << node->id();
+        dep_node->Controls(node);
+      }
+    }
+  }
+}
+
+std::unique_ptr<common::Graph> CreateCompGraph(const std::vector<ir::Tensor>& tensors,
+                                               StageMap stages,
+                                               bool hide_inline) {
+  if (hide_inline) {
+    auto graph = CreateCompGraphWithInlineTensorHidden(tensors, stages);
+    CompuGraphAddCtrlDepLinks(graph.get(), stages);
+    return graph;
+  } else {
+    auto graph = std::make_unique<common::Graph>();
+    std::set<ir::Tensor> visited;
+    for (auto& t : tensors) {
+      CreateCompGraphWithInlineTensors(graph.get(), t, stages, &visited);
+    }
+    CompuGraphAddCtrlDepLinks(graph.get(), stages);
+    return graph;
+  }
+}
+
+void LowerImpl::CheckArgsUnique() {
+  for (auto& tensor : tensor_args_) {
+    CHECK(!stages_[tensor]->inlined()) << "Inline tensor cannot be argument of function";
+    if (!tensor->buffer.defined()) {
+      LOG(ERROR) << "tensor [" << tensor->name << "] buffer is null";
+      continue;
+    }
+  }
+}
+
+std::vector<ir::Argument> LowerImpl::GenerateFunctionArgumentList(Expr fn_body) {
+  CheckArgsUnique();
+
+  std::vector<ir::Argument> args;
+  optim::TensorWriteTeller teller;
+  teller.Collect(&fn_body);
+
+  std::set<std::string> arg_names;
+
+  for (auto& scalar : scalar_args_) {
+    CHECK(!arg_names.count(scalar->name));
+    auto* scalar_node = scalar.As<ir::_Var_>();
+    CHECK(scalar_node->type().valid());
+    arg_names.insert(scalar->name);
+
+    args.emplace_back(scalar, ir::Argument::IO::kInput);
+  }
+
+  for (auto& tensor : tensor_args_) {
+    auto* tensor_node = tensor.As<ir::_Tensor_>();
+    bool is_output    = teller.IsWrite(tensor->name);
+    VLOG(1) << "tensor argument " << tensor->name << " buffer " << tensor->buffer->name;
+
+    // avoid duplicate
+    if (!tensor_node->buffer.defined()) continue;
+    // if a argument is already marked as kInput, mark it as kOutput and move it to the back.
+    if (arg_names.count(tensor_node->buffer->name)) {
+      auto it = std::find_if(
+          args.begin(), args.end(), [&](const ir::Argument& x) { return x.name() == tensor_node->buffer->name; });
+      CHECK(it != args.end());
+      if (it->is_input()) {
+        args.erase(it);
+      } else if (it->is_output()) {
+        continue;
+      }
+    }
+
+    arg_names.insert(tensor_node->buffer->name);
+
+    auto io = is_output ? ir::Argument::IO::kOutput : ir::Argument::IO::kInput;
+    VLOG(3) << "Collect " << (is_output ? "W" : "R") << " argument " << tensor->buffer->name;
+    args.emplace_back(tensor_node->buffer, io);
+  }
+
+  return args;
+}
+// Generate Function Arguments for splitted kernel.
+std::vector<ir::Argument> LowerImpl::GenFuncArgForSplitKernel(Expr func_iterator,
+                                                              std::vector<ir::Tensor> temp_tensors) {
+  CheckArgsUnique();
+
+  std::vector<ir::Argument> in_args;
+  std::vector<ir::Argument> out_args;
+  optim::TensorWriteTeller teller;
+  teller.Collect(&func_iterator);
+  std::set<std::string> arg_names;
+  std::set<std::string> all_tensor_names;
+
+  for (auto& scalar : scalar_args_) {
+    CHECK(!arg_names.count(scalar->name));
+    auto* scalar_node = scalar.As<ir::_Var_>();
+    CHECK(scalar_node->type().valid());
+    arg_names.insert(scalar->name);
+
+    in_args.emplace_back(scalar, ir::Argument::IO::kInput);
+  }
+
+  auto all_tensors = ir::CollectIRNodes(
+      func_iterator, [&](const Expr* x) { return x->as_tensor() && !stages_[x->as_tensor()]->inlined(); });
+
+  auto all_vars = ir::CollectIRNodes(func_iterator, [&](const Expr* x) { return x->as_var(); });
+
+  for (auto& i : all_tensors) {
+    auto* tensor = i.as_tensor();
+    all_tensor_names.insert(tensor->name);
+    VLOG(3) << "In all_tensors, it has : " << tensor->name;
+    if (!stages_[tensor]->meta.tensors_to_share_buffer_with.empty()) {
+      for (auto& i : stages_[tensor]->meta.tensors_to_share_buffer_with) {
+        all_tensor_names.insert(i);
+        VLOG(3) << "And its share_buffer_tensor is : " << i;
+      }
+    }
+  }
+  for (auto& i : all_vars) {
+    auto* var = i.as_var();
+    VLOG(3) << "In all_vars, it has : " << var->name;
+  }
+
+  for (auto& i : scalar_args_) {
+    VLOG(3) << "In scalar_args_, var has : " << i->name;
+  }
+
+  std::set<std::string> temp_tensor_names;
+
+  for (auto& i : temp_tensors) {
+    VLOG(3) << "In temp_tensors, it has : " << i->name;
+    temp_tensor_names.insert(i->name);
+  }
+
+  for (auto& tensor : tensor_args_) {
+    VLOG(3) << "In tensor_args_, it has : " << tensor->name;
+    if (temp_tensor_names.count(tensor->name) > 0) continue;
+    if (all_tensor_names.count(tensor->name) == 0) continue;
+    bool is_output = teller.IsWrite(tensor->name);
+    VLOG(3) << "tensor argument " << tensor->name << " buffer " << tensor->buffer->name;
+
+    // avoid duplicate
+    if (!tensor->buffer.defined()) {
+      VLOG(3) << "tensor->buffer is not defined";
+      continue;
+    }
+    // if a argument is already marked as kInput, mark it as kOutput and move it to the back.
+    if (arg_names.count(tensor->buffer->name)) {
+      auto it = std::find_if(
+          in_args.begin(), in_args.end(), [&](const ir::Argument& x) { return x.name() == tensor->buffer->name; });
+      if (it != in_args.end()) {
+        in_args.erase(it);
+      } else {
+        continue;
+      }
+    }
+
+    arg_names.insert(tensor->buffer->name);
+
+    auto io = is_output ? ir::Argument::IO::kOutput : ir::Argument::IO::kInput;
+    if (io == ir::Argument::IO::kInput)
+      in_args.emplace_back(tensor->buffer, io);
+    else
+      out_args.emplace_back(tensor->buffer, io);
+  }
+  if (out_args.empty()) {
+    for (auto& i : all_tensors) {
+      auto* tensor = i.as_tensor();
+      VLOG(3) << "Tensor " << tensor->name;
+      if (tensor->buffer.defined() && !arg_names.count(tensor->buffer->name)) {
+        bool is_output = teller.IsWrite(tensor->name) && teller.IsWrite(tensor->name);
+        if (is_output) out_args.emplace_back(tensor->buffer, ir::Argument::IO::kOutput);
+      }
+    }
+  }
+
+  std::vector<ir::Argument> args(in_args.begin(), in_args.end());
+  args.insert(std::end(args), out_args.begin(), out_args.end());
+  return args;
+}
+
+std::vector<Tensor> LowerImpl::CollectTemporaryTensors() {
+  // a temporary should be in the comp_graph but not contained in the tensor_args.
+  absl::flat_hash_map<std::string, Tensor> tensor_arg_map = GenTensorArgMap();
+  absl::flat_hash_map<std::string, Tensor> temp_tensor_map;
+
+  for (auto* node : compu_graph_->nodes()) {
+    auto* cnode = node->safe_as<CompuGraphNode>();
+    CHECK(cnode);
+    if (!tensor_arg_map.count(cnode->tensor->name)) {
+      temp_tensor_map[cnode->tensor->name] = cnode->tensor;
+    }
+  }
+
+  std::vector<Tensor> temp_tensors;
+  std::transform(temp_tensor_map.begin(),
+                 temp_tensor_map.end(),
+                 std::back_inserter(temp_tensors),
+                 [&](const decltype(temp_tensor_map)::value_type& x) { return x.second; });
+  return temp_tensors;
+}
+
+absl::flat_hash_map<std::string, Tensor> LowerImpl::GenTensorArgMap() {
+  absl::flat_hash_map<std::string, Tensor> map;
+  for (auto& t : tensor_args_) {
+    map[t->name] = t;
+  }
+  return map;
+}
+
+absl::flat_hash_map<std::string, Tensor> LowerImpl::GenAllTensorMap() {
+  absl::flat_hash_map<std::string, Tensor> map;
+  for (auto& t : CollectAllTensors()) {
+    map[t->name] = t;
+  }
+  return map;
+}
+
+std::vector<ir::LoweredFunc> LowerImpl::operator()() {
+  std::vector<poly::Stage*> stages;
+  std::map<std::string, ir::Tensor> all_tensor_map;
+  for (auto& t : CollectAllTensors()) {
+    all_tensor_map[t->name] = t;
+    if (!stages_[t]->inlined()) stages.push_back(stages_[t]);
+  }
+
+  auto deps     = CollectExtraDependencies();
+  auto schedule = poly::CreateSchedule(
+      stages, poly::ScheduleKind::Poly, std::vector<std::pair<std::string, std::string>>(deps.begin(), deps.end()));
+  auto func_body = GenerateFunctionBody(schedule.get());
+
+  std::vector<ir::LoweredFunc> result;
+  int num_func = 0;
+  for (auto& func_iterator : func_body) {
+    if (support_ir_schedule_) {
+      // add ScheduleBlockRealize
+      func_iterator = ir::ScheduleBlockRealize::Make(
+          {}, ir::ScheduleBlock::Make({}, {}, {}, common::UniqName("root"), func_iterator));
+    }
+    std::set<std::string> temp_tensor_names;
+    for (auto& t : temp_tensor_args_) temp_tensor_names.insert(t->name);
+
+    auto tensor_map =
+        optim::InitialAssignBuffer(&func_iterator, stages_, all_tensor_map, comp_graph(), temp_tensor_names);
+    // copy the tensor(with buffer assigned) back to func's args.
+    {
+      for (auto& arg : tensor_args_) {
+        if (arg->is_placeholder_node()) continue;
+        if (arg->buffer.defined()) continue;
+        if (arg->body().As<ir::Call>() && arg->body().type().is_void()) continue;  // extern call
+        if (tensor_map.find(arg->name) == tensor_map.end()) {
+          LOG(INFO) << "Didn't find arg tensor " << arg->name << "in tensor_map.\n"
+                    << "The function is " << fn_name_ << "\nAnd all the arg tensors are:\n";
+          for (auto& i : tensor_args_) {
+            LOG(INFO) << i->name;
+          }
+          LOG(FATAL) << "Fatal Error!";
+        }
+        Reference(&arg)->buffer = tensor_map.at(arg->name)->buffer;
+      }
+    }
+    auto store_exprs = ir::CollectIRNodes(func_iterator, [](const Expr* x) { return x->As<ir::Store>(); });
+    std::vector<ir::Tensor> new_temp_tensors;
+    for (auto& expr : store_exprs) {
+      auto* store_node = expr.As<ir::Store>();
+      CHECK(store_node);
+      auto* tensor = store_node->tensor.As<ir::_Tensor_>();
+      CHECK(tensor);
+      VLOG(3) << "In store_exprs, its name is : " << tensor->name;
+      CHECK(tensor->buffer.defined());
+      if (tensor->buffer->memory_type != ir::MemoryType::Heap) {
+        new_temp_tensors.push_back(store_node->tensor.as_tensor_ref());
+      }
+    }
+
+    auto func_temp_tensors = CollectTemporaryTensors();
+    std::vector<ir::Buffer> temp_buffers;
+    std::unordered_set<std::string> buffer_name_set;
+    // TODO(Superjomn) write buffer latter.
+
+    if (target_ == common::DefaultNVGPUTarget()) {
+      for (auto& t : new_temp_tensors) {
+        if (!tensor_map.count(t->name)) continue;
+        auto& tt = tensor_map.at(t->name);
+        if (tt->buffer.defined() && !buffer_name_set.count(tt->buffer->name)) {
+          temp_buffers.push_back(tt->buffer);
+          buffer_name_set.insert(tt->buffer->name);
+        }
+      }
+    } else {
+      for (auto& t : func_temp_tensors) {
+        if (!tensor_map.count(t->name)) continue;
+        auto& tt = tensor_map.at(t->name);
+        if (tt->buffer.defined() && !buffer_name_set.count(tt->buffer->name)) {
+          temp_buffers.push_back(tt->buffer);
+          buffer_name_set.insert(tt->buffer->name);
+        }
+      }
+    }
+
+    ir::LoweredFunc func;
+    if (target_ == common::DefaultNVGPUTarget()) {
+      auto func_args2         = GenFuncArgForSplitKernel(func_iterator, new_temp_tensors);
+      std::string new_fn_name = fn_name_;
+      if (num_func > 0) {
+        new_fn_name += "_" + std::to_string(num_func);
+      }
+      VLOG(3) << "Making func :" << new_fn_name;
+      for (auto& i : func_args2) {
+        VLOG(3) << "func_args2 is : " << i.name();
+      }
+      for (auto& i : temp_buffers) {
+        VLOG(3) << "temp_buffers is : " << i->name;
+      }
+      func = ir::_LoweredFunc_::Make(new_fn_name, func_args2, func_iterator, temp_buffers);
+    } else {
+      auto func_args = GenerateFunctionArgumentList(func_iterator);
+      func           = ir::_LoweredFunc_::Make(fn_name_, func_args, func_iterator, temp_buffers);
+    }
+
+    if (support_ir_schedule_) {
+      optim::TransformPolyForToFor(&func->body);
+      optim::RemoveNestedBlock(&func->body);
+      func->body = ir::Block::Make({func->body});
+      result.push_back(ir::LoweredFunc(func.get()));
+      num_func++;
+    } else {
+      optim::ComputeInlineExpand(&func->body, stages_, &all_tensor_map);
+      auto res =
+          optim::Optimize(func, target_, FLAGS_cinn_runtime_display_debug_info, /* remove_gpu_for_loops = */ false);
+
+      if (cuda_axis_info_.size() > num_func && cuda_axis_info_[num_func].valid()) {
+        auto* res_func           = res.as_lowered_func();
+        res_func->cuda_axis_info = cuda_axis_info_[num_func];
+      }
+      result.push_back(ir::LoweredFunc(res.get()));
+      num_func++;
+    }
+  }
+  return result;
+}
+
+std::vector<Tensor> LowerImpl::CollectAllTensors() {
+  std::vector<Tensor> tensors;
+  auto topo_order = compu_graph_->topological_order();  // NOLINT
+  auto& nodes     = std::get<0>(topo_order);
+  auto& edges     = std::get<1>(topo_order);
+  for (auto* node : nodes) {
+    auto* cnode = node->safe_as<CompuGraphNode>();
+    CHECK(cnode);
+    tensors.push_back(cnode->tensor);
+  }
+  return tensors;
+}
+
+std::set<std::pair<std::string, std::string>> LowerImpl::CollectExtraDependencies() const {
+  std::set<std::pair<std::string, std::string>> deps;
+  for (auto* node : compu_graph_->nodes()) {
+    auto* cnode = node->safe_as<CompuGraphNode>();
+    CHECK(cnode);
+    for (auto& dep : stages_[cnode->tensor]->ctrl_depends()) {
+      deps.emplace(dep->name, cnode->tensor->name);
+    }
+  }
+  return deps;
+}
+
+std::vector<Expr> LowerImpl::GenerateFunctionBody(const poly::Schedule* schedule) {
+  // generate the expressions for each group.
+  std::vector<Expr> exprs;
+  std::vector<Expr> result;
+  auto tensor_map = GenAllTensorMap();
+  std::map<std::string, Expr> tuple_to_expr;
+  CHECK(!schedule->groups.empty()) << "no group is generated";
+
+  std::map<std::string, ir::Tensor> global_tensor_map;
+  std::unordered_map<std::string, std::vector<Expr>> resized_buffer_cache;
+
+  for (auto& group : schedule->groups) {
+    CHECK_GT(group.nodes.size(), 0) << "group is empty";
+    bool all_temp_tensor = true;
+    for (auto& node : group.nodes) {
+      if (!tensor_map.count(node->id())) {
+        VLOG(2) << "tensor_map doesn't count " << node->id();
+        continue;
+      }
+      auto& tensor = tensor_map[node->id()];
+      if (!tensor->has_expression()) continue;
+      all_temp_tensor =
+          all_temp_tensor && (stages_[tensor]->inlined() ||
+                              (tensor->buffer.defined() && (tensor->buffer->memory_type == ir::MemoryType::GPUShared ||
+                                                            tensor->buffer->memory_type == ir::MemoryType::GPULocal)));
+      auto store_body = tensor->tensor_store_expanded_body();
+      if (support_ir_schedule_) {
+        // add schedule block of tensor computation for schedule IR
+        int var_counts = tensor->shape.size() + tensor->reduce_axis.size();
+        std::vector<int> int_shape;
+        VLOG(3) << "Tensor " << tensor->name << "'s shape is : " << utils::Join(tensor->shape, ",");
+        for (auto& expr : tensor->shape) {
+          CHECK(expr.is_constant());
+          int_shape.push_back((int)expr.get_constant());
+        }
+        for (auto& var : tensor->reduce_axis) {
+          CHECK(var->lower_bound.defined());
+          CHECK(var->upper_bound.defined());
+          CHECK(common::is_zero(var->lower_bound));
+          CHECK(var->upper_bound.is_constant());
+          int_shape.push_back((int)var->upper_bound.get_constant());
+        }
+        // create block itervars, i0,i1...
+        std::vector<Var> block_vars;
+        std::vector<Expr> iter_values;
+        std::vector<Var> axis_vars = common::GenDefaultAxis(tensor->shape.size());
+        // bind var_values
+        axis_vars.insert(axis_vars.end(), tensor->reduce_axis.begin(), tensor->reduce_axis.end());
+        for (int i = 0; i < var_counts; i++) {
+          block_vars.push_back(Var(Expr(0), Expr(int_shape[i]), cinn::UniqName("i" + std::to_string(i)), false));
+          if (i >= tensor->shape.size()) {
+            block_vars[i]->is_reduce_axis = true;
+            axis_vars[i]->is_reduce_axis  = true;
+          }
+          iter_values.push_back(axis_vars[i]);
+          // replace store's indice
+          VLOG(3) << "replace axis_var " << axis_vars[i]->name << " to block_var " << block_vars[i];
+          optim::ReplaceVarWithExpr(&store_body, axis_vars[i], block_vars[i]);
+        }
+        store_body = ir::ScheduleBlockRealize::Make(
+            iter_values, ir::ScheduleBlock::Make(block_vars, {}, {}, tensor->name, store_body));
+        // iter_values, ir::ScheduleBlock::Make(block_vars, {}, {}, common::UniqName(tensor->name), store_body));
+        VLOG(3) << "store body\n" << store_body;
+      }
+      tuple_to_expr[tensor->name] = store_body;
+    }
+
+    ir::CudaAxisInfo temp_cuda_axis_info;
+    Expr group_expr =
+        LowerGroup(group, tuple_to_expr, &global_tensor_map, resized_buffer_cache, stages_, &temp_cuda_axis_info);
+
+    if (group_expr.defined()) {
+      cuda_axis_info_.emplace_back(std::move(temp_cuda_axis_info));
+      if (target_ == common::DefaultNVGPUTarget() && !all_temp_tensor) {
+        exprs.push_back(group_expr);
+        Expr body = ir::Block::Make(exprs);
+        result.push_back(body);
+        exprs.clear();
+      } else {
+        exprs.push_back(group_expr);
+      }
+    }
+  }
+  if (target_ == common::DefaultHostTarget()) {
+    Expr body = ir::Block::Make(exprs);
+    result.push_back(body);
+    exprs.clear();
+  } else if (!exprs.empty()) {
+    Expr body = ir::Block::Make(exprs);
+    result.push_back(body);
+    exprs.clear();
+  }
+
+  return result;
+}
+
+LowerImpl::LowerImpl(const std::string& fn_name,
+                     StageMap stages,
+                     const std::vector<Tensor>& tensor_args,
+                     const std::vector<Var>& scalar_args,
+                     const std::vector<Tensor>& temp_tensor_args,
+                     const Target& target,
+                     bool support_ir_schedule)
+    : fn_name_(fn_name),
+      stages_(stages),
+      tensor_args_(tensor_args),
+      scalar_args_(scalar_args),
+      temp_tensor_args_(temp_tensor_args),
+      target_(target),
+      support_ir_schedule_(support_ir_schedule) {
+  {  // Initialize the graph
+    std::vector<ir::Tensor> tensors(tensor_args.begin(), tensor_args.end());
+    tensors.insert(std::end(tensors), temp_tensor_args.begin(), temp_tensor_args.end());
+
+    compu_graph_ = CreateCompGraph(tensors, stages, false /*inline_hide*/);
+
+    VLOG(1) << "compute_graph:\n" << compu_graph_->Visualize();
+  }
+
+  // Todo: Here insert auto syncthreads() @haoze
+
+  {  // update schedule.
+    std::vector<ir::Tensor> tensors(tensor_args.begin(), tensor_args.end());
+    tensors.insert(std::end(tensors), temp_tensor_args_.begin(), temp_tensor_args_.end());
+    compu_graph_ = CreateCompGraph(tensors, stages, true /*inline_hide*/);
+
+    VLOG(1) << "Computation Graph:\n" << compu_graph_->Visualize();
+  }
+}
+
+}  // namespace detail
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/lower_impl.h b/paddle/cinn/lang/lower_impl.h
new file mode 100644
index 0000000000000..923e04c90d46f
--- /dev/null
+++ b/paddle/cinn/lang/lower_impl.h
@@ -0,0 +1,304 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <stack>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/optim/buffer_assign.h"
+#include "cinn/optim/compute_inline_expand.h"
+#include "cinn/optim/fold_cinn_call_arguments.h"
+#include "cinn/optim/optimize.h"
+#include "cinn/optim/remove_nested_block.h"
+#include "cinn/optim/replace_call_with_expr.h"
+#include "cinn/optim/tensor_write_tell.h"
+#include "cinn/optim/transform_gpu_forloop.h"
+#include "cinn/optim/transform_polyfor_to_for.h"
+#include "cinn/poly/ast_gen.h"
+
+namespace cinn {
+
+namespace poly {
+class Stage;
+}  // namespace poly
+
+namespace lang {
+namespace detail {
+
+/**
+ * After the AstGen build the forloop from isl exprs, all the ISL Call nodes should be mapped to the corresponding CINN
+ * expressions, there should be no remaining.
+ */
+void CheckNoIslCallRemains(const Expr* expr);
+
+/**
+ * \brief Lower a single group of nodes.
+ *
+ * We partition the whole computation of a function into several groups, each group is a basic element for ISL
+ * polyhedral computation, that is, we transform a group into a isl domain and schedule, and generate ast latter.
+ *
+ * @param group A single schedule group containing several Stages and the scheduling order.
+ * @param tuple_to_expr A map from isl set tuple name to CINN expressions.
+ */
+Expr LowerGroup(const poly::ScheduleGroup& group,
+                const std::map<std::string, Expr>& tuple_to_expr,
+                std::map<std::string, Tensor>* global_tensor_map,
+                std::unordered_set<std::string>& resized_buffer,
+                StageMap stage_map,
+                ir::CudaAxisInfo* cuda_axis_info = nullptr);
+
+/**
+ * A Computation graph node.
+ */
+struct CompuGraphNode : public common::GraphNode {
+  explicit CompuGraphNode(ir::Tensor tensor) : tensor(tensor) {}
+
+  ir::Tensor tensor;
+
+  std::string id() const override;
+  const char* type_info() const override;
+  static const char* __type_info__;
+};
+
+/**
+ * \brief Create a computation graph using a tensor set.
+ * It will deduce the temporary tensors not in the \p tensors.
+ * It consider the `extra_depend_stages` stored in tensor.stage.
+ *
+ * @param tensors the input/output tensors of a computation.
+ * @param hide_inline hide inline tensor nodes.
+ * @return a graph.
+ */
+std::unique_ptr<common::Graph> CreateCompGraph(const std::vector<ir::Tensor>& tensors,
+                                               StageMap stages,
+                                               bool hide_inline = false);
+
+class LowerImpl {
+ public:
+  /**
+   * @param fn_name the name of the final output function.
+   * @param tensor_args the tensor arguments for the function
+   * @param scalar_args the scalar arguments for the function
+   * @param temp_tensor_args the extra temporary tensor arguments
+   *
+   * The \p tensor_args contains both input and output tensors.
+   */
+  LowerImpl(const std::string& fn_name,
+            StageMap stages,
+            const std::vector<Tensor>& tensor_args,
+            const std::vector<Var>& scalar_args,
+            const std::vector<Tensor>& temp_tensor_args = {},
+            const Target& target                        = common::DefaultHostTarget(),
+            bool support_ir_schedule                    = false);
+
+  std::vector<ir::LoweredFunc> operator()();
+
+  /**
+   * Get the computational graph.
+   */
+  const common::Graph* comp_graph() const { return compu_graph_.get(); }
+
+  /**
+   * \brief generate the argument list of the final output function.
+   * We put the scalar_args in front of tensor_args, e.g. get tensor_args{A,B}, scalar_args{m}, the final argument list
+   * is {m, A, B}, the input and output tensor can be mixed in the tensor_args, the kInput and kOutput token will deduce
+   * from their usage in the computation.
+   */
+  std::vector<ir::Argument> GenerateFunctionArgumentList(Expr fn_body);
+
+  std::vector<ir::Argument> GenFuncArgForSplitKernel(Expr func_iterator, std::vector<ir::Tensor> temp_tensors);
+
+  /**
+   * \brief generate the body expression of the final output function.
+   */
+  std::vector<Expr> GenerateFunctionBody(const poly::Schedule* schedule);
+
+ private:
+  /**
+   * \brief Collect the temporary tensors.
+   * A temporary tensor is one that is in the computation graph, not inlined and not in the tensor_args(similar to a
+   * temporary variable inside function).
+   */
+  std::vector<Tensor> CollectTemporaryTensors();
+
+  /**
+   * \brief Check both the tensor_args and sclar_args not contain duplication (different arguemnt with the same name).
+   */
+  void CheckArgsUnique();
+
+  /**
+   * \brief Get a map, for each tensor in the tensor_args, map from name to itself.
+   */
+  inline absl::flat_hash_map<std::string, Tensor> GenTensorArgMap();
+
+  /**
+   * \brief Get a map, for each tensor in the computation graph, map from name to itself.
+   */
+  inline absl::flat_hash_map<std::string, Tensor> GenAllTensorMap();
+
+  /**
+   * \brief Get all the tensors, including the input, output and temporary ones.
+   */
+  std::vector<Tensor> CollectAllTensors();
+
+  /**
+   * \brief Collect the extra dependencies between tensors.
+   *
+   * The extra dependencies include
+   * 1. the control deps in Stage.
+   *
+   * TODO(Superjomn) remove the field `extra_depend_stages`
+   */
+  std::set<std::pair<std::string, std::string>> CollectExtraDependencies() const;
+
+ private:
+  const std::string& fn_name_;
+  const std::vector<Tensor>& tensor_args_;
+  const std::vector<Var>& scalar_args_;
+  std::vector<Tensor> temp_tensor_args_;
+  Target target_;
+
+  StageMap stages_;
+
+  //! A computation graph generated from the tensor_args and scalar_args.
+  std::unique_ptr<common::Graph> compu_graph_;
+
+  //! CUDA axis info for this function.
+  std::vector<ir::CudaAxisInfo> cuda_axis_info_;
+
+  bool support_ir_schedule_ = false;
+};
+
+/**
+ * \brief Tell whether a tensor contains some GPU related information, such some schedule.
+ */
+bool TensorContainsGPUInfo(ir::Tensor t, poly::Stage* stage);
+
+/**
+ * Mark the PolyFor as Vectorized if it is scheduled Vectorize in Stage.
+ */
+struct MarkVectorizeMutator : public ir::IRMutator<Expr*> {
+  const std::map<std::string, ir::VectorizeInfo>& vectorizes;
+
+  explicit MarkVectorizeMutator(const std::map<std::string /*tensor name*/, ir::VectorizeInfo>& vectorizes)
+      : vectorizes(vectorizes) {}
+
+  void operator()(Expr* expr) { ir::IRMutator<Expr*>::Visit(expr, expr); }
+
+  // NOTE This mutator takes PolyFor as input, not For.
+  void Visit(const ir::PolyFor* op, Expr* expr) override {
+    auto* node = expr->As<ir::PolyFor>();
+    forloop_stack.push_back(node);
+    ir::IRMutator<ir::Expr*>::Visit(op, expr);
+    forloop_stack.pop_back();
+  }
+
+  // each statement in ISL is bound to a Store node.
+  void Visit(const ir::Store* op, Expr* expr) override {
+    auto* tensor_n = op->tensor.As<ir::_Tensor_>();
+    CHECK(tensor_n);
+    auto it = vectorizes.find(tensor_n->name);
+    if (it != vectorizes.end()) {
+      CHECK_LT(it->second.level, forloop_stack.size());
+      forloop_stack[it->second.level]->set_vectorize_info(it->second);
+      CHECK(it->second.valid());
+    }
+  }
+
+  std::vector<ir::PolyFor*> forloop_stack;
+};
+
+/**
+ * Mark the PolyFor as Unroll if is called Unroll in Stage.
+ */
+struct MarkUnrollMutator : public ir::IRMutator<Expr*> {
+  std::map<std::string, std::set<int> /*level*/> unrolls;
+
+  explicit MarkUnrollMutator(const std::map<std::string, std::set<int>>& unrolls) : unrolls(unrolls) {}
+
+  void operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::PolyFor* op, Expr* expr) override {
+    auto* node = expr->As<ir::PolyFor>();
+    stack.push_back(node);
+    ir::IRMutator<>::Visit(op, expr);
+    stack.pop_back();
+  }
+
+  // each statement in ISL is bound to a Store node.
+  void Visit(const ir::Store* op, Expr* expr) override {
+    auto* tensor_n = op->tensor.As<ir::_Tensor_>();
+    CHECK(tensor_n);
+    auto it = unrolls.find(tensor_n->name);
+    if (it != unrolls.end()) {
+      for (int level : it->second) {
+        VLOG(1) << "Mark " << level << " Unrolled";
+        CHECK_LT(level, stack.size());
+        stack[level]->set_unrolled();
+      }
+    }
+  }
+
+  std::vector<ir::PolyFor*> stack;
+};
+
+/**
+ * Mark the PolyFor as Parallel if is called Parallel in Stage.
+ */
+struct MarkParallelMutator : public ir::IRMutator<Expr*> {
+  std::map<std::string, std::set<int> /*level*/> parallels;
+
+  explicit MarkParallelMutator(const std::map<std::string, std::set<int>>& parallels) : parallels(parallels) {}
+
+  void operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::PolyFor* op, Expr* expr) override {
+    auto* node = expr->As<ir::PolyFor>();
+    stack.push_back(node);
+    ir::IRMutator<>::Visit(op, expr);
+    stack.pop_back();
+  }
+
+  // each statement in ISL is bound to a Store node.
+  void Visit(const ir::Store* op, Expr* expr) override {
+    auto* tensor_n = op->tensor.As<ir::_Tensor_>();
+    CHECK(tensor_n);
+    auto it = parallels.find(tensor_n->name);
+    if (it != parallels.end()) {
+      for (int level : it->second) {
+        VLOG(1) << "Mark " << level << " Paralled";
+        CHECK_LT(level, stack.size());
+        stack[level]->set_parallel();
+      }
+    }
+  }
+
+  std::vector<ir::PolyFor*> stack;
+};
+
+}  // namespace detail
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/lower_impl_test.cc b/paddle/cinn/lang/lower_impl_test.cc
new file mode 100644
index 0000000000000..32b2c234093e0
--- /dev/null
+++ b/paddle/cinn/lang/lower_impl_test.cc
@@ -0,0 +1,320 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/lower_impl.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace lang {
+namespace detail {
+
+#define CREATE_GNODE(k__) auto* n##k__ = graph->RetrieveNode(#k__);
+#define ASSERT_LINKED(a__, b__) ASSERT_TRUE(n##a__->IsLinkedTo(n##b__));
+
+TEST(CreateCompGraph, single_layer) {
+  Expr M(100);
+  Expr N(200);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto C = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
+
+  LOG(INFO) << C->expr_fields().size();
+  for (auto* e : C->expr_fields()) {
+    LOG(INFO) << "e: " << *e;
+  }
+
+  auto stages = CreateStages({C});
+  auto graph  = CreateCompGraph({A, B, C}, stages);
+
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  /* generated graph
+    digraph G {
+       node_0[label="A"]
+       node_1[label="B"]
+       node_2[label="C"]
+       node_0->node_2
+       node_1->node_2
+    } // end G
+  */
+
+  CREATE_GNODE(A)
+  CREATE_GNODE(B)
+  CREATE_GNODE(C)
+
+  ASSERT_TRUE(nA->IsLinkedTo(nC));
+  ASSERT_TRUE(nB->IsLinkedTo(nC));
+}
+
+TEST(CreateCompGraph, multi_layers) {
+  Expr M(100);
+  Expr N(200);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  // A->C
+  // B->C
+  auto C = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
+
+  // C->D
+  // B->D
+  auto D = Compute(
+      {M, N}, [&](Expr i, Expr j) { return C(i, j) + B(i, j); }, "D");
+
+  // A->E
+  // B->E
+  // C->E
+  // D->E
+  auto E = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j) + B(i, j) + C(i, j) + D(i, j); }, "E");
+
+  auto stages = CreateStages({C, D, E});
+  auto graph  = CreateCompGraph({A, B, E}, stages);
+
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  /*
+   digraph G {
+     node_0[label="A"]
+     node_1[label="B"]
+     node_3[label="C"]
+     node_4[label="D"]
+     node_2[label="E"]
+     node_0->node_2
+     node_0->node_3
+     node_1->node_2
+     node_1->node_4
+     node_1->node_3
+     node_3->node_2
+     node_3->node_4
+     node_4->node_2
+   } // end G
+  */
+
+  CREATE_GNODE(A)
+  CREATE_GNODE(B)
+  CREATE_GNODE(C)
+  CREATE_GNODE(D)
+  CREATE_GNODE(E)
+
+  ASSERT_EQ(graph->num_nodes(), 5);
+
+  ASSERT_LINKED(A, C)
+  ASSERT_LINKED(B, C)
+
+  ASSERT_LINKED(C, D)
+  ASSERT_LINKED(B, D)
+
+  ASSERT_LINKED(A, E)
+  ASSERT_LINKED(B, E)
+  ASSERT_LINKED(C, E)
+  ASSERT_LINKED(D, E)
+}
+
+TEST(CreateCompGraph, multi_layers_with_extra_deps) {
+  Expr M(100);
+  Expr N(200);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  // A->C
+  auto C = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j) + 1.f; }, "C");
+
+  // B->D
+  auto D = Compute(
+      {M, N}, [&](Expr i, Expr j) { return B(i, j) + 1.f; }, "D");
+
+  // A->E
+  auto E = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j) + 1.f; }, "E");
+
+  auto F = Compute(
+      {M, N}, [&](Expr i, Expr j) { return C(i, j) + D(i, j) + E(i, j); }, "F");
+
+  auto stages = CreateStages({C, D, E, F});
+  // C->D
+  stages[D]->CtrlDepend(C);
+  // C->E
+  stages[E]->CtrlDepend(C);
+
+  auto graph = CreateCompGraph({A, B, F}, stages);
+
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  /*
+   digraph G {
+      node_0[label="A"]
+      node_1[label="B"]
+      node_3[label="C"]
+      node_4[label="D"]
+      node_5[label="E"]
+      node_2[label="F"]
+      node_0->node_5
+      node_0->node_3
+      node_1->node_4
+      node_3->node_2
+      node_3->node_5
+      node_3->node_4
+      node_4->node_2
+      node_5->node_2
+   } // end G
+   */
+
+  CREATE_GNODE(A)
+  CREATE_GNODE(B)
+  CREATE_GNODE(C)
+  CREATE_GNODE(D)
+  CREATE_GNODE(E)
+  CREATE_GNODE(F)
+
+  ASSERT_LINKED(B, D)
+  ASSERT_LINKED(A, C)
+  ASSERT_LINKED(A, E)
+  ASSERT_LINKED(C, E)
+  ASSERT_LINKED(C, F)
+  ASSERT_LINKED(C, D)
+  ASSERT_LINKED(D, F)
+}
+
+TEST(CreateCompGraph, inline_compatible) {
+  Expr M(100);
+  Expr N(200);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  // A->C
+  // B->C
+  auto C = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
+
+  // C->D
+  // B->D
+  auto D = Compute(
+      {M, N}, [&](Expr i, Expr j) { return C(i, j) + B(i, j); }, "D");
+
+  // A->E
+  // B->E
+  // C->E
+  // D->E
+  auto E = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j) + B(i, j) + C(i, j) + D(i, j); }, "E");
+
+  auto stages = CreateStages({C, D, E});
+  stages[D]->ComputeInline();
+
+  auto graph = CreateCompGraph({A, B, E}, stages, true);
+
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  /*
+    digraph G {
+    node_0[label="A"]
+    node_1[label="B"]
+    node_3[label="C"]
+    node_2[label="E"]
+    node_0->node_2
+    node_0->node_3
+    node_1->node_2
+    node_1->node_3
+    node_3->node_2
+    } // end G
+  */
+
+  CREATE_GNODE(A)
+  CREATE_GNODE(B)
+  CREATE_GNODE(C)
+  CREATE_GNODE(E)
+
+  ASSERT_EQ(graph->num_nodes(), 4);
+  ASSERT_TRUE(nA->IsLinkedTo(nC));
+  ASSERT_TRUE(nA->IsLinkedTo(nE));
+  ASSERT_TRUE(nB->IsLinkedTo(nC));
+  ASSERT_TRUE(nB->IsLinkedTo(nE));
+  ASSERT_TRUE(nA->IsLinkedTo(nC));
+  ASSERT_TRUE(nB->IsLinkedTo(nE));
+}
+
+TEST(CreateCompGraph, inline_compatible1) {
+  Expr M(100);
+  Expr N(200);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  // A->C
+  // B->C
+  auto C = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
+
+  // C->D
+  // B->D
+  auto D = Compute(
+      {M, N}, [&](Expr i, Expr j) { return C(i, j) + B(i, j); }, "D");
+
+  // A->E
+  // B->E
+  // C->E
+  // D->E
+  auto E = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j) + B(i, j) + C(i, j) + D(i, j); }, "E");
+
+  auto stages = CreateStages({C, D, E});
+  stages[C]->ComputeInline();
+
+  auto graph = CreateCompGraph({A, B, E}, stages, true);
+
+  LOG(INFO) << "graph:\n" << graph->Visualize();
+
+  /*
+  digraph G {
+     node_0[label="A"]
+     node_1[label="B"]
+     node_3[label="D"]
+     node_2[label="E"]
+     node_0->node_2
+     node_1->node_2
+     node_1->node_3
+     node_3->node_2
+  } // end G
+  */
+
+  CREATE_GNODE(A)
+  CREATE_GNODE(B)
+  CREATE_GNODE(D)
+  CREATE_GNODE(E)
+
+  ASSERT_EQ(graph->num_nodes(), 4);
+
+  ASSERT_TRUE(nA->IsLinkedTo(nE));
+  ASSERT_TRUE(nD->IsLinkedTo(nE));
+  ASSERT_TRUE(nB->IsLinkedTo(nE));
+  ASSERT_TRUE(nB->IsLinkedTo(nD));
+  ASSERT_TRUE(nD->IsLinkedTo(nE));
+}
+
+}  // namespace detail
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/lower_test.cc b/paddle/cinn/lang/lower_test.cc
new file mode 100755
index 0000000000000..a7f9ebbebe9e7
--- /dev/null
+++ b/paddle/cinn/lang/lower_test.cc
@@ -0,0 +1,155 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/lower.h"
+
+#include <gtest/gtest.h>
+
+#include <set>
+
+#include "cinn/cinn.h"
+#include "cinn/lang/buffer.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace lang {
+
+TEST(lower, basic) {
+  auto M = Expr(100);
+  auto N = Expr(15);
+
+  Placeholder<float> A("A", {Expr(M), Expr(N)});
+
+  auto B = Compute(
+      {M, N}, [=](Var i, Var j) -> Expr { return A(i, j) + 1.f; }, "B");
+
+  auto stages = CreateStages({B});
+
+  auto lower_funcs = Lower("cal_B", stages, {A, B});
+
+  LOG(INFO) << "lower_size " << lower_funcs;
+
+#define TEST_SOUTPUT(x, out)           \
+  std::cout << "\n" << x << std::endl; \
+  EXPECT_EQ(utils::GetStreamCnt(x), utils::Trim(out));
+
+  auto out = R"ROC(
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 15)
+    {
+      B[i, j] = (1.00000000f + A[i, j])
+    }
+  }
+}
+)ROC";
+  TEST_SOUTPUT(lower_funcs->body, out);
+}
+
+TEST(lower, more_complex) {
+  Expr M(100);
+  Expr N(15);
+  Expr K(200);
+
+  Placeholder<float> A("A", {Expr(M), Expr(N)});
+  Placeholder<float> B("B", {Expr(N), Expr(K)});
+
+  auto C = Compute(
+      {M, N, K}, [=](Var i, Var j, Var k) -> Expr { return A(i, j) * B(j, k); }, "C");
+
+  auto stages = CreateStages({C});
+
+  auto lower_funcs = Lower("cal_C", stages, {A, B, C});
+
+  std::cout << "func:\n" << Expr(lower_funcs->self()) << std::endl;
+}
+
+//! To support training, the dynamic shape support is vital. We test the corresponding lower ability here.
+TEST(lower, dynamic_shape) {
+  Var B("B");  // B is like shape here.
+  Expr N(15);
+  Expr K(200);
+
+  // Input is B * N, B is like batch.
+  Placeholder<float> X("X", {Expr(B), Expr(N)});
+  Placeholder<float> W("W", {Expr(N), Expr(K)});
+
+  auto C = Compute(
+      {B, N, K}, [=](Var i, Var j, Var k) -> Expr { return X(i, j) * W(j, k); }, "C");
+
+  auto stages      = CreateStages({C});
+  auto lower_funcs = Lower("cal_C", stages, {X, W, C});
+
+  std::cout << "func:\n" << Expr(lower_funcs->self()) << std::endl;
+}
+
+TEST(lower, lowered_call) {
+  Var B("B");  // B is like shape here.
+  Expr N(15);
+
+  // Input is B * N, B is like batch.
+  Placeholder<float> X("X", {Expr(B), Expr(N)});
+  Placeholder<float> Y("Y", {Expr(B), Expr(N)});
+
+  auto Z = Compute(
+      {B, N}, [&](Var i, Var j) { return X(i, j) + Y(i, j); }, "Z");
+
+  std::vector<ReturnType> return_types({{Float(32), std::vector<Expr>{{B, N}}, "C"}});
+  auto tensors = CallLowered("lowered_fun0", {X, Y, Z}, return_types);
+  auto C       = tensors[0];
+
+  auto stages = CreateStages({X, Y, Z, C});
+
+  LOG(INFO) << "call_op: " << C->operation->as<ir::CallOp>()->call_expr;
+
+  auto lower_func = Lower("fn", stages, {X, Y, Z, C});
+}
+
+// test the temp_buffers are all collected.
+TEST(lower, temp_buffer_collects) {
+  Expr M(10);
+
+  Placeholder<float> A("A", {M});
+
+  auto B = Compute(
+      {M}, [&](Expr i) -> Expr { return A(i); }, "B");  // temp
+  auto C = Compute(
+      {M}, [&](Expr i) -> Expr { return B(i); }, "C");  // temp
+  auto D = Compute(
+      {M}, [&](Expr i) -> Expr { return C(i); }, "D");  // temp
+  auto output = Compute(
+      {M}, [&](Expr i) -> Expr { return D(i); }, "output");
+
+  ir::Module::Builder b("somemodule", common::DefaultHostTarget());
+
+  auto stages = CreateStages({B, C, D, output});
+
+  auto fn = Lower("fn", stages, {A, output}, {}, {}, &b);
+
+  auto module = b.Build();
+
+  ASSERT_EQ(module.buffers().size(), 3UL);
+
+  std::set<std::string> detected_buffer_names({"_B", "_C", "_D"});
+
+  for (auto& buffer : module.buffers()) {
+    ASSERT_TRUE(detected_buffer_names.count(buffer->name));
+  }
+}
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/packed_func.cc b/paddle/cinn/lang/packed_func.cc
new file mode 100644
index 0000000000000..47f6e777c2c2d
--- /dev/null
+++ b/paddle/cinn/lang/packed_func.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/packed_func.h"
+
+namespace cinn {
+namespace lang {
+
+Args::Args(cinn_value_t *values, int *type_codes, int len) {
+  for (int i = 0; i < len; i++) {
+    values_.emplace_back(values[i], type_codes[i]);
+  }
+}
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/packed_func.h b/paddle/cinn/lang/packed_func.h
new file mode 100644
index 0000000000000..eca3fe84cd9f6
--- /dev/null
+++ b/paddle/cinn/lang/packed_func.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cinn_value.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace lang {
+using common::CINNValue;
+
+/**
+ * A single argument value to Function.
+ */
+using ArgValue = CINNValue;
+
+using RetValue = CINNValue;
+
+/**
+ * Arguments of the PackedFunc.
+ */
+class Args {
+ public:
+  Args() = default;
+  Args(cinn_value_t* values, int* type_codes, int len);
+
+  //! Append a \p value of type code \p type_code.
+  void Append(const ArgValue& arg) { values_.push_back(arg); }
+
+  //! Count of the arguments.
+  size_t size() const { return values_.size(); }
+
+  //! if the arguments is empty
+  bool empty() const { return values_.empty(); }
+
+  //! Get i-th element.
+  ArgValue& operator[](int i) { return values_[i]; }
+  const ArgValue& operator[](int i) const { return values_[i]; }
+
+  common::CINNValuePack ToValuePack() const { return common::CINNValuePack(values_); }
+
+ private:
+  std::vector<ArgValue> values_;
+};
+
+namespace detail {
+
+template <bool stop, std::size_t I, typename F>
+struct for_each_dispatcher {
+  template <typename T, typename... Args>
+  static void Run(const F& f, T&& value, Args&&... args) {
+    f(I, std::forward<T>(value));
+    for_each_dispatcher<sizeof...(Args) == 0, I + 1, F>::Run(f, std::forward<Args>(args)...);
+  }
+};
+
+template <std::size_t I, typename F>
+struct for_each_dispatcher<true, I, F> {
+  static void Run(const F& f) {}
+};
+
+template <typename F, typename... Args>
+inline void for_each(const F& f, Args&&... args) {
+  for_each_dispatcher<sizeof...(Args) == 0, 0, F>::Run(f, std::forward<Args>(args)...);
+}
+
+struct FuncArgsSetter {
+  FuncArgsSetter(Args* args) : args_(args) {}  // NOLINT
+
+  template <typename T>
+  void operator()(size_t I, T v) const {
+    args_->Append(ArgValue(v));
+  }
+
+ private:
+  mutable Args* args_{};
+};
+
+}  // namespace detail
+
+/**
+ * A function defininer with the arguments packed, all the PackedFuncs have the same signature.
+ */
+class PackedFunc {
+ public:
+  using body_t = std::function<void(Args args, RetValue*)>;
+
+  PackedFunc() = default;
+  explicit PackedFunc(const std::string& name) : name_(name) {}
+  explicit PackedFunc(body_t body) : body_(body) {}
+
+  template <typename... Args_>
+  inline RetValue operator()(Args_&&... args) const {
+    Args _args;
+    detail::FuncArgsSetter setter(&_args);
+    detail::for_each(setter, std::forward<Args_>(args)...);
+
+    RetValue ret_value;
+    body_(_args, &ret_value);
+    return ret_value;
+  }
+
+  inline body_t body() const { return body_; }
+
+ private:
+  std::string name_;
+  body_t body_;
+};
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/packed_func_test.cc b/paddle/cinn/lang/packed_func_test.cc
new file mode 100644
index 0000000000000..e374c4655e3c7
--- /dev/null
+++ b/paddle/cinn/lang/packed_func_test.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/packed_func.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace lang {
+
+TEST(Function, test) {
+  PackedFunc::body_t func_body = [](Args args, RetValue* ret) {
+    int a = args[0];
+    int b = args[1];
+    *ret  = (a + b);
+  };
+  PackedFunc func(func_body);
+
+  int c = func(1, 2);
+  LOG(INFO) << "c " << c;
+}
+
+TEST(Function, test1) {
+  PackedFunc::body_t body = [](Args args, RetValue* ret) {
+    auto* msg = static_cast<const char*>(args[0]);
+    (*ret)    = msg;
+  };
+
+  PackedFunc func(body);
+  const char* msg = "hello world";
+  char* c         = func(msg);
+  LOG(INFO) << static_cast<char*>(c);
+}
+
+TEST(Function, Expr) {
+  PackedFunc::body_t body = [](Args args, RetValue* ret) {
+    Expr a = args[0];
+    Expr b = args[1];
+
+    ASSERT_EQ(a->__ref_count__.val(), 4);
+    ASSERT_EQ(b->__ref_count__.val(), 4);
+
+    Expr c = a + b;
+    (*ret) = CINNValue(c);
+  };
+
+  PackedFunc func(body);
+
+  Expr a(1);
+  Expr b(2);
+  ASSERT_EQ(a->__ref_count__.val(), 1);
+  ASSERT_EQ(b->__ref_count__.val(), 1);
+
+  Expr ret = func(a, b);
+
+  ASSERT_EQ(utils::GetStreamCnt(ret), "(1 + 2)");
+}
+
+TEST(Function, ReturnMultiValue) {
+  PackedFunc::body_t body = [](Args args, RetValue* ret) {
+    int a = args[0];
+    int b = args[1];
+    int c = a + b;
+    int d = a - b;
+
+    *ret = common::CINNValuePack{{common::CINNValue(c), common::CINNValue(d)}};
+  };
+
+  PackedFunc func(body);
+
+  common::CINNValuePack ret = func(1, 2);
+  int c                     = ret[0];
+  int d                     = ret[1];
+
+  EXPECT_EQ(c, 3);
+  EXPECT_EQ(d, -1);
+}
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/placeholder.cc b/paddle/cinn/lang/placeholder.cc
new file mode 100644
index 0000000000000..c73476c2db021
--- /dev/null
+++ b/paddle/cinn/lang/placeholder.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/placeholder.h"
+
+#include "cinn/runtime/intrinsic.h"
+
+namespace cinn {
+namespace lang {
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+ir::Tensor CreatePlaceHolder(const std::vector<int> &shape, Type type, const std::string &name) {
+  std::vector<Expr> expr_shape;
+  for (int s : shape) {
+    expr_shape.push_back(Expr(s));
+  }
+  return CreatePlaceHolder(expr_shape, type, name);
+}
+
+ir::Tensor CreatePlaceHolder(const std::vector<Expr> &shape, Type type, const std::string &name) {
+  if (type.is_float(32)) {
+    return Placeholder<float>(name, shape);
+  } else if (type.is_float(64)) {
+    return Placeholder<double>(name, shape);
+  } else if (type.is_bfloat16()) {
+    return Placeholder<bfloat16>(name, shape);
+  } else if (type.is_float16()) {
+    return Placeholder<float16>(name, shape);
+  } else if (type.is_int(8)) {
+    return Placeholder<int8_t>(name, shape);
+  } else if (type.is_int(16)) {
+    return Placeholder<int16_t>(name, shape);
+  } else if (type.is_int(32)) {
+    return Placeholder<int32_t>(name, shape);
+  } else if (type.is_int(64)) {
+    return Placeholder<int64_t>(name, shape);
+  } else if (type.is_uint(8)) {
+    return Placeholder<uint8_t>(name, shape);
+  } else if (type.is_uint(16)) {
+    return Placeholder<uint16_t>(name, shape);
+  } else if (type.is_uint(32)) {
+    return Placeholder<uint32_t>(name, shape);
+  } else if (type.is_uint(64)) {
+    return Placeholder<uint64_t>(name, shape);
+  } else if (type.is_bool()) {
+    return Placeholder<bool>(name, shape);
+  }
+  CINN_NOT_IMPLEMENTED
+}
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/placeholder.h b/paddle/cinn/lang/placeholder.h
new file mode 100644
index 0000000000000..dc945559cea23
--- /dev/null
+++ b/paddle/cinn/lang/placeholder.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/operation.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/runtime/intrinsic.h"
+
+namespace cinn {
+namespace lang {
+
+using ir::Expr;
+
+/**
+ * Placeholder
+ * @tparam T
+ */
+template <typename T>
+class Placeholder {
+ public:
+  Placeholder(const std::string &name, const std::vector<int> &shape);
+  Placeholder(const std::string &name, const std::vector<Expr> &shape);
+
+  //! Get a slice.
+  // @{
+  Expr operator()(Expr a) const { return Call({a}); }
+  Expr operator()(Expr a, Expr b) const { return Call({a, b}); }
+  Expr operator()(Expr a, Expr b, Expr c) const { return Call({a, b, c}); }
+  Expr operator()(Expr a, Expr b, Expr c, Expr d) const { return Call({a, b, c, d}); }
+  Expr operator()(const std::vector<Expr> &indices) const;
+  // @}
+
+  Type type() const { return tensor_->type(); }
+
+  operator ir::Tensor() { return tensor_; }
+  operator ir::Expr() { return Expr(tensor_); }
+
+  ir::Tensor &operator->() { return tensor_; }
+  const ir::Tensor &operator->() const { return tensor_; }
+
+  ir::Tensor tensor() const { return tensor_; }
+
+ private:
+  Expr Call(const std::vector<Expr> &indices) const;
+
+  void Init(const std::string &name, const std::vector<Expr> &shape);
+
+  ir::Tensor tensor_;
+};
+
+template <typename T>
+Expr Placeholder<T>::operator()(const std::vector<Expr> &indices) const {
+  return tensor_(indices);
+}
+
+template <typename T>
+Expr Placeholder<T>::Call(const std::vector<Expr> &indices) const {
+  return tensor_(indices);
+}
+
+template <typename T>
+Placeholder<T>::Placeholder(const std::string &name, const std::vector<int> &shape) {
+  std::vector<Expr> _shape;
+  for (int v : shape) _shape.push_back(Expr(v));
+  Init(name, _shape);
+}
+
+template <typename T>
+Placeholder<T>::Placeholder(const std::string &name, const std::vector<Expr> &shape) {
+  Init(name, shape);
+}
+
+ir::Tensor CreatePlaceHolder(const std::vector<int> &shape, Type type, const std::string &name);
+
+ir::Tensor CreatePlaceHolder(const std::vector<Expr> &shape, Type type, const std::string &name);
+
+/// ------- details -------
+template <typename T>
+void Placeholder<T>::Init(const std::string &name, const std::vector<Expr> &shape) {
+  ir::Var buffer_ptr(Context::Global().NewName("buffer"));
+  buffer_ptr->set_type(type_of<T>());
+
+  std::vector<Expr> strides(shape.size(), Expr(1));
+  Expr offset(0);
+
+  std::vector<ir::Var> axis;
+  for (int i = 0; i < shape.size(); i++) axis.emplace_back(common::axis_name(i));
+
+  auto op = ir::PlaceholderOp::Make(name, shape, type_of<T>());
+
+  tensor_ = ir::Tensor(name, type_of<T>(), shape, shape, op, {});
+  Buffer buffer(tensor_->type());
+  tensor_->Bind(buffer);
+}
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/placeholder_test.cc b/paddle/cinn/lang/placeholder_test.cc
new file mode 100644
index 0000000000000..5043b5280dfd6
--- /dev/null
+++ b/paddle/cinn/lang/placeholder_test.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/lang/placeholder.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn {
+namespace lang {
+
+TEST(placeholder, basic) {
+  Expr M(100);
+  Expr N(20);
+
+  Placeholder<float> x("x", {M, N});
+
+  ir::Var i("i");
+  ir::Var j("j");
+
+  auto slice = x(i, j);
+  LOG(INFO) << "slice " << slice;
+}
+
+TEST(placeholder, dynamic_shape) {
+  Var B("B", Int(32));
+  Expr N(20);
+
+  Placeholder<float> x("x", {B, N});
+
+  Var i("i"), j("j");
+  auto slice = x(i, j);
+}
+
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt
new file mode 100755
index 0000000000000..54407db0af697
--- /dev/null
+++ b/paddle/cinn/optim/CMakeLists.txt
@@ -0,0 +1,50 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    remove_nested_block.cc
+    replace_call_with_expr.cc
+    ir_copy.cc
+    ir_replace.cc
+    replace_var_with_expr.cc
+    tensor_write_tell.cc
+    ir_simplify.cc
+    optimize.cc
+    vectorize_loops.cc
+    unroll_loops.cc
+    transform_polyfor_to_for.cc
+    eliminate_broadcast_in_forloop.cc
+    fold_cinn_call_arguments.cc
+    call_arg_list_to_pod_value.cc
+    insert_debug_log_callee.cc
+    lower_function_call_bind_vars.cc
+    extern_call_process.cc
+    map_extern_call.cc
+    compute_inline_expand.cc
+    buffer_assign.cc
+    replace_const_param_to_integer.cc
+    cast_simplify.cc
+    if_simplify.cc
+    lower_intrin.cc
+    cast_bool_to_int8.cc
+    collect_undefined_vars.cc
+    var_mod_simplify.cc
+    remove_schedule_block.cc
+    )
+
+if (WITH_CUDA)
+  gather_srcs(cinnapi_src SRCS transform_gpu_forloop.cc)
+endif()
+
+
+cc_test(test_remove_nested_block SRCS remove_nested_block_test.cc DEPS cinncore)
+cc_test(test_ir_copy SRCS ir_copy_test.cc DEPS cinncore)
+cc_test(test_ir_simplify SRCS ir_simplify_test.cc DEPS cinncore)
+cc_test(test_replace_call_with_expr SRCS replace_call_with_expr_test.cc DEPS cinncore)
+cc_test(test_vectorize_loops SRCS vectorize_loops_test.cc DEPS cinncore ARGS ${global_test_args})
+cc_test(test_transform_polyfor_to_for SRCS transform_polyfor_to_for_test.cc DEPS cinncore ARGS ${global_test_args})
+cc_test(test_optimize SRCS optimize_test.cc DEPS cinncore)
+cc_test(test_cache_read_write_replace SRCS cache_read_write_replace_test.cc DEPS cinncore)
+cc_test(test_cast_simplify SRCS cast_simplify_test.cc DEPS cinncore)
+cc_test(test_if_simplify SRCS if_simplify_test.cc DEPS cinncore)
+cc_test(test_remove_schedule_block SRCS remove_schedule_block_test.cc DEPS cinncore)
+cc_test(test_unroll_loops SRCS unroll_loops_test.cc DEPS cinncore)
diff --git a/paddle/cinn/optim/buffer_assign.cc b/paddle/cinn/optim/buffer_assign.cc
new file mode 100644
index 0000000000000..0b59feb339237
--- /dev/null
+++ b/paddle/cinn/optim/buffer_assign.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/buffer_assign.h"
+
+#include "cinn/common/union_find.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/lang/lower_impl.h"
+#include "cinn/optim/ir_replace.h"
+
+namespace cinn {
+namespace optim {
+
+namespace {
+
+struct BufferUFNode : public common::UnionFindNode {
+  BufferUFNode(const std::string& x) : tensor_name(x) {}
+
+  const char* type_info() const override { return __type_info__; }
+
+  std::string tensor_name;
+  static const char* __type_info__;
+};
+
+const char* BufferUFNode::__type_info__ = "BufferUFNode";
+
+struct IRReplaceTensorMutator : ir::IRMutator<> {
+  const std::map<std::string, ir::Tensor>& tensor_map;
+  IRReplaceTensorMutator(const std::map<std::string, ir::Tensor>& tensor_map) : tensor_map(tensor_map) {}
+  void operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::_Tensor_* op, Expr* expr) override {
+    auto it = tensor_map.find(op->name);
+    if (it != tensor_map.end()) {
+      *expr = Expr(it->second);
+    }
+  }
+};
+
+}  // namespace
+
+std::map<std::string, ir::Tensor> InitialAssignBuffer(Expr* expr,
+                                                      poly::StageMap stages,
+                                                      const std::map<std::string, ir::Tensor>& all_tensor_map,
+                                                      const common::Graph* comp_graph,
+                                                      const std::set<std::string>& temp_tensor_names) {
+  // The tensor map helps to reserve only one tensor instance for a tensor(called the same name).
+  std::map<std::string, ir::Tensor> buffer_updated_tensor;
+
+  for (auto& item : all_tensor_map) {
+    if (stages[item.second]->inlined()) continue;
+    buffer_updated_tensor[item.second->name] = item.second;
+  }
+
+  // union-find to cluster the tensors with the same buffer.
+  common::UnionFind union_find;
+
+  // unify all the tensor occurance with a global one, e.g. there are multiple tensor B exists in the expression,
+  // replace them with a shared one.
+  ir::CollectIRNodes(*expr, [&](const Expr* x) -> bool {
+    auto* t = x->as_tensor();
+    if (t && !stages[t]->inlined()) {
+      Reference(x) = Expr(all_tensor_map.at(t->name));
+    }
+    return false;
+  });
+
+  std::map<std::string, BufferUFNode*> uf_map;
+  for (auto& item : all_tensor_map) {
+    auto* n                   = union_find.AddNode(new BufferUFNode(item.second->name));
+    uf_map[item.second->name] = n->safe_as<BufferUFNode>();
+  }
+
+  for (auto& item : buffer_updated_tensor) {
+    auto* cur_n = uf_map[item.first];
+    for (auto& other : stages[item.second]->meta.tensors_to_share_buffer_with) {
+      // we might intialize the buffer in args.
+      auto* other_n = uf_map[other];
+      if (!other_n) continue;
+
+      VLOG(3) << "share buffer between " << item.first << " " << other_n->tensor_name;
+      cur_n->Union(other_n);
+    }
+  }
+
+  // determine which tensor to have the initial buffer, and will share across the cluster, we take a topological order
+  // of the computational graph, and find out which tensor comes first in a cluster.
+
+  auto _topo_order_topo_edges_ = comp_graph->topological_order();
+  auto& topo_order             = std::get<0>(_topo_order_topo_edges_);
+  auto& topo_edges             = std::get<1>(_topo_order_topo_edges_);
+  for (common::GraphNode* n : topo_order) {
+    auto nn = n->safe_as<lang::detail::CompuGraphNode>();
+    CHECK(nn);
+    {
+      auto it = uf_map.find(nn->tensor->name);
+      CHECK(it != uf_map.end());
+      auto& cluster_info = std::get<0>(it->second->GetRoot())->cluster_info;
+      if (cluster_info.empty()) {  // buffer owner(a tensor) of this cluster not set yet.
+        cluster_info = nn->tensor->name;
+      }
+    }
+  }
+
+  // Get a center of the cluster, it will consider the following rules
+  // 1. Prefer a tensor arg than a temp tensor.
+  auto cluster_get_center_tensor = [&](const std::vector<common::UnionFindNode*>& cluster) {
+    ir::Tensor some_tensor;
+    // try to find a node that is a tensor_arg, allocate buffer for it, and make others share buffer with it.
+    for (auto* n : cluster) {
+      auto* node   = n->safe_as<BufferUFNode>();
+      bool is_temp = temp_tensor_names.count(node->tensor_name);
+      if (!is_temp) return all_tensor_map.at(node->tensor_name);
+      if (all_tensor_map.at(node->tensor_name)->buffer.defined()) {
+        return all_tensor_map.at(node->tensor_name);
+      }
+      some_tensor = all_tensor_map.at(node->tensor_name);
+    }
+    return some_tensor;
+  };
+
+  for (auto& cluster : union_find.GetClusters()) {
+    auto root_tensor = cluster_get_center_tensor(cluster);
+    if (!root_tensor->buffer.defined() && !root_tensor->type().is_void()) {
+      root_tensor->WithBuffer();
+    }
+
+    for (auto* n : cluster) {
+      auto& tensor = all_tensor_map.at(n->safe_as<BufferUFNode>()->tensor_name);
+      if (tensor != root_tensor) {
+        auto keep_shape = root_tensor->buffer->shape;
+        Reference(&tensor)->Bind(root_tensor->buffer);
+        root_tensor->buffer->shape        = keep_shape;
+        Reference(&tensor)->buffer->shape = keep_shape;
+        VLOG(3) << "keep_shape is : " << utils::GetStreamCnt(keep_shape[0]);
+      }
+    }
+  }
+
+  return buffer_updated_tensor;
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/buffer_assign.h b/paddle/cinn/optim/buffer_assign.h
new file mode 100644
index 0000000000000..69464607ad7de
--- /dev/null
+++ b/paddle/cinn/optim/buffer_assign.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "cinn/ir/ir.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Assign buffer for tensors those are not marked as compute_inline.
+ * @param expr
+ * @param stages The stage map.
+ */
+std::map<std::string, ir::Tensor> InitialAssignBuffer(Expr* expr,
+                                                      poly::StageMap stages,
+                                                      const std::map<std::string, ir::Tensor>& all_tensor_map,
+                                                      const common::Graph* comp_graph,
+                                                      const std::set<std::string>& temp_tensor_names);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/cache_read_write_replace_test.cc b/paddle/cinn/optim/cache_read_write_replace_test.cc
new file mode 100755
index 0000000000000..eda11ac0ccc3d
--- /dev/null
+++ b/paddle/cinn/optim/cache_read_write_replace_test.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+TEST(CacheReadWriteReplace, basic) {
+  Context::Global().ResetNameId();
+  Expr M(100);
+  Expr N(20);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto C = Compute(
+      {M, N}, [&](Expr i, Expr j) -> Expr { return A(i, j) + B(i, j); }, "C");
+
+  auto stages = CreateStages({C});
+
+  // AA cache
+  std::vector<ir::Tensor> readers{C};
+  auto AA = stages[A]->CacheRead("shared", readers, stages);
+  auto CC = stages[C]->CacheWrite("local", stages, C);
+
+  auto fn = Lower("fn", stages, {A, B, C}, {}, {AA, CC});
+
+  LOG(INFO) << "fn:\n" << Expr(fn);
+
+  auto target = R"ROC(
+function fn (_A, _B, _C)
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 20)
+    {
+      A_read_cache[i, j] = A[i, j]
+    }
+  }
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 20)
+    {
+      C_write_cache[i, j] = (A_read_cache[i, j] + B[i, j])
+    }
+  }
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 20)
+    {
+      C[i, j] = C_write_cache[i, j]
+    }
+  }
+}
+  )ROC";
+
+  ASSERT_EQ(utils::Trim(target), utils::GetStreamCnt(fn));
+}
+
+TEST(CacheReadWriteReplace, cache_write) {
+  Context::Global().ResetNameId();
+
+  Expr M(100);
+  Expr N(100);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto C = Compute(
+      {M, N}, [=](Expr i, Expr j) { return A(i, j) + 1.f; }, "C");
+
+  auto C0 = Compute(
+      {M, N}, [=](Expr i, Expr j) { return C(i, j) + 1.f; }, "C0");
+  auto C1 = Compute(
+      {M, N}, [=](Expr i, Expr j) { return C0(i, j) + 1.f; }, "C1");
+
+  auto stages = CreateStages({A, B, C, C0, C1});
+  stages[C]->ComputeInline();
+  stages[C0]->ComputeInline();
+
+  auto Co = stages[C1]->CacheWrite("shared", stages, C1);
+
+  auto fn = Lower("fn", stages, {A, B, Co}, {}, {C, C0, C1});
+  LOG(INFO) << "\n" << fn;
+
+  auto target_source = R"ROC(
+function fn (_A, _B, _C1_write_cache)
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 100)
+    {
+      C1_write_cache[i, j] = (3.00000000f + A[i, j])
+    }
+  }
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 100)
+    {
+      C1[i, j] = C1_write_cache[i, j]
+    }
+  }
+}
+)ROC";
+
+  ASSERT_EQ(utils::Trim(target_source), utils::GetStreamCnt(fn));
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/call_arg_list_to_pod_value.cc b/paddle/cinn/optim/call_arg_list_to_pod_value.cc
new file mode 100644
index 0000000000000..afdddbb566a1b
--- /dev/null
+++ b/paddle/cinn/optim/call_arg_list_to_pod_value.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/call_arg_list_to_pod_value.h"
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/runtime/intrinsic.h"
+
+namespace cinn {
+namespace optim {
+
+namespace {
+
+struct CallArgListToPodValueMutator : ir::IRMutator<> {
+  void operator()(Expr* e) { ir::IRMutator<>::Visit(e, e); }
+
+ private:
+  void Visit(const ir::Call* op, Expr* expr) override {
+    if (op->is_cinn_call()) {
+      auto _oprs_args_ = pack_arg_exprs(op);  // NOLINT
+      auto& oprs       = std::get<0>(_oprs_args_);
+      auto& args       = std::get<1>(_oprs_args_);
+
+      Var pod_array_var(Context::Global().NewName("_pod_arr"),
+                        type_of<cinn_pod_value_t>().with_lanes(op->total_args_count()));
+
+      // Declare pod_array.
+      oprs.push_back(ir::Let::Make(pod_array_var, Expr()));
+      oprs.push_back(ir::intrinsics::ArgsConstruct::Make(pod_array_var, args));
+
+      auto new_call = ir::Call::Make(Void(),
+                                     op->name,
+                                     {pod_array_var, common::make_const(Int(32), args.size())},
+                                     {},
+                                     ir::CallType::CINN,
+                                     op->func,
+                                     op->value_index);
+
+      oprs.push_back(new_call);
+
+      *expr = ir::Block::Make(oprs);
+    }
+  }
+
+  std::tuple<std::vector<Expr> /*oprs*/, std::vector<Expr> /*args*/> pack_arg_exprs(const ir::Call* op) {
+    std::vector<Expr> exprs;
+    std::vector<Expr> args;
+
+    auto pack_arg = [&](const Expr& arg) {
+      Var pod_var(Context::Global().NewName("_pod_val_"), type_of<cinn_pod_value_t>());
+
+      // declare the array.
+      exprs.push_back(ir::Let::Make(pod_var, Expr()));
+
+      auto pod_val_addr_expr = ir::intrinsics::GetAddr::Make(pod_var);
+
+      Expr cast;
+      if (arg.As<ir::_Buffer_>()) {
+        cast = runtime::IntrinsicCall(
+            Void(), runtime::intrinsic::buffer_p_to_cinn_pod_value_repr, {arg}, {pod_val_addr_expr});
+
+      } else if (arg.type() == type_of<float>()) {
+        cast = runtime::IntrinsicCall(
+            Void(), runtime::intrinsic::float_to_cinn_pod_value_repr, {arg}, {pod_val_addr_expr});
+      } else if (arg.type() == type_of<int32_t>()) {
+        cast = runtime::IntrinsicCall(
+            Void(), runtime::intrinsic::int32_to_cinn_pod_value_repr, {arg}, {pod_val_addr_expr});
+      } else {
+        CINN_NOT_IMPLEMENTED
+      }
+
+      exprs.push_back(cast);
+      args.push_back(pod_val_addr_expr);
+    };
+
+    for (auto& arg : op->read_args) {
+      pack_arg(arg);
+    }
+    for (auto& arg : op->write_args) {
+      pack_arg(arg);
+    }
+
+    return std::make_tuple(exprs, args);
+  }
+};
+
+}  // namespace
+
+void CallArgListToPodValue(Expr* e) { CallArgListToPodValueMutator()(e); }
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/call_arg_list_to_pod_value.h b/paddle/cinn/optim/call_arg_list_to_pod_value.h
new file mode 100644
index 0000000000000..2c568177ff75f
--- /dev/null
+++ b/paddle/cinn/optim/call_arg_list_to_pod_value.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+/**
+ * \file Transform the CINN Call node's args to cinn_pod_value_t array.
+ */
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+void CallArgListToPodValue(Expr* e);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/cast_bool_to_int8.cc b/paddle/cinn/optim/cast_bool_to_int8.cc
new file mode 100644
index 0000000000000..86584aba5072c
--- /dev/null
+++ b/paddle/cinn/optim/cast_bool_to_int8.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/cast_bool_to_int8.h"
+
+#include <glog/logging.h>
+
+#include "cinn/ir/ir_mutator.h"
+
+namespace cinn::optim {
+
+namespace {
+
+struct Mutator : public ir::IRMutator<> {
+  using ir::IRMutator<>::Visit;
+
+  void Visit(const ir::Store* op, Expr* expr) override {
+    auto* node = expr->As<ir::Store>();
+    CHECK(node);
+    auto value = node->value;
+    if (op->type().is_bool() && op->value->type().is_bool()) {
+      value = ir::Cast::Make(Int(8), value);
+      *expr = ir::Store::Make(node->tensor, value, node->indices);
+    }
+  }
+};
+
+}  // namespace
+
+void CastBoolToInt8(Expr* e, Target target) {
+  if (target.arch == Target::Arch::X86) {
+    Mutator mutator;
+    mutator.Visit(e, e);
+  }
+}
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/cast_bool_to_int8.h b/paddle/cinn/optim/cast_bool_to_int8.h
new file mode 100644
index 0000000000000..c7770840167e5
--- /dev/null
+++ b/paddle/cinn/optim/cast_bool_to_int8.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir.h"
+
+namespace cinn::optim {
+
+/**
+ * Cast the expr from bool to Int8 type for llvm codegen, currently used in cpu.
+ *
+ * e.g.
+ *
+ * The expression:
+ * a = b
+ *
+ * to
+ *
+ * a = int8(b)
+ */
+void CastBoolToInt8(Expr* e, Target target);
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/cast_simplify.cc b/paddle/cinn/optim/cast_simplify.cc
new file mode 100644
index 0000000000000..eb88dbc3d29a4
--- /dev/null
+++ b/paddle/cinn/optim/cast_simplify.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/cast_simplify.h"
+
+#include "cinn/ir/ir_mutator.h"
+
+namespace cinn::optim {
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+namespace {
+
+template <typename CastType, typename T>
+CastType NormCastValue(T value) {
+  if (type_of<CastType>().is_uint() || type_of<T>().is_uint()) {
+    // not support uint
+    return static_cast<CastType>(value);
+  }
+
+  if (std::isinf(value)) {
+    return std::numeric_limits<CastType>::infinity();
+  } else if (std::isnan(value)) {
+    return std::numeric_limits<CastType>::signaling_NaN();
+  } else if (value >= static_cast<T>(std::numeric_limits<CastType>::max())) {
+    return std::numeric_limits<CastType>::max();
+  } else if (value <= static_cast<T>(std::numeric_limits<CastType>::lowest())) {
+    return std::numeric_limits<CastType>::lowest();
+  }
+  return static_cast<CastType>(value);
+}
+
+struct Mutator : ir::IRMutator<> {
+  using ir::IRMutator<>::Visit;
+
+  void Visit(const ir::Cast* op, Expr* expr) {
+    auto* node = expr->As<ir::Cast>();
+
+    Visit(&node->v(), &node->v());
+
+    if (op->type() == op->v().type()) {
+      *expr = op->v();
+      return;
+    }
+
+#define __CAST_TO_TYPE(type__)                                          \
+  if (auto* i = op->v().As<ir::IntImm>()) {                             \
+    *expr = Expr(static_cast<type__>(i->value));                        \
+  } else if (auto* f = op->v().As<ir::FloatImm>()) {                    \
+    *expr = Expr(static_cast<type__>(NormCastValue<type__>(f->value))); \
+  } else if (auto* u = op->v().As<ir::UIntImm>()) {                     \
+    *expr = Expr(static_cast<type__>(u->value));                        \
+  } else {                                                              \
+    CINN_NOT_IMPLEMENTED                                                \
+  }
+
+    if (op->v().is_constant()) {
+      if (op->type() == type_of<int8_t>()) {
+        __CAST_TO_TYPE(int8_t)
+      } else if (op->type() == type_of<int16_t>()) {
+        __CAST_TO_TYPE(int16_t)
+      } else if (op->type() == type_of<int32_t>()) {
+        __CAST_TO_TYPE(int32_t)
+      } else if (op->type() == type_of<int64_t>()) {
+        __CAST_TO_TYPE(int64_t)
+      } else if (op->type() == type_of<uint8_t>()) {
+        __CAST_TO_TYPE(uint8_t)
+      } else if (op->type() == type_of<uint16_t>()) {
+        __CAST_TO_TYPE(uint16_t)
+      } else if (op->type() == type_of<uint32_t>()) {
+        __CAST_TO_TYPE(uint32_t)
+      } else if (op->type() == type_of<uint64_t>()) {
+        __CAST_TO_TYPE(uint64_t)
+      } else if (op->type() == type_of<float>()) {
+        __CAST_TO_TYPE(float)
+      } else if (op->type() == type_of<double>()) {
+        __CAST_TO_TYPE(double)
+      } else if (op->type() == type_of<bool>()) {
+        __CAST_TO_TYPE(bool)
+      } else if (op->type() == type_of<uint32_t>()) {
+        __CAST_TO_TYPE(uint32_t)
+      } else if (op->type() == type_of<uint64_t>()) {
+        __CAST_TO_TYPE(uint64_t)
+      } else if (op->type() == type_of<bfloat16>()) {
+        // Cannot simplify!!! pass
+        __CAST_TO_TYPE(bfloat16)
+      } else if (op->type() == type_of<float16>()) {
+        // Cannot simplify!!! pass
+        __CAST_TO_TYPE(float16)
+      } else {
+        CINN_NOT_IMPLEMENTED
+      }
+    }
+#undef __CAST_TO_TYPE
+  }
+};
+
+}  // namespace
+
+void CastSimplify(Expr* e) {
+  Mutator mutator;
+  mutator.Visit(e, e);
+}
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/cast_simplify.h b/paddle/cinn/optim/cast_simplify.h
new file mode 100644
index 0000000000000..7a3e1abf1ff70
--- /dev/null
+++ b/paddle/cinn/optim/cast_simplify.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/ir/ir.h"
+
+namespace cinn::optim {
+
+/**
+ * Simplify the Cast nodes.
+ *
+ * There are several patterns:
+ * 1. the source and target type are the same, drop the Cast node
+ * 2. for intermediate numbers, just replace the Cast node with a Node of the target type
+ */
+void CastSimplify(Expr* e);
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/cast_simplify_test.cc b/paddle/cinn/optim/cast_simplify_test.cc
new file mode 100644
index 0000000000000..2aad9b6789556
--- /dev/null
+++ b/paddle/cinn/optim/cast_simplify_test.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/cast_simplify.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn::optim {
+
+TEST(CastSimplify, same_type) {
+  Var n("n");
+  Expr a = ir::Cast::Make(Int(32), n);
+  LOG(INFO) << n->type();
+  LOG(INFO) << a;
+  CastSimplify(&a);
+  ASSERT_EQ(utils::GetStreamCnt(a), "n");
+}
+
+TEST(CastSimplify, Imm_int) {
+  Expr a = ir::Cast::Make(Int(64), Expr(1));
+  Expr c = ir::Cast::Make(Int(32), a);
+  LOG(INFO) << c;
+  CastSimplify(&c);
+  LOG(INFO) << c;
+  ASSERT_EQ(utils::GetStreamCnt(c), "1");
+  ASSERT_EQ(c.type(), Int(32));
+}
+
+TEST(CastSimplify, Imm_double) {
+  Expr a = ir::Cast::Make(Float(64), Expr(2.33));
+  Expr c = ir::Cast::Make(Int(32), a);
+  LOG(INFO) << c;
+  CastSimplify(&c);
+  LOG(INFO) << c;
+  ASSERT_EQ(utils::GetStreamCnt(c), "2");
+  ASSERT_EQ(c.type(), Int(32));
+}
+
+TEST(CastSimplify, Imm_uint) {
+  Expr a = ir::Cast::Make(UInt(64), Expr(1));
+  Expr c = ir::Cast::Make(UInt(32), a);
+  LOG(INFO) << c;
+  CastSimplify(&c);
+  LOG(INFO) << c;
+  ASSERT_EQ(utils::GetStreamCnt(c), "1");
+  ASSERT_EQ(c.type(), UInt(32));
+}
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/collect_undefined_vars.cc b/paddle/cinn/optim/collect_undefined_vars.cc
new file mode 100644
index 0000000000000..244342bad2cb4
--- /dev/null
+++ b/paddle/cinn/optim/collect_undefined_vars.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/collect_undefined_vars.h"
+
+#include <set>
+
+#include "cinn/ir/ir_mutator.h"
+
+namespace cinn::optim {
+
+namespace {
+struct Mutator : public ir::IRMutator<> {
+  using ir::IRMutator<>::Visit;
+  std::vector<std::string> undefined_vars;
+  std::set<std::string> defined_vars;
+  std::set<std::string> used_vars;
+
+  void CollectVarDef(const std::string& var) {
+    CHECK(!defined_vars.count(var)) << "var " << var << " has been defined, please check";
+    CHECK(!used_vars.count(var)) << "var " << var << " is wrongly used before definition";
+    defined_vars.insert(var);
+  }
+
+  void ClearVar(const std::string& var) {
+    defined_vars.erase(var);
+    used_vars.erase(var);
+  }
+
+  void CollectVarUse(const std::string& var) {
+    used_vars.insert(var);
+    if (defined_vars.count(var) == 0) {
+      undefined_vars.push_back(var);
+    }
+  }
+
+  void Visit(const ir::Let* op, Expr* expr) final {
+    Expr symbol = op->symbol;
+    auto var    = symbol.as_var_ref();
+    CHECK(var.defined());
+    CollectVarDef(var->name);
+    auto* node = expr->As<ir::Let>();
+    Visit(&node->body, &node->body);
+  }
+
+  void Visit(const ir::For* op, Expr* expr) final {
+    CollectVarDef(op->loop_var->name);
+    auto* node = expr->As<ir::For>();
+    Visit(&node->min, &node->min);
+    Visit(&node->extent, &node->extent);
+    Visit(&node->body, &node->body);
+    ClearVar(op->loop_var->name);
+  }
+
+  void Visit(const ir::Load* op, Expr* expr) final {
+    auto tensor = op->tensor.as_tensor_ref();
+    CollectVarUse(tensor->name);
+    auto* node = expr->As<ir::Load>();
+    for (auto& idx : node->indices) Visit(&idx, &idx);
+  }
+
+  void Visit(const ir::Store* op, Expr* expr) final {
+    auto tensor = op->tensor.as_tensor_ref();
+    CollectVarUse(tensor->name);
+    auto* node = expr->As<ir::Store>();
+    for (auto& idx : node->indices) Visit(&idx, &idx);
+    Visit(&node->value, &node->value);
+  }
+
+  void Visit(const ir::_Var_* op, Expr* expr) final {
+    CollectVarUse(op->name);
+    auto* node = expr->As<ir::_Var_>();
+    if (node->lower_bound.defined()) {
+      Visit(&node->lower_bound, &node->lower_bound);
+    }
+    if (node->upper_bound.defined()) {
+      Visit(&node->upper_bound, &node->upper_bound);
+    }
+  }
+
+  void Visit(const ir::Reduce* op, Expr* expr) final {
+    for (auto& axis : op->reduce_axis) {
+      CollectVarDef(axis->name);
+    }
+    auto* node = expr->As<ir::Reduce>();
+    if (node->init.defined()) Visit(&node->init, &node->init);
+    Visit(&node->body, &node->body);
+  }
+};
+}  // namespace
+
+std::vector<std::string> CollectUndefinedVars(Expr* e) {
+  Mutator mutator;
+  mutator.Visit(e, e);
+  return mutator.undefined_vars;
+}
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/collect_undefined_vars.h b/paddle/cinn/optim/collect_undefined_vars.h
new file mode 100644
index 0000000000000..25b4de3a2d4d5
--- /dev/null
+++ b/paddle/cinn/optim/collect_undefined_vars.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+namespace cinn::optim {
+
+/**
+ * Collect undefined vars in the scope.
+ *
+ * e.g.
+ *
+ * The expression:
+ * for i
+ *  for j
+ *    a[i, j] = b[i, j]
+ *
+ * here a, b are vars without definition
+ */
+std::vector<std::string> CollectUndefinedVars(Expr* e);
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc
new file mode 100644
index 0000000000000..9e110706aab57
--- /dev/null
+++ b/paddle/cinn/optim/compute_inline_expand.cc
@@ -0,0 +1,233 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/compute_inline_expand.h"
+
+#include <map>
+#include <string>
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/replace_var_with_expr.h"
+
+namespace cinn {
+namespace optim {
+
+namespace {
+
+/*
+ * Replace a tensor(marked as compute_inline) to the expanded expression.
+ */
+struct TensorInlineExpandMutator : public ir::IRMutator<> {
+  const std::string &tensor_name_;
+  std::map<std::string, ir::Tensor> *all_tensor_map_;
+  poly::StageMap stages_;
+  bool inline_code{false};
+  bool temp_buffer{false};
+  bool memory_local{false};
+  std::unordered_map<std::string, std::vector<Expr>> resized_buffer_cache;
+  std::vector<std::string> tensor_names;
+  std::vector<std::vector<Var>> replace_var;
+  std::map<std::string, Expr> var_to_extent;
+
+  TensorInlineExpandMutator(const std::string &tensor_name,
+                            std::map<std::string, ir::Tensor> *all_tensor_map,
+                            poly::StageMap stages)
+      : tensor_name_(tensor_name), all_tensor_map_(all_tensor_map), stages_(stages) {}
+
+  void operator()(Expr *expr) {
+    ir::IRMutator<>::Visit(expr, expr);
+    for (int i = 0; i < tensor_names.size(); i++) {
+      for (auto &var : replace_var[i]) {
+      }
+    }
+  }
+
+  void Visit(const ir::_Var_ *expr, Expr *op) override {
+    if (inline_code && temp_buffer) {
+      if (utils::Startswith(expr->name, "blockIdx") || (utils::Startswith(expr->name, "threadIdx") && memory_local)) {
+        *op = ir::Expr(0);
+      }
+    }
+  }
+
+  void Visit(const ir::_Tensor_ *op, Expr *expr) override {
+    if (inline_code && utils::Endswith(op->name, "_write_cache") &&
+        (*all_tensor_map_).at(op->name)->buffer->memory_type == ir::MemoryType::Heap) {
+      auto no_cache_name = op->name.substr(0, op->name.size() - 12);
+      VLOG(2) << "no_cache_name: " << no_cache_name;
+      CHECK(all_tensor_map_->count(no_cache_name));
+      *expr = (*all_tensor_map_)[no_cache_name];
+    }
+  }
+
+  void Visit(const ir::For *op, Expr *expr) override {
+    CHECK(op->extent.is_constant());
+    int cons_extent                   = (int)op->extent.get_constant();
+    var_to_extent[op->loop_var->name] = op->extent;
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::PolyFor *op, Expr *expr) override {
+    auto extent                       = op->ExtractExtent();
+    var_to_extent[op->iterator->name] = extent;
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::Load *op, Expr *expr) override {
+    auto *node   = expr->As<ir::Load>();
+    auto *tensor = node->tensor.as_tensor();
+    if (tensor && tensor->name == tensor_name_) {
+      *expr       = tensor->inline_expanded(op->indices);
+      inline_code = true;
+      ir::IRMutator<>::Visit(expr, expr);
+      inline_code = false;
+    } else if (inline_code && tensor->buffer.defined()) {
+      bool is_heap = (*all_tensor_map_).at(tensor->name)->buffer->memory_type == ir::MemoryType::Heap;
+      if (utils::Endswith(tensor->buffer->name, "_write_cache") && is_heap) {
+        // temp fix: cache_write will change the tensor to the cache tensor wrongly
+        auto no_cache_name = tensor->buffer->name.substr(1, tensor->buffer->name.size() - 13);
+        if (all_tensor_map_->count(no_cache_name)) {
+          ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
+        } else {
+          auto *tensor = node->tensor.as_tensor();
+          CHECK(tensor);
+          // fix computeAt case
+          auto shapes = tensor->shape;
+          CHECK_EQ(shapes.size(), node->indices.size());
+          for (int i = 0; i < shapes.size(); i++) {
+            if (common::is_zero(shapes[i] - 1)) {
+              node->indices[i] = Expr(0);
+            }
+          }
+        }
+      } else if (utils::Endswith(tensor->buffer->name, "_write_cache") ||
+                 utils::Endswith(tensor->buffer->name, "_read_cache") ||
+                 utils::Endswith(tensor->buffer->name, "_temp_buffer")) {
+#ifdef CINN_WITH_CUDA
+        auto axis_names  = stages_[tensor]->axis_names();
+        auto compute_ats = stages_[tensor]->GetComputeAts();
+        if (compute_ats.size() == 1) {
+          int level_tmp;
+          for (auto &i : compute_ats) {
+            level_tmp = i.second.level;
+          }
+          std::vector<Var> replace_vars;
+          for (int j = 0; j <= level_tmp; j++) {
+            if (var_to_extent.count(axis_names[j]) == 0) continue;
+            replace_vars.push_back(Var(var_to_extent[axis_names[j]], axis_names[j]));
+          }
+          replace_var.push_back(replace_vars);
+          tensor_names.push_back(tensor->buffer->name);
+        }
+#endif
+        bool keep_buffer       = temp_buffer;
+        temp_buffer            = true;
+        bool keep_memory_local = memory_local;
+        if ((*all_tensor_map_).at(tensor->name)->buffer->memory_type == ir::MemoryType::GPULocal) {
+          memory_local = true;
+        }
+        ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
+        for (int i = 0; i < node->indices.size(); i++) {
+          auto temp = optim::IRCopy(node->indices[i]);
+          ir::IRMutator<>::Visit(&temp, &temp);
+          node->indices[i] = temp;
+        }
+        temp_buffer  = keep_buffer;
+        memory_local = keep_memory_local;
+      } else {
+        ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
+        for (int i = 0; i < node->indices.size(); i++) {
+          auto temp = optim::IRCopy(node->indices[i]);
+          ir::IRMutator<>::Visit(&temp, &temp);
+          node->indices[i] = temp;
+        }
+      }
+    } else {
+      ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
+      for (int i = 0; i < node->indices.size(); i++) {
+        auto temp = optim::IRCopy(node->indices[i]);
+        ir::IRMutator<>::Visit(&temp, &temp);
+        node->indices[i] = temp;
+      }
+    }
+  }
+};
+
+struct SSANode : public common::GraphNode {
+  std::string id_;
+
+  explicit SSANode(const std::string &id) : id_(id) {}
+
+  std::string id() const override { return id_; }
+
+  const char *type_info() const override { return __type_info__; }
+
+  static constexpr char *__type_info__ = "optim::SSANode";
+};
+
+// TODO(Superjomn) the graph here is not a SSA now, it is flattern for the ir::CollectIRNodes method collects all the
+// tensors recursively, so it can not reserve the level information, fix it.
+struct SSABuilder : public ir::IRMutator<> {
+  common::Graph graph;
+
+  SSABuilder &operator()(Expr *expr) {
+    ir::IRMutator<>::Visit(expr, expr);
+    return *this;
+  }
+
+  void Visit(const ir::Store *op, Expr *expr) override {
+    auto *node = expr->As<ir::Store>();
+
+    auto *cur_graph_node = graph.RetrieveNode(node->tensor.as_tensor()->name);
+    if (!cur_graph_node) {
+      cur_graph_node = graph.RegisterNode(node->tensor.as_tensor()->name, new SSANode(node->tensor.as_tensor()->name));
+    }
+
+    auto deps_tensor_names = node->tensor.as_tensor()->GetDependTensorNames();
+    for (auto &t : deps_tensor_names) {
+      auto *n = graph.RetrieveNode(t);
+      if (!n) {
+        n = graph.RegisterNode(t, new SSANode(t));
+      }
+      n->Controls(cur_graph_node);
+    }
+  }
+};
+
+}  // namespace
+
+void ComputeInlineExpand(Expr *expr, poly::StageMap stages, std::map<std::string, ir::Tensor> *all_tensor_map) {
+  // the inline tensors contained in the expression.
+  auto inline_tensors =
+      ir::CollectIRNodes(*expr, [&](const Expr *x) { return x->as_tensor() && stages[x->as_tensor()]->inlined(); });
+
+  // keep inline expand if any inline tensor exists
+  // NOTE This is a naive method to greedily expand the inline tensors until none exists, a better way is to create a
+  // SSA graph and expand the inline tensors in the reversed dependency order.
+  // TODO(Superjomn) Use the SSA graph to improve this.
+  while (!inline_tensors.empty()) {
+    for (const auto &t : inline_tensors) {
+      auto *tensor = t.as_tensor();
+      TensorInlineExpandMutator(tensor->name, all_tensor_map, stages)(expr);
+    }
+
+    inline_tensors = ir::CollectLoadTensors(
+        *expr, [&](const Expr *x) { return x->as_tensor() && stages[x->as_tensor()]->inlined(); });
+  }
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/compute_inline_expand.h b/paddle/cinn/optim/compute_inline_expand.h
new file mode 100644
index 0000000000000..9fa5baf682eb8
--- /dev/null
+++ b/paddle/cinn/optim/compute_inline_expand.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+
+#include "cinn/cinn.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Recursive expand the inlined tensors.
+ * @param expr the expression to modify.
+ * @param tensor_name name of the tensor to expand inline.
+ * @param memo a memo to avoid duplicate expand.
+ */
+void ComputeInlineExpand(Expr* expr, poly::StageMap stages, std::map<std::string, ir::Tensor>* all_tensor_map);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
new file mode 100644
index 0000000000000..64ee0ba7a5664
--- /dev/null
+++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/eliminate_broadcast_in_forloop.h"
+
+#include <tuple>
+#include <vector>
+
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/optim/ir_replace.h"
+
+namespace cinn {
+namespace optim {
+
+namespace detail {
+
+struct EliminateBroadcastInForloop : public ir::IRMutator<Expr*> {
+  void operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::Store* op, Expr* expr) {
+    // TODO(Superjom) Support single one level of forloop.
+    if (forloop_stack.size() < 2) return;
+
+    auto* node = expr->As<ir::Store>();
+
+    auto broadcasts = ir::CollectIRNodes(node->value, [&](const Expr* expr) { return expr->As<ir::Broadcast>(); });
+    std::vector<Expr> let_exprs;
+
+    Var tmp;
+    Expr let_expr;
+
+    Var cur_level_loop_var = forloop_stack.back()->As<ir::For>() ? forloop_stack.back()->As<ir::For>()->loop_var
+                                                                 : forloop_stack.back()->As<ir::PolyFor>()->iterator;
+    for (Expr broadcast : broadcasts) {
+      if (ContainsLoopVar(broadcast, cur_level_loop_var)) continue;
+      VLOG(4) << "eliminating " << broadcast;
+      std::tie(let_expr, tmp) = CreateTmpLet(broadcast);
+      let_exprs.push_back(let_expr);
+
+      optim::IrReplace(expr, broadcast, tmp);
+    }
+
+    // insert the let expressions to the outer forloop.
+
+    Expr* outer_forloop = forloop_stack[forloop_stack.size() - 2];
+
+    auto& outer_forloop_body =
+        outer_forloop->As<ir::For>() ? outer_forloop->As<ir::For>()->body : outer_forloop->As<ir::PolyFor>()->body;
+
+    auto* outer_forloop_body_block = outer_forloop_body.As<ir::Block>();
+    if (outer_forloop_body_block) {
+      outer_forloop_body_block->stmts.insert(
+          std::begin(outer_forloop_body_block->stmts), let_exprs.begin(), let_exprs.end());
+
+    } else {
+      let_exprs.push_back(outer_forloop_body);
+      outer_forloop_body = ir::Block::Make(let_exprs);
+    }
+  }
+
+  bool ContainsLoopVar(Expr expr, Var loop_var) {
+    return !ir::CollectIRNodes(expr, [&](const Expr* e) -> bool {
+              return e->As<ir::_Var_>() && e->As<ir::_Var_>()->name == loop_var->name;
+            }).empty();
+  }
+
+  std::tuple<Expr, Var> CreateTmpLet(Expr body) {
+    Var tmp(Context::Global().NewName("tmp"), body.type());
+
+    Expr let_expr = ir::Let::Make(tmp, body);
+
+    return std::make_tuple(let_expr, tmp);
+  }
+
+  void Visit(const ir::For* op, Expr* expr) {
+    forloop_stack.push_back(expr);
+    ir::IRMutator<>::Visit(op, expr);
+    forloop_stack.pop_back();
+  }
+
+  void Visit(const ir::PolyFor* op, Expr* expr) {
+    forloop_stack.push_back(expr);
+    ir::IRMutator<>::Visit(op, expr);
+    forloop_stack.pop_back();
+  }
+
+  std::vector<Expr*> forloop_stack;
+};
+
+}  // namespace detail
+
+void EliminateBroadcastInForloop(Expr* expr) {
+  detail::EliminateBroadcastInForloop mutator;
+  mutator(expr);
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/eliminate_broadcast_in_forloop.h b/paddle/cinn/optim/eliminate_broadcast_in_forloop.h
new file mode 100644
index 0000000000000..95f1a9a4063a6
--- /dev/null
+++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+void EliminateBroadcastInForloop(Expr* expr);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/extern_call_process.cc b/paddle/cinn/optim/extern_call_process.cc
new file mode 100644
index 0000000000000..0f3f62c243b68
--- /dev/null
+++ b/paddle/cinn/optim/extern_call_process.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/extern_call_process.h"
+
+#include "cinn/ir/ir_mutator.h"
+
+namespace cinn {
+namespace optim {
+
+namespace {
+
+struct ExternCallMultiOutputShallowStoreMutator : public ir::IRMutator<> {
+  void operator()(Expr* e) { ir::IRMutator<>::Visit(e, e); }
+
+ private:
+  void Visit(const ir::Store* op, Expr* expr) override {
+    auto* call = op->value.As<ir::Call>();
+    if (call && call->is_extern_call() && !call->write_args.empty()) {
+      *expr = op->value;
+    }
+  }
+};
+
+}  // namespace
+
+void ExternCallMultiOutputShallowStore(Expr* e) { ExternCallMultiOutputShallowStoreMutator()(e); }
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/extern_call_process.h b/paddle/cinn/optim/extern_call_process.h
new file mode 100644
index 0000000000000..6f371a1134d7f
--- /dev/null
+++ b/paddle/cinn/optim/extern_call_process.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+void ExternCallMultiOutputShallowStore(Expr* e);
+
+void ExternCallRemoveTupleGetStatements(Expr* e);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/fold_cinn_call_arguments.cc b/paddle/cinn/optim/fold_cinn_call_arguments.cc
new file mode 100644
index 0000000000000..e09e7ede205fb
--- /dev/null
+++ b/paddle/cinn/optim/fold_cinn_call_arguments.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/fold_cinn_call_arguments.h"
+
+#include <unordered_set>
+#include <vector>
+
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+
+namespace {
+
+/**
+ * Fold the arguments of the Call nodes marked as CINN(calls an LoweredFunc).
+ */
+struct FoldCINNCallArgumentsMutator : public ir::IRMutator<> {
+  void operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::Block* op, Expr* expr) override {
+    auto* node = expr->As<ir::Block>();
+    for (auto it = node->stmts.begin(); it != node->stmts.end();) {
+      if (it->As<ir::Store>()) {
+        auto* call = it->As<ir::Store>()->value.As<ir::Call>();
+        if (call && call->is_cinn_call()) {
+          // remove the duplicate calls.
+          std::string key = utils::GetStreamCnt(Expr(call));
+          if (visited_call_.count(key)) {
+            it = node->stmts.erase(it);
+            continue;
+          }
+
+          ir::IRMutator<>::Visit(&(*it), &(*it));
+          visited_call_.insert(key);
+          continue;
+        }
+      }
+
+      ir::IRMutator<>::Visit(&(*it), &(*it));
+      ++it;
+    }
+  }
+  void Visit(const ir::Store* op, Expr* expr) override {
+    auto* node = expr->As<ir::Store>();
+    if (node->value.As<ir::Call>()) {
+      auto* call = node->value.As<ir::Call>();
+      switch (call->call_type) {
+        case ir::CallType::CINN:
+          MutateCall(call);
+          *expr = node->value;
+          break;
+        case ir::CallType::Intrinsic:
+          break;
+        case ir::CallType::Extern:
+          break;
+        default:
+          CINN_NOT_IMPLEMENTED
+      }
+    }
+  }
+
+  void MutateCall(ir::Call* call) {
+    if (call->call_type == ir::CallType::Extern) return;
+
+    std::vector<Expr> read_args;
+    std::vector<Expr> write_args;
+    for (auto& arg : call->read_args) {
+      if (arg.as_tensor()) {
+        CHECK(arg.as_tensor()->buffer.defined()) << "arg tensor [" << arg.as_tensor()->name << "] not has buffer";
+        read_args.push_back(arg.as_tensor()->buffer);
+      } else {
+        read_args.push_back(arg);
+      }
+    }
+
+    for (auto& arg : call->write_args) {
+      if (arg.as_tensor()) {
+        write_args.push_back(arg.as_tensor()->buffer);
+      } else {
+        write_args.push_back(arg);
+      }
+    }
+
+    call->read_args  = read_args;
+    call->write_args = write_args;
+  }
+
+ private:
+  // To avoid the same call triggered duplicately.
+  std::unordered_set<std::string> visited_call_;
+};
+
+}  // namespace
+
+void FoldCINNCallArguments(Expr* expr) { FoldCINNCallArgumentsMutator()(expr); }
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/fold_cinn_call_arguments.h b/paddle/cinn/optim/fold_cinn_call_arguments.h
new file mode 100644
index 0000000000000..8c15438792077
--- /dev/null
+++ b/paddle/cinn/optim/fold_cinn_call_arguments.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * \brief Rewrite the Call Nodes marked type as CINN, pack their arguments into `void*, int` so that they can trigger a
+ * `LoweredFunc`.
+ *
+ * For example, input the IR
+ * \code
+ * Call(some_lowered_func, a:cinn_buffer_t*, b:cinn_buffer_t*, c:cinn_buffer_t*)
+ * \endcode
+ *
+ * This pass will rewrite it to
+ * \code
+ * cinn_pod_value_t a_(a);
+ * cinn_pod_value_t b_(b);
+ * cinn_pod_value_t c_(c);
+ *
+ * cinn_args_construct(packed_args, a_, b_, c_);
+ * Call(some_lowered_func, packed_args, 3); // 3 is the number of arguments
+ * \endcode
+ */
+void FoldCINNCallArguments(Expr* expr);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/if_simplify.cc b/paddle/cinn/optim/if_simplify.cc
new file mode 100644
index 0000000000000..0d999dac84795
--- /dev/null
+++ b/paddle/cinn/optim/if_simplify.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/if_simplify.h"
+
+#include "cinn/ir/ir_mutator.h"
+
+namespace cinn::optim {
+
+namespace {
+
+struct Mutator : public ir::IRMutator<> {
+  using ir::IRMutator<>::Visit;
+
+  void Visit(const ir::IfThenElse* op, Expr* expr) {
+    auto* condition_int  = op->condition.As<ir::IntImm>();
+    auto* condition_uint = op->condition.As<ir::UIntImm>();
+    int64_t value;
+    if (condition_int || condition_uint) {
+      if (condition_int) {
+        value = condition_int->value;
+      } else {
+        value = condition_uint->value;
+      }
+      if (value) {
+        *expr = op->true_case;
+      } else {
+        if (op->false_case.defined()) {
+          *expr = op->false_case;
+        } else {
+          // null condition
+          *expr = ir::Block::Make({});
+        }
+      }
+    }
+  }
+};
+
+}  // namespace
+
+void IfSimplify(Expr* e) {
+  Mutator mutator;
+  mutator.Visit(e, e);
+}
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/if_simplify.h b/paddle/cinn/optim/if_simplify.h
new file mode 100644
index 0000000000000..2e4fa1426ee59
--- /dev/null
+++ b/paddle/cinn/optim/if_simplify.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir.h"
+
+namespace cinn::optim {
+
+void IfSimplify(Expr* e);
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/if_simplify_test.cc b/paddle/cinn/optim/if_simplify_test.cc
new file mode 100644
index 0000000000000..1221a58b805cc
--- /dev/null
+++ b/paddle/cinn/optim/if_simplify_test.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/if_simplify.h"
+
+#include <gtest/gtest.h>
+
+#include <string>
+
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn::optim {
+
+TEST(IfSimplify, if_true) {
+  Var n("n");
+  auto e = ir::IfThenElse::Make(Expr(1) /*true*/, ir::Let::Make(n, Expr(1)), ir::Let::Make(n, Expr(2)));
+
+  LOG(INFO) << "\n" << e;
+
+  IfSimplify(&e);
+
+  LOG(INFO) << e;
+
+  ASSERT_EQ(utils::GetStreamCnt(e), "int32 n = 1");
+}
+
+TEST(IfSimplify, if_false) {
+  Var n("n");
+  auto e = ir::IfThenElse::Make(Expr(0) /*false*/, ir::Let::Make(n, Expr(1)), ir::Let::Make(n, Expr(2)));
+
+  LOG(INFO) << "\n" << e;
+
+  IfSimplify(&e);
+
+  LOG(INFO) << e;
+
+  ASSERT_EQ(utils::GetStreamCnt(e), "int32 n = 2");
+}
+
+TEST(IfSimplify, if_else_empty) {
+  Var n("n");
+  auto e = ir::IfThenElse::Make(Expr(0) /*false*/, ir::Let::Make(n, Expr(1)));
+
+  LOG(INFO) << "\n" << e;
+
+  IfSimplify(&e);
+
+  LOG(INFO) << e;
+
+  std::string target = utils::Trim(R"ROC(
+{
+
+}
+)ROC");
+
+  ASSERT_EQ(utils::GetStreamCnt(e), target);
+}
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/insert_debug_log_callee.cc b/paddle/cinn/optim/insert_debug_log_callee.cc
new file mode 100644
index 0000000000000..6c7988b6016c8
--- /dev/null
+++ b/paddle/cinn/optim/insert_debug_log_callee.cc
@@ -0,0 +1,275 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/insert_debug_log_callee.h"
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/runtime/intrinsic.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+using cinn::utils::StringFormat;
+
+namespace {
+
+struct StoreDebugInfoBuilder : public ir::IRVisitor {
+  std::tuple<std::string, std::vector<Expr>> operator()(const Expr *e) {
+    ir::IRVisitor::Visit(e);
+    return std::make_tuple(format_.str(), args_);
+  }
+
+ private:
+#define _BINARY_OP(Op__, repr__)           \
+  void Visit(const ir::Op__ *x) override { \
+    format_ << "(";                        \
+    ir::IRVisitor::Visit(&x->a());         \
+    format_ << " " << #repr__ << " ";      \
+    ir::IRVisitor::Visit(&x->b());         \
+    format_ << ")";                        \
+  }
+  _BINARY_OP(Add, +);
+  _BINARY_OP(Mul, *);
+  _BINARY_OP(Div, /);
+  _BINARY_OP(Sub, -);
+  _BINARY_OP(Mod, %);
+  _BINARY_OP(LT, <);
+  _BINARY_OP(LE, <=);
+  _BINARY_OP(GT, >);
+  _BINARY_OP(GE, >=);
+#undef _BINARY_OP
+
+  void Visit(const ir::Load *x) override {
+    format_ << type_specifier(x->type());
+    args_.push_back(Expr(&Reference(x)));
+  }
+
+ public:
+  void Visit(const Expr *x) override { IRVisitor::Visit(x); }
+  void Visit(const ir::IntImm *x) override {
+    format_ << type_specifier(x->type());
+    args_.push_back(&Reference(x));
+  }
+  void Visit(const ir::UIntImm *x) override {
+    format_ << type_specifier(x->type());
+    args_.push_back(&Reference(x));
+  }
+  void Visit(const ir::FloatImm *x) override {
+    format_ << type_specifier(x->type());
+    args_.push_back(&Reference(x));
+  }
+  void Visit(const ir::StringImm *x) override {}
+  void Visit(const ir::EQ *x) override {}
+  void Visit(const ir::_Var_ *x) override {}
+  void Visit(const ir::NE *x) override {}
+  void Visit(const ir::And *x) override {}
+  void Visit(const ir::Or *x) override {}
+  void Visit(const ir::Min *x) override {}
+  void Visit(const ir::Max *x) override {}
+  void Visit(const ir::Minus *x) override {}
+  void Visit(const ir::Not *x) override {}
+  void Visit(const ir::Cast *x) override {}
+  void Visit(const ir::For *x) override {}
+  void Visit(const ir::PolyFor *x) override {}
+  void Visit(const ir::Select *x) override {}
+  void Visit(const ir::IfThenElse *x) override {}
+  void Visit(const ir::Block *x) override {}
+  void Visit(const ir::Call *x) override {}
+  void Visit(const ir::Store *x) override {
+    format_ << x->tensor.as_tensor()->name << "[] = ";
+    Visit(&x->value);
+    LOG(INFO) << "store value " << x->value;
+  }
+  void Visit(const ir::Alloc *x) override {}
+  void Visit(const ir::Free *x) override {}
+  void Visit(const ir::_Buffer_ *x) override {}
+  void Visit(const ir::_Tensor_ *x) override {}
+  void Visit(const ir::_LoweredFunc_ *x) override {}
+  void Visit(const ir::_Module_ *x) override {}
+  void Visit(const ir::Let *x) override {}
+  void Visit(const ir::Reduce *x) override {}
+  void Visit(const ir::Ramp *x) override {}
+  void Visit(const ir::Broadcast *x) override {}
+  void Visit(const ir::FracOp *x) override {}
+  void Visit(const ir::Product *x) override {}
+  void Visit(const ir::Sum *x) override {}
+
+ private:
+  std::string type_specifier(const Type &type) {
+    if (type.is_float()) return "%f";
+    if (type.is_int()) return "%d";
+    CINN_NOT_IMPLEMENTED
+    return "";
+  }
+
+ private:
+  std::stringstream format_;
+  std::vector<Expr> args_;
+  bool in_load_{false};
+};
+
+struct InsertDebugLogCalleeMutator : public ir::IRMutator<> {
+  void operator()(Expr *e) { ir::IRMutator<>::Visit(e, e); }
+
+  void Visit(const ir::_LoweredFunc_ *op, Expr *expr) {
+    auto *node       = expr->As<ir::_LoweredFunc_>();
+    auto *body_block = node->body.As<ir::Block>();
+    CHECK(body_block);
+
+    auto msg        = StringFormat("running : %s", GetDebugString(*expr).c_str());
+    auto debug_node = CreateDebugStatement(msg);
+
+    ir::IRMutator<>::Visit(&node->body, &node->body);
+
+    auto deal_with_exprs = [&](std::vector<Expr> *exprs) {  // deal with op->argument_preapre_exprs
+      std::vector<Expr> new_stmts;
+      for (auto &expr : *exprs) {
+        auto msg = StringFormat("running : %s", GetDebugString(expr).c_str());
+        new_stmts.push_back(CreateDebugStatement(msg));
+        new_stmts.push_back(expr);
+      }
+      *exprs = new_stmts;
+    };
+
+    deal_with_exprs(&node->alloc_output_buffer_exprs);
+    deal_with_exprs(&node->dealloc_output_buffer_exprs);
+    deal_with_exprs(&node->buffer_data_cast_exprs);
+    deal_with_exprs(&node->argument_prepare_exprs);
+
+    body_block->stmts.insert(body_block->stmts.begin(), debug_node);
+  }
+
+  void Visit(const ir::Block *op, Expr *expr) {
+    auto *node = expr->As<ir::Block>();
+    std::vector<Expr> new_stmts;
+    for (auto &e : op->stmts) {
+      if (!IsDebugInfoNode(e)) {
+        std::string msg;
+        if (!e.As<ir::Store>()) {
+          msg                  = StringFormat("running: %s", GetDebugString(e).c_str());
+          auto debug_info_node = CreateDebugStatement(msg);
+          new_stmts.push_back(debug_info_node);
+        } else {
+          auto _msg_args_      = StoreDebugInfo(e);
+          auto &msg            = std::get<0>(_msg_args_);
+          auto &args           = std::get<1>(_msg_args_);
+          auto debug_info_node = CreateDebugStatement("running: " + msg, std::move(args));
+          new_stmts.push_back(debug_info_node);
+        }
+      }
+
+      ir::IRMutator<>::Visit(&e, &Reference(&e));
+
+      new_stmts.push_back(e);
+
+      if (!IsDebugInfoNode(e) && e.As<ir::Store>()) {
+        auto _msg_args_      = StoreDebugInfo(e);
+        auto &msg            = std::get<0>(_msg_args_);
+        auto &args           = std::get<1>(_msg_args_);
+        auto debug_info_node = CreateDebugStatement(msg, std::move(args));
+        new_stmts.push_back(debug_info_node);
+
+        {  // detailed debug
+          auto _format_args_ = StoreDebugInfoBuilder()(&e);
+          auto &format       = std::get<0>(_format_args_);
+          auto &args         = std::get<1>(_format_args_);
+          new_stmts.push_back(CreateDebugStatement(format, std::move(args)));
+        }
+      }
+    }
+
+    node->stmts = new_stmts;
+  }
+
+  std::string GetDebugString(const Expr &e) {
+    std::stringstream ss;
+    switch (e.node_type()) {
+      case ir::IrNodeTy::Block:
+        ss << "<block>";
+        break;
+      case ir::IrNodeTy::For: {
+        auto *node = e.As<ir::For>();
+        ss << "<For " << node->loop_var << " in [" << node->min << ", " << node->extent << ")>";
+        break;
+      }
+      case ir::IrNodeTy::PolyFor: {
+        auto *node = e.As<ir::PolyFor>();
+        ss << "<PolyFor " << node->iterator << " in [" << node->init << ", " << node->ExtractExtent() << ")"
+           << " with condition: " << node->condition << ">";
+        break;
+      }
+      case ir::IrNodeTy::_LoweredFunc_: {
+        auto *node = e.As<ir::_LoweredFunc_>();
+        ss << "<LoweredFunc " << node->name << ">";
+        break;
+      }
+      case ir::IrNodeTy::Call: {
+        auto *node = e.As<ir::Call>();
+        if (node->name == runtime::intrinsic::debug_log_repr) {
+          return "";
+        }
+        ss << e;
+        break;
+      }
+      default:
+        ss << "NodeTy " << e->node_type() << ": " << e;
+        break;
+    }
+
+    return ss.str();
+  }
+
+  std::tuple<std::string, std::vector<Expr>> StoreDebugInfo(const Expr &e) {
+    const auto *node = e.As<ir::Store>();
+
+    std::stringstream format_ss;
+
+    {  // destination
+      format_ss << node->tensor.as_tensor()->name << "[";
+      for (auto &index : node->indices) format_ss << "%d ";
+      format_ss << "] = %f";
+    }
+
+    format_ss << ", ";
+
+    std::vector<Expr> val_reprs;
+    for (auto &index : node->indices) val_reprs.push_back(index);
+    val_reprs.push_back(ir::Load::Make(node->tensor, node->indices));
+
+    return std::make_tuple(format_ss.str(), val_reprs);
+  }
+
+  inline bool IsDebugInfoNode(const Expr &e) {
+    return e.As<ir::Call>() && e.As<ir::Call>()->name == runtime::intrinsic::debug_log_repr;
+  }
+
+  Expr CreateDebugStatement(const std::string &msg, std::vector<Expr> &&args = {}) {
+    args.insert(args.begin(), Expr(msg));
+    return ir::Call::Make(
+        Void(), runtime::intrinsic::debug_log_repr, args, {}, ir::CallType ::Intrinsic, ir::FunctionRef(), 0);
+  }
+};
+
+}  // namespace
+
+void InsertDebugLogCallee(Expr *e) { InsertDebugLogCalleeMutator()(e); }
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/insert_debug_log_callee.h b/paddle/cinn/optim/insert_debug_log_callee.h
new file mode 100644
index 0000000000000..470c909d36ce1
--- /dev/null
+++ b/paddle/cinn/optim/insert_debug_log_callee.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <tuple>
+#include <utility>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+void InsertDebugLogCallee(Expr* e);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/ir_copy.cc b/paddle/cinn/optim/ir_copy.cc
new file mode 100644
index 0000000000000..0603a2998def7
--- /dev/null
+++ b/paddle/cinn/optim/ir_copy.cc
@@ -0,0 +1,480 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/ir_copy.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/ir/module.h"
+
+namespace cinn {
+namespace optim {
+using namespace ir;  // NOLINT
+
+struct IRCopyVisitor : public ir::IRVisitorBase<Expr> {
+  // Use maps to unify all the copied tensors and buffers.
+  std::map<std::string, ir::_Tensor_*> tensor_map;
+  std::map<std::string, ir::_Buffer_*> buffer_map;
+
+  Expr Visit(const Expr* op) override { return IRVisitorBase::Visit(op); }
+
+ protected:
+  // The methods of ir nodes follows the order defined in node.h
+
+  Expr Visit(const ir::IntImm* op) override { return Expr(make_shared<IntImm>(op->type(), op->value)); }
+  Expr Visit(const ir::UIntImm* op) override { return Expr(make_shared<UIntImm>(op->type(), op->value)); }
+  Expr Visit(const ir::FloatImm* op) override { return Expr(make_shared<FloatImm>(op->type(), op->value)); }
+  Expr Visit(const ir::StringImm* op) override { return Expr(common::make_shared<StringImm>(op->value)); }
+
+  Expr Visit(const ir::Cast* op) override {
+    auto v = Visit(&op->v());
+    return Cast::Make(op->type(), v);
+  }
+
+  Expr Visit(const Select* op) override {
+    auto condition   = Visit(&op->condition);
+    auto true_value  = Visit(&op->true_value);
+    auto false_value = Visit(&op->false_value);
+    return Select::Make(condition, true_value, false_value);
+  }
+
+  Expr Visit(const IfThenElse* op) override {
+    auto condition = Visit(&op->condition);
+    auto true_case = Visit(&op->true_case);
+    Expr false_case;
+    if (op->false_case.defined()) false_case = Visit(&op->false_case);
+    return IfThenElse::Make(condition, true_case, false_case);
+  }
+
+  Expr Visit(const Block* op) override {
+    std::vector<Expr> stmts;
+    for (auto& s : op->stmts) {
+      stmts.push_back(Visit(&s));
+    }
+    return Block::Make(stmts);
+  }
+
+  Expr Visit(const Call* op) override {
+    auto read_args  = Visit(op->read_args);
+    auto write_args = Visit(op->write_args);
+    return Call::Make(op->type(), op->name, read_args, write_args, op->call_type, FunctionRef(), 0, op->attrs);
+  }
+
+  Expr Visit(const _Var_* op) override {
+    auto* n = make_shared<_Var_>();
+
+    n->name           = op->name;
+    n->is_reduce_axis = op->is_reduce_axis;
+    n->set_type(op->type());
+
+    if (op->lower_bound.defined()) {
+      n->lower_bound = Visit(&op->lower_bound);
+    }
+    if (op->upper_bound.defined()) {
+      n->upper_bound = Visit(&op->upper_bound);
+    }
+
+    return Expr(n);
+  }
+
+  Expr Visit(const Load* op) override {
+    auto tensor = Visit(&op->tensor);
+    std::vector<Expr> indices;
+    for (auto& idx : op->indices) {
+      indices.push_back(Visit(&idx));
+    }
+    return Load::Make(tensor, indices);
+  }
+
+  Expr Visit(const Store* op) override {
+    auto tensor = Visit(&op->tensor);
+    auto value  = Visit(&op->value);
+    std::vector<Expr> indices;
+    for (auto& idx : op->indices) indices.push_back(Visit(&idx));
+
+    return Store::Make(tensor, value, indices);
+  }
+
+  Expr Visit(const Alloc* op) override {
+    auto extents = Visit(op->extents);
+    Expr condition;
+    Expr body;
+    if (op->condition.defined()) condition = Visit(&op->condition);
+    if (op->body.defined()) body = Visit(&op->body);
+
+    return Alloc::Make(op->destination, op->type(), extents, condition, body);
+  }
+
+  Expr Visit(const Free* op) override { return Free::Make(op->destination); }
+
+  Expr Visit(const _Buffer_* op) override {
+    if (buffer_map.count(op->name)) {
+      return buffer_map[op->name];
+    }
+
+    auto shape         = Visit(op->shape);
+    auto strides       = Visit(op->strides);
+    auto name          = op->name;
+    auto scope         = op->scope;
+    int data_alignment = op->data_alignment;
+    auto elem_offset   = Visit(&op->elem_offset);
+    int offset_factor  = op->offset_factor;
+    Target target      = op->target;
+
+    auto new_node            = _Buffer_::Make(name, shape);
+    new_node->strides        = strides;
+    new_node->dtype          = op->dtype;  // copy data element's type.
+    new_node->name           = name;
+    new_node->scope          = scope;
+    new_node->data_alignment = data_alignment;
+    new_node->elem_offset    = elem_offset;
+    new_node->offset_factor  = offset_factor;
+    new_node->target         = target;
+    new_node->memory_type    = op->memory_type;
+    new_node->set_type(op->type());
+    op->CopyMeta(new_node.As<ir::_Buffer_>());
+
+    buffer_map[op->name] = new_node->self();
+
+    return Expr(ir::Buffer(new_node));
+  }
+
+  Expr Visit(const _Tensor_* op) override {
+    if (tensor_map.count(op->name)) {
+      return tensor_map[op->name];
+    }
+
+    auto shape       = Visit(op->shape);
+    auto domain      = Visit(op->domain);
+    auto buffer_expr = Expr(op->buffer);
+    // TODO(Superjomn) copy the operation.
+    auto operaion = op->operation;
+    auto name     = op->name;
+    auto tensor   = make_shared<_Tensor_>();
+
+    if (buffer_expr.defined()) {
+      auto buffer    = Visit(&buffer_expr);
+      tensor->buffer = buffer.as_buffer_ref();
+    }
+    tensor->domain      = domain;
+    tensor->shape       = shape;
+    tensor->reduce_axis = op->reduce_axis;
+    tensor->operation   = operaion;
+    tensor->name        = name;
+    tensor->set_type(op->type());
+    tensor->axis_ = op->axis_;
+
+    tensor_map[tensor->name] = tensor;
+
+    return tensor;
+  }
+
+  Expr Visit(const For* op) override {
+    auto extent = Visit(&op->extent);
+    auto min    = Visit(&op->min);
+    auto body   = Visit(&op->body);
+
+    return ir::For::Make(
+        op->loop_var, min, extent, op->for_type(), op->device_api, body, op->vectorize_info(), op->bind_info());
+  }
+
+  Expr Visit(const ir::PolyFor* op) override {
+    auto init      = Visit(&op->init);
+    auto condition = Visit(&op->condition);
+    auto inc       = Visit(&op->inc);
+    auto body      = Visit(&op->body);
+    auto expr      = PolyFor::Make(op->iterator,
+                              init,
+                              condition,
+                              inc,
+                              op->for_type(),
+                              op->device_api,
+                              body,
+                              op->vectorize_info(),
+                              op->bind_info());
+    return expr;
+  }
+
+  Expr Visit(const ir::_Module_* op) override {
+    std::vector<Expr> buffers;
+    std::vector<Expr> functions;
+    std::vector<Expr> submodules;
+
+    for (auto& expr : op->buffers) {
+      buffers.push_back(Visit(&expr));
+    }
+
+    for (auto& expr : op->functions) {
+      functions.push_back(Visit(&expr));
+    }
+
+    for (auto& expr : op->submodules) {
+      submodules.push_back(Visit(&expr));
+    }
+
+    auto res        = ir::_Module_::Make(op->name, op->target);
+    res->buffers    = buffers;
+    res->functions  = functions;
+    res->submodules = submodules;
+
+    return Expr(res);
+  }
+
+  Expr Visit(const _LoweredFunc_* op) override {
+    auto func = make_shared<_LoweredFunc_>();
+
+    func->name      = op->name;
+    func->args      = op->args;
+    func->body      = Visit(&op->body);
+    func->temp_bufs = op->temp_bufs;
+
+    func->device_api = op->device_api;
+
+    func->cuda_axis_info = op->cuda_axis_info;
+
+    std::vector<Expr> alloc_output_buffer_exprs;
+    std::vector<Expr> dealloc_output_buffer_exprs;
+    std::vector<Expr> buffer_data_cast_exprs;
+    std::vector<Expr> argument_prepare_exprs;
+
+#define COPY_ADD_FIELD(field__)      \
+  for (auto& expr : op->field__) {   \
+    field__.push_back(Visit(&expr)); \
+  }                                  \
+  func->field__ = std::move(field__);
+
+    COPY_ADD_FIELD(alloc_output_buffer_exprs);
+    COPY_ADD_FIELD(dealloc_output_buffer_exprs);
+    COPY_ADD_FIELD(buffer_data_cast_exprs);
+    COPY_ADD_FIELD(argument_prepare_exprs);
+
+    return func;
+  }
+
+  Expr Visit(const Let* op) override {
+    auto value = Visit(&op->symbol);
+    auto body  = Visit(&op->body);
+
+    return Let::Make(value, body);
+  }
+
+  Expr Visit(const Reduce* op) override {
+    auto init = Visit(&op->init);
+    auto body = Visit(&op->body);
+    std::vector<Var> reduce_axis(op->reduce_axis.begin(), op->reduce_axis.end());
+    return Reduce::Make(op->reduce_type, init, body, reduce_axis);
+  }
+
+  Expr Visit(const Ramp* op) override {
+    auto base   = Visit(&op->base);
+    auto stride = Visit(&op->stride);
+    int lanes   = op->lanes;
+    return Ramp::Make(base, stride, lanes);
+  }
+
+  Expr Visit(const Broadcast* op) override {
+    auto value = Visit(&op->value);
+    int lanes  = op->lanes;
+    CHECK(value.defined());
+    CHECK(value.type().valid());
+
+    auto* n  = make_shared<Broadcast>();
+    n->value = value;
+    n->lanes = lanes;
+    return Expr(n);
+  }
+
+  Expr Visit(const FracOp* op) override {
+    auto a = Visit(&op->a());
+    auto b = Visit(&op->b());
+    CHECK(a.defined());
+    CHECK(b.defined());
+
+    auto* n = make_shared<FracOp>();
+    n->a()  = a;
+    n->b()  = b;
+    return Expr(n);
+  }
+
+  Expr Visit(const Product* op) override {
+    std::vector<Expr> operands;
+    for (auto& v : op->operands()) {
+      operands.push_back(Visit(&v));
+    }
+    return Product::Make(operands);
+  }
+
+  Expr Visit(const Sum* op) override {
+    std::vector<Expr> operands;
+    for (auto& v : op->operands()) {
+      operands.push_back(Visit(&v));
+    }
+    return Sum::Make(operands);
+  }
+
+  Expr Visit(const ir::PrimitiveNode* op) override {
+    std::vector<std::vector<Expr>> arguments;
+    for (auto& args : op->arguments) {
+      arguments.push_back(Visit(args));
+    }
+
+    auto n       = common::make_shared<ir::PrimitiveNode>();
+    n->name      = op->name;
+    n->attrs     = op->attrs;  // attrs are PODs
+    n->arguments = arguments;
+    return Expr(n);
+  }
+
+  Expr Visit(const ir::_BufferRange_* op) {
+    std::vector<Var> ranges;
+    for (auto& range_var : op->ranges) {
+      auto* var = range_var.As<_Var_>();
+      ranges.push_back(Visit(var));
+    }
+    return ir::_BufferRange_::Make(Visit(&op->buffer), ranges);
+  }
+
+  Expr Visit(const ir::ScheduleBlock* op) {
+    std::vector<Var> iter_vars;
+    for (auto iter_var : op->iter_vars) {
+      auto* var = iter_var.As<_Var_>();
+      CHECK(var);
+      iter_vars.push_back(Visit(var));
+    }
+    std::vector<Expr> read_buffers;
+    for (auto buffer_range : op->read_buffers) {
+      read_buffers.push_back(Visit(&buffer_range));
+    }
+    std::vector<Expr> write_buffers;
+    for (auto buffer_range : op->write_buffers) {
+      write_buffers.push_back(Visit(&buffer_range));
+    }
+    Expr res = ir::ScheduleBlock::Make(iter_vars, read_buffers, write_buffers, op->name, Visit(&op->body));
+    res.As<ScheduleBlock>()->attrs = op->attrs;
+    return res;
+  }
+
+  Expr Visit(const ir::ScheduleBlockRealize* op) {
+    std::vector<Expr> iter_values;
+    for (auto iter_value : op->iter_values) {
+      iter_values.push_back(Visit(&iter_value));
+    }
+    return ir::ScheduleBlockRealize::Make(iter_values, Visit(&op->schedule_block));
+  }
+
+#define __(x__) Expr Visit(const ir::intrinsics::x__* op);
+  INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+
+  Expr Visit(const ir::IntrinsicOp* op) override {
+    switch (op->getKind()) {
+#define __(x__)                   \
+  case ir::IntrinsicKind::k##x__: \
+    return Visit(llvm::dyn_cast<ir::intrinsics::x__>(op));
+      INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+    }
+  }
+
+#define OP_BINARY_HANDLE(op__)               \
+  Expr Visit(const ir::op__* op) override {  \
+    auto a = IRVisitorBase::Visit(&op->a()); \
+    auto b = IRVisitorBase::Visit(&op->b()); \
+    return op__::Make(a, b);                 \
+  }
+  NODETY_BINARY_OP_FOR_EACH(OP_BINARY_HANDLE)
+#undef OP_BINARY_HANDLE
+
+#define OP_UNARY_HANDLE(op__)                \
+  Expr Visit(const op__* op) override {      \
+    auto v = IRVisitorBase::Visit(&op->v()); \
+    return op__::Make(v);                    \
+  }
+  NODETY_UNARY_OP_FOR_EACH(OP_UNARY_HANDLE)
+#undef OP_UNARY_HANDLE
+
+  std::vector<Expr> Visit(const std::vector<Expr>& vs) {
+    std::vector<Expr> copied;
+    for (auto& e : vs) {
+      copied.push_back(Visit(&e));
+    }
+    return copied;
+  }
+};
+
+Expr IRCopyVisitor::Visit(const ir::intrinsics::BufferGetDataHandle* op) {
+  return intrinsics::BufferGetDataHandle::Make(Visit(&op->buffer));
+}
+Expr IRCopyVisitor::Visit(const ir::intrinsics::BufferGetDataConstHandle* op) {
+  return intrinsics::BufferGetDataConstHandle::Make(Visit(&op->buffer));
+}
+Expr IRCopyVisitor::Visit(const ir::intrinsics::PodValueToX* op) {
+  return intrinsics::PodValueToX::Make(Visit(&op->pod_value_ptr), op->GetOutputType(0));
+}
+Expr IRCopyVisitor::Visit(const ir::intrinsics::BufferCreate* op) {
+  return intrinsics::BufferCreate::Make(Visit(&op->buffer));
+}
+Expr IRCopyVisitor::Visit(const ir::intrinsics::GetAddr* op) { return intrinsics::GetAddr::Make(Visit(&op->data)); }
+Expr IRCopyVisitor::Visit(const ir::intrinsics::ArgsConstruct* op) {
+  llvm::SmallVector<Expr, 7> args;
+  for (auto& arg : op->args) {
+    args.push_back(Visit(&arg));
+  }
+  return intrinsics::ArgsConstruct::Make(op->var, args);
+}
+Expr IRCopyVisitor::Visit(const ir::intrinsics::BuiltinIntrin* op) {
+  return intrinsics::BuiltinIntrin::Make(op->name, op->args, op->id, op->arg_nums, op->type());
+}
+
+Expr IRCopy(Expr x) {
+  IRCopyVisitor visitor;
+  auto copied = visitor.Visit(&x);
+  return copied;
+}
+
+std::vector<Expr> IRCopy(const std::vector<Expr>& x) {
+  std::vector<Expr> res;
+  for (auto& i : x) {
+    res.emplace_back(IRCopy(i));
+  }
+  return res;
+}
+
+ir::ModuleExpr IRCopy(const ir::ModuleExpr& x) { return ir::ModuleExpr(IRCopy(x.GetExprs())); }
+
+ir::LoweredFunc IRCopy(const ir::LoweredFunc& x) {
+  ir::Expr copy_func_expr          = IRCopy(static_cast<ir::Expr>(x));
+  ir::_LoweredFunc_* copy_func_ptr = copy_func_expr.As<ir::_LoweredFunc_>();
+  return ir::LoweredFunc(copy_func_ptr);
+}
+
+// TODO(zhhsplendid): make IRCopy of std::vector a template function
+std::vector<ir::LoweredFunc> IRCopy(const std::vector<ir::LoweredFunc>& x) {
+  std::vector<ir::LoweredFunc> res;
+  for (const auto& i : x) {
+    res.emplace_back(IRCopy(i));
+  }
+  return res;
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/ir_copy.h b/paddle/cinn/optim/ir_copy.h
new file mode 100644
index 0000000000000..38baef7067f11
--- /dev/null
+++ b/paddle/cinn/optim/ir_copy.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/lowered_func.h"
+
+namespace cinn {
+
+namespace ir {
+class ModuleExpr;
+}  // namespace ir
+
+namespace optim {
+
+//! Shallow copy an expression.
+Expr IRCopy(Expr x);
+
+std::vector<Expr> IRCopy(const std::vector<Expr>& x);
+
+ir::ModuleExpr IRCopy(const ir::ModuleExpr& x);
+
+ir::LoweredFunc IRCopy(const ir::LoweredFunc& x);
+
+std::vector<ir::LoweredFunc> IRCopy(const std::vector<ir::LoweredFunc>& x);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/ir_copy_test.cc b/paddle/cinn/optim/ir_copy_test.cc
new file mode 100644
index 0000000000000..ee592fda58aed
--- /dev/null
+++ b/paddle/cinn/optim/ir_copy_test.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/ir_copy.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn {
+namespace optim {
+
+TEST(IrCopy, basic) {
+  Expr a(1.f);
+  auto aa = IRCopy(a);
+  LOG(INFO) << "aa " << aa;
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/ir_replace.cc b/paddle/cinn/optim/ir_replace.cc
new file mode 100755
index 0000000000000..9ebf0c7271680
--- /dev/null
+++ b/paddle/cinn/optim/ir_replace.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/ir_replace.h"
+
+#include <set>
+
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+using utils::GetStreamCnt;
+
+namespace {
+
+struct IrReplaceMutator : ir::IRMutator<Expr*> {
+  std::set<ir::IrNodeTy> valid_nodetys{{ir::IrNodeTy::Broadcast, ir::IrNodeTy::_Var_}};
+
+  IrReplaceMutator(ir::Expr from, Expr to) : from_(from), to_(to), from_repr_(GetStreamCnt(from)) {
+    CHECK(valid_nodetys.count(from->node_type())) << "Not valid node type got " << from->node_type();
+  }
+  void operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::_Var_* op, Expr* expr) override {
+    if (op->node_type() == from_->node_type() && from_repr_ == GetStreamCnt(*expr)) {
+      *expr = optim::IRCopy(to_);
+    }
+  }
+
+  void Visit(const ir::Broadcast* op, Expr* expr) override {
+    if (op->node_type() == from_->node_type() && from_repr_ == GetStreamCnt(*expr)) {
+      *expr = optim::IRCopy(to_);
+    }
+  }
+
+  std::string from_repr_;
+  ir::Expr from_;
+  Expr to_;
+};
+
+}  // namespace
+
+void IrReplace(ir::Expr* expr, ir::Expr from, ir::Expr to) {
+  CHECK(expr);
+  IrReplaceMutator(from, to)(expr);
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/ir_replace.h b/paddle/cinn/optim/ir_replace.h
new file mode 100644
index 0000000000000..c6982056693e4
--- /dev/null
+++ b/paddle/cinn/optim/ir_replace.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+//! Replace the variable \p v to expression \p e in expression \p expr.
+void IrReplace(ir::Expr* expr, ir::Expr from, ir::Expr to);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc
new file mode 100644
index 0000000000000..0ed3d92c93aeb
--- /dev/null
+++ b/paddle/cinn/optim/ir_simplify.cc
@@ -0,0 +1,365 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/ir_simplify.h"
+
+#include <absl/container/flat_hash_map.h>
+#include <ginac/ginac.h>
+#include <glog/logging.h>
+
+#include <map>
+#include <string>
+
+#include "cinn/common/arithmatic.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/optim/cast_simplify.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+using namespace ir;  // NOLINT
+using common::ExprToGinacConverter;
+using utils::GetStreamCnt;
+using utils::Replace;
+
+namespace {
+
+//! Simplify some sub-expression in the `expr`. Due to the simplify strategy just fit several kinds of IR noedes, we
+//! partition the original expression to several sub-expression those supported by simplify, and process each of them.
+void PartialSimplify(Expr* expr, const absl::flat_hash_map<std::string, common::CasInterval>& var_intervals = {}) {
+  *expr = common::AutoSimplify(*expr, var_intervals);
+}
+
+//! Simplify the expression but Load.
+struct SimplifyButStoreLoadMutator : public ir::IRMutator<ir::Expr*> {
+  common::cas_intervals_t& var_intervals;
+  explicit SimplifyButStoreLoadMutator(common::cas_intervals_t& var_intervals) : var_intervals(var_intervals) {}
+
+  void operator()(Expr* x) { ir::IRMutator<ir::Expr*>::Visit(x, x); }
+
+  using ir::IRMutator<>::Visit;
+
+#define __(op__) \
+  void Visit(const op__* op, Expr* expr) override { PartialSimplify(expr, var_intervals); }
+
+  __(Add)
+  __(Mul)
+  __(Sub)
+  __(Div)
+  __(Min)
+  __(Max)
+#undef __
+
+  void Visit(const Ramp* op, Expr* expr) override {
+    auto* node = expr->As<Ramp>();
+    CHECK(common::IsPureMath(node->base));
+    CHECK(common::IsPureMath(node->stride));
+    PartialSimplify(&node->base, var_intervals);
+    PartialSimplify(&node->stride, var_intervals);
+  }
+
+  void Visit(const Cast* op, Expr* expr) override {
+    auto* node = expr->As<Cast>();
+    Visit(&node->v(), &node->v());
+  }
+
+  void Visit(const PolyFor* op, Expr* expr) override {
+    auto* node      = expr->As<ir::PolyFor>();
+    node->condition = common::SolveInequality(op->condition, op->iterator);
+
+    Visit(&node->body, &node->body);
+  }
+
+  void Visit(const For* op, Expr* expr) override {
+    auto* node = expr->As<ir::For>();
+    Visit(&node->min, &node->min);
+    Visit(&node->extent, &node->extent);
+    auto* min_i    = op->min.As<IntImm>();
+    auto* extent_i = op->extent.As<IntImm>();
+    if (min_i && extent_i && extent_i->value > min_i->value) {
+      var_intervals.emplace(op->loop_var->name, common::CasInterval{min_i->value, extent_i->value - 1});
+    } else {
+      var_intervals.emplace(op->loop_var->name, common::CasInterval{op->min, op->extent - 1});
+    }
+
+    Visit(&node->body, &node->body);
+    if (min_i && extent_i) {
+      var_intervals.erase(op->loop_var->name);
+    }
+  }
+
+  void Visit(const _Tensor_* op, Expr* expr) override {
+    auto* node = expr->As<ir::_Tensor_>();
+
+    for (auto& e : node->shape) {
+      PartialSimplify(&e, var_intervals);
+    }
+    for (auto& e : node->domain) {
+      PartialSimplify(&e, var_intervals);
+    }
+  }
+};
+
+struct SimplifyLoadMutator : public ir::IRMutator<ir::Expr*> {
+  void operator()(Expr* x) { ir::IRMutator<ir::Expr*>::Visit(x, x); }
+
+  void Visit(const Load* expr, Expr* op) override {
+    auto* node = op->As<Load>();
+    for (auto& idx : node->indices) {
+      if (common::IsPureMath(idx)) {
+        PartialSimplify(&idx, var_intervals_);
+      } else {
+        SimplifyButStoreLoadMutator mutator(var_intervals_);
+        mutator(&idx);
+      }
+    }
+  }
+
+  void Visit(const For* op, Expr* expr) override {
+    auto* min_i    = op->min.As<IntImm>();
+    auto* extent_i = op->extent.As<IntImm>();
+    if (min_i && extent_i && extent_i->value > min_i->value) {
+      var_intervals_.emplace(op->loop_var->name, common::CasInterval{min_i->value, extent_i->value - 1});
+    }
+
+    auto* node = expr->As<For>();
+
+    operator()(&node->body);
+    operator()(&node->extent);
+
+    if (min_i && extent_i) {
+      var_intervals_.erase(op->loop_var->name);
+    }
+  }
+
+  common::cas_intervals_t var_intervals_;
+};
+
+struct SimplifyStoreMutator : public ir::IRMutator<ir::Expr*> {
+  void operator()(Expr* x) { ir::IRMutator<ir::Expr*>::Visit(x, x); }
+
+  void Visit(const Store* expr, Expr* op) override {
+    auto* node = op->As<Store>();
+
+    for (auto& idx : node->indices) {
+      if (common::IsPureMath(idx)) {
+        PartialSimplify(&idx, var_intervals_);
+      } else {
+        SimplifyButStoreLoadMutator mutator(var_intervals_);
+        mutator(&idx);
+      }
+    }
+  }
+
+  void Visit(const For* op, Expr* expr) override {
+    auto* min_i    = op->min.As<IntImm>();
+    auto* extent_i = op->extent.As<IntImm>();
+    if (min_i && extent_i) {
+      var_intervals_.emplace(op->loop_var->name, common::CasInterval{min_i->value, extent_i->value - 1});
+    }
+
+    auto* node = expr->As<For>();
+
+    operator()(&node->body);
+    operator()(&node->extent);
+
+    if (min_i && extent_i) {
+      var_intervals_.erase(op->loop_var->name);
+    }
+  }
+
+  common::cas_intervals_t var_intervals_;
+};
+
+struct SimplifyRampMutator : public ir::IRMutator<Expr*> {
+  void operator()(Expr* x) { ir::IRMutator<ir::Expr*>::Visit(x, x); }
+
+  void Visit(const Ramp* op, Expr* expr) override {
+    auto* node = expr->As<ir::Ramp>();
+
+    CHECK(common::IsPureMath(node->base)) << node->base << "is not a pure math!";
+    CHECK(common::IsPureMath(node->stride)) << node->stride << "is not a pure math!";
+    ;
+    Simplify(&node->base);
+    Simplify(&node->stride);
+  }
+  // ramp + ramp
+  void Visit(const Add* op, Expr* expr) override {
+    auto* node  = expr->As<ir::Add>();
+    Expr a      = node->a();
+    Expr b      = node->b();
+    auto a_ramp = a.As<ir::Ramp>();
+    auto b_ramp = b.As<ir::Ramp>();
+
+    if (a_ramp && b_ramp && a_ramp->lanes == b_ramp->lanes) {
+      Expr base_add   = common::AutoSimplify(a_ramp->base + b_ramp->base);
+      Expr stride_add = common::AutoSimplify(a_ramp->stride + b_ramp->stride);
+      *expr           = ir::Ramp::Make(base_add, stride_add, a_ramp->lanes);
+    }
+  }
+};
+
+struct SimplifyIfThenElseMutator : public ir::IRMutator<> {
+  void operator()(Expr* x) { ir::IRMutator<>::Visit(x, x); }
+
+  using ir::IRMutator<>::Visit;
+
+  void Visit(const IfThenElse* op, Expr* expr) override {
+    auto* node      = expr->As<ir::IfThenElse>();
+    node->condition = common::AutoSimplify(node->condition);
+
+    if (node->true_case.defined()) Visit(&node->true_case, &node->true_case);
+    if (node->false_case.defined()) Visit(&node->false_case, &node->false_case);
+  }
+};
+
+struct ReplaceFracWithDivMutator : public ir::IRMutator<> {
+  void operator()(Expr* x) { ir::IRMutator<>::Visit(x, x); }
+
+  void Visit(const FracOp* op, Expr* expr) override {
+    auto* node = expr->As<ir::FracOp>();
+
+    ir::IRMutator<>::Visit(&node->operand(0), &node->operand(0));
+    ir::IRMutator<>::Visit(&node->operand(1), &node->operand(1));
+
+    *expr = ir::Div::Make(node->operand(0), node->operand(1));
+  }
+};
+
+struct SimplifyBlocksMutator : public ir::IRMutator<> {
+  explicit SimplifyBlocksMutator() {}
+
+  void operator()(Expr* x) { ir::IRMutator<ir::Expr*>::Visit(x, x); }
+
+  using ir::IRMutator<>::Visit;
+
+  void Visit(const Block* op, Expr* expr) override {
+    auto* node = expr->As<ir::Block>();
+
+    if (node->stmts.size() == 1 && node->stmts[0].As<ir::Block>()) {
+      VLOG(6) << "Simplify size-1 ir::Block";
+      *expr = node->stmts[0];
+      Visit(expr, expr);
+    } else {
+      for (auto& s : node->stmts) {
+        Visit(&s, &s);
+      }
+      std::vector<Expr> stmts;
+      for (auto& s : node->stmts) {
+        if (s.As<ir::Block>()) {
+          VLOG(6) << "Simplify ir::Block inside ir::Block";
+          auto inner_block = s.As<ir::Block>();
+          for (auto inner_stmt : inner_block->stmts) {
+            stmts.push_back(inner_stmt);
+          }
+        } else {
+          stmts.push_back(s);
+        }
+      }
+      expr->As<ir::Block>()->stmts = stmts;
+    }
+  }
+
+  void Visit(const IfThenElse* op, Expr* expr) override {
+    if (op->condition.As<ir::UIntImm>()) {
+      if (op->condition.as_bool() == false) {
+        VLOG(6) << "Simplify ir::IfThenElse false block";
+        if (expr->As<IfThenElse>()->false_case.defined()) {
+          *expr = expr->As<IfThenElse>()->false_case;
+        } else {
+          *expr = ir::Block::Make({});
+        }
+      } else {
+        if (expr->As<IfThenElse>()->true_case.defined()) {
+          VLOG(6) << "Simplify ir::IfThenElse true block";
+          *expr = expr->As<IfThenElse>()->true_case;
+        } else {
+          *expr = ir::Block::Make({});
+        }
+      }
+      ir::IRMutator<ir::Expr*>::Visit(expr, expr);
+      return;
+    }
+    ir::IRMutator<ir::Expr*>::Visit(op, expr);
+  }
+};
+
+struct SimplifyForLoopsMutator : public ir::IRMutator<> {
+  absl::flat_hash_map<std::string, common::CasInterval> var_intervals;
+  explicit SimplifyForLoopsMutator() {}
+
+  void operator()(Expr* x) { ir::IRMutator<ir::Expr*>::Visit(x, x); }
+
+  using ir::IRMutator<>::Visit;
+
+  void Visit(const For* op, Expr* expr) override {
+    auto* node = expr->As<ir::For>();
+    Visit(&node->min, &node->min);
+    Visit(&node->extent, &node->extent);
+    auto* min_i    = node->min.As<IntImm>();
+    auto* extent_i = node->extent.As<IntImm>();
+    if (min_i && extent_i && extent_i->value > min_i->value && extent_i->value - min_i->value == 1) {
+      VLOG(6) << "Simplify current For Loop";
+      std::string var_name = node->loop_var->name;
+      var_intervals.emplace(var_name, common::CasInterval{min_i->value, extent_i->value - 1});
+      if (node->body.As<ir::Block>() && node->body.As<ir::Block>()->stmts.size() == 1) {
+        *expr = node->body.As<ir::Block>()->stmts[0];
+      } else {
+        *expr = node->body;
+      }
+      Visit(expr, expr);
+      var_intervals.erase(var_name);
+    } else {
+      Visit(&node->body, &node->body);
+    }
+  }
+
+  void Visit(const _Var_* op, Expr* expr) override {
+    auto* node = expr->As<ir::_Var_>();
+
+    if (var_intervals.count(node->name)) {
+      auto loop_range = var_intervals.at(node->name);
+      *expr           = Expr(loop_range.l);
+    }
+  }
+};
+
+}  // namespace
+
+void Simplify(Expr* expr) {
+  VLOG(3) << "Begin Simplify " << *expr;
+  optim::CastSimplify(expr);
+  SimplifyRampMutator()(expr);
+  SimplifyLoadMutator()(expr);
+  SimplifyStoreMutator()(expr);
+  SimplifyIfThenElseMutator()(expr);
+
+  common::cas_intervals_t var_intervals;
+  SimplifyButStoreLoadMutator mutator(var_intervals);
+  mutator(expr);
+
+  ReplaceFracWithDivMutator()(expr);
+}
+
+void SimplifyForLoops(Expr* expr) { SimplifyForLoopsMutator()(expr); }
+void SimplifyBlocks(Expr* expr) { SimplifyBlocksMutator()(expr); }
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/ir_simplify.h b/paddle/cinn/optim/ir_simplify.h
new file mode 100644
index 0000000000000..f5e2bdf82f6ba
--- /dev/null
+++ b/paddle/cinn/optim/ir_simplify.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Simplify the expression.
+ * The following cases are supported:
+ * a + 0 => a
+ * a*0 => 0
+ * A[i*0+2*a+3*a+1+2] => A[5*a+3]
+ *
+ * This only works on the simple IR nodes such as Load, Store, and the math operators such as Add, Sub and so on.
+ */
+void Simplify(Expr *expr);
+
+void SimplifyForLoops(Expr *expr);
+
+void SimplifyBlocks(Expr *expr);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/ir_simplify_test.cc b/paddle/cinn/optim/ir_simplify_test.cc
new file mode 100755
index 0000000000000..b9e7fb807a072
--- /dev/null
+++ b/paddle/cinn/optim/ir_simplify_test.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/ir_simplify.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+using utils::GetStreamCnt;
+using utils::Trim;
+
+TEST(IrSimplify, basic) {
+  auto A = Compute(
+      {Expr(100), Expr(20)}, [&](Var i, Var j) { return Expr(1.f); }, "C");
+  Buffer A_buf(A->type());
+  A->Bind(A_buf);
+
+  Var i("i"), j("j");
+  i->set_type(Int(32));
+  j->set_type(Int(32));
+
+  {  // simple case
+    auto B = A(i, Expr(0)) + 1.f * 0.f + 100.f + 24.5f;
+
+    LOG(INFO) << "B " << B;
+    // get (((C[(i * 20)] + 0) + 100) + 24.5)
+    Simplify(&B);
+    LOG(INFO) << "simplified: " << B;
+    auto out = "(124.500000f + C[i, 0])";
+    EXPECT_EQ(out, utils::GetStreamCnt(B));
+  }
+
+  {
+    Placeholder<float> x("X", {100, 20});
+    Placeholder<float> y("y", {100, 20});
+
+    auto B = Compute(
+        {Expr(100), Expr(20)},
+        [&](Expr i, Expr j) {
+          return x(i + 0, j + 0) + y(i, j * 0) * 1.f + 0.f * x(i, j) + 25.f + 100.f - 0.f +
+                 9.f * 10000.f * 1.f * 1.f * 0.f;
+        },
+        "B");
+
+    auto stages = CreateStages({B});
+    auto func   = Lower("func", stages, {B});
+    auto body   = func->body;
+
+    LOG(INFO) << "original body:\n" << body;
+    Simplify(&body);
+    auto target_out = R"ROC(
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 20)
+    {
+      B[i, j] = (125.000000f + (X[i, j] + y[i, 0]))
+    }
+  }
+}
+)ROC";
+    EXPECT_EQ(Trim(target_out), Trim(GetStreamCnt(body)));
+  }
+
+  {
+    Placeholder<float> x("X", {100, 20});
+    Placeholder<float> y("y", {100, 20});
+
+    auto B = Compute(
+        {Expr(100), Expr(20)},
+        [&](Expr i, Expr j) {
+          return x(100 * 10 * 1 * i + 0, j * 0) + y(i, j * 0) / (1.f + 2.f) + 0.f * x(i, j) + 25.f + 100.f - 0.f +
+                 9.f * 10000.f * 1.f * 1.f * 0.f;
+        },
+        "B");
+
+    auto stages = CreateStages({B});
+
+    auto func = Lower("func", stages, {B});
+    auto body = func->body;
+
+    LOG(INFO) << "original body:\n" << body;
+    Simplify(&body);
+
+    auto target_out = R"ROC(
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 20)
+    {
+      B[i, j] = ((y[i, 0] / 3.00000000f) + (125.000000f + X[(1000 * i), 0]))
+    }
+  }
+}
+)ROC";
+    EXPECT_EQ(Trim(target_out), Trim(GetStreamCnt(body)));
+  }
+}
+
+TEST(reverse, prod) {
+  Expr M(100), N(20);
+  Placeholder<float> A("A", {M, N});
+  auto C = Compute(
+      {M, N}, [=](Var i, Var j) { return Expr(1.f) / A(i, j); }, "C");
+
+  auto stages = CreateStages({A, C});
+  auto fn     = Lower("fn", stages, {A, C});
+  LOG(INFO) << "fn:\n" << fn;
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/lower_function_call_bind_vars.cc b/paddle/cinn/optim/lower_function_call_bind_vars.cc
new file mode 100644
index 0000000000000..abb80fc56b871
--- /dev/null
+++ b/paddle/cinn/optim/lower_function_call_bind_vars.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/lower_function_call_bind_vars.h"
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir_mutator.h"
+
+namespace cinn {
+namespace optim {
+
+namespace {
+
+struct LowerFunctionCallBindVarsMutator : public ir::IRMutator<> {
+  LowerFunctionCallBindVarsMutator() = default;
+
+  void operator()(Expr* m) {
+    m_ = m->as_module();
+    Expr module(m->get());
+    ir::IRMutator<>::Visit(&module, &module);
+  }
+
+ private:
+  void Visit(const ir::Call* op, Expr* expr) {
+    auto* node = expr->As<ir::Call>();
+    if (op->is_cinn_call()) {
+      const std::string& target = op->name;
+      auto it                   = std::find_if(m_->functions.begin(), m_->functions.end(), [&](const Expr& x) {
+        return x.as_lowered_func()->name == target;
+      });
+      CHECK(it != m_->functions.end()) << "The called function [" << target << "] is not exist";
+
+      std::vector<Expr> extra_var_args;
+
+      for (auto& arg : (*it).as_lowered_func()->args) {
+        if (arg.is_var()) {
+          extra_var_args.push_back(arg.var_arg());
+        }
+      }
+
+      // insert the extra var arguments to the begining of the original call's argument list.
+      node->read_args.insert(std::begin(op->read_args), extra_var_args.begin(), extra_var_args.end());
+    }
+
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+ private:
+  ir::_Module_* m_{};
+};
+
+}  // namespace
+
+void LowerFunctionCallBindVars(Expr* m) {
+  CHECK(m->as_module());
+  LowerFunctionCallBindVarsMutator()(m);
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/lower_function_call_bind_vars.h b/paddle/cinn/optim/lower_function_call_bind_vars.h
new file mode 100644
index 0000000000000..d5b941862a9c7
--- /dev/null
+++ b/paddle/cinn/optim/lower_function_call_bind_vars.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/module.h"
+
+namespace cinn {
+namespace optim {
+
+void LowerFunctionCallBindVars(Expr *m);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/lower_intrin.cc b/paddle/cinn/optim/lower_intrin.cc
new file mode 100644
index 0000000000000..e342af8fbeb22
--- /dev/null
+++ b/paddle/cinn/optim/lower_intrin.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/lower_intrin.h"
+
+#include <string>
+
+#include "cinn/backends/llvm/llvm_intrin_rule.h"
+#include "cinn/cinn.h"
+#include "cinn/ir/intrinsic_ops.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/registry.h"
+
+namespace cinn {
+namespace optim {
+
+void LowerIntrin(Expr *e, Target target) {
+  if (target.arch == Target::Arch::X86) {
+    codegen::RegisterCpuIntrinRule();
+  } else {
+    return;
+  }
+  struct Mutator : ir::IRMutator<Expr *> {
+    Target target;
+
+    explicit Mutator(Target target) : target(target) {}
+
+    void operator()(Expr *e) { ir::IRMutator<>::Visit(e, e); }
+
+    void Visit(const ir::Add *op, Expr *expr) override {
+      auto *node = expr->As<ir::Add>();
+      CHECK(node);
+      Expr ret;
+      if (node->type().is_float()) {
+        if (const ir::Mul *mul = node->b().As<ir::Mul>()) {
+          ret = ir::Call::Make(node->type(), "fma", {mul->a(), mul->b(), node->a()}, {}, ir::CallType::Intrinsic);
+        } else if (const ir::Mul *mul = node->a().As<ir::Mul>()) {
+          ret = ir::Call::Make(node->type(), "fma", {mul->a(), mul->b(), node->b()}, {}, ir::CallType::Intrinsic);
+        }
+        if (ret.defined()) {
+          ir::IRMutator<>::Visit(&ret, &ret);
+          *expr = ret;
+          return;
+        }
+      }
+      ir::IRMutator<>::Visit(&node->a(), &node->a());
+      ir::IRMutator<>::Visit(&node->b(), &node->b());
+    }
+
+    void Visit(const ir::Call *op, Expr *expr) override {
+      auto *node = expr->As<ir::Call>();
+      CHECK(node);
+      LowerCpuintrinsicOp(node, expr);
+    }
+
+    void LowerCpuintrinsicOp(ir::Call *op, Expr *expr) {
+      auto *node = expr->As<ir::Call>();
+      if (kIntrinsicCalls.count(node->name)) {
+        CHECK(!node->name.empty());
+        auto *func_ptr = ir::Registry::Get("lower_cpu_intrinsic_" + node->name);
+        CHECK(func_ptr) << "find no rule to lower cpu intrinsic for "
+                        << "lower_cpu_intrinsic_" + node->name;
+        Expr ret = (*func_ptr)(Expr(node));
+        if (!ret.same_as(*expr)) {
+          ir::IRMutator<>::Visit(&ret, &ret);
+        }
+        *expr = ret;
+        return;
+      }
+      for (auto &expr : node->read_args) {
+        ir::IRMutator<>::Visit(&expr, &expr);
+      }
+      for (auto &expr : node->write_args) {
+        ir::IRMutator<>::Visit(&expr, &expr);
+      }
+    }
+  };
+
+  Mutator m(target);
+  m(e);
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/lower_intrin.h b/paddle/cinn/optim/lower_intrin.h
new file mode 100644
index 0000000000000..1b4b5cd2ac42d
--- /dev/null
+++ b/paddle/cinn/optim/lower_intrin.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+static const std::set<std::string> kIntrinsicCalls{
+    {"exp",         "exp2",       "sqrt",        "log",         "log2",        "log10", "floor",
+     "ceil",        "round",      "trunc",       "cos",         "cosh",        "tan",   "tanh",
+     "sin",         "sinh",       "fabs",        "isnan",       "isfinite",    "isinf", "left_shift",
+     "right_shift", "bitwise_or", "bitwise_and", "bitwise_xor", "bitwise_not", "fma",   "rsqrt"}};
+
+/**
+ * Map the Call nodes to llvm intrinsic.
+ *
+ * This will rename the external call with the function in different backends.
+ *
+ * Notes: only support cpu currently.
+ */
+void LowerIntrin(Expr *e, Target target);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/map_extern_call.cc b/paddle/cinn/optim/map_extern_call.cc
new file mode 100644
index 0000000000000..f67129a64567b
--- /dev/null
+++ b/paddle/cinn/optim/map_extern_call.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/map_extern_call.h"
+
+#include "cinn/cinn.h"
+#include "cinn/hlir/op/op_util.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/runtime/cpu/host_intrinsics.h"
+
+namespace cinn {
+namespace optim {
+
+static const std::set<std::string> kExternFp32CallsGPU{
+    {"exp",   "erf",  "sigmoid",  "sqrt",  "log",       "log2",  "log10", "floor", "ceil",  "round", "trunc",
+     "cos",   "cosh", "tan",      "sin",   "sinh",      "acos",  "acosh", "asin",  "asinh", "atan",  "atanh",
+     "isnan", "tanh", "isfinite", "isinf", "remainder", "rsqrt", "cbrt",  "abs",   "pow",   "mod"}};
+
+static const std::set<std::string> kExternInt32CallsGPU{{"left_shift",
+                                                         "right_shift",
+                                                         "bitwise_or",
+                                                         "bitwise_and",
+                                                         "bitwise_xor",
+                                                         "bitwise_not",
+                                                         "pow",
+                                                         "logical_right_shift",
+                                                         "clz",
+                                                         "popc",
+                                                         "mod"}};
+
+static const std::set<std::string> kExternFp32CallsCPU = {
+    "erf", "acos", "acosh", "asin", "asinh", "atan", "atanh", "remainder"};
+
+void MapExternCall(Expr *e, Target target) {
+  struct Mutator : ir::IRMutator<Expr *> {
+    Target target;
+
+    explicit Mutator(Target target) : target(target) {}
+
+    void operator()(Expr *e) { ir::IRMutator<>::Visit(e, e); }
+
+    void Visit(const ir::Call *op, Expr *expr) override {
+      auto *node = expr->As<ir::Call>();
+      CHECK(node);
+      OptimizeConstantPow(node);
+      if (target.arch == Target::Arch::NVGPU) {
+        DealWithNvGpuintrinsics(node, expr);
+      } else {
+        DealWithCpuintrinsics(node, expr);
+      }
+    }
+
+    void DealWithCpuintrinsics(ir::Call *node, Expr *expr) {
+      if (kExternFp32CallsCPU.count(node->name)) {
+        CHECK_GE(node->read_args.size(), 1UL);
+        CHECK(node->read_args.front().type().is_float())
+            << "CPU extern call instrinsices only support float now! Please check.";
+        if (node->read_args.front().type().is_float(32)) {
+          auto out_type = node->type();
+          *expr         = lang::CallExtern(node->name + "f", node->read_args);
+        }
+      }
+    }
+
+    void DealWithNvGpuintrinsics(ir::Call *node, Expr *expr) {
+      auto arg_size = node->read_args.size();
+      if (arg_size == 0UL) {
+        // some node like __syncthreads hasn't arguments
+        return;
+      }
+      const auto &dtype = node->read_args.front().type();
+      const auto &name  = node->name;
+
+      bool node_in_extern_fp32  = kExternFp32CallsGPU.count(name);
+      bool node_in_extern_int32 = kExternInt32CallsGPU.count(name);
+      if (!node_in_extern_fp32 && !node_in_extern_int32) {
+        return;
+      }
+
+      std::string extern_func = hlir::GetExternFuncName(common::DefaultNVGPUTarget(), dtype, name);
+      *expr                   = lang::CallExtern(extern_func, node->read_args, node->attrs);
+    }
+
+    // Replace pow(x, 0.5) to sqrt(x) and pow(x, -0.5) to rsqrt(x), which
+    // can speed up a lot.
+    //
+    // Reference:
+    // https://en.wikipedia.org/wiki/Fast_inverse_square_root
+    void OptimizeConstantPow(ir::Call *node) {
+      if (node->name == "pow" && node->read_args.size() >= 2 && node->read_args[1].is_constant()) {
+        float pow_constant = node->read_args[1].get_constant();
+        if (pow_constant == 0.5) {
+          node->name = "sqrt";
+          node->read_args.erase(node->read_args.begin() + 1);
+        } else if (pow_constant == -0.5) {
+          node->name = "rsqrt";
+          node->read_args.erase(node->read_args.begin() + 1);
+        }
+      }
+    }
+  };
+
+  Mutator m(target);
+  m(e);
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/map_extern_call.h b/paddle/cinn/optim/map_extern_call.h
new file mode 100644
index 0000000000000..6ece28f96bad6
--- /dev/null
+++ b/paddle/cinn/optim/map_extern_call.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Map the Call nodes to external function call.
+ *
+ * This will rename the external call with the function in different backends.
+ */
+void MapExternCall(Expr *e, Target target);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
new file mode 100644
index 0000000000000..55ddc705700de
--- /dev/null
+++ b/paddle/cinn/optim/optimize.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/optimize.h"
+
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_schedule_util.h"
+#include "cinn/optim/call_arg_list_to_pod_value.h"
+#include "cinn/optim/cast_bool_to_int8.h"
+#include "cinn/optim/cast_simplify.h"
+#include "cinn/optim/eliminate_broadcast_in_forloop.h"
+#include "cinn/optim/extern_call_process.h"
+#include "cinn/optim/fold_cinn_call_arguments.h"
+#include "cinn/optim/if_simplify.h"
+#include "cinn/optim/insert_debug_log_callee.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/lower_function_call_bind_vars.h"
+#include "cinn/optim/lower_intrin.h"
+#include "cinn/optim/map_extern_call.h"
+#include "cinn/optim/remove_nested_block.h"
+#include "cinn/optim/remove_schedule_block.h"
+#include "cinn/optim/replace_const_param_to_integer.h"
+#include "cinn/optim/transform_gpu_forloop.h"
+#include "cinn/optim/transform_polyfor_to_for.h"
+#include "cinn/optim/unroll_loops.h"
+#include "cinn/optim/vectorize_loops.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace optim {
+
+Expr Optimize(Expr e, Target target, bool runtime_debug_info, bool remove_gpu_for_loops) {
+  CHECK(e.defined());
+  auto copied = IRCopy(e);
+
+  FoldCINNCallArguments(&copied);
+  TransformPolyForToFor(&copied);
+  ReplaceConstParamToInteger(&copied);
+  // Simplify already contains CastSimplify
+  Simplify(&copied);
+  UnrollLoop(&copied);
+  VLOG(4) << "After Optimize UnrollLoop:" << copied;
+
+  VectorizeLoops(&copied, target);
+  VLOG(4) << "After Optimize VectorizeLoops:" << copied;
+#ifdef CINN_WITH_CUDA
+  if (FLAGS_cinn_ir_schedule && copied.as_lowered_func()) {
+    ir::SetCudaAxisInfo(&copied);
+  }
+  if (remove_gpu_for_loops) {
+    RemoveGpuForloopsAxis(&copied);
+  }
+  CudaSyncThreadsDropIfThenElse(&copied);
+#endif
+
+  RemoveNestedBlock(&copied);
+  VLOG(4) << "After Optimize RemoveNestedBlock:" << copied;
+
+  MapExternCall(&copied, target);
+  VLOG(10) << "After Optimize MapExternCall:" << copied;
+
+  ExternCallMultiOutputShallowStore(&copied);
+  VLOG(10) << "After Optimize ExternCallMultiOutputShallowStore:" << copied;
+  // Simplify already contains CastSimplify
+  Simplify(&copied);
+  VLOG(10) << "After Optimize Simplify:" << copied;
+
+  IfSimplify(&copied);
+  VLOG(10) << "After Optimize IfSimplify:" << copied;
+
+  if (runtime_debug_info) {
+    LOG(WARNING) << "Turn on runtime debug information output";
+    InsertDebugLogCallee(&copied);
+  }
+  return copied;
+}
+
+ir::Module Optimize(const ir::Module& module, const Target& target) {
+  auto copied = IRCopy(Expr(module));
+  if (FLAGS_cinn_ir_schedule) {
+    UnrollLoop(&copied);
+    VectorizeLoops(&copied, Target());
+  }
+  VLOG(10) << "After VectorizeLoops:" << copied.as_module_ref();
+  RemoveScheduleBlock(&copied);
+  VLOG(10) << "After RemoveScheduleBlock:" << copied.as_module_ref();
+  LowerFunctionCallBindVars(&copied);
+  VLOG(10) << "After LowerFunctionCallBindVars:" << copied.as_module_ref();
+  CallArgListToPodValue(&copied);
+  VLOG(10) << "After CallArgListToPodValue:" << copied.as_module_ref();
+  LowerIntrin(&copied, target);
+  VLOG(10) << "After LowerIntrin:" << copied.as_module_ref();
+
+  return copied.as_module_ref();
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/optimize.h b/paddle/cinn/optim/optimize.h
new file mode 100644
index 0000000000000..7d1165f3d883c
--- /dev/null
+++ b/paddle/cinn/optim/optimize.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir.h"
+#include "cinn/ir/module.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Optimize the expression but Module.
+ * @param e
+ * @param runtime_debug_info
+ * @return
+ */
+Expr Optimize(Expr e, Target target, bool runtime_debug_info = false, bool remove_gpu_for_loops = true);
+
+/**
+ * Optimize a Module.
+ */
+ir::Module Optimize(const ir::Module& module, const Target& target);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/optimize_test.cc b/paddle/cinn/optim/optimize_test.cc
new file mode 100755
index 0000000000000..1479fa6b37871
--- /dev/null
+++ b/paddle/cinn/optim/optimize_test.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/optimize.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+
+TEST(Optimize, Unroll) {
+  Placeholder<float> A("A", {100, 20});
+
+  auto C = Compute(
+      {Expr(100), Expr(20)}, [&](Var i, Var j) { return A(i, j) + 1.f; }, "C");
+  auto stages = CreateStages({C});
+
+  stages[C]->Split(1, 5);
+  stages[C]->Unroll(2);
+
+  auto func = Lower("matmul", stages, {A, C});
+
+  auto out = R"ROC(
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j_outer, 0, 4)
+    {
+      C[i, (5 * j_outer)] = (1.00000000f + A[i, (5 * j_outer)])
+      C[i, (1 + (5 * j_outer))] = (1.00000000f + A[i, (1 + (5 * j_outer))])
+      C[i, (2 + (5 * j_outer))] = (1.00000000f + A[i, (2 + (5 * j_outer))])
+      C[i, (3 + (5 * j_outer))] = (1.00000000f + A[i, (3 + (5 * j_outer))])
+      C[i, (4 + (5 * j_outer))] = (1.00000000f + A[i, (4 + (5 * j_outer))])
+    }
+  }
+}
+)ROC";
+
+  EXPECT_EQ(utils::Trim(out), utils::Trim(utils::GetStreamCnt(func->body)));
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/remove_nested_block.cc b/paddle/cinn/optim/remove_nested_block.cc
new file mode 100644
index 0000000000000..366dd23a1a33f
--- /dev/null
+++ b/paddle/cinn/optim/remove_nested_block.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/remove_nested_block.h"
+
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn {
+namespace optim {
+
+Expr GetExprInsideBlock(Expr op) {
+  Expr node = op;
+  while (node.As<ir::Block>()) {
+    auto& stmts = node.As<ir::Block>()->stmts;
+    if (stmts.size() == 1) {
+      node = stmts.front();
+    } else {
+      break;
+    }
+  }
+  return node;
+}
+
+// This will remove the nested blocks, but it will also remove the block outside the forloop's body.
+struct NestedBlockSimplifer : public ir::IRMutator<Expr*> {
+  void operator()(ir::Expr* expr) { Visit(expr); }
+
+ private:
+  void Visit(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::Block* expr, Expr* op) override {
+    auto* node = op->As<ir::Block>();
+    if (node->stmts.size() == 1) {
+      *op = GetExprInsideBlock(*op);
+      IRMutator::Visit(op, op);
+    } else {
+      IRMutator::Visit(expr, op);
+    }
+  }
+};
+
+struct NestedBlockRemover : public ir::IRMutator<Expr*> {
+  void operator()(ir::Expr* expr) { Visit(expr); }
+
+ private:
+  void Visit(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::Block* expr, Expr* op) override {
+    auto* node = op->As<ir::Block>();
+
+    std::vector<ir::Expr> new_exprs;
+
+    bool detect_nested = false;
+    for (auto it = node->stmts.begin(); it != node->stmts.end(); it++) {
+      auto* block = it->As<ir::Block>();
+      if (block) {
+        detect_nested = true;
+        new_exprs.insert(std::end(new_exprs), block->stmts.begin(), block->stmts.end());
+      } else {
+        new_exprs.push_back(*it);
+      }
+    }
+
+    node->stmts = new_exprs;
+
+    IRMutator::Visit(expr, op);
+  }
+};
+
+// add block outside forloop's body.
+struct AddBlockToForloop : public ir::IRMutator<> {
+  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::For* expr, Expr* op) override {
+    auto* node = op->As<ir::For>();
+    if (!node->body.As<ir::Block>()) {
+      node->body = ir::Block::Make({node->body});
+    }
+
+    ir::IRMutator<>::Visit(expr, op);
+  }
+
+  void Visit(const ir::PolyFor* expr, Expr* op) override {
+    auto* node = op->As<ir::PolyFor>();
+    if (!node->body.As<ir::Block>()) {
+      node->body = ir::Block::Make({node->body});
+    }
+
+    ir::IRMutator<>::Visit(expr, op);
+  }
+
+  void Visit(const ir::_LoweredFunc_* expr, Expr* op) override {
+    auto* node = op->As<ir::_LoweredFunc_>();
+    if (!node->body.As<ir::Block>()) {
+      node->body = ir::Block::Make({node->body});
+    }
+
+    ir::IRMutator<>::Visit(expr, op);
+  }
+};
+
+void RemoveNestedBlock(Expr* e) {
+  NestedBlockRemover()(e);
+  NestedBlockSimplifer()(e);
+  AddBlockToForloop()(e);
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/remove_nested_block.h b/paddle/cinn/optim/remove_nested_block.h
new file mode 100644
index 0000000000000..cf6393fc863a1
--- /dev/null
+++ b/paddle/cinn/optim/remove_nested_block.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This file implements the strategy to remove the unnecessary nested block.
+ */
+#pragma once
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Remove the unecessary nested block.
+ */
+void RemoveNestedBlock(Expr* e);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/remove_nested_block_test.cc b/paddle/cinn/optim/remove_nested_block_test.cc
new file mode 100644
index 0000000000000..a62689c7d1ea0
--- /dev/null
+++ b/paddle/cinn/optim/remove_nested_block_test.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/remove_nested_block.h"
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir_printer.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+
+TEST(RemoveNestedBlock, basic) {
+  auto block0 = ir::Block::Make({Expr(1.f), Expr(1.f)});
+  auto block1 = ir::Block::Make({block0});
+  auto e      = Expr(block1);
+
+  std::string origin = utils::GetStreamCnt(e);
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+{
+  {
+    1.00000000f
+    1.00000000f
+  }
+}
+  )ROC"));
+
+  std::cout << "origin:\n" << e << std::endl;
+
+  RemoveNestedBlock(&e);
+
+  std::cout << "e:\n" << e << std::endl;
+
+  EXPECT_EQ(utils::GetStreamCnt(e), utils::Trim(R"ROC(
+{
+  1.00000000f
+  1.00000000f
+}
+  )ROC"));
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/remove_schedule_block.cc b/paddle/cinn/optim/remove_schedule_block.cc
new file mode 100644
index 0000000000000..e496ccdca4f0f
--- /dev/null
+++ b/paddle/cinn/optim/remove_schedule_block.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/remove_schedule_block.h"
+
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/optim/replace_var_with_expr.h"
+
+namespace cinn {
+namespace optim {
+
+struct ScheduleBlockRemover : public ir::IRMutator<Expr*> {
+  void operator()(ir::Expr* expr) { Visit(expr); }
+
+ private:
+  void Visit(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::ScheduleBlockRealize* op, Expr* expr) override {
+    auto* node = expr->As<ir::ScheduleBlockRealize>();
+    CHECK(node);
+    auto& iter_values    = node->iter_values;
+    auto* schedule_block = node->schedule_block.As<ir::ScheduleBlock>();
+    CHECK(schedule_block);
+    auto& iter_vars = schedule_block->iter_vars;
+    Expr body       = schedule_block->body;
+    CHECK_EQ(iter_vars.size(), iter_values.size());
+    for (int i = 0; i < iter_vars.size(); i++) {
+      optim::ReplaceVarWithExpr(&body, iter_vars[i], iter_values[i]);
+    }
+    *expr = body;
+    IRMutator::Visit(expr, expr);
+  }
+};
+
+void RemoveScheduleBlock(Expr* e) { ScheduleBlockRemover()(e); }
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/remove_schedule_block.h b/paddle/cinn/optim/remove_schedule_block.h
new file mode 100644
index 0000000000000..791c12159f81f
--- /dev/null
+++ b/paddle/cinn/optim/remove_schedule_block.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This file implements the strategy to remove the unnecessary nested block.
+ */
+#pragma once
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Remove schedule block.
+ */
+void RemoveScheduleBlock(Expr* e);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/remove_schedule_block_test.cc b/paddle/cinn/optim/remove_schedule_block_test.cc
new file mode 100755
index 0000000000000..bf41b729ea900
--- /dev/null
+++ b/paddle/cinn/optim/remove_schedule_block_test.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/remove_schedule_block.h"
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+
+TEST(RemovescheduleBlock, basic) {
+  using namespace ir;  // NOLINT
+  Context::Global().ResetNameId();
+  Placeholder<float> A("A", {Expr(100), Expr(20)});
+  Placeholder<float> B("B", {Expr(20), Expr(50)});
+  Target target = common::DefaultHostTarget();
+  Module::Builder builder("matmul", target);
+  // C = A * B
+  Var k(20, "k0");
+  Tensor C = Compute(
+      {Expr(100), Expr(50)}, [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+  auto stages = CreateStages({A, B, C});
+  auto func   = Lower("matmul", stages, {A, B, C}, {}, {}, nullptr, target, true);
+  LOG(INFO) << "func\n" << func;
+
+  std::string origin = utils::GetStreamCnt(func);
+  EXPECT_EQ(origin, utils::Trim(R"ROC(
+function matmul (_A, _B, _C)
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 100)
+    {
+      serial for (j, 0, 50)
+      {
+        ScheduleBlock(C__reduce_init)
+        {
+          i0, i1 = axis.bind(i, j)
+          C__reduce_init[i0, i1] = 0.00000000f
+        }
+        serial for (k0, 0, 20)
+        {
+          ScheduleBlock(C)
+          {
+            i0_0, i1_0, i2 = axis.bind(i, j, k0)
+            C[i0_0, i1_0] = (C[i0_0, i1_0] + (A[i0_0, i2] * B[i2, i1_0]))
+          }
+        }
+      }
+    }
+  }
+}
+)ROC"));
+
+  RemoveScheduleBlock(&func->body);
+
+  std::cout << "after RemovescheduleBlock:\n" << func << std::endl;
+
+  EXPECT_EQ(utils::GetStreamCnt(func), utils::Trim(R"ROC(
+function matmul (_A, _B, _C)
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 50)
+    {
+      C__reduce_init[i, j] = 0.00000000f
+      serial for (k0, 0, 20)
+      {
+        C[i, j] = (C[i, j] + (A[i, k0] * B[k0, j]))
+      }
+    }
+  }
+}
+)ROC"));
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/replace_call_with_expr.cc b/paddle/cinn/optim/replace_call_with_expr.cc
new file mode 100644
index 0000000000000..ac69e484cec31
--- /dev/null
+++ b/paddle/cinn/optim/replace_call_with_expr.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/replace_call_with_expr.h"
+
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/replace_var_with_expr.h"
+
+namespace cinn {
+namespace optim {
+
+struct ReplaceCallWithExprModifier : public ir::IRMutator<> {
+  ReplaceCallWithExprModifier(const std::string &statement, const Expr &candidate)
+      : statement_(statement), candidate_(candidate) {}
+
+  void operator()(Expr *e) { IRMutator<>::Visit(e, e); }
+
+ private:
+  void Visit(const ir::Call *expr, Expr *op) override {
+    auto *node = op->As<ir::Call>();
+    CHECK(!node->name.empty()) << "Call has no name";
+    VLOG(3) << "Processing Call node " << *op;
+    if (statement_ != node->name) return;
+
+    Expr expr_candidate = IRCopy(candidate_);
+    VLOG(3) << "Original candidate expr: " << candidate_;
+    VLOG(3) << "Copied candidate expr: " << expr_candidate;
+
+    // Replace the Call node with the expression candidate.
+    *op = expr_candidate;
+    VLOG(3) << "expr " << *op;
+  }
+
+ private:
+  std::string statement_;
+  const Expr &candidate_;
+};
+
+void ReplaceCallWithExpr(Expr *e, const std::string &statement, const Expr &candidate) {
+  ReplaceCallWithExprModifier modifier(statement, candidate);
+  modifier(e);
+}
+
+void ReplaceIslCallWithExpr(Expr *e,
+                            const std::string &statement,
+                            const Expr &candidate,
+                            const std::map<std::string, Expr> &axis_map) {
+  VLOG(3) << "ReplaceCallWithExpr, original expression: " << candidate;
+  Expr copied = IRCopy(candidate);
+  // update the axis in the copied expression.
+
+  // we treat the Store node as the normal statement, the others like Call node has no axis.
+  std::map<std::string, Expr> local_axis;
+  std::vector<std::string> origin_axes;
+  std::map<std::string, Expr> new_axis_map = axis_map;
+  for (auto &item : axis_map) {
+    origin_axes.push_back(item.first);
+  }
+  // Add '_after' to the transformed var's name to avoid duplicating transforming.
+  // For example, given indices [i,j], if we want to switch 'i' and 'j'(i->j, j->i)
+  // When we don't add '_after', the processing will be :
+  // 1. [i,j] to [j,j]
+  // 2. [j,j] to [i,i]
+  // Then we get result [i,i], which is different form the correct result [j,i]
+  // If we add '_after', the processing will be:
+  // 1. [i,j] to [j_after,j]
+  // 2. [j_after,j] to [j_after,i_after]
+  // 3. [j_after,i_after] to [j, i]
+  // Mission Complete!
+  for (auto &item : new_axis_map) {
+    for (auto &axis : origin_axes) {
+      ReplaceVarWithExpr(&item.second, Var(axis), Expr(Var(axis + "_after")));
+    }
+  }
+  if (copied.As<ir::Store>()) {
+    auto *store = copied.As<ir::Store>();
+    for (int i = 0; i < store->indices.size(); i++) {
+      auto indice = store->indices[i];
+      if (indice.is_var() || indice.is_constant()) {
+        if (!new_axis_map.count(std::to_string(i))) continue;
+        if (!indice.is_constant()) {
+          local_axis[indice.as_var()->name] = new_axis_map.at(std::to_string(i));
+        }
+      }
+    }
+    // the store indices just contains the ones of transform's domain, not the range.
+    // e.g. { s[i,j] -> s[i0,i1,j]: i0=i/4 and i1=i%4 }, the store's indices just contains i,j while in the final code,
+    // the axis are from the range, that is, there are some new axis not exists in store->indice, i0 and i1.
+  }
+
+  for (auto &laxis : local_axis) {
+    VLOG(3) << "local_axis Replacing axis: " << laxis.first << " to " << laxis.second;
+    ReplaceVarWithExpr(&copied, Var(laxis.first), laxis.second);
+  }
+  // replace the remaining axis(in the transform's range)
+  for (auto &item : new_axis_map) {
+    if (!local_axis.count(item.first)) {
+      VLOG(3) << "new_axis_map Replacing axis: " << item.first << " to " << item.second;
+      ReplaceVarWithExpr(&copied, Var(item.first), item.second);
+    }
+  }
+
+  for (auto &axis : origin_axes) {
+    ReplaceVarWithExpr(&copied, Var(axis + "_after"), Expr(Var(axis)));
+  }
+
+  VLOG(3) << "After replacing, the statement [" << statement << "] is : " << copied;
+  ReplaceCallWithExpr(e, statement, copied);
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/replace_call_with_expr.h b/paddle/cinn/optim/replace_call_with_expr.h
new file mode 100644
index 0000000000000..470a4835038e8
--- /dev/null
+++ b/paddle/cinn/optim/replace_call_with_expr.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <string>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Replace a Call node with a Expr (inline).
+ * @param e The expression to modify.
+ * @param statement The map from tuple_name to the expression candidate.
+ * @param candidate Var of each axis in the expression candidate.
+ */
+void ReplaceCallWithExpr(Expr *e, const std::string &statement, const Expr &candidate);
+
+/**
+ * Replace a Call node with a Expr (inline).
+ * @param e The expression to modify.
+ * @param statement The map from tuple_name to the expression candidate.
+ * @param candidate Var of each axis in the expression candidate.
+ * @param axis_map The map from a variable to expression.
+ */
+void ReplaceIslCallWithExpr(Expr *e,
+                            const std::string &statement,
+                            const Expr &candidate,
+                            const std::map<std::string, Expr> &axis_map);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/replace_call_with_expr_test.cc b/paddle/cinn/optim/replace_call_with_expr_test.cc
new file mode 100644
index 0000000000000..f5d08027a89d4
--- /dev/null
+++ b/paddle/cinn/optim/replace_call_with_expr_test.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/replace_call_with_expr.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/ir/buffer.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/ast_gen.h"
+
+namespace cinn {
+namespace optim {
+
+using namespace poly;
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/replace_const_param_to_integer.cc b/paddle/cinn/optim/replace_const_param_to_integer.cc
new file mode 100644
index 0000000000000..9d270e4e8d9b6
--- /dev/null
+++ b/paddle/cinn/optim/replace_const_param_to_integer.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/replace_const_param_to_integer.h"
+
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/poly/ast_gen.h"
+#include "cinn/utils/string.h"
+
+namespace cinn::optim {
+
+namespace {
+
+struct Mutator : public ir::IRMutator<> {
+  using ir::IRMutator<>::Visit;
+
+  void Visit(const ir::_Var_* op, Expr* expr) override {
+    if (utils::Startswith(op->name, poly::kIslParamConstPrefix)) {
+      std::string value = op->name.substr(strlen(poly::kIslParamConstPrefix));
+      *expr             = Expr(std::stoi(value));
+    }
+  }
+};
+
+}  // namespace
+
+void ReplaceConstParamToInteger(Expr* e) {
+  Mutator mutator;
+  mutator.Visit(e, e);
+}
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/replace_const_param_to_integer.h b/paddle/cinn/optim/replace_const_param_to_integer.h
new file mode 100644
index 0000000000000..40b7dee5b3299
--- /dev/null
+++ b/paddle/cinn/optim/replace_const_param_to_integer.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir.h"
+
+namespace cinn::optim {
+
+/**
+ * Replace the constant parameter(included in ISL param) to the corresponding integer.
+ *
+ * e.g.
+ *
+ * The expression:
+ * for (int i = 0; i <= _const_0; i++) ...
+ *
+ * to
+ *
+ * for (int i = 0; i < 0; i++)
+ */
+void ReplaceConstParamToInteger(Expr* e);
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/replace_var_with_expr.cc b/paddle/cinn/optim/replace_var_with_expr.cc
new file mode 100644
index 0000000000000..c10a16bb60339
--- /dev/null
+++ b/paddle/cinn/optim/replace_var_with_expr.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/replace_var_with_expr.h"
+
+#include "cinn/common/cas.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/replace_const_param_to_integer.h"
+
+namespace cinn {
+namespace optim {
+
+struct ReplaceVarWithExprMutator : public ir::IRMutator<> {
+  ReplaceVarWithExprMutator(const Var& var, const Expr& expr, const std::string& tensor_name)
+      : var_(var), expr_(expr), tensor_name_(tensor_name) {}
+
+  void operator()(Expr* expr) {
+    if (tensor_name_.empty()) visit_all_ = true;
+    IRMutator::Visit(expr, expr);
+  }
+
+ private:
+  void Visit(const ir::_Var_* expr, Expr* op) override {
+    if (expr->name == var_->name && (do_replace_ || visit_all_)) {
+      auto copied = IRCopy(expr_);
+      *op         = copied;
+    }
+  }
+
+  void Visit(const ir::For* op, Expr* expr) override {
+    auto* node = expr->As<ir::For>();
+    ir::IRMutator<>::Visit(&node->min, &node->min);
+    ir::IRMutator<>::Visit(&node->extent, &node->extent);
+    ir::IRMutator<>::Visit(&node->body, &node->body);
+    if (node->loop_var->name == var_->name && expr_.As<ir::_Var_>() && visit_all_) {
+      node->loop_var = expr_.As<ir::_Var_>();
+    }
+  }
+
+  void Visit(const ir::PolyFor* op, Expr* expr) override {
+    auto* node = expr->As<ir::PolyFor>();
+    ir::IRMutator<>::Visit(&node->init, &node->init);
+    ir::IRMutator<>::Visit(&node->condition, &node->condition);
+    ir::IRMutator<>::Visit(&node->inc, &node->inc);
+    ir::IRMutator<>::Visit(&node->body, &node->body);
+    if (node->iterator->name == var_->name && expr_.As<ir::_Var_>() && visit_all_) {
+      node->iterator = expr_.As<ir::_Var_>();
+    }
+  }
+
+  void Visit(const ir::Store* op, Expr* expr) override {
+    auto* node   = expr->As<ir::Store>();
+    auto* tensor = node->tensor.as_tensor();
+
+    if (tensor->name == tensor_name_) {
+      do_replace_ = true;
+    } else {
+      do_replace_ = false;
+    }
+    for (auto& index : node->indices) {
+      ir::IRMutator<>::Visit(&index, &index);
+    }
+    do_replace_ = false;
+    ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
+    ir::IRMutator<>::Visit(&node->value, &node->value);
+  }
+
+  void Visit(const ir::Load* expr, Expr* op) override {
+    auto* node   = op->As<ir::Load>();
+    auto* tensor = node->tensor.as_tensor();
+    if (tensor->name == tensor_name_) {
+      do_replace_ = true;
+    } else {
+      do_replace_ = false;
+    }
+    for (auto& idx : node->indices) ir::IRMutator<>::Visit(&idx, &idx);
+    do_replace_ = false;
+    ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
+  }
+
+ private:
+  bool do_replace_{false};
+  bool visit_all_{false};
+  const Var& var_;
+  const Expr& expr_;
+  const std::string& tensor_name_;
+};
+
+void ReplaceVarWithExpr(Expr* source, const Var& var, const Expr& expr, const std::string& tensor_name) {
+  ReplaceVarWithExprMutator mutator(var, expr, tensor_name);
+  mutator(source);
+}
+
+struct CollectTensorIndexMutator : public ir::IRMutator<> {
+  CollectTensorIndexMutator(const std::string& tensor_name) : tensor_name_(tensor_name) {}
+
+  std::vector<std::vector<Expr>> operator()(Expr* expr) {
+    IRMutator::Visit(expr, expr);
+    return res;
+  }
+
+ private:
+  void Visit(const ir::For* op, Expr* expr) override {
+    auto* node = expr->As<ir::For>();
+    ir::IRMutator<>::Visit(&node->body, &node->body);
+  }
+
+  void Visit(const ir::PolyFor* op, Expr* expr) override {
+    auto* node = expr->As<ir::PolyFor>();
+    ir::IRMutator<>::Visit(&node->body, &node->body);
+  }
+
+  void Visit(const ir::Load* expr, Expr* op) override {
+    auto* node   = op->As<ir::Load>();
+    auto* tensor = node->tensor.as_tensor();
+    if (tensor->name == tensor_name_) {
+      ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
+      res.push_back(node->indices);
+    } else {
+      ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
+      for (auto& idx : node->indices) ir::IRMutator<>::Visit(&idx, &idx);
+    }
+  }
+
+ private:
+  std::vector<std::vector<Expr>> res;
+  const std::string& tensor_name_;
+};
+
+std::vector<std::vector<Expr>> CollectTensorIndex(Expr* source, const std::string& tensor_name) {
+  CollectTensorIndexMutator mutator(tensor_name);
+  std::vector<std::vector<Expr>> result = mutator(source);
+  for (auto& i : result) {
+    for (auto& j : i) {
+      j = common::AutoSimplify(j);
+    }
+  }
+  return result;
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/replace_var_with_expr.h b/paddle/cinn/optim/replace_var_with_expr.h
new file mode 100644
index 0000000000000..50b2b2dd3ce31
--- /dev/null
+++ b/paddle/cinn/optim/replace_var_with_expr.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Replace the variable with a expression.
+ * @param var The variable to replace.
+ * @param expr The candidate expression.
+ * @param tensor_name Name of the tensor whose indices will be edited. If it is empty, means we will
+ * do the replace in all Expr instead of only in specific tensor's indices.
+ */
+/**
+ * Example 1: ReplaceVarWithExpr(source, Var("i"), Expr(0), "A")
+ * for(i, 0, 10)
+ *   for(j, 0, 10)
+ *      B[i,j] = A[i,j]
+ *
+ * =>
+ *
+ * for(i, 0, 10)
+ *   for(j, 0, 10)
+ *      B[i,j] = A[0,j]
+ *
+ * Example 2: ReplaceVarWithExpr(source, Var("i"), Expr(Var("k")))
+ * for(i, 0, 10)
+ *   for(j, 0, 10)
+ *      B[i,j] = A[i,j]
+ *
+ * =>
+ *
+ * for(k, 0, 10)
+ *   for(j, 0, 10)
+ *      B[k,j] = A[k,j]
+ */
+void ReplaceVarWithExpr(Expr *source, const Var &var, const Expr &expr, const std::string &tensor_name = "");
+
+/**
+ * Collect the specific tensor's indices.
+ * @param tensor_name The specific tensor's name.
+ * @return Return a vector containing all the indices of the specific tensor appeared in source.
+ */
+/**
+ * Example: CollectTensorIndex(source, "A")
+ * for(i, 0, 10)
+ *   for(j, 0, 10)
+ *      C[i,j] = A[i,j] + A[0,j] + B[j,i] + B[i,0]
+ *
+ * =>
+ *
+ * Return value:
+ * {{i,j},{0,j}}
+ */
+std::vector<std::vector<Expr>> CollectTensorIndex(Expr *source, const std::string &tensor_name);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/tensor_write_tell.cc b/paddle/cinn/optim/tensor_write_tell.cc
new file mode 100644
index 0000000000000..d52590cf17d29
--- /dev/null
+++ b/paddle/cinn/optim/tensor_write_tell.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/tensor_write_tell.h"
+
+namespace cinn {
+namespace optim {}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/tensor_write_tell.h b/paddle/cinn/optim/tensor_write_tell.h
new file mode 100644
index 0000000000000..a44664ca25baf
--- /dev/null
+++ b/paddle/cinn/optim/tensor_write_tell.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <set>
+#include <string>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_mutator.h"
+
+namespace cinn {
+namespace optim {
+
+struct TensorWriteTeller : public ir::IRMutator<const Expr*> {
+  //! Collect the write info in \p op.
+  void Collect(const Expr* op) { Visit(op, op); }
+
+  bool IsWrite(const std::string& tensor_name) const { return tensor_written.count(tensor_name); }
+
+ private:
+  std::set<std::string> tensor_written;
+
+  void Visit(const Expr* expr, const Expr* op) override { IRMutator::Visit(expr, op); }
+
+  void Visit(const ir::Store* expr, const Expr* op) override {
+    auto* node = op->As<ir::Store>();
+    CHECK(node);
+    auto* tensor = node->tensor.As<ir::_Tensor_>();
+    CHECK(tensor);
+    tensor_written.insert(tensor->name);
+    IRMutator::Visit(expr, op);
+  }
+
+  void Visit(const ir::_Tensor_* op, const Expr* expr) override {
+    auto* node = expr->As<ir::_Tensor_>();
+    if (node->is_call_node()) {
+      tensor_written.insert(node->name);
+    }
+  }
+};
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
new file mode 100644
index 0000000000000..86e2572a7a70c
--- /dev/null
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -0,0 +1,664 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/transform_gpu_forloop.h"
+
+#include <algorithm>
+#include <map>
+#include <stack>
+#include <string>
+#include <vector>
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/replace_var_with_expr.h"
+#include "cinn/poly/isl_utils.h"
+#include "cinn/poly/stage.h"
+#include "cinn/runtime/intrinsic.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * 1. Determine the grid and block dimensions.
+ * It takes the domains like `[0, 20]` or `[0, min(20, M/2)]`, the domain should have a integer right bound.
+ *
+ * 2. Replace the grid/thread iterators with something like `threadIdx.x`, `threadIdx.y`.
+ *
+ * 3. Remove the forloops owning the gpu axis.
+ *   1. if the extent is an IntImm, just remove this forloop.
+ *   2. if the extent is a Min, replace the forloop with an IfThenElse, with forloop's condition, new check will add (if
+ * the min of forloop is not zero).
+ *
+ * @param expr The expression to mutate.
+ */
+void RemoveGpuForloopsAxis(Expr *expr) {
+  struct Mutator : public ir::IRMutator<Expr *> {
+    void operator()(Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+   private:
+    void Visit(const ir::For *op, Expr *expr) override {
+      switch (op->for_type()) {
+        case ir::ForType::GPUBlock:
+          if (NeedToReplaceForloopWithIfThenElse(op)) {
+            ReplaceForloopWithIfThenElse(expr);
+          } else {
+            *expr = op->body;
+          }
+          IRMutator<>::Visit(expr, expr);
+          break;
+        case ir::ForType::GPUThread:
+          if (NeedToReplaceForloopWithIfThenElse(op)) {
+            ReplaceForloopWithIfThenElse(expr);
+          } else {
+            *expr = op->body;
+          }
+          IRMutator<>::Visit(expr, expr);
+          break;
+        default:
+          auto *node = expr->As<ir::For>();
+          IRMutator<>::Visit(&node->body, &node->body);
+          break;
+      }
+    }
+
+    bool NeedToReplaceForloopWithIfThenElse(const ir::For *n) const { return true; }
+
+    void ReplaceForloopWithIfThenElse(Expr *expr) {
+      auto *for_n      = expr->As<ir::For>();
+      auto *poly_for_n = expr->As<ir::PolyFor>();
+      CHECK(for_n || poly_for_n);
+
+      Expr condition;
+
+      auto condition_append = [&](Expr new_cond) {
+        if (condition.defined()) {
+          condition = ir::And::Make(condition, new_cond);
+        } else {
+          condition = new_cond;
+        }
+      };
+
+      if (for_n) {
+        // for(i, 2, 100);
+        //        ^
+        if (for_n->min != common::make_const(0)) {
+          condition_append(ir::GE::Make(for_n->loop_var, for_n->min));
+        }
+
+        // for(i, 2, min(M/2, 20)
+        //            ^
+        condition_append(ir::LT::Make(for_n->loop_var, for_n->extent));
+      } else {
+        if (poly_for_n->init != common::make_const(0)) {
+          condition_append(ir::GE::Make(poly_for_n->iterator, poly_for_n->init));
+        }
+
+        condition_append(poly_for_n->condition);
+      }
+
+      CHECK(condition.defined());
+
+      VLOG(3) << "GPU replacing\n" << *expr;
+      VLOG(3) << "\nto\n";
+      auto if_n = ir::IfThenElse::Make(condition, for_n->body);
+      VLOG(3) << if_n;
+      *expr = if_n;
+    }
+
+    void Visit(const ir::PolyFor *op, Expr *expr) override {
+      const auto msg = "PolyFor is not allowed for GPU, only For nodes are allowed";
+      CHECK(op->for_type() != ir::ForType::GPUBlock) << msg;
+      CHECK(op->for_type() != ir::ForType::GPUThread) << msg;
+      CHECK(op->for_type() != ir::ForType::GPULane) << msg;
+    }
+  };
+
+  Mutator mutator;
+  mutator(expr);
+}
+
+/**
+ * The generated __syncthreads call will be wrapped with a `if (xxxx == 0) { }`, this is the problem of isl AST output,
+ * drop it to make it run in all the threads.
+ */
+void CudaSyncThreadsDropIfThenElse(Expr *expr) {
+  struct Mutator : public ir::IRMutator<> {
+    void operator()(Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+    void Visit(const ir::IfThenElse *op, Expr *expr) override {
+      blocked_statement_stack.push_back(expr);
+      ir::IRMutator<>::Visit(op, expr);
+      blocked_statement_stack.pop_back();
+    }
+
+    void Visit(const ir::Call *op, Expr *expr) override {
+      if (op->name == runtime::intrinsic::cuda_sync_threads) {
+        if (!blocked_statement_stack.empty()) {
+          auto *last_for = blocked_statement_stack.back()->As<ir::IfThenElse>();
+          if (auto *eq_n = last_for->condition.As<ir::EQ>()) {
+            if (eq_n->b() == common::make_const(0)) {
+              *blocked_statement_stack.back() = *expr;
+            }
+          }
+        }
+      }
+    }
+
+    // Collect all the statements with Block(include Block) to the statement.
+    std::vector<ir::Expr *> blocked_statement_stack;
+  };
+
+  Mutator()(expr);
+}
+
+class RestructureVarNodes : public ir::IRMutator<> {
+ public:
+  void operator()(ir::Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::Load *load, Expr *op) override {
+    std::vector<ir::Expr> indices_copied;
+    for (const ir::Expr &indice : load->indices) {
+      indices_copied.push_back(IRCopy(indice));
+    }
+    op->As<ir::Load>()->indices = indices_copied;
+
+    IRMutator::Visit(load, op);
+  }
+
+  void Visit(const ir::Store *store, Expr *op) override {
+    std::vector<ir::Expr> indices_copied;
+    for (const ir::Expr &indice : store->indices) {
+      indices_copied.push_back(IRCopy(indice));
+    }
+    op->As<ir::Store>()->indices = indices_copied;
+
+    IRMutator::Visit(store, op);
+  }
+};
+
+class ReplaceIndexToBindExpr : public ir::IRMutator<> {
+ public:
+  void operator()(ir::Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::ScheduleBlockRealize *op, Expr *expr) override {
+    ir::ScheduleBlockRealize *schedule_block_realize = expr->As<ir::ScheduleBlockRealize>();
+    CHECK(schedule_block_realize->schedule_block.As<ir::ScheduleBlock>());
+    std::vector<ir::Expr> iter_values = schedule_block_realize->iter_values;
+    ir::Expr body                     = schedule_block_realize->schedule_block.As<ir::ScheduleBlock>()->body;
+    std::vector<ir::Var> iter_vars    = schedule_block_realize->schedule_block.As<ir::ScheduleBlock>()->iter_vars;
+
+    CHECK_EQ(iter_values.size(), iter_vars.size());
+    for (int idx = 0; idx < iter_values.size(); ++idx) {
+      ReplaceVarWithExpr(&body, iter_vars[idx], iter_values[idx]);
+    }
+    ir::IRMutator<>::Visit(&body, &body);
+  }
+};
+
+using TENSOR_LOOP = std::pair<ir::Expr, std::vector<ir::Expr>>;
+class CollectTensorLoopVisitor : public ir::IRMutator<> {
+ public:
+  void operator()(ir::Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::Store *op, Expr *expr) override {
+    auto tensor = op->tensor.as_tensor_ref();
+    // if buffer defined and buffer is not Heap.
+    if (tensor->buffer.defined() && tensor->buffer->memory_type != ir::MemoryType::Heap) {
+      if (buffer_tensor_loop_map_.count(tensor->buffer->name)) {
+        buffer_tensor_loop_map_[tensor->buffer->name].push_back(std::make_pair(*expr, loops_));
+      } else {
+        buffer_tensor_loop_map_[tensor->buffer->name] = {std::make_pair(*expr, loops_)};
+      }
+    }
+
+    IRMutator::Visit(op, expr);
+  }
+
+  void Visit(const ir::Load *op, Expr *expr) override {
+    if (op->is_addr_scalar()) {
+      return;
+    }
+    auto tensor = op->tensor.as_tensor_ref();
+    // if buffer defined and buffer is not Heap.
+    if (tensor->buffer.defined() && tensor->buffer->memory_type != ir::MemoryType::Heap) {
+      if (buffer_tensor_loop_map_.count(tensor->buffer->name)) {
+        buffer_tensor_loop_map_[tensor->buffer->name].push_back(std::make_pair(*expr, loops_));
+      } else {
+        buffer_tensor_loop_map_[tensor->buffer->name] = {std::make_pair(*expr, loops_)};
+      }
+    }
+
+    IRMutator::Visit(op, expr);
+  }
+
+  void Visit(const ir::For *op, Expr *expr) override {
+    loops_.push_back(*expr);
+    IRMutator::Visit(op, expr);
+    loops_.pop_back();
+  }
+
+  void Visit(const ir::PolyFor *op, Expr *expr) override { LOG(FATAL) << "Unkown PolyFor!"; }
+
+ public:
+  std::vector<ir::Expr> loops_;
+  std::unordered_map<std::string, std::vector<TENSOR_LOOP>> buffer_tensor_loop_map_;
+};
+
+void UpdateBufferAxisPass(ir::Expr *expr) {
+  CollectTensorLoopVisitor collect_tensor_loop_visitor;
+  collect_tensor_loop_visitor(expr);
+
+  auto buffer_tensor_loop = collect_tensor_loop_visitor.buffer_tensor_loop_map_;
+
+  for (auto &tmp : buffer_tensor_loop) {
+    auto tensor_loop_v = tmp.second;
+
+    auto &front = tensor_loop_v.front();
+    int count   = tensor_loop_v.size() > 1 ? front.second.size() : 0;
+    for (int idx = 1; idx < tensor_loop_v.size(); ++idx) {
+      auto &other = tensor_loop_v[idx];
+      for (int idy = 0; idy < std::min(front.second.size(), other.second.size()); ++idy) {
+        if (front.second[idy] != other.second[idy]) {
+          count = std::min(count, idy);
+          break;
+        }
+      }
+    }
+
+    auto get_thread_bind_var = [](const std::vector<ir::Expr> &loops) {
+      // threadidx loop_var,extent.
+      using ThreadLoopVarExtentMap = std::unordered_map<std::string, std::pair<std::string, int>>;
+      ThreadLoopVarExtentMap thread_loop_var_exent_map;
+      for (auto loop : loops) {
+        auto loop_ir = loop.As<ir::For>();
+        CHECK(loop_ir);
+        if (loop_ir->is_gpu_thread_binded()) {
+          std::string axis = "";
+          if (loop_ir->bind_info().offset == 0) {
+            axis = "threadIdx.x";
+          } else if (loop_ir->bind_info().offset == 1) {
+            axis = "threadIdx.y";
+          } else {
+            axis = "threadIdx.z";
+          }
+          // insert gpu thread loop var.
+          if (thread_loop_var_exent_map.count(axis)) {
+            auto &loop_var_extent = thread_loop_var_exent_map[axis];
+            if (loop_var_extent.second >= loop_ir->extent.as_int32()) {
+              thread_loop_var_exent_map[axis] = std::make_pair(loop_ir->loop_var->name, loop_ir->extent.as_int32());
+            }
+          } else {
+            thread_loop_var_exent_map[axis] = std::make_pair(loop_ir->loop_var->name, loop_ir->extent.as_int32());
+          }
+        }
+      }
+
+      std::unordered_set<std::string> loop_var_map;
+      for (auto &tmp : thread_loop_var_exent_map) {
+        loop_var_map.insert(tmp.second.first);
+      }
+
+      return loop_var_map;
+    };
+
+    auto load   = front.first.As<ir::Load>();
+    auto store  = front.first.As<ir::Store>();
+    auto tensor = load ? load->tensor.as_tensor_ref() : store->tensor.as_tensor_ref();
+    // find store and load keep loop for shared
+    std::vector<std::unordered_set<std::string>> keep_loop_vars;
+    if (tensor->buffer->memory_type == ir::MemoryType::GPUShared) {
+      for (auto &tensor_loop : tensor_loop_v) {
+        keep_loop_vars.push_back(get_thread_bind_var(tensor_loop.second));
+      }
+      CHECK_EQ(keep_loop_vars.size(), tensor_loop_v.size());
+    }
+
+    auto &loops = front.second;
+    for (int idx = 0; idx < count; ++idx) {
+      auto loop_expr = loops[idx];
+      auto loop_ir   = loop_expr.As<ir::For>();
+      auto loop_var  = loop_ir->loop_var;
+
+      for (int idy = 0; idy < tensor_loop_v.size(); ++idy) {
+        auto expr  = tensor_loop_v[idy].first;
+        auto load  = expr.As<ir::Load>();
+        auto store = expr.As<ir::Store>();
+        if (keep_loop_vars.size() == 0 || !keep_loop_vars[idy].count(loop_var->name)) {
+          auto &indices = load ? load->indices : store->indices;
+          for (auto &indice : indices) {
+            optim::ReplaceVarWithExpr(&indice, loop_var, ir::Expr(0));
+            indice = common::AutoSimplify(indice);
+          }
+        }
+      }
+    }
+  }
+}
+
+class ReplaceLoopVarToGpu : public ir::IRMutator<> {
+ public:
+  void operator()(Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::For *op, Expr *expr) override {
+    auto for_ir = expr->As<ir::For>();
+    CHECK(for_ir);
+
+    auto bind_info = for_ir->bind_info();
+
+    std::string var_name = "";
+    if (bind_info.offset == 0)
+      var_name = "x";
+    else if (bind_info.offset == 1)
+      var_name = "y";
+    else if (bind_info.offset == 2)
+      var_name = "z";
+    if (for_ir->is_gpu_block_binded()) {
+      var_name = "blockIdx." + var_name;
+      optim::ReplaceVarWithExpr(expr, op->loop_var, ir::Expr(ir::Var(var_name)));
+    } else if (for_ir->is_gpu_thread_binded()) {
+      var_name = "threadIdx." + var_name;
+      optim::ReplaceVarWithExpr(expr, op->loop_var, ir::Expr(ir::Var(var_name)));
+    }
+
+    ir::IRMutator<>::Visit(&for_ir->body, &for_ir->body);
+  }
+  void Visit(const ir::PolyFor *op, Expr *expr) override { LOG(FATAL) << "Unkown PolyFor!"; }
+};
+
+class SharedAxisVisitor : public ir::IRMutator<> {
+ public:
+  void operator()(ir::Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::Store *op, Expr *expr) override {
+    auto store = expr->As<ir::Store>();
+    if (!store->tensor.as_tensor_ref()->buffer.defined()) {
+      return;
+    }
+
+    if (store->tensor.as_tensor_ref()->buffer->memory_type == ir::MemoryType::GPUShared) {
+      for (auto &indice : store->indices) {
+        for (auto axis : gpu_axis) {
+          optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
+        }
+        indice = common::AutoSimplify(indice);
+      }
+    }
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::Load *op, Expr *expr) override {
+    auto load = expr->As<ir::Load>();
+    if (load->is_addr_scalar()) {
+      return;
+    }
+    if (!load->tensor.as_tensor_ref()->buffer.defined()) {
+      return;
+    }
+
+    if (load->tensor.as_tensor_ref()->buffer->memory_type == ir::MemoryType::GPUShared) {
+      for (auto &indice : load->indices) {
+        for (auto axis : gpu_axis) {
+          optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
+        }
+        indice = common::AutoSimplify(indice);
+      }
+    }
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  const std::vector<std::string> gpu_axis = {"blockIdx.x", "blockIdx.y", "blockIdx.z"};
+};
+
+class LocalAxisVisitor : public ir::IRMutator<> {
+ public:
+  void operator()(ir::Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::Store *op, Expr *expr) override {
+    auto store = expr->As<ir::Store>();
+    if (!store->tensor.as_tensor_ref()->buffer.defined()) {
+      return;
+    }
+
+    if (store->tensor.as_tensor_ref()->buffer->memory_type == ir::MemoryType::GPULocal) {
+      for (auto &indice : store->indices) {
+        for (auto axis : gpu_axis) {
+          optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
+        }
+        indice = common::AutoSimplify(indice);
+      }
+    }
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::Load *op, Expr *expr) override {
+    auto load = expr->As<ir::Load>();
+    if (load->is_addr_scalar()) {
+      return;
+    }
+    if (!load->tensor.as_tensor_ref()->buffer.defined()) {
+      return;
+    }
+
+    if (load->tensor.as_tensor_ref()->buffer->memory_type == ir::MemoryType::GPULocal) {
+      for (auto &indice : load->indices) {
+        for (auto axis : gpu_axis) {
+          optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
+        }
+        indice = common::AutoSimplify(indice);
+      }
+    }
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  const std::vector<std::string> gpu_axis = {
+      "blockIdx.x", "blockIdx.y", "blockIdx.z", "threadIdx.x", "threadIdx.y", "threadIdx.z"};
+};
+
+class ResizeBufferSizeVisitor : public ir::IRMutator<> {
+ public:
+  void operator()(ir::Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::Store *op, Expr *expr) override {
+    auto store        = expr->As<ir::Store>();
+    auto store_tensor = store->tensor.as_tensor_ref();
+
+    if (!store_tensor->buffer.defined()) {
+      return;
+    }
+    if (store_tensor->buffer->memory_type == ir::MemoryType::Heap) {
+      ir::IRMutator<>::Visit(op, expr);
+      return;
+    }
+
+    auto &indices = store->indices;
+    auto &shape   = store_tensor->shape;
+    auto &buffer  = store_tensor->buffer->shape;
+
+    shape.clear();
+    buffer.clear();
+    for (int idx = 0; idx < indices.size(); ++idx) {
+      shape.push_back(ir::Expr(BufferSize(indices[idx])));
+      buffer.push_back(shape.back());
+    }
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::Load *op, Expr *expr) override {
+    auto load = expr->As<ir::Load>();
+    if (!load->tensor.as_tensor_ref()->buffer.defined()) {
+      return;
+    }
+
+    if (load->tensor.as_tensor_ref()->buffer->memory_type == ir::MemoryType::Heap) {
+      ir::IRMutator<>::Visit(op, expr);
+      return;
+    }
+
+    load->tensor.as_tensor_ref()->shape = load->tensor.as_tensor_ref()->buffer->shape;
+
+    // For the moment, align the load tensor indices with the tensor shape using the trick method.
+    // A better way would be to modify the FlattenLoop Schedule.
+    int cnt = load->indices.size() - load->tensor.as_tensor_ref()->shape.size();
+    for (int i = 0; i < cnt; i++) {
+      load->indices.erase(load->indices.begin());
+    }
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::For *op, Expr *expr) override {
+    CHECK(expr->As<ir::For>());
+    auto for_ir   = expr->As<ir::For>();
+    auto var_name = for_ir->loop_var->name;
+    auto extent_i = for_ir->extent;
+
+    if (extent_i.is_constant()) loop_2_extent_[var_name] = extent_i.as_int32();
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  int BufferSize(ir::Expr indice) {
+    auto copy = IRCopy(indice);
+    auto vars = ir::CollectIRNodesInOrder(copy, [](const ir::Expr *expr) { return expr->As<ir::_Var_>(); });
+
+    int max_range = 1;
+    // using recursion funcitons index range.
+    std::function<void(int, ir::Expr)> compute_range = [&](const int deep, ir::Expr index) {
+      auto var = vars[deep].as_var_ref();
+      CHECK(loop_2_extent_.count(var->name)) << var->name;
+      auto extent = loop_2_extent_.find(var->name)->second;
+
+      for (int idx = 0; idx < extent; ++idx) {
+        auto tmp = IRCopy(index);
+        ReplaceVarWithExpr(&tmp, var, Expr(idx));
+
+        if (deep == vars.size() - 1) {
+          auto simplify = common::AutoSimplify(tmp);
+          auto range    = common::AutoSimplify(simplify);
+          CHECK(range.is_constant());
+          max_range = std::max(max_range, range.as_int32() + 1);
+        } else {
+          compute_range(deep + 1, tmp);
+        }
+      }
+    };
+
+    if (vars.size()) compute_range(0, copy);
+    return max_range;
+  }
+
+  std::unordered_map<std::string, int> loop_2_extent_;
+};
+
+class ReplaceVarToZero : public ir::IRMutator<> {
+ public:
+  void operator()(ir::Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::Store *op, Expr *expr) override {
+    auto store = expr->As<ir::Store>();
+    if (!store->tensor.as_tensor_ref()->buffer.defined()) {
+      return;
+    }
+
+    auto &indices = store->indices;
+    for (auto &indice : indices) {
+      for (auto var_ : loop_var_) {
+        optim::ReplaceVarWithExpr(&indice, ir::Var(var_), ir::Expr(0));
+      }
+      indice = common::AutoSimplify(indice);
+    }
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::Load *op, Expr *expr) override {
+    auto load = expr->As<ir::Load>();
+    if (!load->tensor.as_tensor_ref()->buffer.defined()) {
+      return;
+    }
+
+    auto &indices = load->indices;
+    for (auto &indice : indices) {
+      for (auto var_ : loop_var_) {
+        optim::ReplaceVarWithExpr(&indice, ir::Var(var_), ir::Expr(0));
+      }
+      indice = common::AutoSimplify(indice);
+    }
+
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::For *op, Expr *expr) override {
+    CHECK(expr->As<ir::For>());
+    auto for_ir   = expr->As<ir::For>();
+    auto var_name = for_ir->loop_var->name;
+    auto extent_i = for_ir->extent;
+
+    if (extent_i.is_constant() && extent_i.as_int32() == 1) loop_var_.insert(var_name);
+    ir::IRMutator<>::Visit(op, expr);
+    loop_var_.erase(var_name);
+  }
+  std::unordered_set<std::string> loop_var_;
+};
+
+void OptimizeExprGPU(Expr *expr) {
+  VLOG(2) << "Before Optimize Expr:\n" << *expr;
+
+  // copy var nodes to prevent one modification leading to multiple changes
+  RestructureVarNodes restructure_var_nodes;
+  restructure_var_nodes(expr);
+
+  // replace var to bind expr
+  ReplaceIndexToBindExpr replace_index_to_bind_expr;
+  replace_index_to_bind_expr(expr);
+
+  // resize buffer axis
+  UpdateBufferAxisPass(expr);
+
+  // replace var name with block/thread
+  ReplaceLoopVarToGpu replace_loop_var_to_gpu;
+  replace_loop_var_to_gpu(expr);
+
+  // update shared buffer axis
+  SharedAxisVisitor shared_axis_visitor;
+  shared_axis_visitor(expr);
+
+  // update local buffer axis
+  LocalAxisVisitor local_axis_visitor;
+  local_axis_visitor(expr);
+
+  ResizeBufferSizeVisitor resize_buffer_size_visitor;
+  resize_buffer_size_visitor(expr);
+
+  ReplaceVarToZero replace_var_to_zero;
+  replace_var_to_zero(expr);
+
+  VLOG(2) << "After Optimize Expr: \n" << *expr;
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/transform_gpu_forloop.h b/paddle/cinn/optim/transform_gpu_forloop.h
new file mode 100644
index 0000000000000..bffe8f412c8a7
--- /dev/null
+++ b/paddle/cinn/optim/transform_gpu_forloop.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <unordered_set>
+#include <utility>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/poly/isl_utils.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace optim {
+
+void OptimizeExprGPU(Expr* expr);
+/*
+  // replace 'for' loop to gpu 'block/thread'
+  // update buffer index to save memory size.
+  // re-compute buffer size.
+*/
+
+/**
+ * Remove the forloops of block and thread axis, add the kernel launch thread dimension information to the outermost
+ * LoweredFunc.
+ *
+ * For example, input the code:
+ * \code
+ * // Note here, the outermost expression should be a LoweredFunc
+ * _LoweredFunc_:
+ *   for (blockIdx.x, 0, 10)
+ *     for (threadIdx.x, 0, 20)
+ *       A(blockIdx.x, threadIdx.x)
+ * \endcode
+ *
+ * will be modified to
+ * \code
+ * _LoweredFunc_<blockDim:10, threadDim:20>:
+ *   A(blockIdx.x, threadIdx.x)
+ * \endcode
+ *
+ * \note For that the dimensions of each threadIdx or blockIdx should be constant, so this only takes For nodes, not
+ * \note PolyFor nodes is allowed to be GPU related.
+ */
+void RemoveGpuForloopsAxis(Expr* expr);
+
+/**
+ * Add __syncthreads() to shared memory producer.
+ */
+void CudaSyncThreadsDropIfThenElse(Expr* expr);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/transform_polyfor_to_for.cc b/paddle/cinn/optim/transform_polyfor_to_for.cc
new file mode 100644
index 0000000000000..3913056fbf719
--- /dev/null
+++ b/paddle/cinn/optim/transform_polyfor_to_for.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/transform_polyfor_to_for.h"
+
+#include <cmath>
+#include <vector>
+
+#include "cinn/common/arithmatic.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/type.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/ir_simplify.h"
+
+namespace cinn {
+namespace optim {
+
+namespace {
+
+Expr PlusOneWithMinMax(Expr expr) {
+  auto* min_n = expr.As<ir::Min>();
+  auto* max_n = expr.As<ir::Max>();
+
+  if (min_n) {
+    min_n->a() = min_n->a() + 1;
+    min_n->b() = min_n->b() + 1;
+    Simplify(&min_n->a());
+    Simplify(&min_n->b());
+    return expr;
+  } else if (max_n) {
+    max_n->a() = max_n->a() + 1;
+    max_n->b() = max_n->b() + 1;
+    Simplify(&max_n->a());
+    Simplify(&max_n->b());
+    return expr;
+  }
+  return expr + 1;
+}
+
+struct PolyForWithSimpleConditionToForMutator : public ir::IRMutator<Expr*> {
+  void operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void Visit(const ir::PolyFor* op, Expr* expr) override {
+    auto* node = expr->As<ir::PolyFor>();
+    auto* ge_n = node->condition.As<ir::GE>();
+    auto* gt_n = node->condition.As<ir::GT>();
+    if (ge_n) {
+      node->condition = (ge_n->a() * -1) <= (ge_n->b() * -1);
+    }
+    if (gt_n) {
+      node->condition = (ge_n->a() * -1) < (ge_n->b() * -1);
+    }
+
+    auto* lt_n = node->condition.As<ir::LT>();
+    auto* le_n = node->condition.As<ir::LE>();
+
+    if (lt_n) {
+      if (lt_n->b() != common::make_const(0)) {
+        node->condition = lt_n->a() - lt_n->b() < 0;
+      }
+    }
+    if (le_n) {
+      if (le_n->b() != common::make_const(0)) {
+        node->condition = le_n->a() - le_n->b() <= 0;
+      }
+    }
+
+    lt_n = node->condition.As<ir::LT>();
+    le_n = node->condition.As<ir::LE>();
+    if (!(lt_n || le_n)) return;
+
+    // check the lhs is the iterator
+    bool can_extract_extent = (lt_n && lt_n->a().as_var() && lt_n->a().as_var()->name == op->iterator->name) ||
+                              (le_n && le_n->a().as_var() && le_n->a().as_var()->name == op->iterator->name);
+
+    if (!can_extract_extent) {
+      if (node->condition.As<ir::LE>()) {
+        auto le = node->condition.As<ir::LE>();
+        CHECK(le->a().As<ir::Sub>());
+        CHECK_EQ(le->b().As<ir::IntImm>()->value, 0UL);
+        auto sub        = le->a().As<ir::Sub>();
+        node->condition = ir::LE::Make(sub->a(), sub->b());
+      } else if (node->condition.As<ir::LT>()) {
+        auto lt = node->condition.As<ir::LT>();
+        CHECK(lt->a().As<ir::Sub>());
+        CHECK_EQ(lt->b().As<ir::IntImm>()->value, 0UL);
+        auto sub        = lt->a().As<ir::Sub>();
+        node->condition = ir::LT::Make(sub->a(), sub->b());
+      } else {
+        LOG(FATAL) << "Unkown Type!";
+      }
+
+      lt_n = node->condition.As<ir::LT>();
+      le_n = node->condition.As<ir::LE>();
+      if (!(lt_n || le_n)) return;
+    }
+
+    Expr lhs = lt_n ? lt_n->a() : le_n->a();
+    Expr rhs = lt_n ? lt_n->b() : PlusOneWithMinMax(le_n->b());
+    rhs      = common::AutoSimplify(rhs);
+
+    if (op->is_vectorized()) CHECK(op->vectorize_info().valid());
+
+    Expr new_for =
+        ir::For::Make(op->iterator, op->init, rhs, op->for_type(), op->device_api, op->body, op->vectorize_info());
+    *expr = new_for;
+
+    Visit(&new_for.As<ir::For>()->body);
+  }
+};
+
+}  // namespace
+
+void TransformPolyForToFor(Expr* expr, bool auto_separate) { PolyForWithSimpleConditionToForMutator()(expr); }
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/transform_polyfor_to_for.h b/paddle/cinn/optim/transform_polyfor_to_for.h
new file mode 100644
index 0000000000000..d31bc6c4584f7
--- /dev/null
+++ b/paddle/cinn/optim/transform_polyfor_to_for.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+//! Transform the PolyFor node to For node. This will also separate the PolyFor with Min or Max conditions into two For
+//! nodes if \p auto_separate is true.
+void TransformPolyForToFor(Expr* expr, bool auto_separate = true);
+
+namespace detail {
+
+void PolyForWithSimpleConditionToFor(Expr* expr);
+
+}  // namespace detail
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/transform_polyfor_to_for_test.cc b/paddle/cinn/optim/transform_polyfor_to_for_test.cc
new file mode 100644
index 0000000000000..d98dd770c4549
--- /dev/null
+++ b/paddle/cinn/optim/transform_polyfor_to_for_test.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/transform_polyfor_to_for.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+
+namespace cinn {
+namespace optim {
+
+TEST(Expr, basic) {
+  using namespace ir;  // NOLINT
+
+  Expr M(512);
+  Expr K(200);
+  Expr N(500);
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  // C = A * B
+  Var k(K.as_int32(), "k0");
+
+  Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  {
+    stages[C]->Split("i", 8);
+    stages[C]->Split("j", 8);
+  }
+
+  // Code gen
+  auto func = Lower("matmul", stages, {A, B, C});
+
+  Target target;
+  target.arch = Target::Arch ::X86;
+  target.bits = Target::Bit ::k32;
+  target.os   = Target::OS ::Linux;
+
+  {
+    ir::Module::Builder builder("module1", target);
+    builder.AddFunction(func);
+
+    CodeGenC codegen(target);
+    codegen.SetInlineBuiltinCodes(false);
+    auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+    std::cout << "out:\n" << out;
+  }
+
+  optim::TransformPolyForToFor(&func->body);
+
+  {
+    ir::Module::Builder builder("module1", target);
+    builder.AddFunction(func);
+
+    CodeGenC codegen(target);
+    codegen.SetInlineBuiltinCodes(false);
+    auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+    std::cout << "out:\n" << out;
+
+    auto target_out = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void matmul(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  float* C__reduce_init = ((float*)(_C->memory));
+  for (int32_t i_outer = 0; i_outer < 64; i_outer += 1) {
+    for (int32_t i_inner = 0; i_inner < 8; i_inner += 1) {
+      for (int32_t j_outer = 0; j_outer < 63; j_outer += 1) {
+        for (int32_t j_inner = 0; j_inner < cinn_min(8, (500 + (-8 * j_outer))); j_inner += 1) {
+          C__reduce_init[((500 * i_inner) + ((4000 * i_outer) + ((8 * j_outer) + j_inner)))] = 0.00000000f;
+          for (int32_t k0 = 0; k0 < 200; k0 += 1) {
+            C[((500 * i_inner) + ((4000 * i_outer) + ((8 * j_outer) + j_inner)))] = fma(A[((200 * i_inner) + ((1600 * i_outer) + k0))], B[((8 * j_outer) + ((500 * k0) + j_inner))], C[((500 * i_inner) + ((4000 * i_outer) + ((8 * j_outer) + j_inner)))]);
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+    EXPECT_EQ(utils::Trim(target_out), utils::Trim(out));
+  }
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc
new file mode 100755
index 0000000000000..7262a77878900
--- /dev/null
+++ b/paddle/cinn/optim/unroll_loops.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/unroll_loops.h"
+
+#include <utility>
+#include <vector>
+
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/ir_replace.h"
+
+namespace cinn {
+namespace optim {
+
+namespace {
+
+struct UnrollMutator : public ir::IRMutator<Expr*> {
+  void operator()(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  // update auto_max_step_ from the specific attribute of ScheduleBlock
+  void Visit(const ir::ScheduleBlock* op, Expr* expr) override {
+    auto attr_it = op->attrs.find(ir::attr::auto_unroll_max_step);
+    if (attr_it != op->attrs.end()) {
+      const int* attr_v = absl::get_if<int>(&attr_it->second);
+      if (attr_v) {
+        int value = *attr_v;
+        std::swap(auto_max_step_, value);
+        VLOG(5) << "auto_max_step is updated:" << auto_max_step_;
+        ir::IRMutator<>::Visit(op, expr);
+        std::swap(auto_max_step_, value);
+        return;
+      } else {
+        LOG(WARNING) << "Get invalid value of attr:" << ir::attr::auto_unroll_max_step;
+      }
+    }
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  // count a Store node as plain statement
+  void Visit(const ir::Store* op, Expr* expr) override {
+    IRMutator<>::Visit(op, expr);
+    ++flat_step_;
+  }
+
+  // predicate whether a for-loop can be unrolled and do it
+  void Visit(const ir::For* op, Expr* expr) override {
+    IRMutator<>::Visit(op, expr);
+    if (op->extent.As<ir::IntImm>() == nullptr) {
+      VLOG(5) << "loop to be unrolled should have a contant extent";
+      return;
+    }
+    int extent = op->extent.as_int32();
+
+    // predicate this for-loop can be unrolled by auto-unroll conditions
+    bool unrollable =
+        (op->is_serial() && extent >= 0 && not_unrolled_depth_ == 0 && extent * flat_step_ <= auto_max_step_);
+
+    // predicate this for-loop can be unrolled by the unrolled tag
+    unrollable = (unrollable || op->is_unrolled()) && extent <= max_unroll_extent_;
+
+    if (unrollable) {
+      Unroll(op, expr);
+      flat_step_ *= extent;
+    } else {
+      ++not_unrolled_depth_;
+    }
+  }
+
+  //! Unroll a forloop.
+  void Unroll(const ir::For* op, Expr* expr) {
+    std::vector<Expr> body;
+
+    auto* min    = op->min.As<ir::IntImm>();
+    auto* extent = op->extent.As<ir::IntImm>();
+    if (!(min && extent)) return;
+
+    for (int i = min->value; i < extent->value; i++) {
+      Expr start = op->min + i;
+      body.push_back(optim::IRCopy(op->body));
+      optim::IrReplace(&body.back(), op->loop_var, start);
+    }
+
+    *expr = ir::Block::Make(body);
+  }
+
+ private:
+  // max permitted steps to be automatically unrolled in total
+  int auto_max_step_ = 0;
+  // max permitted extent of a loop to be unrolled
+  int max_unroll_extent_ = 50;
+
+  // the number of steps that have been unrolled or plain statement
+  int flat_step_ = 0;
+  // the number of nested loops not to be unrolled
+  int not_unrolled_depth_ = 0;
+};
+
+}  // namespace
+
+void UnrollLoop(Expr* expr) { UnrollMutator()(expr); }
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/unroll_loops.h b/paddle/cinn/optim/unroll_loops.h
new file mode 100644
index 0000000000000..283991b4f81dc
--- /dev/null
+++ b/paddle/cinn/optim/unroll_loops.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+void UnrollLoop(Expr* expr);
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/unroll_loops_test.cc b/paddle/cinn/optim/unroll_loops_test.cc
new file mode 100644
index 0000000000000..e4dbf49055da7
--- /dev/null
+++ b/paddle/cinn/optim/unroll_loops_test.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/unroll_loops.h"
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "cinn/cinn.h"
+#include "cinn/ir/ir_schedule.h"
+#include "cinn/lang/lower.h"
+
+namespace cinn {
+namespace optim {
+
+TEST(UnrollLoops, unrolled_tag) {
+  using namespace ir;
+
+  Expr M(100);
+  Expr N(4);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "C");
+
+  auto stages = CreateStages({C});
+
+  Target target = common::DefaultHostTarget();
+  auto func     = cinn::lang::LowerVec("test_unrolled_tag", stages, {A, B, C}, {}, {}, nullptr, target, true);
+  auto ast_expr = func[0]->body;
+
+  ir::ModuleExpr mod_expr({ast_expr});
+  ir::IRSchedule ir_sch(mod_expr);
+  auto loops = ir_sch.GetLoops("C");
+  ASSERT_EQ(loops.size(), 2U);
+
+  // extent of the loop exceed the max permitted value in the unroll_loops pass,
+  // which currently set 50, so the loop can not be unrolled actually
+  loops[1].As<ir::For>()->extent.As<ir::IntImm>()->value = 51;
+  ir_sch.Unroll(loops[1]);
+  UnrollLoop(&ast_expr);
+  loops = ir_sch.GetLoops("C");
+  ASSERT_EQ(loops.size(), 2U);
+
+  // unrolled correctly
+  loops[1].As<ir::For>()->extent.As<ir::IntImm>()->value = 4;
+  UnrollLoop(&ast_expr);
+  EXPECT_EQ(ir_sch.GetLoops("C").size(), 1);
+}
+
+TEST(UnrollLoops, auto_unroll) {
+  using namespace ir;
+
+  Expr M(100);
+  Expr N(4);
+  Expr O(5);
+  Expr const_value(float(2.11));
+
+  Placeholder<float> A("A", {M, N, O});
+
+  // B = A + 2.11
+  Tensor B = Compute(
+      {M, N, O}, [&](Var i, Var j, Var k) { return A(i, j, k) + const_value; }, "B");
+
+  auto stages   = CreateStages({B});
+  Target target = common::DefaultHostTarget();
+  auto func     = cinn::lang::LowerVec("test_auto_unroll", stages, {A, B}, {}, {}, nullptr, target, true);
+  auto ast_expr = func[0]->body;
+  ir::ModuleExpr mod_expr({ast_expr});
+  ir::IRSchedule ir_sch(mod_expr);
+  ASSERT_EQ(ir_sch.GetLoops("B").size(), 3);
+  UnrollLoop(&ast_expr);
+  // check after the last UnrollLoop pass it will remain unchanged
+  ASSERT_EQ(ir_sch.GetLoops("B").size(), 3);
+
+  ASSERT_TRUE(ast_expr.As<ir::Block>()->stmts.front().As<ir::ScheduleBlockRealize>() != nullptr);
+  auto* block_realize  = ast_expr.As<ir::Block>()->stmts.front().As<ir::ScheduleBlockRealize>();
+  auto* schedule_block = block_realize->schedule_block.As<ir::ScheduleBlock>();
+  // set the 'auto_unroll_max_step' attribute as value 25 that is bigger than
+  // the product of extent of the inner 2 loops
+  schedule_block->attrs.emplace(ir::attr::auto_unroll_max_step, 25);
+  UnrollLoop(&ast_expr);
+  EXPECT_EQ(ir_sch.GetLoops("B").size(), 1);
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/var_mod_simplify.cc b/paddle/cinn/optim/var_mod_simplify.cc
new file mode 100644
index 0000000000000..af099fe028391
--- /dev/null
+++ b/paddle/cinn/optim/var_mod_simplify.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/var_mod_simplify.h"
+
+#include <absl/container/flat_hash_map.h>
+
+#include "cinn/common/cas.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn::optim {
+
+namespace {
+using namespace ir;  // NOLINT
+
+struct ReplaceModWithDivMutator : public ir::IRMutator<> {
+  void operator()(Expr* x) { ir::IRMutator<>::Visit(x, x); }
+
+  void Visit(const Mod* op, Expr* expr) override {
+    auto* node = expr->As<ir::Mod>();
+    auto a     = node->operand(0);
+    auto b     = node->operand(1);
+    *expr      = ir::Div::Make(a, b);
+    *expr      = ir::Mul::Make(b, *expr);
+    *expr      = ir::Sub::Make(a, *expr);
+  }
+};
+
+struct ReplaceDivWithVarMutator : public ir::IRMutator<> {
+  absl::flat_hash_map<std::string, Expr> div_var_map_;
+  void operator()(Expr* x) { ir::IRMutator<>::Visit(x, x); }
+
+  void Visit(const Div* op, Expr* expr) override {
+    auto* node = expr->As<ir::Div>();
+
+    auto a = node->operand(0);
+    auto b = node->operand(1);
+    // only deal with var/int
+    if (a.is_var() && b.is_constant()) {
+      auto a_var = a.As<_Var_>();
+      auto b_int = b.As<IntImm>();
+      CHECK(a_var);
+      CHECK(b_int);
+      std::string var_name   = a_var->name + "/" + std::to_string(b_int->value);
+      div_var_map_[var_name] = ir::Div::Make(a, b);
+      *expr                  = Var(var_name);
+    }
+  }
+};
+
+struct ReplaceVarWithDivMutator : public ir::IRMutator<> {
+  absl::flat_hash_map<std::string, Expr> div_var_map_;
+  void operator()(Expr* x, const absl::flat_hash_map<std::string, Expr>& div_var_map) {
+    div_var_map_ = div_var_map;
+    ir::IRMutator<>::Visit(x, x);
+  }
+
+  void Visit(const _Var_* op, Expr* expr) override {
+    auto* node = expr->As<_Var_>();
+    CHECK(node);
+    if (div_var_map_.count(node->name)) {
+      *expr = div_var_map_[node->name];
+    }
+  }
+};
+
+}  // namespace
+
+void VarModSimplify(Expr* e) {
+  *e = common::AutoSimplify(*e);
+  ReplaceModWithDivMutator()(e);
+  ReplaceDivWithVarMutator mutator;
+  mutator(e);
+  *e               = common::AutoSimplify(*e);
+  auto div_var_map = mutator.div_var_map_;
+  ReplaceVarWithDivMutator()(e, mutator.div_var_map_);
+}
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/var_mod_simplify.h b/paddle/cinn/optim/var_mod_simplify.h
new file mode 100644
index 0000000000000..fb01e7e39215a
--- /dev/null
+++ b/paddle/cinn/optim/var_mod_simplify.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/ir/ir.h"
+
+/** simplify expressions with vars' div and mod.
+ *
+ * For example, input the code
+ * \code
+ * ((i_j_k_fused / 3) * 144) + (48 * (i_j_k_fused % 3))
+ * \endcode
+ *
+ * with the `i_j_k_fused` set as var will be simplified to i_j_k_fused
+ *
+ */
+namespace cinn::optim {
+
+void VarModSimplify(Expr* e);
+
+}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
new file mode 100644
index 0000000000000..fc6d97f1daf4b
--- /dev/null
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -0,0 +1,890 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/vectorize_loops.h"
+
+#include <absl/container/flat_hash_map.h>
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/ir_replace.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/tensor_write_tell.h"
+#include "cinn/optim/unroll_loops.h"
+#include "cinn/utils/functional.h"
+
+namespace cinn {
+namespace optim {
+using namespace ir;  // NOLINT
+using common::make_const;
+using common::make_one;
+using common::make_zero;
+
+//! Widen an expression to the given number of lanes.
+Expr Widen(Expr e, int lanes) {
+  if (e.type().lanes() == lanes) return e;
+  if (const ir::Broadcast *op = e.As<ir::Broadcast>()) {
+    if (lanes % op->lanes == 0) {
+      return ir::Broadcast::Make(op->value, lanes);
+    }
+  }
+
+  CHECK_EQ(e.type().lanes(), 1) << "Cannot broadcast lanes from " << e.type().lanes() << " to " << lanes;
+  return ir::Broadcast::Make(e, lanes);
+}
+
+// tell whether a tensor can be vectorized or not on CUDA by collecting names
+// of tensors which meet all check predicates of vectoring
+class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
+ public:
+  TensorVectorizeTeller(const Var &iter_var,
+                        const int factor,
+                        const absl::flat_hash_map<std::string, common::CasInterval> *var_intervals)
+      : iter_var_(iter_var), factor_(factor), var_intervals_(var_intervals) {}
+
+  void Collect(const Expr *op) { IRMutator::Visit(op, op); }
+
+  // return true if input tensor can be vectorized
+  bool CanBeVectorized(const std::string &tensor_name) const {
+    auto it = tensor2flag_.find(tensor_name);
+    return it != tensor2flag_.end() && it->second;
+  }
+
+ private:
+  const Var iter_var_;  // loop var of new for-loop split from the vectorized loop
+  const int factor_;
+  const absl::flat_hash_map<std::string, common::CasInterval> *var_intervals_;
+  // save (tensor name) -> (bool flag) to indentify whether tensors can be vectorized or not
+  std::unordered_map<std::string, bool> tensor2flag_;
+
+  void Visit(const ir::Store *expr, const Expr *op) override {
+    auto *node = op->As<ir::Store>();
+    CHECK(node);
+    IRMutator::Visit(&node->value, &node->value);
+    auto *tensor = node->tensor.As<ir::_Tensor_>();
+    CHECK(tensor);
+
+    // a tensor should pass all check of pre-conditions in every time it appears
+    if (!tensor2flag_.count(tensor->name) || tensor2flag_.at(tensor->name)) {
+      bool flag                  = MeetConditions(node->tensor, node->indices);
+      tensor2flag_[tensor->name] = flag;
+    }
+  }
+
+  void Visit(const ir::Load *expr, const Expr *op) override {
+    auto *node = op->As<ir::Load>();
+    CHECK(node);
+    auto *tensor = node->tensor.As<ir::_Tensor_>();
+    CHECK(tensor);
+
+    // a tensor should pass all check of pre-conditions in every time it appears
+    if (!tensor2flag_.count(tensor->name) || tensor2flag_.at(tensor->name)) {
+      bool flag                  = MeetConditions(node->tensor, node->indices);
+      tensor2flag_[tensor->name] = flag;
+    }
+  }
+
+  // return true if the tensor meets all conditions of vectorizing
+  bool MeetConditions(const Expr &expr, const std::vector<Expr> &indices) {
+    const ir::_Tensor_ *tensor = expr.As<ir::_Tensor_>();
+    auto find_matched_var_fn = [&](const Expr *x) { return x->As<_Var_>() && x->As<_Var_>()->name == iter_var_->name; };
+
+    // the size of the last dim should be divisible by factor
+    Expr last_size = tensor->shape.back();
+    if (tensor->shape.empty() || !tensor->shape.back().As<IntImm>() || tensor->shape.back().as_int32() % factor_ != 0) {
+      VLOG(5) << "Size of the last dim of tensor:" << tensor->name << " can't be divisible by factor:" << factor_
+              << ", shape:" << utils::Join(tensor->shape, ",");
+      return false;
+    }
+
+    // the iter val must appear in the last index
+    if (indices.empty() || ir::CollectIRNodes(indices.back(), find_matched_var_fn).empty()) {
+      VLOG(5) << "Loop var:" << iter_var_->name << " is not used in the last index";
+      return false;
+    }
+
+    // the iter val can't appear in mulitple indices
+    for (int i = 0; i < indices.size() - 1; ++i) {
+      auto repeat_found = ir::CollectIRNodes(indices[i], find_matched_var_fn);
+      if (!repeat_found.empty()) {
+        VLOG(5) << "Loop var:" << iter_var_->name << " is used at more than last index, current:" << i;
+        return false;
+      }
+    }
+
+    // check tensor accessed sequentially by comparing index one by one
+    Expr first_idx = optim::IRCopy(indices.back());
+    optim::IrReplace(&first_idx, Expr(iter_var_), Expr(0));
+    const auto &interval = var_intervals_->at(iter_var_->name);
+    for (int i = 1; i < interval.r; ++i) {
+      Expr next_idx = optim::IRCopy(indices.back());
+      optim::IrReplace(&next_idx, Expr(iter_var_), Expr(i));
+      auto gap = common::AutoSimplify(Expr(next_idx - first_idx));
+      if (!gap.As<IntImm>() || gap.as_int32() != i) {
+        VLOG(5) << "Tensor:" << tensor->name << " is not accessed sequentially, next:" << next_idx
+                << ", first:" << first_idx << ", gap:" << gap;
+        return false;
+      }
+      VLOG(5) << "Tensor:" << tensor->name << " is accessed sequentially, next:" << next_idx << ", first:" << first_idx
+              << ", gap:" << gap;
+    }
+
+    auto dtype = expr->type().ElementOf();
+    bool type_supported =
+        dtype.is_float(32) || dtype.is_int(32) || dtype.is_uint(32) || dtype.is_float16() || dtype.is_bfloat16();
+    if (!type_supported) {
+      VLOG(5) << "Only support vectorizing int,uint,float,float16,bloat16, but got " << dtype;
+      return false;
+    }
+    return true;
+  }
+};
+
+// find tensors accessed sequentially in a for-loop to be vectorized,
+// and substitue the corresponding cuda built-in vector for them
+class CudaVectorizer : public IRMutator<Expr *> {
+  const Var iter_var_;  // the loop var of the vecotrized loop
+  const int factor_;    // the factor for vectorize
+
+  TensorWriteTeller write_teller_;
+  TensorVectorizeTeller vectorized_teller_;
+
+  absl::flat_hash_map<std::string, Var> tensor2vectorized_vars_;
+  std::vector<Expr> vectorized_cast_exprs_;
+  std::vector<Expr> vectorized_store_exprs_;
+
+ public:
+  static constexpr int CudaVectorTypeMaxLanes = 8;
+  CudaVectorizer(const Var &iter_var,
+                 const int factor,
+                 const absl::flat_hash_map<std::string, common::CasInterval> *var_intervals)
+      : iter_var_(iter_var), factor_(factor), vectorized_teller_(iter_var, factor, var_intervals) {
+    CHECK(factor <= CudaVectorTypeMaxLanes)
+        << "The maximum lanes of valid CUDA vector types: " << CudaVectorTypeMaxLanes << ", but factor: " << factor;
+  }
+
+  // return all cast statements collected through vectorizing
+  std::vector<Expr> VectorizedTypeCastExprs() { return vectorized_cast_exprs_; }
+
+  // return all store statements collected through vectorizing
+  std::vector<Expr> VectorizedTypeStoreExprs() { return vectorized_store_exprs_; }
+
+  void Visit(Expr *expr) {
+    write_teller_.Collect(expr);
+    vectorized_teller_.Collect(expr);
+    IRMutator<Expr *>::Visit(expr, expr);
+  }
+
+  void Visit(const Load *op, Expr *expr) override {
+    auto *node   = expr->As<Load>();
+    auto *tensor = node->tensor.As<ir::_Tensor_>();
+    if (node->is_addr_tensor() && vectorized_teller_.CanBeVectorized(tensor->name)) {
+      TensorVectorized(node, &node->indices, false);
+    }
+  }
+
+  void Visit(const Store *op, Expr *expr) override {
+    auto *node   = expr->As<Store>();
+    auto *tensor = node->tensor.As<ir::_Tensor_>();
+    CHECK(tensor);
+    if (vectorized_teller_.CanBeVectorized(tensor->name)) {
+      TensorVectorized(node, &node->indices, true);
+    }
+
+    IRMutator::Visit(&node->value, &node->value);
+  }
+
+ private:
+  void TensorVectorized(ir::LoadStoreAddrMnger *node, std::vector<Expr> *indices, bool is_store) {
+    auto *tensor = node->tensor.As<ir::_Tensor_>();
+    VLOG(5) << "Vectorizing tensor:" << tensor->name;
+
+    // save the tensor and its corresponding vector name when it first appear
+    if (!tensor2vectorized_vars_.count(tensor->name)) {
+      AppendCast(node->tensor, *indices, is_store);
+    }
+
+    auto vectorized_var = tensor2vectorized_vars_.at(tensor->name);
+    // substitue a new tensor with the vector name and dtype
+    auto t       = vectorized_var->type().is_cpp_handle() ? node->tensor->type().PointerOf() : node->tensor->type();
+    node->tensor = ir::Tensor(vectorized_var->name, t, {Expr(factor_)}, {Expr(factor_)}, tensor->operation);
+    // remain the last iterative indice
+    indices->assign({iter_var_});
+  }
+
+  std::string GetVectorTypeName(Type type) {
+    std::string name_prefix = common::customized_type::kcuda_builtin_vector_t;
+#define GET_CUDA_VECTOR_TYPE_NAME(pred_expr, scalar_name)       \
+  if (pred_expr) {                                              \
+    return name_prefix + scalar_name + std::to_string(factor_); \
+  }
+
+    GET_CUDA_VECTOR_TYPE_NAME(type.is_int(32), "int");
+    GET_CUDA_VECTOR_TYPE_NAME(type.is_uint(32), "uint");
+    GET_CUDA_VECTOR_TYPE_NAME(type.is_float(32), "float");
+    GET_CUDA_VECTOR_TYPE_NAME(type.is_float16(), "half");
+    GET_CUDA_VECTOR_TYPE_NAME(type.is_bfloat16(), "bfloat16");
+#undef GET_CUDA_VECTOR_TYPE_NAME
+
+    // others are not implementd yet
+    CINN_NOT_IMPLEMENTED
+    return "";
+  }
+
+  void AppendCast(Expr tensor, const std::vector<Expr> &indices, bool is_store) {
+    auto *node    = tensor.As<ir::_Tensor_>();
+    bool is_const = !write_teller_.IsWrite(node->name);
+
+    // generate the corresponding vector type
+    Type scalar_type = tensor->type().ElementOf();
+    Type vector_type_ptr(Type::type_t::Customized, scalar_type.bits(), factor_);
+    Type vector_type(Type::type_t::Customized, scalar_type.bits(), factor_);
+    vector_type_ptr.set_customized_type(GetVectorTypeName(scalar_type));
+    vector_type_ptr.set_cpp_handle();
+    vector_type_ptr.set_cpp_const(is_const);
+
+    vector_type.set_customized_type(GetVectorTypeName(scalar_type));
+    vector_type.set_cpp_const(is_const);
+
+    // generate a local vector variable to be used in subsequent statements
+    std::string vectorized_name = "vectorized_" + node->name;
+    Var vectorized_var          = _Var_::Make(vectorized_name, vector_type);
+    tensor2vectorized_vars_.emplace(node->name, vectorized_var);
+
+    // generate a get_addr expr to get the address of the tensor
+    Expr converted_tensor = Load::Make(tensor, indices);
+    optim::IrReplace(&converted_tensor, iter_var_, Expr(int32_t(0)));
+    auto get_addr = ir::intrinsics::GetAddr::Make(converted_tensor);
+
+    // generate a let expression to cast the tensor into the local vector
+    auto cast = ir::Cast::Make(vector_type_ptr, get_addr);
+    if (!is_store) {
+      auto load = Load::Make(cast, {make_const(0)});
+      auto let  = Let::Make(vectorized_var, load);
+      vectorized_cast_exprs_.emplace_back(let);
+      VLOG(5) << "Append a vectorized expr:" << let;
+    } else {
+      Var vectorized_ptr = _Var_::Make(vectorized_name + "_ptr", vector_type_ptr);
+
+      auto let1 = Let::Make(vectorized_ptr, cast);
+      auto let2 = Let::Make(vectorized_var, Expr(0));
+      vectorized_cast_exprs_.emplace_back(let1);
+      vectorized_cast_exprs_.emplace_back(let2);
+
+      VLOG(5) << "Append a vectorized expr:" << let1;
+      VLOG(5) << "Append a vectorized expr:" << let2;
+
+      auto t =
+          ir::Tensor(vectorized_ptr->name, node->type().PointerOf(), {Expr(factor_)}, {Expr(factor_)}, node->operation);
+      auto store = Store::Make(t, vectorized_var, {make_const(0)});
+
+      vectorized_store_exprs_.emplace_back(store);
+      VLOG(5) << "Append a vectorized expr:" << store;
+    }
+  }
+};
+
+//! Substitutes a vector for a scalar var in a Stmt.
+class Vectorizer : public IRMutator<Expr *> {
+  //! The name of the variable to be vectorized.
+  Var var;
+
+  int lanes_{-1};
+
+  bool need_scalarize_{false};
+
+  bool to_vectorize_{false};
+
+  Expr ramp_;
+
+  absl::flat_hash_map<std::string, common::CasInterval> var_intervals_;
+
+  //! A suffix to attach to widened variables.
+  std::string widen_suffix;
+
+ public:
+  Vectorizer(const Var &var, int lanes, const absl::flat_hash_map<std::string, common::CasInterval> &var_intervals = {})
+      : var(var), lanes_(lanes), var_intervals_(var_intervals) {
+    // the identity ramp.
+    ramp_ = Ramp::Make(make_zero(), make_one(), lanes_);
+  }
+
+  void Visit(Expr *expr) {
+    CHECK(!need_scalarize_);
+    IRMutator<Expr *>::Visit(expr, expr);
+
+    if (need_scalarize_) {
+      need_scalarize_ = false;
+      Scalarize(expr);
+    }
+  }
+
+  void Visit(const Cast *op, Expr *expr) override {
+    auto *node = expr->As<Cast>();
+    auto v0    = node->v();
+    Visit(&node->v());
+    if (v0.same_as(node->v())) return;
+
+    Type t = op->type().with_lanes(node->v().type().lanes());
+    node->set_type(t);
+  }
+
+  void Visit(const _Var_ *op, Expr *expr) override {
+    if (op->name == var->name) {
+      *expr = Expr(ramp_);
+    }
+  }
+
+  void Visit(const Add *op, Expr *expr) override { MutateAddSubOperator(op, expr); }
+  void Visit(const Sub *op, Expr *expr) override { MutateAddSubOperator(op, expr); }
+  void Visit(const Mul *op, Expr *expr) override { MutateMulDivOperator(op, expr); }
+  void Visit(const Div *op, Expr *expr) override { MutateMulDivOperator(op, expr); }
+  void Visit(const Mod *op, Expr *expr) override { MutateMulDivOperator(op, expr); }
+  void Visit(const Min *op, Expr *expr) override { BinaryOperatorVec(op, expr); }
+  void Visit(const Max *op, Expr *expr) override { BinaryOperatorVec(op, expr); }
+  void Visit(const EQ *op, Expr *expr) override { BinaryOperatorVec(op, expr); }
+  void Visit(const NE *op, Expr *expr) override { BinaryOperatorVec(op, expr); }
+  void Visit(const LT *op, Expr *expr) override { BinaryOperatorVec(op, expr); }
+  void Visit(const LE *op, Expr *expr) override { BinaryOperatorVec(op, expr); }
+  void Visit(const GT *op, Expr *expr) override { BinaryOperatorVec(op, expr); }
+  void Visit(const GE *op, Expr *expr) override { BinaryOperatorVec(op, expr); }
+  void Visit(const And *op, Expr *expr) override { BinaryOperatorVec(op, expr); }
+  void Visit(const Or *op, Expr *expr) override { BinaryOperatorVec(op, expr); }
+
+  void Visit(const Ramp *op, Expr *expr) override {}
+
+  void Visit(const Select *op, Expr *expr) override {
+    auto *node        = expr->As<Select>();
+    auto condition0   = node->condition;
+    auto true_value0  = node->true_value;
+    auto false_value0 = node->false_value;
+
+    Visit(&node->condition);
+    Visit(&node->true_value);
+    Visit(&node->false_value);
+
+    if (condition0.same_as(node->condition) && true_value0.same_as(node->true_value) &&
+        false_value0.same_as(node->false_value))
+      return;
+
+    int lanes =
+        utils::Max(node->condition.type().lanes(), node->true_value.type().lanes(), node->false_value.type().lanes());
+    node->true_value  = Widen(node->true_value, lanes);
+    node->false_value = Widen(node->false_value, lanes);
+  }
+
+  void Visit(const Load *op, Expr *expr) override {
+    auto *node                = expr->As<Load>();
+    std::vector<Expr> indices = node->indices;
+    // We ignore the predicate here.
+    bool need_visit = false;
+    for (int i = 0; i < indices.size(); i++) {
+      Visit(&node->indices[i]);
+      if (!node->indices[i].same_as(indices[i])) {
+        need_visit = true;
+      }
+    }
+    if (!need_visit) return;
+    int lanes = 0;
+    for (auto &idx : node->indices) {
+      lanes = std::max(idx.type().lanes(), lanes);
+    }
+    std::vector<Expr> new_indices;
+    for (auto &idx : node->indices) {
+      new_indices.push_back(Widen(idx, lanes));
+    }
+    *expr = Load::Make(node->tensor, new_indices);
+  }
+
+  void Visit(const Store *op, Expr *expr) override {
+    auto *node  = expr->As<Store>();
+    auto value0 = node->value;
+    Visit(&node->value);
+
+    std::vector<Expr> indices = node->indices;
+    // We ignore the predicate here.
+    for (auto &idx : node->indices) {
+      Visit(&idx);
+    }
+
+    bool need_visit = false;
+    for (int i = 0; i < indices.size(); i++) {
+      if (!node->indices[i].same_as(indices[i])) {
+        need_visit = true;
+      }
+    }
+    if (!need_visit) return;
+
+    int lanes = 0;
+    for (auto &idx : node->indices) lanes = std::max(idx.type().lanes(), lanes);
+    lanes = std::max(lanes, node->value.type().lanes());
+
+    node->value = Widen(node->value, lanes);
+
+    std::vector<Expr> new_indices;
+    for (auto &idx : node->indices) {
+      new_indices.push_back(Widen(idx, lanes));
+    }
+    *expr = Store::Make(node->tensor, node->value, new_indices);
+  }
+
+  void Visit(const Call *op, Expr *expr) override {
+    std::vector<Expr> read_args  = op->read_args;
+    std::vector<Expr> write_args = op->write_args;
+    auto *node                   = expr->As<Call>();
+    ir::IRMutator<>::Visit(op, expr);
+    bool is_changed = false;
+    int lanes       = 0;
+    for (int i = 0; i < node->read_args.size(); i++) {
+      lanes = std::max(node->read_args[i].type().lanes(), lanes);
+      if (!node->read_args[i].same_as(read_args[i])) {
+        is_changed = true;
+      }
+    }
+    for (int i = 0; i < node->write_args.size(); i++) {
+      lanes = std::max(node->write_args[i].type().lanes(), lanes);
+      if (!node->write_args[i].same_as(write_args[i])) {
+        is_changed = true;
+      }
+    }
+    if (!is_changed) return;
+
+    for (int i = 0; i < read_args.size(); i++) {
+      node->read_args[i] = Widen(node->read_args[i], lanes);
+    }
+    for (int i = 0; i < write_args.size(); i++) {
+      node->write_args[i] = Widen(node->write_args[i], lanes);
+    }
+
+    CHECK(!read_args.empty());
+    Type type = op->type().with_lanes(lanes);
+    *expr     = Call::Make(type,
+                       node->name,
+                       node->read_args,
+                       node->write_args,
+                       node->call_type,
+                       node->func,
+                       node->value_index,
+                       node->attrs);
+  }
+
+  void Visit(const Let *op, Expr *expr) override {
+    auto *node = expr->As<Let>();
+    Visit(&node->symbol);
+    LOG(ERROR) << "Let not supported";
+  }
+
+  void Visit(const IfThenElse *op, Expr *expr) override {
+    auto *node = expr->As<IfThenElse>();
+    Visit(&node->condition);
+    int lanes = node->condition.type().lanes();
+    Visit(&node->true_case);
+    Visit(&node->false_case);
+    LOG(ERROR) << "Ignore Width IfThenElse";
+  }
+
+  void Visit(const For *op, Expr *expr) override { ir::IRMutator<>::Visit(op, expr); }
+
+  void Scalarize(Expr *expr) {
+    Var idx(var->name + "_s", Int(32));
+    std::map<const ir::_Var_ *, Expr> var_map;
+    var_map[var.As<ir::_Var_>()] = idx;
+
+    common::Substitute(expr, var_map);
+    *expr =
+        ir::For::Make(idx, common::make_const(0), common::make_const(lanes_), ForType::Serial, DeviceAPI::Host, *expr);
+  }
+
+  template <typename T>
+  void MutateAddSubOperator(const T *op, Expr *expr) {
+    auto *node = expr->As<T>();
+    Expr a0    = node->a();
+    Expr b0    = node->b();
+
+    Visit(&node->a());
+    Visit(&node->b());
+
+    // if (a0.same_as(node->a()) && b0.same_as(node->b())) return;
+
+    int lanes = std::max(node->a().type().lanes(), node->b().type().lanes());
+    if (lanes != 1) {
+      const Ramp *a_ramp_n = node->a().template As<Ramp>();
+      const Ramp *b_ramp_n = node->b().template As<Ramp>();
+      if (node->a().type().lanes() == 1 && b_ramp_n) {
+        // a + Ramp(base,stride,lanes) = Ramp(base+a, stride,lanes)
+        *expr = Ramp::Make(T::Make(node->a(), b_ramp_n->base),  // base
+                           b_ramp_n->stride,                    // stride
+                           b_ramp_n->lanes);
+        return;
+      }
+      if (node->b().type().lanes() == 1 && a_ramp_n) {
+        *expr = Ramp::Make(T::Make(node->b(), a_ramp_n->base),  // base
+                           a_ramp_n->stride,                    // stride
+                           a_ramp_n->lanes);
+        return;
+      }
+    }
+
+    *expr = T::Make(Widen(node->a(), lanes), Widen(node->b(), lanes));
+  }
+
+  template <typename T>
+  void MutateMulDivOperator(const T *op, Expr *expr) {
+    Expr a0    = op->a();
+    Expr b0    = op->b();
+    auto *node = expr->As<T>();
+    Visit(&node->a());
+    Visit(&node->b());
+
+    // if (a0.same_as(node->a()) && b0.same_as(node->b())) return;
+    int lanes = std::max(node->a().type().lanes(), node->b().type().lanes());
+    if (lanes != 1) {
+      const Ramp *a_ramp_n = node->a().template As<Ramp>();
+      const Ramp *b_ramp_n = node->b().template As<Ramp>();
+      if (node->a().type().lanes() == 1 && b_ramp_n) {
+        // a * Ramp(base,stride,lanes) = Ramp(base*a, stride*a,lanes)
+        *expr = Ramp::Make(T::Make(node->a(), b_ramp_n->base),    // base
+                           T::Make(node->a(), b_ramp_n->stride),  // stride
+                           b_ramp_n->lanes);
+
+        return;
+      }
+      // Ramp(base,stride,lanes) * b  = Ramp(base*b, stride*b,lanes)
+      if (node->b().type().lanes() == 1 && a_ramp_n) {
+        *expr = Ramp::Make(T::Make(a_ramp_n->base, node->b()),    // base
+                           T::Make(a_ramp_n->stride, node->b()),  // stride
+                           a_ramp_n->lanes);
+        return;
+      }
+    }
+
+    *expr = T::Make(Widen(node->a(), lanes), Widen(node->b(), lanes));
+  }
+
+  template <typename T>
+  void BinaryOperatorVec(const T *op, Expr *expr) {
+    auto *node = expr->As<T>();
+    Expr a0    = node->a();
+    Expr b0    = node->b();
+    Visit(&node->a());
+    Visit(&node->b());
+    // if (a0.same_as(node->a()) && b0.same_as(node->b())) return *expr;
+
+    int lanes = std::max(node->a().type().lanes(), node->b().type().lanes());
+    *expr     = T::Make(Widen(node->a(), lanes), Widen(node->b(), lanes));
+  }
+};
+
+struct VectorizeLoops_ : public IRMutator<Expr *> {
+  const Target &target;
+  absl::flat_hash_map<std::string, common::CasInterval> var_intervals;
+  bool vectorizable_ = true;
+
+  explicit VectorizeLoops_(const Target &t) : target(t) {}
+
+  void operator()(Expr *expr) { IRMutator::Visit(expr, expr); }
+
+  void Visit(const Load *op, Expr *expr) override {
+    auto *node                = expr->As<Load>();
+    std::vector<Expr> indices = node->indices;
+
+    bool is_changed = false;
+    // simplify the complicated index from poly in the format of div/mod
+    for (int i = 0; i < indices.size(); i++) {
+      node->indices[i] = common::AutoSimplify(node->indices[i], var_intervals);
+      Simplify(&node->indices[i]);
+      if (!node->indices[i].same_as(indices[i])) {
+        is_changed = true;
+      }
+    }
+    if (!is_changed) return;
+
+    *expr = Load::Make(node->tensor, node->indices);
+  }
+
+  void Visit(const Store *op, Expr *expr) override {
+    auto *node = expr->As<Store>();
+    auto value = node->value;
+    IRMutator::Visit(&node->value, &node->value);
+
+    std::vector<Expr> indices = node->indices;
+    bool is_changed           = false;
+    // simplify the complicated index from poly in the format of div/mod
+    for (int i = 0; i < indices.size(); i++) {
+      node->indices[i] = common::AutoSimplify(node->indices[i], var_intervals);
+      Simplify(&node->indices[i]);
+      if (!node->indices[i].same_as(indices[i])) {
+        is_changed = true;
+      }
+    }
+    if (!is_changed) return;
+
+    *expr = Store::Make(node->tensor, node->value, node->indices);
+  }
+
+  void Visit(const Call *op, Expr *expr) override {
+    auto it = op->attrs.find("vectorizable");
+    if (it != op->attrs.end()) {
+      vectorizable_ = absl::get<bool>(it->second);
+    }
+  }
+
+  void Visit(const For *forloop, Expr *expr) {
+    auto *node        = expr->As<For>();
+    auto loopvar_name = forloop->loop_var->name;
+    if (forloop->extent.As<IntImm>()) {
+      var_intervals.emplace(loopvar_name, common::CasInterval{0, forloop->extent.as_int32() - 1});
+    } else {
+      var_intervals.emplace(loopvar_name, common::CasInterval{Expr(0), forloop->extent - 1});
+    }
+    // the extent the forloops marked as Vectorized should be int constant
+    if (forloop->is_vectorized()) {
+      Context::info_rgt().Get<int>("vectorized_forloop_count")++;
+
+      CHECK(forloop->vectorize_info().factor > 0);
+
+      CHECK(is_zero(forloop->min));
+      Expr for_extent = common::AutoSimplify(forloop->extent);
+      Simplify(&for_extent);
+      node->extent     = for_extent;
+      auto *extent_min = for_extent.As<Min>();
+      auto *extent_max = for_extent.As<Max>();
+
+      vectorizable_ = true;
+      IRMutator<>::Visit(&node->body, &node->body);
+
+      if (target == common::DefaultNVGPUTarget()) {
+        if (!forloop->extent.As<IntImm>() || forloop->extent.as_int32() % forloop->vectorize_info().factor != 0) {
+          vectorizable_ = false;
+          VLOG(5) << "GPU vectorize only support extent is a multiple of factor";
+        }
+      }
+
+      if (extent_min || extent_max || !vectorizable_) {
+        // not vectorize if has tail blocks, for llvm to optimize
+        node->reset_vectorize_info();
+        var_intervals.erase(forloop->loop_var->name);
+        return;
+      }
+
+      const int factor  = forloop->vectorize_info().factor;
+      auto _new_forloop = SplitForLoop(node, factor);
+      if (!_new_forloop.defined()) {
+        IRMutator<>::Visit(&node->body, &node->body);
+        var_intervals.erase(forloop->loop_var->name);
+        return;
+      }
+
+      node->reset_vectorize_info();
+
+      auto *new_forloop = _new_forloop.As<ir::For>();
+
+      // The forloop generated from polyhedral analysis might have a complex condition that is not something like
+      // "i<20" or "i<=20", those cases is not possible to extract the extent.
+      auto *extent_int = new_forloop->extent.As<IntImm>();
+
+      if (!extent_int) {
+        IRMutator<>::Visit(&node->body, &node->body);
+        var_intervals.erase(forloop->loop_var->name);
+        return;
+      }
+
+      int extent = extent_int->value;
+      CHECK_GT(extent, 0) << "Loop over " << Expr(new_forloop->loop_var) << " has extent " << new_forloop->extent
+                          << ". Can only vectorize loops over a constant extent > 1";
+
+      VLOG(2) << "Vectorizing " << new_forloop->loop_var << " extent " << extent;
+      VLOG(2) << "before vectorize body:\n" << node->body;
+
+      if (target == common::DefaultNVGPUTarget()) {
+        CudaVectorizer cuda_vectorizer(new_forloop->loop_var, factor, &var_intervals);
+        cuda_vectorizer.Visit(&new_forloop->body);
+        // unroll the new forloop to compute each element of the vector iteratively
+        auto copied_loop = optim::IRCopy(_new_forloop);
+        copied_loop.As<ir::For>()->set_unrolled();
+        optim::UnrollLoop(&copied_loop);
+        // add cast exprs of vector type in the front of vectorized forloop,
+        // and replace original compute statements with the correspond unrolled ones
+        auto unroll_body = copied_loop.As<ir::Block>()->stmts;
+        auto cast_exprs  = cuda_vectorizer.VectorizedTypeCastExprs();
+        auto store_exprs = cuda_vectorizer.VectorizedTypeStoreExprs();
+        auto &body_stmts = new_forloop->body.As<ir::Block>()->stmts;
+        body_stmts.assign(cast_exprs.begin(), cast_exprs.end());
+        body_stmts.insert(body_stmts.end(), unroll_body.begin(), unroll_body.end());
+        body_stmts.insert(body_stmts.end(), store_exprs.begin(), store_exprs.end());
+      } else {
+        Vectorizer(new_forloop->loop_var, extent, var_intervals).Visit(&new_forloop->body);
+      }
+
+      VLOG(2) << "after vectorize body:\n" << node->body;
+
+      // Remove the forloop, the new_forloop's body is vectorized to Ramp, so no forloop is needed.
+      if (is_zero(forloop->extent - 1)) {
+        *expr = new_forloop->body;
+      } else {
+        node->body = new_forloop->body;
+      }
+    } else {
+      IRMutator::Visit(forloop, expr);
+    }
+    var_intervals.erase(loopvar_name);
+  }
+
+  //! unroll the forloop if its' extent is min type by solving the condition extent
+  //! @return The new forloop.
+  bool UnrollCmpFor(For *outer_for, For *inner_for, Expr *expr) {
+    CHECK(outer_for);
+    CHECK(inner_for);
+    Expr inner_for_extent = common::AutoSimplify(inner_for->extent);
+    Simplify(&inner_for_extent);
+    auto *extent_min = inner_for_extent.As<Min>();
+    if (extent_min) {
+      CHECK(is_zero(inner_for->min));
+      // simplify the complicated indices of load/store from poly
+      IRMutator::Visit(&inner_for->body, &inner_for->body);
+      Expr a, b, condition;
+      a          = extent_min->a();
+      b          = extent_min->b();
+      auto a_int = a.As<IntImm>();
+      auto b_int = a.As<IntImm>();
+      if (a_int || b_int) {
+        condition = common::SolveInequality(LE::Make(a, b), outer_for->loop_var);
+        Simplify(&condition);
+      }
+      if (condition.defined()) {
+        auto le_n      = condition.As<ir::LE>();
+        bool can_split = le_n && le_n->b().is_constant();
+        if (le_n && le_n->b().is_constant()) {
+          Expr inner_for_a  = Block::Make({For::Make(inner_for->loop_var,
+                                                    inner_for->min,
+                                                    a,
+                                                    ForType::Vectorized,
+                                                    DeviceAPI::UNK,
+                                                    inner_for->body,
+                                                    inner_for->vectorize_info())});
+          Expr new_extent_a = common::AutoSimplify(le_n->b() + 1);
+          Expr out_for_a    = For::Make(outer_for->loop_var,
+                                     outer_for->min,
+                                     new_extent_a,
+                                     outer_for->for_type(),
+                                     outer_for->device_api,
+                                     inner_for_a,
+                                     outer_for->vectorize_info());
+          Var new_iterator_inner(common::UniqName(inner_for->loop_var->name + "_s"));
+          Var new_iterator_outer(common::UniqName(outer_for->loop_var->name + "_s"));
+
+          Expr inner_for_b = Block::Make({For::Make(
+              new_iterator_inner, inner_for->min, b, ForType::Serial, DeviceAPI::UNK, IRCopy(inner_for->body))});
+          optim::IrReplace(&inner_for_b, inner_for->loop_var, Expr(new_iterator_inner));
+
+          Expr out_for_b = For::Make(new_iterator_outer,
+                                     new_extent_a,
+                                     outer_for->extent,
+                                     outer_for->for_type(),
+                                     outer_for->device_api,
+                                     inner_for_b,
+                                     outer_for->vectorize_info());
+          optim::IrReplace(&out_for_b, outer_for->loop_var, Expr(new_iterator_outer));
+          *expr = Block::Make({out_for_a, out_for_b});
+          VLOG(2) << *expr;
+          IRMutator::Visit(expr, expr);
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  //! Split the forloop with size \p factor.
+  //! @return The new forloop.
+  Expr SplitForLoop(For *forloop, int factor) {
+    CHECK_GT(factor, 1);
+    auto *for_min_i = forloop->min.As<IntImm>();
+    CHECK(forloop);
+    if (!for_min_i) return Expr();
+    if (for_min_i->value != 0) return Expr();
+
+    auto *extent_ptr = forloop->extent.As<IntImm>();
+    Expr times;
+    if (extent_ptr) {
+      int extent_int   = forloop->extent.as_int32();
+      int extent_trunc = extent_int / factor;
+      int extent_times = extent_int % factor == 0 ? extent_trunc : extent_trunc + 1;
+      times            = common::make_const(forloop->extent->type(), extent_times);
+    } else {
+      times = common::AutoSimplify(Div::Make(forloop->extent, make_const(factor)));
+      Simplify(&times);
+    }
+
+    // update the current forloop
+    auto times_int = times.As<IntImm>();
+    forloop->set_vectorized(false);
+
+    forloop->extent = times;
+    if (times_int && forloop->extent.as_int32() >= 1) {
+      var_intervals.emplace(forloop->loop_var->name, common::CasInterval{0, forloop->extent.as_int32() - 1});
+    } else {
+      var_intervals.erase(forloop->loop_var->name);
+      var_intervals.emplace(forloop->loop_var->name, common::CasInterval{Expr(0), forloop->extent - 1});
+    }
+
+    // create the new forloop
+    {
+      Var new_iterator(Context::Global().NewName("vi"));
+      var_intervals.emplace(new_iterator->name, common::CasInterval{0, factor - 1});
+      // eliminate for 1
+      Expr new_index;
+      if (common::is_zero(times - 1)) {
+        new_index = Expr(new_iterator);
+      } else {
+        new_index = Expr(forloop->loop_var) * factor + Expr(new_iterator);
+      }
+      optim::IrReplace(&forloop->body, forloop->loop_var, new_index);
+      auto new_forloop = For::Make(new_iterator,
+                                   forloop->min,
+                                   make_const(factor),
+                                   ForType::Vectorized,
+                                   DeviceAPI::UNK,
+                                   forloop->body,
+                                   forloop->vectorize_info());
+      forloop->body    = Block::Make({new_forloop});
+      return new_forloop;
+    }
+  }
+};
+
+void VectorizeLoops(Expr *expr, const Target &target) { return VectorizeLoops_(target)(expr); }
+
+namespace detail {
+
+void Vectorize(Var var, int lanes, Expr *expr) {
+  Vectorizer vectorizer(var, lanes);
+  vectorizer.Visit(expr);
+}
+
+}  // namespace detail
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/vectorize_loops.h b/paddle/cinn/optim/vectorize_loops.h
new file mode 100644
index 0000000000000..a3ecc2625a219
--- /dev/null
+++ b/paddle/cinn/optim/vectorize_loops.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/ir/ir_mutator.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Vectorize the forloops(For) if its for_type is marked as kVectorize.
+ * @param expr
+ * @param target
+ */
+void VectorizeLoops(Expr* expr, const Target& target);
+
+namespace detail {
+
+//! Vecorize the \p expr by making the \p var has \p lanes lanes.
+void Vectorize(Var var, int lanes, Expr* expr);
+
+}  // namespace detail
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/vectorize_loops_test.cc b/paddle/cinn/optim/vectorize_loops_test.cc
new file mode 100644
index 0000000000000..afcf88ae07f73
--- /dev/null
+++ b/paddle/cinn/optim/vectorize_loops_test.cc
@@ -0,0 +1,288 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/optim/vectorize_loops.h"
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "cinn/cinn.h"
+#include "cinn/common/common.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/optimize.h"
+#include "cinn/optim/transform_polyfor_to_for.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+using namespace ir;  // NOLINT
+using utils::GetStreamCnt;
+using utils::Trim;
+
+TEST(Vectorize, replace_var) {
+  using namespace ir;  // NOLINT
+
+  Expr M(100);
+  Expr N(500);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  // C = A * B
+
+  Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "C");
+
+  auto stages = CreateStages({C});
+
+  stages[C]->Vectorize(1, 16);
+
+  auto funcs = Lower("matmul", stages, {A, B, C});
+
+  Expr func = optim::Optimize(funcs, common::DefaultHostTarget());
+
+  Target target;
+  target.arch = Target::Arch ::X86;
+  target.bits = Target::Bit ::k32;
+  target.os   = Target::OS ::Linux;
+
+  ir::Module::Builder builder("module1", target);
+  builder.AddFunction(ir::LoweredFunc(func.As<ir::_LoweredFunc_>()));
+
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out        = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  auto target_out = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void matmul(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 100; i += 1) {
+    for (int32_t j = 0; j < 32; j += 1) {
+      C[StackVec<16,int32_t>::Ramp(((500 * i) + (16 * j)), 1, 16)] = (StackedVec<float,16>::Load(A,((500 * i) + (16 * j))) * StackedVec<float,16>::Load(B,((500 * i) + (16 * j))));
+    };
+  };
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+  EXPECT_EQ(Trim(target_out), Trim(out));
+}
+
+TEST(Vectorize, TestMarkVectorize) {
+  // create two forloops, check only one forloop is marked Vectorize.
+  Context::info_rgt().Clear();
+
+  using namespace ir;  // NOLINT
+
+  Expr M(100);
+  Expr N(500);
+
+  Target target;
+  target.arch = Target::Arch ::X86;
+  target.bits = Target::Bit ::k32;
+  target.os   = Target::OS ::Linux;
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  // C = A * B
+
+  Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "C");
+
+  Tensor D = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "D");
+
+  auto stages = CreateStages({C, D});
+
+  stages[D]->ShareBufferWith(stages[C]);
+
+  // vectorize C, not D
+  stages[C]->Vectorize(1, 16);
+
+  auto func = Lower("matmul", stages, {A, B, C, D});
+
+  std::cout << "before optim\n" << func->body << std::endl;
+
+  optim::TransformPolyForToFor(&func->body);
+  optim::VectorizeLoops(&func->body, target);
+  optim::Simplify(&func->body);
+
+  ir::Module::Builder builder("module1", target);
+  builder.AddFunction(func);
+
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "out:\n" << out;
+
+  auto target_out = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void matmul(const struct cinn_buffer_t *_A, const struct cinn_buffer_t *_B, struct cinn_buffer_t *_C)
+{
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = (const float*)(_A->memory);
+  const float* B = (const float*)(_B->memory);
+  float* C = (float*)(_C->memory);
+  float* D = (float*)(_C->memory);
+  for (int32_t i = 0; i < 100; i += 1) {
+    for (int32_t j_outer = 0; j_outer < 31; j_outer += 1) {
+      C[StackVec<16,int32_t>::Ramp(((500 * i) + (16 * j_outer)), 1, 16)] = (StackedVec<float,16>::Load(A,((500 * i) +
+      (16 * j_outer))) * StackedVec<float,16>::Load(B,((500 * i) + (16 * j_outer))));
+    };
+    for (int32_t j_outer = 31; j_outer < 32; j_outer += 1) {
+      for (int32_t j_inner = 0; j_inner < (500 + (-16 * j_outer)); j_inner += 1) {
+        C[((500 * i) + ((16 * j_outer) + j_inner))] = (A[((500 * i) + ((16 * j_outer) + j_inner))] * B[((500 * i) +
+        ((16 * j_outer) + j_inner))]);
+      };
+    };
+  };
+  for (int32_t i = 0; i < 100; i += 1) {
+    for (int32_t j = 0; j < 500; j += 1) {
+      D[((500 * i) + j)] = (A[((500 * i) + j)] * B[((500 * i) + j)]);
+    };
+  };
+}
+)ROC";
+
+  EXPECT_EQ(Context::info_rgt().Get<int>("vectorized_forloop_count"), 1);
+}
+
+TEST(Vectorize, vectorize) {
+  Var a("a");
+  Var b("b");
+  Var c("c");
+
+  {
+    Expr d = a * 10 + b;
+    detail::Vectorize(a, 16, &d);
+    Simplify(&d);
+    EXPECT_EQ(GetStreamCnt(d), "Ramp(b,10,16)");
+  }
+
+  {
+    Expr d = a * 10 + b;
+    detail::Vectorize(b, 16, &d);
+    Simplify(&d);
+    EXPECT_EQ(GetStreamCnt(d), "Ramp((10 * a),1,16)");
+  }
+
+  {
+    Placeholder<float> A("A", std::vector<int>{{10}});
+    Placeholder<float> B("B", std::vector<int>{{10}});
+    Placeholder<float> C("C", std::vector<int>{{10}});
+
+    auto expr = Load::Make(ir::Tensor(A), {a * 2 + b * 2});
+    expr      = expr + 10.f * expr;
+    detail::Vectorize(a, 16, &expr);
+    EXPECT_EQ(GetStreamCnt(expr),
+              "(A[Ramp(((b * 2) + (0 * 2)),(1 * 2),16)] + (Broadcast(10.0000000f,16) * A[Ramp(((b * 2) + (0 * 2)),(1 * "
+              "2),16)]))");
+  }
+}
+
+TEST(Vectorize, single_for) {
+  Placeholder<float> A("A", std::vector<int>{{10}});
+  Placeholder<float> B("B", std::vector<int>{{10}});
+  Placeholder<float> C("C", std::vector<int>{{10}});
+
+  Var loop_var("k0");
+
+  Expr body = Store::Make(ir::Tensor(C),
+                          ir::Add::Make(  //
+                              ir::Load::Make(ir::Tensor(A), {Expr(loop_var)}),
+                              ir::Load::Make(ir::Tensor(B), {Expr(loop_var)})),
+                          {Expr(loop_var)});
+  body      = ir::Block::Make({body});
+
+  VectorizeInfo vectorize_info(0, 16);
+  auto forloop = ir::For::Make(loop_var,
+                               common::make_const(0),
+                               common::make_const(16),
+                               ir::ForType::Vectorized,
+                               ir::DeviceAPI::UNK,
+                               body,
+                               vectorize_info);
+
+  forloop = optim::Optimize(forloop, common::DefaultHostTarget());
+
+  LOG(INFO) << "Forloop\n" << forloop;
+}
+
+TEST(Vectorize, cuda_vectorize) {
+  Expr M(100);
+  Expr N(500);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "C");
+
+  auto stages = CreateStages({C});
+  stages[C]->Vectorize(1, 4);
+  Target target = common::DefaultNVGPUTarget();
+  auto func     = Lower("matmul", stages, {A, B, C}, {}, {}, nullptr, target);
+
+  auto target_expr = R"ROC(
+function matmul (_A, _B, _C)
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 125)
+    {
+      CudaVectorType::float4<4>* vectorized_C_ptr = CudaVectorType::float4<4>*(get_addr(C[i, ((j * 4) + 0)]))
+      CudaVectorType::float4<4> vectorized_C = 0
+      const CudaVectorType::float4<4> vectorized_A = const CudaVectorType::float4<4>*(get_addr(A[i, ((j * 4) + 0)]))[0]
+      const CudaVectorType::float4<4> vectorized_B = const CudaVectorType::float4<4>*(get_addr(B[i, ((j * 4) + 0)]))[0]
+      vectorized_C[0] = (vectorized_A[0] * vectorized_B[0])
+      vectorized_C[1] = (vectorized_A[1] * vectorized_B[1])
+      vectorized_C[2] = (vectorized_A[2] * vectorized_B[2])
+      vectorized_C[3] = (vectorized_A[3] * vectorized_B[3])
+      vectorized_C_ptr[0] = vectorized_C
+    }
+  }
+}
+)ROC";
+  ASSERT_EQ(Trim(target_expr), Trim(GetStreamCnt(func)));
+}
+
+TEST(Vectorize, cuda_vectorize_with_constant) {
+  Expr M(100);
+  Expr N(500);
+  Placeholder<float> A("A", {M, N});
+  Expr const_value(float(2.11));
+
+  Tensor C = Compute(
+      {M, N}, [&](Var i, Var j) { return const_value * A(i, j); }, "C");
+
+  auto stages = CreateStages({C});
+  stages[C]->Vectorize(1, 4);
+  Target target = common::DefaultNVGPUTarget();
+  auto func     = Lower("mul_const", stages, {A, C}, {}, {}, nullptr, target);
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/poly/CMakeLists.txt b/paddle/cinn/poly/CMakeLists.txt
new file mode 100644
index 0000000000000..f7eb9af0a2cb8
--- /dev/null
+++ b/paddle/cinn/poly/CMakeLists.txt
@@ -0,0 +1,24 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+    dim.cc
+    domain.cc
+    domain_add_unit_loop_mutator.cc
+    map.cc
+    stage.cc
+    isl_utils.cc
+    schedule.cc
+    naive_scheduler.cc
+    poly_scheduler.cc
+    ast_gen.cc
+    graph.cc
+    compute_at_transform.cc
+    )
+
+# cc_test(test_graph SRCS graph_test.cc DEPS core)
+cc_test(test_schedule SRCS schedule_test.cc DEPS cinncore)
+# cc_test(test_poly_scheduler SRCS poly_scheduler_test.cc DEPS core)
+cc_test(test_stage SRCS stage_test.cc DEPS cinncore)
+cc_test(test_compute_at_transform SRCS compute_at_transform_test.cc DEPS cinncore)
+cc_test(test_ast_gen SRCS ast_gen_test.cc DEPS cinncore)
+cc_test(test_isl_utils SRCS isl_utils_test.cc DEPS cinncore)
diff --git a/paddle/cinn/poly/ast_gen.cc b/paddle/cinn/poly/ast_gen.cc
new file mode 100644
index 0000000000000..e8f4ca8c43629
--- /dev/null
+++ b/paddle/cinn/poly/ast_gen.cc
@@ -0,0 +1,566 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/ast_gen.h"
+
+#include <llvm/Support/FormatVariadic.h>
+
+#include <utility>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/poly/domain_add_unit_loop_mutator.h"
+#include "cinn/poly/isl_utils.h"
+
+namespace cinn {
+namespace poly {
+
+struct AstGen::Impl {
+  Impl(const isl::set& context, const poly::ScheduleGroup& schedule_group)
+      : context_(context), schedule_group_(schedule_group) {}
+  //! Set the ISL ast_gen configs.
+  void InitIslAstConfig();
+
+  //! Return a domain composed of all the elements.
+  isl::union_set domain() const;
+
+  //! Return a map composed of all the transforms.
+  isl::union_map transform();
+
+  isl::ctx ctx() const;
+
+  /**
+   * Help to collect the map from the axis(and the pos) in statement to the transformed indice.
+   * e.g. If s[i,j] will be generated to something like s[a+2, b] in the final AST, this will return
+   * - a map { i->a+2, j->b, 0->a+2, 1->b }.
+   */
+  static std::map<std::string, isl::ast_expr> ExtractIslTransformedIndiceMap(const isl::set& iterator_domain,
+                                                                             isl_ast_build* build);
+
+  //! Get the polyhedral stages.
+  const std::vector<Shared<Stage>>& stages() const { return stages_; }
+
+ private:
+  isl::set context_;
+  std::vector<Shared<Stage>> stages_;
+  const poly::ScheduleGroup& schedule_group_;
+  std::vector<std::string> iterator_names_;
+  //! tuple name -> { axis -> isl_ast }
+  std::map<std::string, std::map<std::string, isl::ast_expr>> transformed_indice_map_;
+  isl::union_map build_options_;
+
+  friend class AstGen;
+};
+
+isl::union_set AstGen::domain() const { return impl_->domain(); }
+
+isl::union_set AstGen::Impl::domain() const {
+  CHECK(!stages_.empty());
+  auto sets =
+      utils::Map<std::vector<Shared<Stage>>, isl::set>(stages_, [](const Shared<Stage>& e) { return e->domain(); });
+  return isl_sets_to_union_set(sets);
+}
+
+isl::ctx AstGen::ctx() const { return impl_->ctx(); }
+
+isl::ctx AstGen::Impl::ctx() const {
+  CHECK(!stages_.empty());
+  return stages_.front()->domain().ctx();
+}
+
+isl::set TransIdentityExtentToContextId(isl::set set) {
+  std::vector<std::tuple<int, int>> iden_dim_offsets;
+  for (int i = 0; i < isl_set_dim(set.get(), isl_dim_set); i++) {
+    if (isl_set_axis_has_noparam_constant_bound(set.get(), i)) {
+      auto range = isl_set_get_axis_range(set.get(), i);
+      auto& minv = std::get<0>(range);
+      auto& maxv = std::get<1>(range);
+
+      int min_iv = minv.get_num_si();
+      int max_iv = maxv.get_num_si();
+      if (max_iv == min_iv) {
+        iden_dim_offsets.emplace_back(i, max_iv);
+      }
+    }
+  }
+
+  isl::set res_set = set;
+  for (auto offset_val : iden_dim_offsets) {
+    auto& offset = std::get<0>(offset_val);
+    auto& val    = std::get<1>(offset_val);
+    res_set      = isl::manage(isl_set_drop_constraints_involving_dims(res_set.copy(), isl_dim_set, offset, 1));
+
+    std::string const_param_name = llvm::formatv("{0}{1}", kIslParamConstPrefix, val);
+
+    std::string cond_str = llvm::formatv(
+        "{0} <= {1} <= {2}", val, isl_set_get_dim_name(res_set.get(), isl_dim_set, offset), const_param_name);
+    std::string param_cond_str = llvm::formatv("{0} <= {1} < {2}", val, const_param_name, val + 2);
+
+    std::string set_repr = llvm::formatv("[{0}] -> { {1}[{2}]: {3} and {4} }",
+                                         const_param_name,
+                                         isl_set_get_tuple_name(res_set.get()),
+                                         utils::Join(isl_get_dim_names(res_set.get()), ","),
+                                         cond_str,
+                                         param_cond_str);
+
+    VLOG(4) << "repr: " << set_repr;
+
+    isl::set new_set(res_set.ctx(), set_repr);
+
+    res_set = res_set.intersect(new_set);
+  }
+  return res_set;
+}
+
+isl::union_set TransIdentityExtentToContextId(isl::union_set set) {
+  auto* set_list = isl_union_set_get_set_list(set.release());
+  llvm::SmallVector<isl::set, 4> sets;
+  for (int i = 0; i < isl_set_list_n_set(set_list); i++) {
+    auto set = isl::manage(isl_set_list_get_set(set_list, i));
+    set      = TransIdentityExtentToContextId(set);
+    sets.push_back(set);
+  }
+  isl_set_list_free(set_list);
+
+  return isl_union_set_from_sets(sets);
+}
+
+isl::ast_node AstGen::Build() {
+  // Collect schedule from scheduler.
+  auto schedule_map = CollectScheduleMapFromGroup(impl_->schedule_group_);
+  std::vector<isl::map> maps;
+  for (auto& stage : impl_->stages_) {
+    auto it = schedule_map.find(stage->id());
+    CHECK(it != std::end(schedule_map)) << "stage " << stage->id() << " not found in the map";
+    maps.push_back(it->second);
+  }
+  auto schedule = isl_maps_to_union_map(maps);
+
+  // Build it.
+  auto ast_build = isl::ast_build::from_context(impl_->context_);
+
+  if (!impl_->build_options_.is_null())
+    ast_build = isl::manage(isl_ast_build_set_options(ast_build.release(), impl_->build_options_.release()));
+
+  // Set iterators names for readable code.
+  auto iterator_names =
+      impl_->iterator_names_.empty() ? impl_->schedule_group_.dimension_names : impl_->iterator_names_;
+
+  iterator_names   = SchedulerBase::WrapIteratorNames(iterator_names);
+  isl::id_list ids = isl::manage(isl_id_list_alloc(ctx().get(), iterator_names.size()));
+  for (int i = 0; i < iterator_names.size(); i++) {
+    ids = isl::manage(isl_id_list_add(ids.release(), isl_id_alloc(ctx().get(), iterator_names[i].c_str(), nullptr)));
+  }
+  ast_build = isl::manage(isl_ast_build_set_iterators(ast_build.release(), ids.release()));
+
+  // collect iterator map
+  auto get_domain_by_name = [this](const std::string& name) -> isl::set {
+    auto ele_it = std::find_if(
+        impl_->stages_.begin(), impl_->stages_.end(), [&name](const Shared<Stage>& ele) { return ele->id() == name; });
+    CHECK(ele_it != std::end(impl_->stages_));
+    return (*ele_it)->domain();
+  };
+
+  auto collect = [&](isl::ast_node node, isl::ast_build build) -> isl::ast_node {
+    auto tuple_name = detail::GetTupleName(node.get());
+    auto indice_map = impl_->ExtractIslTransformedIndiceMap(get_domain_by_name(tuple_name), build.get());
+    impl_->transformed_indice_map_[tuple_name] = indice_map;
+    return node;
+  };
+
+  ast_build = ast_build.set_at_each_domain(collect);
+
+  isl::union_map transformed_schedule = impl_->transform().apply_range(schedule);
+  VLOG(4) << "transformed_schedule: " << transformed_schedule;
+  isl::union_map schedule_domain = transformed_schedule.intersect_domain(impl_->domain());
+  VLOG(4) << "domain: " << impl_->domain();
+  VLOG(4) << "transform schedule " << impl_->stages()[0]->transform();
+  VLOG(4) << "schedule: " << schedule;
+  VLOG(4) << "schedule_domain: " << schedule_domain;
+  isl::ast_node ast = ast_build.node_from_schedule_map(schedule_domain);
+  VLOG(2) << "AST:\n" << isl_ast_node_to_C_str(ast.get());
+  return ast;
+}
+
+AstGen& AstGen::SetIteratorNames(const std::vector<std::string>& names) {
+  impl_->iterator_names_ = names;
+  return *this;
+}
+
+isl::ast_expr CreateIslAstIndexExpression(isl_ast_build* build, const isl::map& access);
+
+std::map<std::string, isl::ast_expr> AstGen::Impl::ExtractIslTransformedIndiceMap(const isl::set& iterator_domain,
+                                                                                  isl_ast_build* build) {
+  std::map<std::string, isl::ast_expr> iterator_map;
+  isl::map identity = isl::manage(isl_set_identity(iterator_domain.copy()));
+  isl::map schedule = identity;
+
+  identity                = identity.apply_domain(schedule);
+  isl::ast_expr idx_expr  = CreateIslAstIndexExpression(build, identity);
+  isl::space domain_space = iterator_domain.space();
+
+  for (int i = 1; i < isl_ast_expr_get_op_n_arg(idx_expr.get()); i++) {
+    if (isl_space_has_dim_name(domain_space.get(), isl_dim_set, i - 1)) {
+      std::string original_idx_name   = isl_space_get_dim_name(domain_space.get(), isl_dim_set, i - 1);
+      isl::ast_expr transformed_index = isl::manage(isl_ast_expr_get_op_arg(idx_expr.get(), i));
+      VLOG(4) << "axis-" << i - 1 << " named " << original_idx_name << ", is "
+              << isl_ast_expr_to_C_str(transformed_index.get());
+      iterator_map.emplace(original_idx_name, transformed_index);
+      iterator_map.emplace(std::to_string(i - 1), transformed_index);
+    }
+  }
+
+  return iterator_map;
+}
+
+const std::map<std::string, isl::ast_expr>& AstGen::axis2ast(const std::string& tuple_name) const {
+  auto it = impl_->transformed_indice_map_.find(tuple_name);
+  CHECK(it != impl_->transformed_indice_map_.end()) << "no id " << tuple_name;
+  return it->second;
+}
+
+const std::map<std::string, Expr> AstGen::axis2expr(const std::string& tuple_name) const {
+  const auto& axis_to_ast = axis2ast(tuple_name);
+  std::map<std::string, Expr> res;
+  for (auto item : axis_to_ast) {
+    Expr expr;
+    IslAstExprToCinnExpr(item.second, &expr);
+    res[item.first] = expr;
+  }
+  return res;
+}
+
+isl::ast_expr CreateIslAstIndexExpression(isl_ast_build* build, const isl::map& access) {
+  CHECK(build);
+  isl::map schedule = isl::manage(isl_map_from_union_map(isl_ast_build_get_schedule(build)));
+
+  // get identity access from schedule.
+  auto statement       = isl_map_get_statement_repr(schedule.get(), isl_dim_in);
+  auto statement_set   = isl::manage(isl_set_read_from_str(isl_map_get_ctx(schedule.get()),
+                                                         utils::StringFormat("{ %s : }", statement.c_str()).c_str()));
+  auto identity_access = isl::manage(isl_set_identity(statement_set.release()));
+  isl::map map         = isl::manage(isl_map_reverse(schedule.copy()));
+
+  isl::pw_multi_aff iterator_map = isl::manage(isl_pw_multi_aff_from_map(map.copy()));
+  isl::pw_multi_aff index_aff    = isl::manage(isl_pw_multi_aff_from_map(identity_access.copy()));
+
+  isl::space model2        = iterator_map.space();
+  index_aff                = isl::manage(isl_pw_multi_aff_align_params(index_aff.copy(), model2.copy()));
+  isl::space model         = index_aff.space();
+  iterator_map             = isl::manage(isl_pw_multi_aff_align_params(iterator_map.copy(), model.copy()));
+  iterator_map             = isl::manage(isl_pw_multi_aff_pullback_pw_multi_aff(index_aff.copy(), iterator_map.copy()));
+  isl::ast_expr index_expr = isl::manage(isl_ast_build_access_from_pw_multi_aff(build, iterator_map.copy()));
+
+  return index_expr;
+}
+
+isl::union_map AstGen::Impl::transform() {
+  std::vector<isl::map> transforms;
+  for (auto& stage : stages()) {
+    transforms.push_back(stage->transform());
+  }
+  return isl_maps_to_union_map(transforms);
+}
+
+namespace detail {
+
+std::string GetTupleName(isl_ast_node* node) {
+  auto expr = isl::manage(isl_ast_node_user_get_expr(node));
+  auto arg  = isl::manage(isl_ast_expr_get_op_arg(expr.get(), 0));
+  auto name = isl_id_get_name(isl_ast_expr_get_id(arg.get()));
+  return name;
+}
+
+}  // namespace detail
+
+//! Eat an isl block node.
+void EatBlock(const isl::ast_node& node, ir::Expr* expr);
+//! Eat an isl user node.
+void EatUser(const isl::ast_node& node, ir::Expr* expr);
+//! Eat an isl for node.
+void EatFor(const isl::ast_node& node, ir::Expr* expr);
+//! Eat an isl `if` node.
+void EatIf(const isl::ast_node& node, ir::Expr* expr);
+//! Eat an isl mark node.
+void EatMark(const isl::ast_node& node, ir::Expr* expr);
+
+void IslAstNodeToCinnExpr(const isl::ast_node& node, ir::Expr* expr) {
+  CHECK(!node.is_null());
+  CHECK(expr);
+
+  switch (isl_ast_node_get_type(node.get())) {
+    case isl_ast_node_block: {
+      VLOG(6) << "get isl block node";
+      EatBlock(node, expr);
+    } break;
+    case isl_ast_node_for: {
+      VLOG(6) << "get isl for node";
+      EatFor(node, expr);
+    } break;
+    case isl_ast_node_if: {
+      VLOG(6) << "get isl if node";
+      EatIf(node, expr);
+    } break;
+    case isl_ast_node_user: {
+      VLOG(6) << "get isl user node";
+      EatUser(node, expr);
+    } break;
+    case isl_ast_node_mark: {
+      VLOG(6) << "get isl mark";
+      // EatMark(node, expr);
+    } break;
+    default:
+      LOG(FATAL) << "Unexpected ISL node type " << isl_ast_node_get_type(node.get());
+      break;
+  }
+}
+
+// Eat an isl block node.
+void EatBlock(const isl::ast_node& node, ir::Expr* expr) {
+  VLOG(2) << "get isl ast body node";
+  CHECK(!node.is_null());
+  CHECK(expr);
+  CHECK_EQ(isl_ast_node_get_type(node.get()), isl_ast_node_block);
+  isl::ast_node_list list = isl::manage(isl_ast_node_block_get_children(node.get()));
+  std::vector<ir::Expr> exprs;
+  for (int i = 0; i < isl_ast_node_list_n_ast_node(list.get()); i++) {
+    isl::ast_node child = isl::manage(isl_ast_node_list_get_ast_node(list.get(), i));
+    // visit child
+    ir::Expr child_expr;
+    IslAstNodeToCinnExpr(child, &child_expr);
+    exprs.push_back(child_expr);
+  }
+  *expr = ir::Block::Make(std::move(exprs));
+}
+// Eat an isl user node.
+void EatUser(const isl::ast_node& node, ir::Expr* expr) {
+  CHECK_EQ(isl_ast_node_get_type(node.get()), isl_ast_node_user);
+  isl::ast_expr isl_expr = isl::manage(isl_ast_node_user_get_expr(node.get()));
+  IslAstExprToCinnExpr(isl_expr, expr);
+}
+// Eat an isl `for` node.
+void EatFor(const isl::ast_node& node, ir::Expr* expr) {
+  CHECK_EQ(isl_ast_node_get_type(node.get()), isl_ast_node_for);
+
+  // iter name
+  isl::ast_expr iter    = isl::manage(isl_ast_node_for_get_iterator(node.get()));
+  isl::id iter_id       = isl::manage(isl_ast_expr_get_id(iter.get()));
+  std::string iter_name = iter_id.name();
+
+  // get condition
+  isl::ast_expr condition   = isl::manage(isl_ast_node_for_get_cond(node.get()));
+  isl::ast_expr incrementor = isl::manage(isl_ast_node_for_get_inc(node.get()));
+  isl::ast_expr initializer = isl::manage(isl_ast_node_for_get_init(node.get()));
+  isl::ast_node body        = isl::manage(isl_ast_node_for_get_body(node.get()));
+
+  ir::Expr ir_body;
+  IslAstNodeToCinnExpr(body, &ir_body);
+  ir_body = ir::Block::Make({ir_body});
+
+  ir::Expr ir_initializer;
+  IslAstExprToCinnExpr(initializer, &ir_initializer);
+
+  ir::Expr ir_condition;
+  IslAstExprToCinnExpr(condition, &ir_condition);
+  ir::Expr tmp;
+
+  isl::ast_expr arg = isl::manage(isl_ast_expr_get_op_arg(condition.get(), 1));
+  IslAstExprToCinnExpr(arg, &tmp);
+
+  ir::Expr ir_inc;
+  IslAstExprToCinnExpr(incrementor, &ir_inc);
+
+  ir::Var ir_iter(iter_name);
+
+  *expr = ir::PolyFor::Make(
+      ir::Var(iter_name), ir_initializer, ir_condition, ir_inc, ir::ForType::Serial, ir::DeviceAPI ::Host, ir_body);
+}
+
+void EatIf(const isl::ast_node& node, ir::Expr* expr) {
+  CHECK_EQ(isl_ast_node_get_type(node.get()), isl_ast_node_if);
+  isl::ast_node then_body = isl::manage(isl_ast_node_if_get_then(node.get()));
+  isl::ast_expr condition = isl::manage(isl_ast_node_if_get_cond(node.get()));
+
+  ir::Expr ir_then_body;
+  IslAstNodeToCinnExpr(then_body, &ir_then_body);
+
+  ir::Expr ir_else_body;
+  if (isl_bool_true == isl_ast_node_if_has_else(node.get())) {
+    isl::ast_node else_body = isl::manage(isl_ast_node_if_get_else(node.get()));
+    IslAstNodeToCinnExpr(else_body, &ir_else_body);
+  }
+
+  ir::Expr ir_condition;
+  IslAstExprToCinnExpr(condition, &ir_condition);
+
+  if (ir_else_body.defined()) {
+    *expr = ir::IfThenElse::Make(ir_condition, ir_then_body, ir_else_body);
+  } else {
+    *expr = ir::IfThenElse::Make(ir_condition, ir_then_body, ir::Expr());
+  }
+}
+
+void IslAstExprToCinnExpr(const isl::ast_expr& node, ir::Expr* expr) {
+  switch (isl_ast_expr_get_type(node.get())) {
+    case isl_ast_expr_int: {
+      isl::val val = isl::manage(isl_ast_expr_get_val(node.get()));
+      *expr        = ir::Expr(static_cast<int>(isl_val_get_num_si(val.get())));
+    } break;
+    case isl_ast_expr_id: {
+      isl::id id = isl::manage(isl_ast_expr_get_id(node.get()));
+      *expr      = ir::Var(id.name());
+    } break;
+    case isl_ast_expr_op: {
+      std::vector<ir::Expr> ops;
+      const int n_args = isl_ast_expr_get_op_n_arg(node.get());
+
+      for (int i = 0; i < n_args; i++) {
+        ir::Expr op;
+        isl::ast_expr expr0 = isl::manage(isl_ast_expr_get_op_arg(node.get(), i));
+        IslAstExprToCinnExpr(expr0, &op);
+        ops.push_back(op);
+      }
+
+      auto set_ops_ptype = [&](ir::Type type) {
+        for (auto& op : ops) {
+          op->set_type(type);
+        }
+      };
+
+      // set ops as int32 by default.
+      set_ops_ptype(Int(32));
+
+      isl_ast_op_type op_type = isl_ast_expr_get_op_type(node.get());
+      switch (op_type) {
+        case isl_ast_op_and: {
+          set_ops_ptype(Bool());
+          *expr = ir::And::Make(ops[0], ops[1]);
+        } break;
+        case isl_ast_op_or:
+          *expr = ir::Or::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_min:
+          *expr = ir::Min::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_max:
+          *expr = ir::Max::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_minus:
+          *expr = ir::Minus::Make(ops[0]);
+          break;
+        case isl_ast_op_add:
+          *expr = ir::Add::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_sub:
+          *expr = ir::Sub::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_mul:
+          *expr = ir::Mul::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_div:
+          *expr = ir::Div::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_le:
+          *expr = ir::LE::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_lt:
+          *expr = ir::LT::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_ge:
+          *expr = ir::GE::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_gt:
+          *expr = ir::GT::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_eq:
+          *expr = ir::EQ::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_pdiv_q:
+          *expr = ir::Div::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_zdiv_r:
+        case isl_ast_op_pdiv_r:
+          *expr = ir::Mod::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_call: {
+          ir::Expr caller_expr = ops.front();
+          // TODO(Superjomn) make it an string
+          CHECK(caller_expr.node_type() == ir::IrNodeTy::_Var_);
+          std::string caller = caller_expr.As<ir::_Var_>()->name;
+          ops.erase(ops.begin());
+          // NOTE the type here is not important.
+          *expr = ir::Call::Make(Float(32), caller, ops, {}, ir::CallType::ISL, ir::FunctionRef(), 0);
+        } break;
+        case isl_ast_op_fdiv_q:
+          *expr = ir::Div::Make(ops[0], ops[1]);
+          break;
+        case isl_ast_op_select:
+          CHECK_EQ(ops.size(), 3UL) << "In ir::Select, the ops size should be 3";
+          ops[0]->set_type(Bool());
+          *expr = ir::Select::Make(ops[0], ops[1], ops[2]);
+          break;
+        default:
+          LOG(FATAL) << "unsupported op " << op_type;
+      }
+    } break;
+    default:
+      break;
+  }
+}
+
+void AddUnitLoopOfDomain(const isl::ast_node& node, const isl::set& domain, ir::Expr* expr) {
+  std::vector<std::string> dim_names = isl_get_dim_names(domain);
+  std::vector<std::tuple<int, int, int>> dim_min_max;
+  for (int i = 0; i < dim_names.size(); ++i) {
+    auto minv_maxv = isl_set_get_axis_range(domain.get(), i);
+    int min_iv     = std::get<0>(minv_maxv).get_num_si();
+    int max_iv     = std::get<1>(minv_maxv).get_num_si();
+    dim_min_max.emplace_back(i, min_iv, max_iv);
+  }
+
+  DomainAddUnitLoopMutator mutator(dim_names, dim_min_max);
+  mutator(expr);
+}
+
+void IslAstNodeToCinnExpr(const isl::ast_node& node, const isl::union_set& domain, ir::Expr* expr) {
+  IslAstNodeToCinnExpr(node, expr);
+
+  isl_set_list* set_list = isl_union_set_get_set_list(domain.get());
+  VLOG(6) << "After convert to CinnExpr, n = " << isl_set_list_n_set(set_list);
+  for (int i = 0; i < isl_set_list_n_set(set_list); i++) {
+    isl::set s = isl::manage(isl_set_list_get_set(set_list, i));
+    AddUnitLoopOfDomain(node, s, expr);
+  }
+}
+
+void AstGen::Impl::InitIslAstConfig() {
+  isl_options_set_ast_build_detect_min_max(ctx().get(), 1);
+  isl_options_set_ast_build_exploit_nested_bounds(ctx().get(), 1);
+  isl_options_set_ast_build_scale_strides(ctx().get(), 1);
+  isl_options_set_ast_build_allow_else(ctx().get(), 1);
+}
+
+AstGen::AstGen(const isl::set& context, const std::vector<Stage*>& stages, const poly::ScheduleGroup& group)
+    : impl_(new Impl(context, group)) {
+  for (auto* x : stages) impl_->stages_.emplace_back(x);
+  impl_->InitIslAstConfig();
+}
+void AstGen::SetBuildOptions(const isl::union_map& options) { impl_->build_options_ = options; }
+bool AstGen::ContainsStatement(const std::string& name) const { return impl_->transformed_indice_map_.count(name); }
+
+AstGen::~AstGen() {}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/ast_gen.h b/paddle/cinn/poly/ast_gen.h
new file mode 100644
index 0000000000000..02cf5fb9ea1e4
--- /dev/null
+++ b/paddle/cinn/poly/ast_gen.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This file implements the isl AST build interface, it helps to generate isl AST given the polyhedral domain and
+ * schedule.
+ */
+#pragma once
+#include <isl/cpp.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "cinn/ir/tensor.h"
+#include "cinn/poly/isl_utils.h"
+#include "cinn/poly/poly_scheduler.h"
+#include "cinn/poly/schedule.h"
+#include "cinn/poly/stage.h"
+#include "cinn/utils/functional.h"
+
+namespace cinn {
+namespace poly {
+
+static const char* kIslParamConstPrefix = "_const_";
+
+/**
+ * Generate IR from polyhedral schedule.
+ */
+class AstGen {
+ public:
+  AstGen(const isl::set& context, const std::vector<Stage*>& stages, const poly::ScheduleGroup& group);
+  ~AstGen();
+
+  /**
+   * Set for-loop iterator names.
+   * @param names
+   * @return AstGen itself.
+   */
+  AstGen& SetIteratorNames(const std::vector<std::string>& names);
+
+  isl::ctx ctx() const;
+
+  isl::ast_node Build();
+
+  //! Get the map from original CINN iterators to the transformed actual ISL ast nodes.
+  const std::map<std::string, isl::ast_expr>& axis2ast(const std::string& tuple_name) const;
+
+  const std::map<std::string, Expr> axis2expr(const std::string& tuple_name) const;
+
+  bool ContainsStatement(const std::string& name) const;
+
+  void SetBuildOptions(const isl::union_map& options);
+
+  isl::union_set domain() const;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+void AddUnitLoopOfDomain(const isl::ast_node& node, const isl::set& domain, ir::Expr* expr);
+
+/**
+ * Transform the isl ast to Expr.
+ */
+void IslAstNodeToCinnExpr(const isl::ast_node& node, ir::Expr* expr);
+void IslAstNodeToCinnExpr(const isl::ast_node& node, const isl::union_set& domain, ir::Expr* expr);
+void IslAstExprToCinnExpr(const isl::ast_expr& node, ir::Expr* expr);
+
+/**
+ * Transform the set whose axis has one element like
+ *  { s[i=0,j] : ... }
+ * to a new set with a parameter to force all the axis has a range:
+ *  [_const_0] -> { s[i,j]: 0 <= i <= _const_0 and _const_0 < 0+2 and ... }
+ */
+isl::union_set TransIdentityExtentToContextId(isl::union_set set);
+isl::set TransIdentityExtentToContextId(isl::set set);
+
+namespace detail {
+
+//! Get tuple name of a ast node.
+std::string GetTupleName(isl_ast_node* node);
+
+}  // namespace detail
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/ast_gen_test.cc b/paddle/cinn/poly/ast_gen_test.cc
new file mode 100644
index 0000000000000..53c44968960cf
--- /dev/null
+++ b/paddle/cinn/poly/ast_gen_test.cc
@@ -0,0 +1,130 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/ast_gen.h"
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/schedule.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace poly {
+
+using namespace cinn::ir;
+using namespace cinn::lang;
+
+TEST(TransIdentityExtentToContextId, basic) {
+  isl_ctx* ctx = isl_ctx_alloc();
+  isl::set set(ctx, "{ s[i,j=0,k] : 0<=i<12 and 12<k<32 }");
+  auto new_set = TransIdentityExtentToContextId(set);
+  LOG(INFO) << new_set;
+
+  ASSERT_EQ(utils::GetStreamCnt(new_set),
+            "[_const_0] -> { s[i, j, k] : _const_0 <= 1 and 0 <= i <= 11 and 0 <= j <= _const_0 and 13 <= k <= 31 }");
+}
+
+TEST(TransIdentityExtentToContextId, basic1) {
+  isl_ctx* ctx = isl_ctx_alloc();
+  isl::set set(ctx, "[n] -> { s[i,j=0,k] : 0<=i<n and 12<k<32 }");
+  LOG(INFO) << "set: " << set;
+  auto new_set = TransIdentityExtentToContextId(set);
+  LOG(INFO) << new_set;
+}
+
+TEST(AstGen_Build, not_delete_length1_loop) {
+  std::vector<Expr> origin_shape = {Expr(10), Expr(10), Expr(10), Expr(10)};
+  for (int num_len1 = 0; num_len1 <= origin_shape.size(); ++num_len1) {
+    std::vector<int> index_length1(origin_shape.size(), 0);
+    for (int i = 1; i <= num_len1; ++i) {
+      index_length1[index_length1.size() - i] = 1;
+    }
+    do {
+      // Create shape that has 'num_len1' loops with length 1
+      // And this loop iterates for every combination of possible length 1
+      std::vector<Expr> len1_shape = origin_shape;
+      for (int i = 0; i < origin_shape.size(); ++i) {
+        if (index_length1[i] == 1) {
+          len1_shape[i] = Expr(1);
+        }
+      }
+      LOG(INFO) << "index_length1 hint = " << index_length1[0] << index_length1[1] << index_length1[2]
+                << index_length1[3];
+      Placeholder<float> A("A", len1_shape);
+      Tensor B = lang::Compute(
+          len1_shape, [&](const std::vector<Expr>& indice) { return lang::Relu(A(indice), 0); }, "relu_test");
+
+      StageMap stage_map = CreateStages({B});
+      std::vector<cinn::poly::Stage*> stages;
+      stages.push_back(stage_map[B]);
+
+      std::unique_ptr<Schedule> schedule =
+          poly::CreateSchedule(stages, poly::ScheduleKind::Poly, std::vector<std::pair<std::string, std::string>>());
+
+      for (auto& group : schedule->groups) {
+        isl::set context(Context::isl_ctx(), "{:}");
+        poly::AstGen gen(context, stages, group);
+        isl::ast_node ast = gen.Build();
+        ir::Expr e;
+        poly::IslAstNodeToCinnExpr(ast, gen.domain().as_set(), &e);
+        LOG(INFO) << "Domain = " << gen.domain().as_set();
+        LOG(INFO) << "Expr for not delete length1 loop";
+        LOG(INFO) << "\n" << e;
+
+        std::stringstream ss;
+        ss << e;
+        std::string expr_str             = ss.str();
+        std::string target_str           = R"ROC(poly_for (i, 0, (i <= 9), 1)
+{
+  poly_for (j, 0, (j <= 9), 1)
+  {
+    poly_for (k, 0, (k <= 9), 1)
+    {
+      poly_for (a, 0, (a <= 9), 1)
+      {
+        relu_test(i, j, k, a)
+      }
+    }
+  }
+})ROC";
+        int pos                          = -1;
+        std::vector<char> iterator_names = {'i', 'j', 'k', 'a'};
+        for (int i = 0; i < origin_shape.size(); ++i) {
+          pos = target_str.find("9", pos + 1);
+          if (index_length1[i] == 1) {
+            target_str[pos]                                 = '0';
+            target_str[target_str.rfind(iterator_names[i])] = '0';
+          }
+        }
+
+        LOG(INFO) << "Target Expr string:";
+        LOG(INFO) << "\n" << target_str;
+        ASSERT_EQ(expr_str, target_str);
+      }
+    } while (std::next_permutation(index_length1.begin(), index_length1.end()));
+  }
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/compute_at_transform.cc b/paddle/cinn/poly/compute_at_transform.cc
new file mode 100755
index 0000000000000..1f69706a4c4c9
--- /dev/null
+++ b/paddle/cinn/poly/compute_at_transform.cc
@@ -0,0 +1,244 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/compute_at_transform.h"
+
+namespace cinn {
+namespace poly {
+
+void ComputeAtTransform::AdjustPdomain() {
+  isl::map ct_with_params = ctransform_with_params();
+  isl::set ct_domain      = ct_with_params.domain();
+
+  isl::set cdomain1 = isl::manage(AddParamsTo(cdomain_.copy()));
+
+  VLOG(3) << "ct_domain: " << ct_domain.space();
+  VLOG(3) << "cdomain1: " << cdomain1.space();
+
+  ct_domain = ct_domain.intersect(cdomain1);
+  VLOG(3) << "ct_domain: " << ct_domain;
+
+  // get producer domain from access
+  isl::map access_with_params = isl::manage(AddParamsTo(access_.copy()));
+
+  isl::set pdomain = ct_domain.apply(access_with_params);
+
+  // intect with the original producer domain
+  auto pdomain_params = isl::manage(AddParamsTo(pdomain_.copy()));
+  VLOG(4) << "pdomain: " << pdomain;
+  VLOG(4) << "pdomain_params: " << pdomain_params;
+  adjusted_pdomain_ = isl::manage(isl_set_intersect(pdomain.release(), pdomain_params.release()));
+  adjusted_pdomain_ = isl::manage(isl_simplify(adjusted_pdomain_.release()));
+  VLOG(4) << "adjusted pdomain: " << adjusted_pdomain_;
+}
+
+void ComputeAtTransform::AdjustPtransform() {
+  // insert level+1 dims from ctransform's range into ptransform's range
+
+  {
+    // insert empty dims to ptransform's range
+    adjusted_ptransform_ = ptransform_;
+    adjusted_ptransform_ = isl::manage(isl_map_insert_dims(adjusted_ptransform_.release(), isl_dim_out, 0, level_ + 1));
+
+    // update the tuple name
+    adjusted_ptransform_ = isl::manage(isl_map_set_tuple_name(adjusted_ptransform_.release(), isl_dim_in, ptuple()));
+    adjusted_ptransform_ = isl::manage(isl_map_set_tuple_name(adjusted_ptransform_.release(), isl_dim_out, ptuple()));
+  }
+
+  {
+    // make ctransform range the same space with ptransform's range so that we can copy the dims
+    isl::set ct_range  = cdomain_.apply(ctransform_);
+    isl::set ct_range1 = isl::manage(isl_set_project_out(
+        ct_range.release(), isl_dim_set, level_ + 1, isl_set_dim(ct_range.get(), isl_dim_set) - level_ - 1));
+    ct_range1          = isl::manage(isl_set_add_dims(
+        ct_range1.release(), isl_dim_set, isl_map_dim(adjusted_ptransform_.get(), isl_dim_out) - level_ - 1));
+    // set as the producer's tuple to make a same space
+    ct_range1 = isl::manage(isl_set_set_tuple_name(ct_range1.release(), ptuple()));
+
+    adjusted_ptransform_ = adjusted_ptransform_.intersect_range(ct_range1);
+    VLOG(4) << "adjusted_ptransform: " << adjusted_ptransform_;
+  }
+
+  {  // add params
+    adjusted_ptransform_ = isl::manage(AddParamsTo(adjusted_ptransform_.release()));
+  }
+}
+
+isl::set ComputeAtTransform::cdomain_with_params() {
+  // add level+1 param to consumer transform
+  isl::set cd_with_params = isl::manage(isl_set_add_dims(cdomain_.copy(), isl_dim_param, level_ + 1));
+  return cd_with_params;
+}
+
+isl::map ComputeAtTransform::ctransform_with_params() {
+  // add level+1 param to consumer transform
+  int num_existing_param  = isl_map_dim(ctransform_.get(), isl_dim_param);
+  isl::map ct_with_params = isl::manage(AddParamsTo(ctransform_.copy()));
+  {
+    isl_local_space* local_space = isl_local_space_from_space(ct_with_params.space().release());
+    for (int i = 0; i < level_ + 1; i++) {
+      isl_constraint* cst = isl_constraint_alloc_equality(isl_local_space_copy(local_space));
+      cst                 = isl_constraint_set_coefficient_val(
+          cst, isl_dim_param, num_existing_param + i, isl_val_int_from_si(ctransform_.ctx().get(), -1));
+      cst = isl_constraint_set_coefficient_val(cst, isl_dim_out, i, isl_val_int_from_si(ctransform_.ctx().get(), 1));
+      ct_with_params = isl::manage(isl_map_add_constraint(ct_with_params.release(), cst));
+    }
+    isl_local_space_free(local_space);
+  }
+  return ct_with_params;
+}
+
+void ComputeAtTransform::DisplayC(isl_map* pschedule, isl_map* cschedule) {
+  VLOG(3) << "adjusted cdomain: " << adjusted_cdomain_;
+  VLOG(3) << "adjusted ctransform: " << adjusted_ctransform_;
+
+  auto adjusted_ctransform = adjusted_ctransform_;
+  auto adjusted_ptransform = adjusted_ptransform_;
+
+  if (cschedule) {
+    adjusted_ctransform = isl::manage(isl_map_apply_range(adjusted_ctransform.release(), cschedule));
+  }
+  if (pschedule) {
+    adjusted_ptransform = isl::manage(isl_map_apply_range(adjusted_ptransform.release(), pschedule));
+  }
+
+  auto whole_domain = isl::manage(isl_union_set_from_set(adjusted_pdomain_.copy()));
+  whole_domain      = isl::manage(isl_union_set_add_set(whole_domain.release(), adjusted_cdomain_.copy()));
+  VLOG(3) << "whole domain: " << whole_domain;
+
+  auto whole_schedule = isl::manage(isl_union_map_from_map(adjusted_ptransform.copy()));
+  whole_schedule      = isl::manage(isl_union_map_add_map(whole_schedule.release(), adjusted_ctransform.copy()));
+  VLOG(3) << "whole_schedule: " << whole_schedule;
+
+  isl::set context(whole_domain.ctx(), "{:}");
+
+  auto intersect_schedule = whole_schedule.intersect_domain(whole_domain);
+
+  auto* build = isl_ast_build_from_context(context.release());
+  auto* node  = isl_ast_build_node_from_schedule_map(build, intersect_schedule.release());
+
+  VLOG(3) << "code:\n\n" << isl_ast_node_to_C_str(node);
+
+  isl_ast_node_free(node);
+}
+
+isl_set* ComputeAtTransform::AddParamsTo(isl_set* set) {
+  int existing_params = isl_set_dim(set, isl_dim_param);
+  set                 = isl_set_add_dims(set, isl_dim_param, level_ + 1);
+
+  // set name
+  for (int i = 0; i < level_ + 1; i++) {
+    std::string pname = GenConsumerParamName(ctuple(), i);
+    set               = isl_set_set_dim_name(set, isl_dim_param, existing_params + i, pname.c_str());
+  }
+  return set;
+}
+
+isl_map* ComputeAtTransform::AddParamsTo(isl_map* map) {
+  int existing_params = isl_map_dim(map, isl_dim_param);
+  map                 = isl_map_add_dims(map, isl_dim_param, level_ + 1);
+
+  // set name
+  for (int i = 0; i < level_ + 1; i++) {
+    std::string pname = GenConsumerParamName(ctuple(), i);
+    map               = isl_map_set_dim_name(map, isl_dim_param, existing_params + i, pname.c_str());
+  }
+  return map;
+}
+
+ComputeAtTransform::ComputeAtTransform(
+    isl::set pdomain, isl::set cdomain, isl::map access, isl::map ptransform, isl::map ctransform, int level)
+    : pdomain_(pdomain),
+      cdomain_(cdomain),
+      access_(access),
+      ptransform_(ptransform),
+      ctransform_(ctransform),
+      level_(level) {
+  VLOG(2) << "pdomain: " << pdomain;
+  VLOG(2) << "ptransform: " << ptransform;
+  VLOG(2) << "cdomain: " << cdomain;
+  VLOG(2) << "ctransform: " << ctransform;
+  VLOG(2) << "access: " << access;
+
+  adjusted_ctransform_ = isl::manage(AddParamsTo(ctransform_.copy()));
+  adjusted_cdomain_    = isl::manage(AddParamsTo(cdomain_.copy()));
+}
+
+std::string GenConsumerParamName(const char* tuple, int id) {
+  return utils::StringFormat("%s%s_%d", kConsumerParamPrefix, tuple, id);
+}
+
+std::vector<int> ComputeAtTransform::GetProducerAdjustedShape() const {
+  VLOG(3) << "domain: " << adjusted_pdomain();
+  isl::set param_limit = isl::manage(isl_set_universe(adjusted_pdomain().space().release()));
+  // set all the params to 0
+  isl_local_space* local_space = isl_local_space_from_space(param_limit.space().release());
+  for (int i = 0; i < isl_set_dim(param_limit.get(), isl_dim_param); i++) {
+    isl_constraint* cst = isl_constraint_alloc_equality(isl_local_space_copy(local_space));
+    cst = isl_constraint_set_coefficient_val(cst, isl_dim_param, i, isl_val_int_from_si(ctransform_.ctx().get(), 1));
+    param_limit = isl::manage(isl_set_add_constraint(param_limit.release(), cst));
+  }
+
+  VLOG(3) << "param_limit: " << param_limit;
+  isl::set domain = adjusted_pdomain().intersect(param_limit);
+
+  std::vector<int> shape;
+  // collect the min and max and get the num elements for each axis.
+  for (int i = 0; i < isl_set_dim(domain.get(), isl_dim_set); i++) {
+    auto _minv_maxv_ = isl_set_get_axis_range(domain.get(), i);
+    auto& minv       = std::get<0>(_minv_maxv_);
+    auto& maxv       = std::get<1>(_minv_maxv_);
+    int num_elements = maxv.num_si() - minv.num_si() + 1;
+    shape.push_back(num_elements);
+  }
+  return shape;
+}
+
+std::vector<int> ComputeAtTransform::GetAccessesPrecedingIndicesMinAssumingParamsZero() {
+  std::vector<int> res;
+
+  isl::set cdomain_with_param = isl::manage(AddParamsTo(cdomain_.copy()));
+  VLOG(4) << "cdomain_with_param: " << cdomain_with_param;
+  isl::map access_with_param = isl::manage(AddParamsTo(access_.copy()));
+
+  VLOG(4) << "applied: " << cdomain_with_param.apply(access_with_param);
+  isl::set param_limited_cdomain = ctransform_with_params().domain();
+  VLOG(4) << "ctransform.domain: " << param_limited_cdomain;
+  isl::set access_domain = param_limited_cdomain.apply(access_with_param);
+
+  // set all the params to 0
+  isl_local_space* local_space = isl_local_space_from_space(access_domain.space().release());
+  for (int i = 0; i < isl_set_dim(access_domain.get(), isl_dim_param); i++) {
+    isl_constraint* cst = isl_constraint_alloc_equality(isl_local_space_copy(local_space));
+    cst = isl_constraint_set_coefficient_val(cst, isl_dim_param, i, isl_val_int_from_si(ctransform_.ctx().get(), 1));
+    access_domain = isl::manage(isl_set_add_constraint(access_domain.release(), cst));
+  }
+  isl_local_space_free(local_space);
+
+  access_domain = access_domain.intersect(adjusted_pdomain());
+
+  VLOG(3) << "access_with_param: " << access_domain;
+
+  for (int i = 0; i < level_ + 1; i++) {
+    auto _minv_maxv_ = isl_set_get_axis_range(access_domain.get(), i);
+    auto& minv       = std::get<0>(_minv_maxv_);
+    auto& maxv       = std::get<1>(_minv_maxv_);
+    res.push_back(minv.get_num_si());
+  }
+
+  return res;
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/compute_at_transform.h b/paddle/cinn/poly/compute_at_transform.h
new file mode 100644
index 0000000000000..dc091fc8b2e35
--- /dev/null
+++ b/paddle/cinn/poly/compute_at_transform.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! This file implements the class ComputeAtTransform, which help to perform the isl transformation in `compute_at`
+//! optimization.
+#pragma once
+#include <isl/constraint.h>
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/poly/isl_utils.h"
+#include "cinn/poly/map.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace poly {
+
+//! Help to mark the consumer parameters in the generated AST.
+static const char* kConsumerParamPrefix = "_cp_";
+
+/**
+ * Generate a consumer parameter name in isl sets and maps.
+ * e.g the _cp_A_0 in `[_cp_A_0] -> {...}`
+ *
+ * @param tuple The tuple name of the consumer set.
+ * @param id The id of the parameter.
+ * @return the name.
+ */
+std::string GenConsumerParamName(const char* tuple, int id);
+
+/**
+ * \brief The ComputeAt transform implemented in polyhedral way.
+ *
+ * The current implementation for `ComputeAt` schedule primitive is quite complex, it contains the polyhedral transform
+ * before the AST generation, and the several passes after AST generation. This class only contains the polyhedral
+ * transform:
+ * 1. Adjust the producer's domain by the consume accesses.
+ * 2. Adjust the producer's transform by
+ *   a. Insert the preceding level+1 consumer axis to the head of the original producer transform's domain, to make it
+ * compute in the level of consumer forloops. b.
+ *   b. Adjust the range of the producer's transform by fixing the preceding axis(from the previous step).
+ *
+ * The latter process after the execution of this class remains, including
+ * 1. Get the adjusted shape of the producer after compute_at
+ * 2. Update the adjusted buffer's shape
+ * 3. Normalize the accesses of the consumers(by making the leftmost access start from zero).
+ */
+class ComputeAtTransform {
+ public:
+  ComputeAtTransform(
+      isl::set pdomain, isl::set cdomain, isl::map access, isl::map ptransform, isl::map ctransform, int level);
+
+  void operator()() {
+    AdjustPdomain();
+    AdjustPtransform();
+  }
+
+  const isl::set& adjusted_pdomain() const { return adjusted_pdomain_; }
+  const isl::map& adjusted_ptransform() const { return adjusted_ptransform_; }
+
+  //! Display C code
+  void DisplayC(isl_map* __isl_give pschedule = nullptr, isl_map* __isl_give cschedule = nullptr);
+
+  //! Re-calculate the producer buffer shape after compute_at transform.
+  std::vector<int> GetProducerAdjustedShape() const;
+
+  //! Get the the minimum of the preceding level+1 axis in accesses by assuming all the isl param is zero(for the
+  //! consumer, the preceding level+1 axis is fixed in producer computation).
+  std::vector<int> GetAccessesPrecedingIndicesMinAssumingParamsZero();
+
+ protected:
+  isl_set* __isl_give AddParamsTo(isl_set* __isl_take set);
+  isl_map* __isl_give AddParamsTo(isl_map* __isl_take map);
+
+  const char* ptuple() const { return isl_set_get_tuple_name(pdomain_.get()); }
+  const char* ctuple() const { return isl_set_get_tuple_name(cdomain_.get()); }
+
+  void AdjustPdomain();
+
+  void AdjustPtransform();
+
+  isl::map ctransform_with_params();
+  isl::set cdomain_with_params();
+
+ private:
+  isl::set pdomain_;
+  isl::set cdomain_;
+  isl::map access_;
+  isl::map ptransform_;
+  isl::map ctransform_;
+
+  isl::set adjusted_pdomain_;
+  isl::map adjusted_ptransform_;
+  isl::set adjusted_cdomain_;
+  isl::map adjusted_ctransform_;
+
+  int level_;
+};
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/compute_at_transform_test.cc b/paddle/cinn/poly/compute_at_transform_test.cc
new file mode 100644
index 0000000000000..7ede92a7c142a
--- /dev/null
+++ b/paddle/cinn/poly/compute_at_transform_test.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/compute_at_transform.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn {
+namespace poly {
+
+TEST(ComputeAtTransform2, basic) {
+  isl::ctx ctx(isl_ctx_alloc());
+  isl::set pdomain(ctx, "{ p[i,j]: 0<=i,j<100 }");
+  isl::map ptransform(ctx, "{ p[i,j]->p[t0,t1,t2]: t0=i%4 and t1=i/4 and t2=j }");
+  isl::set cdomain(ctx, "{ c[i,j,k]: 0<=i,j,k<50 }");
+  isl::map ctransform(ctx, "{ c[i,j,k]->c[t0,t1,t2,t3]: t0=i/4 and t1=i%4 and t2=j and t3=k }");
+
+  isl::map access(ctx, "{ c[i,j,k]->p[i,j]; c[i,j,k]->p[i+1,j]; c[i,j,k]->p[i-1,j] }");
+
+  poly::ComputeAtTransform t(pdomain, cdomain, access, ptransform, ctransform, 1);
+  t();
+
+  t.DisplayC();
+
+  isl::map pschedule(ctx,
+                     "{ p[i0,i1,i2,i3,i4] -> [t0,t1,t1t, t2,t3,t4,t5]: t0=i0 and t1=i1 and t2=i2 and t3=i3 and t4=i4 "
+                     "and t5=0 and t1t=0 }");
+  isl::map cschedule(ctx,
+                     "[_c_0,_c_1] -> { c[i0,i1,i2,i3] -> [t0,t1,t1t,t2,t3,t4,t5]: t0=i0 and t1=i1 and t2=i2 and t3=i3 "
+                     "and t4=0 and t5=0 and t1t=1 }");
+
+  t.DisplayC(pschedule.release(), cschedule.release());
+
+  LOG(INFO) << "shape:";
+  auto shape = t.GetProducerAdjustedShape();
+  for (int i = 0; i < shape.size(); i++) {
+    LOG(INFO) << shape[i];
+  }
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/dim.cc b/paddle/cinn/poly/dim.cc
new file mode 100644
index 0000000000000..db76650d91d10
--- /dev/null
+++ b/paddle/cinn/poly/dim.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/dim.h"
+
+#include "cinn/ir/ir_printer.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace poly {
+
+std::string Dim::range_repr() const {
+  return utils::StringFormat(
+      "%s <= %s <= %s", utils::GetStreamCnt(lower_bound).c_str(), id.c_str(), utils::GetStreamCnt(upper_bound).c_str());
+}
+
+Dim::Dim(std::string id, ir::Expr lower_bound, ir::Expr upper_bound)
+    : id(std::move(id)), lower_bound(lower_bound), upper_bound(upper_bound) {
+  optim::Simplify(&this->lower_bound);
+  optim::Simplify(&this->upper_bound);
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/dim.h b/paddle/cinn/poly/dim.h
new file mode 100644
index 0000000000000..b273efd4b7c66
--- /dev/null
+++ b/paddle/cinn/poly/dim.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/ir/ir_base.h"
+
+/**
+ * \file
+ * This file defines Dim class, which represents the dimension in polyhedral.
+ */
+
+namespace cinn {
+namespace poly {
+
+/**
+ * Dimension with name and range.
+ *
+ * This is used in ISL to define each dimension of a statement.
+ */
+struct Dim {
+  using value_t = ir::Expr;
+  using range_t = std::pair<value_t, value_t>;
+
+  //! The id of the dimension.
+  std::string id;
+  //! The lower bound.
+  value_t lower_bound;
+  //! The upper bound.
+  value_t upper_bound;
+
+  //! Construct a parameter.
+  Dim(std::string id) : id(std::move(id)) {}
+
+  //! Construct a dimension with integer range.
+  Dim(std::string id, uint32_t lower_bound, uint32_t upper_bound)
+      : id(std::move(id)), lower_bound(lower_bound), upper_bound(upper_bound) {}
+
+  //! Construct a dimension with expression range.
+  Dim(std::string id, ir::Expr lower_bound, ir::Expr upper_bound);
+
+  //! Return the range composed of (lower_bound, upper_bound).
+  range_t range() const { return std::make_pair(lower_bound, upper_bound); }
+
+  bool is_param() const { return !lower_bound.defined() && !lower_bound.defined(); }
+
+  //! Return the ISL style range representation, such as '0 <= i <= 20'.
+  std::string range_repr() const;
+};
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/domain.cc b/paddle/cinn/poly/domain.cc
new file mode 100644
index 0000000000000..d2a0baaed0c56
--- /dev/null
+++ b/paddle/cinn/poly/domain.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/domain.h"
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include <unordered_set>
+
+#include "cinn/common/context.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace poly {
+
+std::string Domain::__str__() const {
+  CHECK(!id.empty()) << "domain's id is empty";
+  std::vector<std::string> range_fields;
+  std::transform(
+      dims.begin(), dims.end(), std::back_inserter(range_fields), [](const Dim& x) { return x.range_repr(); });
+  std::string range_repr = utils::Join(range_fields, " and ");
+
+  std::vector<std::string> dim_fields;
+  std::transform(dims.begin(), dims.end(), std::back_inserter(dim_fields), [](const Dim& x) { return x.id; });
+  std::string dims_repr = utils::Join(dim_fields, ", ");
+
+  // parameters
+  std::vector<std::string> param_reprs;
+  std::transform(params.begin(), params.end(), std::back_inserter(param_reprs), [](const Dim& x) { return x.id; });
+  std::string params_repr = utils::Join(param_reprs, ", ");
+
+  return utils::StringFormat(
+      "[%s]->{ %s[%s]: %s }", params_repr.c_str(), id.c_str(), dims_repr.c_str(), range_repr.c_str());
+}
+
+isl::set Domain::to_isl() const {
+  VLOG(3) << "isl::set " << __str__();
+  isl::set x(common::Context::isl_ctx(), __str__());
+  return x;
+}
+
+void Domain::ExtractParams() {
+  std::unordered_set<std::string> var_names;
+  auto collect_param_fn = [&](Expr& e) {
+    if (!e.is_constant()) {
+      auto vars = ir::CollectIRNodes(e, [](const Expr* e) { return e->is_var(); });
+      for (auto& var : vars) var_names.insert(var.As<ir::_Var_>()->name);
+    }
+  };
+
+  for (auto& dim : dims) {
+    collect_param_fn(dim.lower_bound);
+    collect_param_fn(dim.upper_bound);
+  }
+
+  for (auto& id : var_names) params.emplace_back(id);
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/domain.h b/paddle/cinn/poly/domain.h
new file mode 100644
index 0000000000000..68d053489df8e
--- /dev/null
+++ b/paddle/cinn/poly/domain.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <isl/cpp.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/poly/dim.h"
+
+namespace cinn {
+namespace poly {
+
+struct Domain {
+  //! The id of the statement.
+  std::string id;
+  //! The dimensions.
+  std::vector<Dim> dims;
+  //! The parameters.
+  std::vector<Dim> params;
+
+  //! The ISL context.
+  isl::ctx ctx;
+
+  Domain(isl::ctx ctx, std::string id, std::vector<Dim> dims) : ctx(ctx), id(std::move(id)), dims(std::move(dims)) {
+    ExtractParams();
+  }
+
+  //! The ISL format representation, such as '{ S[i]: 0<=i<=20 }'.
+  std::string __str__() const;
+
+  //! Get the isl domain.
+  isl::set to_isl() const;
+
+ private:
+  //! Extract the parameters from dims.
+  void ExtractParams();
+};
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/domain_add_unit_loop_mutator.cc b/paddle/cinn/poly/domain_add_unit_loop_mutator.cc
new file mode 100644
index 0000000000000..9100262a52f9f
--- /dev/null
+++ b/paddle/cinn/poly/domain_add_unit_loop_mutator.cc
@@ -0,0 +1,217 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/domain_add_unit_loop_mutator.h"
+
+#include <glog/logging.h>
+
+#include <tuple>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace poly {
+
+DomainAddUnitLoopMutator::DomainAddUnitLoopMutator(const std::vector<std::string>& dim_names,
+                                                   const std::vector<std::tuple<int, int, int>>& dim_min_max)
+    : dim_names_(dim_names), dim_min_max_(dim_min_max) {}
+
+void DomainAddUnitLoopMutator::operator()(ir::Expr* expr) {
+  ir::IRMutator<>::Visit(expr, expr);
+
+  // If the loop with length 1 is the most inner loop, Visit cannot find it
+  // in deleted-length-1-loop expr. So we should check after visit
+  MutateAfterVisit(expr);
+}
+
+void DomainAddUnitLoopMutator::Visit(const ir::For* op, Expr* expr) {
+  VLOG(6) << "DomainAddUnitLoopMutator Visit For";
+  ir::For* node      = expr->As<ir::For>();
+  bool add_unit_loop = false;
+  if (parent_for_.size() < dim_names_.size()) {
+    std::string check_name      = dim_names_[parent_for_.size()];
+    std::tuple<int, int, int> t = dim_min_max_[parent_for_.size()];
+    if (!utils::Startswith(node->loop_var->name, check_name) && (std::get<2>(t) - std::get<1>(t) == 0)) {
+      ir::Expr unit_loop = ir::For::Make(ir::Var(check_name),
+                                         ir::Expr(0),
+                                         ir::Expr(1),
+                                         ir::ForType::Serial,
+                                         node->device_api,
+                                         ir::Block::Make({*expr}));
+      if (parent_for_.empty()) {
+        *expr = unit_loop;
+        parent_for_.push_back(unit_loop.As<ir::For>());
+        longest_loop_.push_back(unit_loop);
+        add_unit_loop = true;
+      } else if (parent_for_.back()->body.As<ir::For>() && parent_for_.back()->body == *expr) {
+        parent_for_.back()->body = ir::Block::Make({unit_loop});
+        parent_for_.push_back(unit_loop.As<ir::For>());
+        longest_loop_.push_back(unit_loop);
+        add_unit_loop = true;
+      } else if (parent_for_.back()->body.As<ir::Block>()) {
+        ir::Block* body = parent_for_.back()->body.As<ir::Block>();
+        if (body->stmts.size() == 1 && body->stmts[0] == *expr) {
+          parent_for_.back()->body = ir::Block::Make({unit_loop});
+          parent_for_.push_back(unit_loop.As<ir::For>());
+          longest_loop_.push_back(unit_loop);
+          add_unit_loop = true;
+        }
+      }
+    }
+  }
+
+  if (add_unit_loop) {
+    ir::IRMutator<>::Visit(&(parent_for_.back()->body), &(parent_for_.back()->body));
+    parent_for_.pop_back();
+  } else {
+    parent_for_.push_back(node);
+    longest_loop_.push_back(*expr);
+    ir::IRMutator<>::Visit(&node->body, &node->body);
+    parent_for_.pop_back();
+  }
+}
+
+void DomainAddUnitLoopMutator::Visit(const ir::PolyFor* op, Expr* expr) {
+  VLOG(6) << "DomainAddUnitLoopMutator Visit PolyFor";
+  ir::PolyFor* node  = expr->As<ir::PolyFor>();
+  bool add_unit_loop = false;
+  if (parent_poly_for_.size() < dim_names_.size()) {
+    std::string check_name      = dim_names_[parent_poly_for_.size()];
+    std::tuple<int, int, int> t = dim_min_max_[parent_poly_for_.size()];
+    if (!utils::Startswith(node->iterator->name, check_name) && (std::get<2>(t) - std::get<1>(t) == 0)) {
+      ir::Expr unit_loop = ir::PolyFor::Make(ir::Var(check_name),
+                                             ir::Expr(0),
+                                             ir::LE::Make(ir::Var(check_name), ir::Expr(0)),
+                                             ir::Expr(1),
+                                             ir::ForType::Serial,
+                                             node->device_api,
+                                             ir::Block::Make({*expr}));
+
+      if (parent_poly_for_.empty()) {
+        *expr = unit_loop;
+        parent_poly_for_.push_back(unit_loop.As<ir::PolyFor>());
+        longest_loop_.push_back(unit_loop);
+        add_unit_loop = true;
+      } else if (parent_poly_for_.back()->body.As<ir::PolyFor>() && parent_poly_for_.back()->body == *expr) {
+        parent_poly_for_.back()->body = ir::Block::Make({unit_loop});
+        parent_poly_for_.push_back(unit_loop.As<ir::PolyFor>());
+        longest_loop_.push_back(unit_loop);
+        add_unit_loop = true;
+      } else if (parent_poly_for_.back()->body.As<ir::Block>()) {
+        ir::Block* body = parent_poly_for_.back()->body.As<ir::Block>();
+        if (body->stmts.size() == 1 && body->stmts[0] == *expr) {
+          parent_poly_for_.back()->body = ir::Block::Make({unit_loop});
+          parent_poly_for_.push_back(unit_loop.As<ir::PolyFor>());
+          longest_loop_.push_back(unit_loop);
+          add_unit_loop = true;
+        }
+      }
+    }
+  }
+
+  if (add_unit_loop) {
+    ir::IRMutator<>::Visit(&(parent_poly_for_.back()->body), &(parent_poly_for_.back()->body));
+    parent_poly_for_.pop_back();
+  } else {
+    parent_poly_for_.push_back(node);
+    longest_loop_.push_back(*expr);
+    ir::IRMutator<>::Visit(&node->body, &node->body);
+    parent_poly_for_.pop_back();
+  }
+}
+
+void DomainAddUnitLoopMutator::MutateAfterVisit(ir::Expr* expr) {
+  VLOG(6) << "DomainAddUnitLoopMutator::MutateAfterVisit";
+  if (longest_loop_.size() >= dim_min_max_.size()) {
+    // No loops to add
+    return;
+  }
+  int loop_match_len = 0;
+  for (int i = 0; i < longest_loop_.size(); ++i) {
+    std::tuple<int, int, int> t = dim_min_max_[i];
+    if (longest_loop_[i].As<ir::For>()) {
+      const ir::For* node = longest_loop_[i].As<ir::For>();
+      if (utils::Startswith(node->loop_var->name, dim_names_[i]) && node->min.is_constant() &&
+          node->min.as_int32() == std::get<1>(t) && node->extent.is_constant() &&
+          node->extent.as_int32() == std::get<2>(t)) {
+        ++loop_match_len;
+      } else {
+        loop_match_len = -1;
+        break;
+      }
+    } else if (longest_loop_[i].As<ir::PolyFor>()) {
+      const ir::PolyFor* node = longest_loop_[i].As<ir::PolyFor>();
+      if (utils::Startswith(node->iterator->name, dim_names_[i]) && node->init.is_constant() &&
+          node->init.as_int32() == std::get<1>(t) &&
+          node->condition == ir::LE::Make(ir::Var(dim_names_[i]), ir::Expr(std::get<2>(t)))) {
+        ++loop_match_len;
+      } else {
+        loop_match_len = -1;
+        break;
+      }
+    } else {
+      loop_match_len = -1;
+      break;
+    }
+  }
+
+  if (loop_match_len == -1 || loop_match_len >= dim_min_max_.size()) {
+    // Not matched loops, shouldn't change anything
+    return;
+  }
+  for (int i = loop_match_len; i < dim_min_max_.size(); ++i) {
+    std::tuple<int, int, int> t = dim_min_max_[i];
+    if (std::get<2>(t) != std::get<1>(t)) {
+      // Not all remaining loops are length 1, just return
+      return;
+    }
+  }
+
+  if (longest_loop_.empty() || longest_loop_.back().As<ir::PolyFor>()) {
+    ir::Expr body = longest_loop_.empty() ? *expr : longest_loop_.back().As<ir::PolyFor>()->body;
+    for (int i = dim_min_max_.size() - 1; i >= loop_match_len; --i) {
+      if (!body.As<ir::Block>()) {
+        body = ir::Block::Make({body});
+      }
+      body = ir::PolyFor::Make(
+          ir::Var(dim_names_[i]),
+          ir::Expr(0),
+          ir::LE::Make(ir::Var(dim_names_[i]), ir::Expr(0)),
+          ir::Expr(1),
+          ir::ForType::Serial,
+          longest_loop_.empty() ? ir::DeviceAPI::UNK : longest_loop_.back().As<ir::PolyFor>()->device_api,
+          body);
+    }
+    if (longest_loop_.empty()) {
+      *expr = body;
+    } else {
+      longest_loop_.back().As<ir::PolyFor>()->body = ir::Block::Make({body});
+    }
+  } else if (longest_loop_.back().As<ir::For>()) {
+    ir::For* node = longest_loop_.back().As<ir::For>();
+    ir::Expr body = node->body;
+    for (int i = dim_min_max_.size() - 1; i >= loop_match_len; --i) {
+      ir::Expr unit_loop =
+          ir::For::Make(ir::Var(dim_names_[i]), ir::Expr(0), ir::Expr(1), ir::ForType::Serial, node->device_api, body);
+      body = ir::Block::Make({unit_loop});
+    }
+    node->body = body;
+  }
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/domain_add_unit_loop_mutator.h b/paddle/cinn/poly/domain_add_unit_loop_mutator.h
new file mode 100644
index 0000000000000..8b4f013255dd2
--- /dev/null
+++ b/paddle/cinn/poly/domain_add_unit_loop_mutator.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <tuple>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_mutator.h"
+
+namespace cinn {
+namespace poly {
+
+/**
+ * CINN Expr mutator utility to add length-1-loop to Expr based on input
+ * dim names and dim range.
+ */
+class DomainAddUnitLoopMutator : public ir::IRMutator<> {
+ public:
+  DomainAddUnitLoopMutator(const std::vector<std::string>& dim_names,
+                           const std::vector<std::tuple<int, int, int>>& dim_min_max);
+
+  void operator()(ir::Expr* expr);
+
+ private:
+  void Visit(const ir::For* op, Expr* expr) override;
+  void Visit(const ir::PolyFor* op, Expr* expr) override;
+
+  void MutateAfterVisit(ir::Expr* expr);
+
+  std::vector<ir::For*> parent_for_;
+  std::vector<ir::PolyFor*> parent_poly_for_;
+
+  std::vector<ir::Expr> longest_loop_;
+
+  const std::vector<std::string>& dim_names_;
+  const std::vector<std::tuple<int, int, int>>& dim_min_max_;
+};
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/graph.cc b/paddle/cinn/poly/graph.cc
new file mode 100755
index 0000000000000..67132eab041f0
--- /dev/null
+++ b/paddle/cinn/poly/graph.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/graph.h"
+
+#include <deque>
+#include <map>
+#include <set>
+#include <utility>
+
+namespace cinn {
+namespace poly {
+
+const DataFlowGraphNode* DataFlowGraphNode::group_ancestor() const {
+  auto* p = this;
+  while (p->group_parent) p = p->group_parent;
+  return p;
+}
+
+DataFlowGraphNode* DataFlowGraphNode::group_ancestor() {
+  auto* p = this;
+  while (p->group_parent && p != p->group_parent) {
+    p = p->group_parent;
+  }
+  return p;
+}
+
+bool DataFlowGraphNode::TransformedDomainIsSame(const DataFlowGraphNode* a, const DataFlowGraphNode* b) {
+  VLOG(3) << "a.domain " << a->stage->domain();
+  VLOG(3) << "a.transform " << a->stage->transform();
+  VLOG(3) << "b.domain " << b->stage->domain();
+  VLOG(3) << "b.transform " << b->stage->transform();
+  auto a_domain = a->stage->transformed_domain();
+  auto b_domain = b->stage->transformed_domain();
+  a_domain      = isl::manage(isl_set_set_tuple_name(a_domain.release(), ""));
+  b_domain      = isl::manage(isl_set_set_tuple_name(b_domain.release(), ""));
+  return isl_set_is_equal(a_domain.get(), b_domain.get());
+}
+
+int DataFlowGraphNode::group_height() const {
+  int h   = 0;
+  auto* p = this;
+  while (p) {
+    ++h;
+    if (p->group_parent == p) break;
+    p = p->group_parent;
+  }
+  return h;
+}
+
+DataFlowGraphNode* DataFlowGraphNode::MergeGroup(DataFlowGraphNode* a, DataFlowGraphNode* b) {
+  int ah      = a->group_height();
+  int bh      = b->group_height();
+  auto* a_anc = a->group_ancestor();
+  auto* b_anc = b->group_ancestor();
+  DataFlowGraphNode* common_anc{};
+  if (ah < bh) {  // take a's ancestor
+    b_anc->group_parent = a_anc;
+    b->group_parent     = a_anc;
+    return a_anc;
+  } else {
+    a_anc->group_parent = b_anc;
+    a->group_parent     = b_anc;
+    return b_anc;
+  }
+}
+std::string DataFlowGraphNode::id() const {
+  // NOTE the stage's id should be unique.
+  return stage->id();
+}
+
+bool DataFlowGraphNode::IsLinkedTo(const DataFlowGraphNode* node) const {
+  bool found = std::find_if(inlinks_.begin(), inlinks_.end(), [=](const Shared<common::GraphEdge>& x) {
+                 return x->source() == node;
+               }) != std::end(inlinks_);
+  return found || std::find_if(outlinks_.begin(), outlinks_.end(), [=](const Shared<common::GraphEdge>& x) {
+                    return x->sink() == node;
+                  }) != std::end(outlinks_);
+}
+
+std::unique_ptr<DataFlowGraph> CreateGraph(const std::vector<Stage*>& stages,
+                                           const std::vector<std::pair<std::string, std::string>>& extra_links) {
+  std::map<std::string, Shared<DataFlowGraphNode>> id2stage;
+  for (auto* x : stages) id2stage[x->id()] = make_shared<DataFlowGraphNode>(x);
+
+  for (auto* stage : stages) {
+    auto depend_statement_names = stage->input_statements();
+    VLOG(3) << stage->id() << " depend " << utils::Join(depend_statement_names, ", ");
+    for (auto& depend_statement : depend_statement_names) {
+      auto input_it = id2stage.find(depend_statement);
+      // We removed some node in the original stages(such as placeholders), so that there might be missing of some input
+      // nodes, just ignore the dependence.
+      if (input_it != std::end(id2stage)) {
+        auto& input_node = input_it->second;
+        input_node->Controls(id2stage.at(stage->id()).get());
+      }
+    }
+  }
+
+  // Add extra links
+  for (auto& item : extra_links) {
+    // the placeholder not exists in id2stage.
+    if (!id2stage.count(item.first)) continue;
+    auto& a = id2stage.at(item.first);
+    auto& b = id2stage.at(item.second);
+    if (a.get() != b.get()) {
+      a->Controls(b.get());
+    }
+  }
+
+  std::unique_ptr<DataFlowGraph> graph(new DataFlowGraph);
+  for (auto& item : id2stage) graph->RegisterNode(item.first, item.second.get());
+  VLOG(3) << "created graph:\n" << graph->Visualize();
+  return graph;
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/graph.h b/paddle/cinn/poly/graph.h
new file mode 100644
index 0000000000000..43a9c8c8c3c56
--- /dev/null
+++ b/paddle/cinn/poly/graph.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+/**
+ * This file defines several graphs used by scheduler.
+ */
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace poly {
+
+struct DataFlowGraphNode : public common::GraphNode {
+  //! Used for union find to gather groups.
+  DataFlowGraphNode* group_parent{};
+  //! Each stage belongs to a node.
+  Shared<Stage> stage;
+
+  explicit DataFlowGraphNode(Stage* stage) : stage(stage) {}
+
+  std::string id() const override;
+
+  //! Get the ancestor.
+  const DataFlowGraphNode* group_ancestor() const;
+  DataFlowGraphNode* group_ancestor();
+
+  //! Get the tree height for union find.
+  int group_height() const;
+
+  //! Tell whether this node is connected to another `node`, either inlink or outlink.
+  bool IsLinkedTo(const DataFlowGraphNode* node) const;
+
+  //! Merge two nodes into the same group.
+  //! returns: the common ancestor.
+  static DataFlowGraphNode* MergeGroup(DataFlowGraphNode* a, DataFlowGraphNode* b);
+
+  //! Compare the the iteration_domain.apply(transform), return true if same.
+  static bool TransformedDomainIsSame(const DataFlowGraphNode* a, const DataFlowGraphNode* b);
+};
+
+struct DataFlowGraphEdge : public common::GraphEdge {};
+
+/**
+ * DataFlowGraph help to record the data dependencies between the Stages.
+ */
+struct DataFlowGraph : public common::Graph {};
+
+/**
+ * Create a dependency graph given some stages.
+ * NOTE The stages should sorted in topological order.
+ *
+ * @param stages The stages.
+ * @param extra_links The extra links, each element is a pair of (a ->  b)
+ */
+std::unique_ptr<DataFlowGraph> CreateGraph(const std::vector<Stage*>& stages,
+                                           const std::vector<std::pair<std::string, std::string>>& extra_links = {});
+
+namespace detail {
+
+struct Group {
+  Group() = default;
+  explicit Group(const std::vector<Shared<DataFlowGraphNode>>& nodes) : nodes(nodes) {}
+
+  std::vector<Shared<DataFlowGraphNode>> nodes;
+  std::vector<std::string> dimension_names;
+};
+
+/**
+ * GraphPartitionBySpace partitions a data flow graph into several sub-graph with consider of the dependency and space
+ * of the iteration domain.
+ * If two Nodes has the stages has dependency relation and has the same iteration domain, then they will be put in the
+ * same sub-graph.
+ */
+std::vector<Group> PartitionGraphByIterationDomain(common::Graph* graph);
+
+}  // namespace detail
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/graph_test.cc b/paddle/cinn/poly/graph_test.cc
new file mode 100644
index 0000000000000..0ae7bf78c4e7b
--- /dev/null
+++ b/paddle/cinn/poly/graph_test.cc
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/graph.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/ir/ir_operators.h"
+#include "cinn/lang/buffer.h"
+
+namespace cinn {
+namespace poly {}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/isl_utils.cc b/paddle/cinn/poly/isl_utils.cc
new file mode 100644
index 0000000000000..e1b10ced35cd6
--- /dev/null
+++ b/paddle/cinn/poly/isl_utils.cc
@@ -0,0 +1,512 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/isl_utils.h"
+
+#include <glog/logging.h>
+#include <isl/cpp.h>
+
+#include <algorithm>
+#include <set>
+
+#include "cinn/common/common.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace poly {
+using utils::Join;
+using utils::StringFormat;
+
+std::vector<std::string> isl_get_dim_names(const isl::set &x) {
+  std::vector<std::string> res;
+  for (int i = 0; i < isl_set_dim(x.get(), isl_dim_set); i++) {
+    res.push_back(isl_set_get_dim_name(x.get(), isl_dim_set, i));
+  }
+  return res;
+}
+
+std::vector<std::string> isl_get_dim_names(const isl::map &x, isl_dim_type dim_type) {
+  std::vector<std::string> res;
+  for (int i = 0; i < isl_map_dim(x.get(), dim_type); i++) {
+    res.push_back(isl_map_get_dim_name(x.get(), dim_type, i));
+  }
+  return res;
+}
+
+std::vector<std::string> isl_get_dim_names(isl_set *set) {
+  std::vector<std::string> res;
+  for (int i = 0; i < isl_set_dim(set, isl_dim_set); i++) {
+    res.push_back(isl_set_get_dim_name(set, isl_dim_set, i));
+  }
+  return res;
+}
+
+void isl_set_dim_names(isl::map *map, isl_dim_type dim_type, const std::vector<std::string> &names) {
+  const int dim = isl_map_dim(map->get(), dim_type);
+  CHECK_EQ(dim, names.size());
+
+  for (int i = 0; i < dim; i++) {
+    *map = isl::manage(isl_map_set_dim_name(map->release(), dim_type, i, names[i].c_str()));
+  }
+}
+
+void isl_set_dim_names(isl::set *set, const std::vector<std::string> &names) {
+  int dim = isl_set_dim(set->get(), isl_dim_set);
+  CHECK_EQ(dim, names.size());
+
+  for (int i = 0; i < dim; i++) {
+    *set = isl::manage(isl_set_set_dim_name(set->release(), isl_dim_set, i, names[i].c_str()));
+  }
+}
+
+isl::union_map isl_maps_to_union_map(const std::vector<isl::map> &maps) {
+  CHECK(!maps.empty());
+  isl::union_map umap = isl::manage(isl_union_map_from_map(maps.front().copy()));
+  for (int i = 1; i < maps.size(); i++) {
+    umap = isl::manage(isl_union_map_add_map(umap.release(), maps[i].copy()));
+  }
+  return umap;
+}
+
+isl::union_set isl_sets_to_union_set(const std::vector<isl::set> &sets) {
+  CHECK(!sets.empty());
+  isl::union_set uset = isl::manage(isl_union_set_from_set(sets.front().copy()));
+  for (int i = 1; i < sets.size(); i++) {
+    uset = isl::manage(isl_union_set_add_set(uset.release(), sets[i].copy()));
+  }
+  return uset;
+}
+
+std::string isl_map_get_statement_repr(__isl_keep isl_map *map, isl_dim_type type) {
+  CHECK(map);
+  auto tuple_name = isl_map_get_tuple_name(map, type);
+  std::vector<std::string> dims;
+
+  for (int i = 0; i < isl_map_dim(map, type); i++) {
+    dims.push_back(isl_map_get_dim_name(map, type, i));
+  }
+  return StringFormat("%s[%s]", tuple_name, Join(dims, ", ").c_str());
+}
+
+std::vector<std::string> isl_get_dim_names(isl_map *map, isl_dim_type dim_type) {
+  std::vector<std::string> res;
+  int n = isl_map_dim(map, dim_type);
+  for (int i = 0; i < n; i++) {
+    res.push_back(isl_map_get_dim_name(map, dim_type, i));
+  }
+  return res;
+}
+
+isl::set SetGetDims(isl::set set, const std::vector<int> &dims) {
+  std::string tuple_name = isl_set_get_tuple_name(set.get());
+  auto dim_names         = isl_get_dim_names(set);
+  std::vector<std::string> selected_dim_names;
+  for (int v : dims) {
+    CHECK_LT(v, dim_names.size());
+    selected_dim_names.push_back(dim_names[v]);
+  }
+
+  std::string transform_repr = StringFormat("{ %s[%s] -> %s[%s] }",
+                                            tuple_name.c_str(),             //
+                                            Join(dim_names, ", ").c_str(),  //
+                                            tuple_name.c_str(),             //
+                                            Join(selected_dim_names, ", ").c_str());
+  isl::map transform(set.ctx(), transform_repr);
+  return set.apply(transform);
+}
+
+isl_set *isl_get_precending_aixs(isl_set *set, int level, bool with_tuple_name) {
+  int n = isl_set_dim(set, isl_dim_set);
+  CHECK_LT(level, n);
+
+  std::vector<std::string> domain_iterators;
+  std::vector<std::string> range_iterators;
+
+  for (int i = 0; i < n; i++) {
+    domain_iterators.push_back(cinn::UniqName("i" + std::to_string(i)));
+  }
+
+  for (int i = 0; i < level; i++) {
+    range_iterators.push_back(cinn::UniqName("i" + std::to_string(i)));
+  }
+
+  const char *statement = isl_set_get_tuple_name(set);
+
+  std::string repr = utils::StringFormat("{ %s[%s] -> %s[%s] }",
+                                         statement,
+                                         utils::Join(domain_iterators, ", ").c_str(),
+                                         statement,
+                                         utils::Join(range_iterators, ", ").c_str());
+  auto transform   = isl::manage(isl_map_read_from_str(isl_set_get_ctx(set), repr.c_str()));
+
+  return isl_set_apply(set, transform.release());
+}
+
+int isl_get_original_axes_from_optimized_level(isl_set __isl_keep *a, int level) {
+  int original_level = -1;
+  std::vector<std::tuple<int, int>> iden_dim_offsets;
+  for (int i = 0; i <= level;) {
+    original_level++;
+    if (isl_set_axis_has_noparam_constant_bound(a, original_level)) {
+      auto range = isl_set_get_axis_range(a, original_level);
+      auto &minv = std::get<0>(range);
+      auto &maxv = std::get<1>(range);
+
+      int min_iv = minv.get_num_si();
+      int max_iv = maxv.get_num_si();
+      if (max_iv == min_iv) {
+        i--;
+      }
+    }
+    i++;
+  }
+  return original_level;
+}
+
+int isl_get_precending_removed_axes_counts(isl_set __isl_keep *a, int level) {
+  int removed_axes_counts = 0;
+  std::vector<std::tuple<int, int>> iden_dim_offsets;
+  for (int i = 0; i <= level; i++) {
+    if (isl_set_axis_has_noparam_constant_bound(a, i)) {
+      auto range = isl_set_get_axis_range(a, i);
+      auto &minv = std::get<0>(range);
+      auto &maxv = std::get<1>(range);
+
+      int min_iv = minv.get_num_si();
+      int max_iv = maxv.get_num_si();
+      if (max_iv == min_iv) {
+        removed_axes_counts++;
+      }
+    }
+  }
+  return removed_axes_counts;
+}
+
+bool isl_is_removed_axis(isl_set __isl_keep *a, int level) {
+  std::vector<std::tuple<int, int>> iden_dim_offsets;
+  if (isl_set_axis_has_noparam_constant_bound(a, level)) {
+    auto range = isl_set_get_axis_range(a, level);
+    auto &minv = std::get<0>(range);
+    auto &maxv = std::get<1>(range);
+
+    int min_iv = minv.get_num_si();
+    int max_iv = maxv.get_num_si();
+    if (max_iv == min_iv) {
+      return true;
+    }
+  }
+  return false;
+}
+
+int isl_max_level_compatible(isl_set *a, isl_set *b) {
+  int an = isl_set_dim(a, isl_dim_set);
+  int bn = isl_set_dim(b, isl_dim_set);
+  CHECK_GE(an, 0);
+  CHECK_GE(bn, 0);
+
+  int compatible_level = -1;
+  for (int i = 0; i < std::min(an, bn); i++) {
+    isl::set a_prefix = isl::manage(isl_get_precending_aixs(isl_set_copy(a), i, false));
+    isl::set b_prefix = isl::manage(isl_get_precending_aixs(isl_set_copy(b), i, false));
+
+    a_prefix = isl::manage(isl_set_set_tuple_name(a_prefix.release(), "s"));
+    b_prefix = isl::manage(isl_set_set_tuple_name(b_prefix.release(), "s"));
+    if (isl_set_is_equal(a_prefix.get(), b_prefix.get()))
+      compatible_level = i;
+    else
+      break;
+  }
+
+  return compatible_level;
+}
+
+isl_set *isl_remove_axis_by_name(isl_set *set, const char *axis_name) {
+  std::string tuple_name = isl_set_get_tuple_name(set);
+  int offset             = isl_set_find_dim_by_name(set, isl_dim_set, axis_name);
+  set                    = isl_set_remove_dims(set, isl_dim_set, offset, 1);
+  set                    = isl_set_set_tuple_name(set, tuple_name.c_str());
+  return set;
+}
+
+isl_map *isl_remove_axis_by_name(isl_map *map, isl_dim_type dim_type, const char *axis_name) {
+  int offset             = isl_map_find_dim_by_name(map, dim_type, axis_name);
+  std::string tuple_name = isl_map_get_tuple_name(map, dim_type);
+  map                    = isl_map_remove_dims(map, dim_type, offset, 1);
+  map                    = isl_map_set_tuple_name(map, dim_type, tuple_name.c_str());
+  return map;
+}
+isl_set *isl_rename_axis(isl_set *set, int offset, const char *name) {
+  return isl_set_set_dim_name(set, isl_dim_set, offset, name);
+}
+isl_map *isl_rename_axis(isl_map *map, isl_dim_type dim_type, int offset, const char *name) {
+  return isl_map_set_dim_name(map, dim_type, offset, name);
+}
+
+isl_set *isl_simplify(isl_set __isl_take *set) {
+  set = isl_set_coalesce(set);
+  set = isl_set_remove_redundancies(set);
+  return set;
+}
+
+isl::union_set isl_union_set_from_sets(llvm::ArrayRef<isl::set> sets) {
+  CHECK(!sets.empty());
+  isl::union_set res = isl::manage(isl_union_set_from_set(sets.front().copy()));
+  for (int i = 1; i < sets.size(); i++) {
+    res = isl::manage(isl_union_set_add_set(res.release(), sets[i].copy()));
+  }
+  return res;
+}
+
+std::tuple<isl::val, isl::val> isl_set_get_axis_range_by_name(isl_set *set, std::string axis_name) {
+  std::vector<std::string> from_iters;
+  for (int i = 0; i < isl_set_dim(set, isl_dim_set); i++) {
+    auto *name = isl_set_get_dim_name(set, isl_dim_set, i);
+    if (name) {
+      from_iters.push_back(name);
+    } else {
+      from_iters.push_back("__emp__" + std::to_string(i));
+    }
+  }
+
+  isl::aff aff(
+      isl_set_get_ctx(set),
+      utils::StringFormat(
+          "{ %s[%s] -> [%s] }", isl_set_get_tuple_name(set), utils::Join(from_iters, ",").c_str(), axis_name.c_str()));
+
+  isl::val max_val = isl::manage(isl_set_max_val(set, aff.get()));
+  isl::val min_val = isl::manage(isl_set_min_val(set, aff.get()));
+
+  return std::make_tuple(min_val, max_val);
+}
+
+std::tuple<isl::val, isl::val> isl_set_get_axis_range(isl_set *set, int pos) {
+  CHECK(isl_set_dim_is_bounded(set, isl_dim_set, pos)) << "an unbound cannot get range, " << isl_set_to_str(set);
+
+  std::vector<std::string> from_iters;
+  std::string target_axis_name;
+  for (int i = 0; i < isl_set_dim(set, isl_dim_set); i++) {
+    auto *name = isl_set_get_dim_name(set, isl_dim_set, i);
+    if (name) {
+      from_iters.push_back(name);
+    } else {
+      from_iters.push_back("__emp__" + std::to_string(i));
+    }
+    if (pos == i) target_axis_name = from_iters.back();
+  }
+
+  isl::aff aff(isl_set_get_ctx(set),
+               utils::StringFormat("{ %s[%s] -> [%s] }",
+                                   isl_set_get_tuple_name(set),
+                                   utils::Join(from_iters, ",").c_str(),
+                                   target_axis_name.c_str()));
+
+  isl::val max_val = isl::manage(isl_set_max_val(set, aff.get()));
+  isl::val min_val = isl::manage(isl_set_min_val(set, aff.get()));
+
+  return std::make_tuple(min_val, max_val);
+}
+
+bool isl_set_axis_has_noparam_constant_bound(isl_set __isl_keep *set, int pos) {
+  if (!isl_set_dim_is_bounded(set, isl_dim_set, pos)) return false;
+  set = isl_simplify(isl_set_copy(set));
+  set = isl_set_drop_unused_params(set);
+
+  isl_pw_aff *min_val = isl_set_dim_min(isl_set_copy(set), pos);
+  isl_pw_aff *max_val = isl_set_dim_max(isl_set_copy(set), pos);
+  VLOG(3) << "set: " << isl_set_to_str(set);
+  VLOG(3) << "min_val: " << isl_pw_aff_to_str(min_val);
+  VLOG(3) << "max_val: " << isl_pw_aff_to_str(max_val);
+
+  isl::set context(isl_set_get_ctx(set), "{:}");
+  auto is_dim_a_constant = [&](isl_pw_aff *__isl_give val) {
+    val = isl_pw_aff_drop_unused_params(val);
+    val = isl_pw_aff_align_params(val, isl_space_copy(context.space().get()));
+
+    bool is_param_involved = false;
+    isl_pw_aff_foreach_piece(
+        val,
+        [](isl_set *__isl_give set, isl_aff *__isl_give aff, void *user) -> isl_stat {
+          // Ignore the set piece, e.g. [_cp_C_0, _cp_C_1] -> { cache[0, 0] : _cp_C_0 = 0 and _cp_C_1 = 0 }
+          // will get a set [_cp_C_0, _cp_C_1] -> {  : _cp_C_0 = 0 and _cp_C_1 = 0 }
+          if (set) {
+            // ignore
+          }
+
+          CHECK(aff);
+          auto &is_param_involved = *reinterpret_cast<bool *>(user);
+          if (is_param_involved) return isl_stat_ok;
+
+          // drop unused params, so the Aff [n]->{ [(0)] } will be []->{ [(0)] }
+          auto *pw_aff = isl_pw_aff_from_aff(aff);
+          pw_aff       = isl_pw_aff_drop_unused_params(pw_aff);
+
+          // check if some params is involved.
+          isl::set params   = isl::manage(isl_pw_aff_params(pw_aff));
+          is_param_involved = isl_set_dim(params.get(), isl_dim_param) > 0;
+
+          isl_set_free(set);
+          return isl_stat_ok;
+        },
+        reinterpret_cast<void *>(&is_param_involved));
+
+    return !is_param_involved;
+  };
+
+  return is_dim_a_constant(max_val) && is_dim_a_constant(min_val);
+}
+
+isl::map isl_set_dim_name_if_null(isl_map *map, std::function<std::string(isl_dim_type, int)> namer) {
+  int in_dims   = isl_map_dim(map, isl_dim_in);
+  int out_dims  = isl_map_dim(map, isl_dim_out);
+  auto set_name = [&](isl_dim_type dim_type) {
+    for (int i = 0; i < isl_map_dim(map, dim_type); i++) {
+      if (!isl_map_get_dim_name(map, dim_type, i)) {
+        map = isl_map_set_dim_name(map, dim_type, i, namer(dim_type, i).c_str());
+      }
+    }
+  };
+
+  set_name(isl_dim_in);
+  set_name(isl_dim_out);
+
+  return isl::manage(map);
+}
+
+isl::set isl_set_dim_name_if_null(isl_set *set, std::function<std::string(isl_dim_type, int)> namer) {
+  for (int i = 0; i < isl_set_dim(set, isl_dim_set); i++) {
+    if (!isl_set_get_dim_name(set, isl_dim_set, i)) {
+      set = isl_set_set_dim_name(set, isl_dim_set, i, namer(isl_dim_set, i).c_str());
+    }
+  }
+  return isl::manage(set);
+}
+
+isl::map RemoveAxiesByInputNames(const isl::map &x,
+                                 const isl::set &origin_domain,
+                                 const std::vector<std::string> &dim_in_names) {
+  std::string map_str = isl_map_to_str(x.get());
+  isl::ctx this_ctx   = x.ctx();
+  isl::map temp_transform(this_ctx, map_str);
+  auto related_output_names = GetRelatedOutputAxies(x, origin_domain, dim_in_names);
+  if (dim_in_names.empty()) return temp_transform;
+  for (auto &i : dim_in_names) {
+    temp_transform = isl::manage(isl_remove_axis_by_name(temp_transform.release(), isl_dim_in, i.c_str()));
+  }
+  for (auto &i : related_output_names) {
+    temp_transform = isl::manage(isl_remove_axis_by_name(temp_transform.release(), isl_dim_out, i.c_str()));
+  }
+  return temp_transform;
+}
+
+isl::map RemoveAxiesByOutputNames(const isl::map &x,
+                                  const isl::set &origin_domain,
+                                  const std::vector<std::string> &dim_out_names) {
+  std::string map_str = isl_map_to_str(x.get());
+  isl::ctx this_ctx   = x.ctx();
+  isl::map temp_transform(this_ctx, map_str);
+  auto related_input_names = GetRelatedInputAxies(x, origin_domain, dim_out_names);
+  if (dim_out_names.empty()) return temp_transform;
+  for (auto &i : dim_out_names) {
+    temp_transform = isl::manage(isl_remove_axis_by_name(temp_transform.release(), isl_dim_out, i.c_str()));
+  }
+  for (auto &i : related_input_names) {
+    temp_transform = isl::manage(isl_remove_axis_by_name(temp_transform.release(), isl_dim_in, i.c_str()));
+  }
+  return temp_transform;
+}
+
+std::vector<std::string> GetRelatedOutputAxies(const isl::map &x,
+                                               const isl::set &origin_domain,
+                                               const std::vector<std::string> &dim_in_names) {
+  std::string map_str = isl_map_to_str(x.get());
+  VLOG(1) << "GetRelatedOutputAxies map_str is : " << map_str;
+  isl::ctx this_ctx = x.ctx();
+  isl::map temp_transform(this_ctx, map_str);
+  auto dim_out_names = isl_get_dim_names(temp_transform, isl_dim_out);
+  std::set<std::string> dim_in_set;
+  for (auto &i : dim_in_names) {
+    VLOG(1) << "GetRelatedOutputAxies dim_in_names is : " << i;
+    dim_in_set.insert(i);
+  }
+  std::set<std::string> res_set;
+  for (auto &i : dim_out_names) {
+    auto related_in_dim = GetRelatedInputAxies(temp_transform, origin_domain, {i});
+    for (auto &j : related_in_dim) {
+      if (dim_in_set.count(j) > 0) {
+        res_set.insert(i);
+      }
+    }
+  }
+  std::vector<std::string> res;
+  for (auto &i : res_set) {
+    VLOG(1) << "GetRelatedOutputAxies res is : " << i;
+    res.push_back(i);
+  }
+  return res;
+}
+
+std::vector<std::string> GetRelatedInputAxies(const isl::map &x,
+                                              const isl::set &origin_domain,
+                                              const std::vector<std::string> &dim_out_names,
+                                              bool strict) {
+  std::string map_str = isl_map_to_str(x.get());
+  VLOG(1) << "GetRelatedInputAxies map_str is : " << map_str;
+  isl::ctx this_ctx = x.ctx();
+  isl::map temp_transform(this_ctx, map_str);
+  auto dim_in_names = isl_get_dim_names(temp_transform, isl_dim_in);
+  for (auto &i : dim_out_names) {
+    VLOG(1) << "GetRelatedInputAxies dim_out_names is : " << i;
+    temp_transform = isl::manage(isl_remove_axis_by_name(temp_transform.release(), isl_dim_out, i.c_str()));
+  }
+  std::string deleted_map = isl_map_to_str(temp_transform.get());
+  std::vector<std::string> res;
+  std::set<std::string> out_set;
+  std::set<std::string> out_set_without_suffix;
+  std::string set_str = isl_set_to_str(origin_domain.get());
+  isl::ctx set_ctx    = origin_domain.ctx();
+  isl::set temp_set(this_ctx, set_str);
+  auto transformed_domain = temp_set.apply(x);
+  for (auto &i : dim_out_names) {
+    out_set.insert(i);
+    if (utils::Endswith(i, "_inner") || utils::Endswith(i, "_outer")) {
+      out_set_without_suffix.insert(utils::RemoveSuffix(i));
+    }
+  }
+  for (auto &i : dim_in_names) {
+    if (utils::Count(&map_str, i) != utils::Count(&deleted_map, i)) {
+      VLOG(1) << "GetRelatedInputAxies res is : " << i;
+      res.push_back(i);
+    } else if (out_set_without_suffix.count(i) > 0 && !strict) {
+      VLOG(1) << "GetRelatedInputAxies res is : " << i;
+      res.push_back(i);
+    } else if (out_set.count(i) > 0) {
+      auto range1 = isl_set_get_axis_range_by_name(origin_domain.get(), i);
+      auto &minv1 = std::get<0>(range1);
+      auto &maxv1 = std::get<1>(range1);
+      auto range2 = isl_set_get_axis_range_by_name(transformed_domain.get(), i);
+      auto &minv2 = std::get<0>(range2);
+      auto &maxv2 = std::get<1>(range2);
+      int min_iv1 = minv1.get_num_si();
+      int max_iv1 = maxv1.get_num_si();
+      int min_iv2 = minv2.get_num_si();
+      int max_iv2 = maxv2.get_num_si();
+      if (min_iv1 == max_iv1 && min_iv2 == max_iv2) {
+        res.push_back(i);
+      }
+    }
+  }
+  return res;
+}
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/isl_utils.h b/paddle/cinn/poly/isl_utils.h
new file mode 100644
index 0000000000000..a1c5637b204b1
--- /dev/null
+++ b/paddle/cinn/poly/isl_utils.h
@@ -0,0 +1,143 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <isl/cpp.h>
+#include <llvm/ADT/ArrayRef.h>
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace cinn {
+namespace poly {
+
+//! Get dimension names from isl containers.
+// @{
+std::vector<std::string> isl_get_dim_names(const isl::set& x);
+std::vector<std::string> isl_get_dim_names(const isl::map& x, isl_dim_type dim_type);
+std::vector<std::string> isl_get_dim_names(isl_map* map, isl_dim_type dim_type);
+std::vector<std::string> isl_get_dim_names(isl_set* set);
+// @}
+
+void isl_set_dim_names(isl::set* __isl_keep set, const std::vector<std::string>& names);
+void isl_set_dim_names(isl::map* __isl_keep map, isl_dim_type dim_type, const std::vector<std::string>& names);
+
+isl::union_set isl_union_set_from_sets(llvm::ArrayRef<isl::set> sets);
+
+isl::map isl_set_dim_name_if_null(isl_map* __isl_take map, std::function<std::string(isl_dim_type, int)> namer);
+isl::set isl_set_dim_name_if_null(isl_set* __isl_take set, std::function<std::string(isl_dim_type, int)> namer);
+
+//! Convert a list of isl::map to isl::union_map
+isl::union_map isl_maps_to_union_map(const std::vector<isl::map>& maps);
+isl::union_set isl_sets_to_union_set(const std::vector<isl::set>& sets);
+
+//! Get a representation of the tuple in the map.
+std::string isl_map_get_statement_repr(__isl_keep isl_map* map, isl_dim_type type);
+
+isl_set* __isl_give isl_get_precending_aixs(isl_set* set, int level, bool with_tuple_name);
+
+//! If the min and max bounds of the axis are same, isl will remove this axis after ast_build. Counts the removed axes
+//! before the given axis.
+int isl_get_precending_removed_axes_counts(isl_set __isl_keep* a, int level);
+
+//! Get the original level from the level after removing axes.
+int isl_get_original_axes_from_optimized_level(isl_set __isl_keep* a, int level);
+
+//! If the min and max bounds of the axis are same, isl will remove this axis after ast_build. Judge whether or not the
+//! axis will be removed by isl.
+bool isl_is_removed_axis(isl_set __isl_keep* a, int level);
+
+//! Get the maximum level of axis that is has the same domain.
+int isl_max_level_compatible(isl_set* __isl_keep a, isl_set* __isl_keep b);
+
+isl_set* __isl_give isl_remove_axis_by_name(isl_set* __isl_take set, const char* axis_name);
+isl_map* __isl_give isl_remove_axis_by_name(isl_map* __isl_take map, isl_dim_type dim_type, const char* axis_name);
+isl_set* __isl_give isl_rename_axis(isl_set* __isl_take set, int offset, const char* name);
+isl_map* __isl_give isl_rename_axis(isl_map* __isl_take map, isl_dim_type dim_type, int offset, const char* name);
+
+isl_set* __isl_give isl_simplify(isl_set* __isl_take set);
+
+// { s[i]: 0 < i < 20 }
+bool isl_set_axis_has_noparam_constant_bound(isl_set* __isl_keep set, int pos);
+
+//! get a minimum and maximum range of a set, if the bound not exists, return a INT_MAX instead.
+//! NOTE the set should be bound.
+//! returns: a tuple of (min, max)
+std::tuple<isl::val, isl::val> isl_set_get_axis_range(isl_set* __isl_keep set, int pos);
+
+std::tuple<isl::val, isl::val> isl_set_get_axis_range_by_name(isl_set* __isl_keep set, std::string axis_name);
+
+//! Port the set from \p from to \p to with the \p poses dims constraints remained.
+//! @param from The set to port.
+//! @param to The set to be.
+//! @param poses The dimensions to remained.
+isl_set* __isl_give isl_set_port_to_other(isl_set* __isl_give from,
+                                          isl_set* __isl_give to,
+                                          const std::vector<int>& poses);
+
+//! Set get a new set consists of several dimensions.
+//! e.g. { s[i,j,k]: 0<i,j,k<100}, get {0,2} dims, get { s[i,k]: 0<i,k<100 }
+isl::set SetGetDims(isl::set set, const std::vector<int>& dims);
+
+/**
+ * Given an isl::map and a vector of names of dim_in,
+ * remove the input dims in vector and related output dims.
+ * @param x The map to edit.
+ * @param dim_in_names The names of input dims to remove.
+ * @return The edited map.
+ */
+isl::map RemoveAxiesByInputNames(const isl::map& x,
+                                 const isl::set& origin_domain,
+                                 const std::vector<std::string>& dim_in_names);
+
+/**
+ * Given an isl::map and a vector of names of dim_out,
+ * remove the output dims in vector and related input dims.
+ * @param x The map to edit.
+ * @param dim_in_names The names of output dims to remove.
+ * @return The edited map.
+ */
+isl::map RemoveAxiesByOutputNames(const isl::map& x,
+                                  const isl::set& origin_domain,
+                                  const std::vector<std::string>& dim_out_names);
+
+/**
+ * Given an isl::map and a vector of names of dim_out,
+ * get the names of related input dims.
+ * @param x The input map.
+ * @param dim_out_names The names of output dims.
+ * @param strict Indicates whether computes the strictly related input axies.
+ * For example, if strict == true, then input 'j' is related to output 'j_outer_inner_outer'
+ * @return The vector of names of related input dims.
+ */
+std::vector<std::string> GetRelatedInputAxies(const isl::map& x,
+                                              const isl::set& origin_domain,
+                                              const std::vector<std::string>& dim_out_names,
+                                              bool strict = false);
+
+/**
+ * Given an isl::map and a vector of names of dim_in,
+ * get the names of related output dims.
+ * @param x The input map.
+ * @param dim_in_names The names of input dims.
+ * @return The vector of names of related output dims.
+ */
+std::vector<std::string> GetRelatedOutputAxies(const isl::map& x,
+                                               const isl::set& origin_domain,
+                                               const std::vector<std::string>& dim_in_names);
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/isl_utils_test.cc b/paddle/cinn/poly/isl_utils_test.cc
new file mode 100644
index 0000000000000..35f8922363a4f
--- /dev/null
+++ b/paddle/cinn/poly/isl_utils_test.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/isl_utils.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn::poly {
+
+TEST(isl_utils, isl_set_axis_has_noparam_constant_bound) {
+  isl_ctx* ctx = isl_ctx_alloc();
+  {
+    isl::set set(ctx, "{ s[i] : 0 < i < 2 }");
+    ASSERT_TRUE(isl_set_axis_has_noparam_constant_bound(set.get(), 0));
+  }
+
+  {
+    isl::set set(ctx, "[n] -> { s[i] : 0 < i < 2 * n }");
+    ASSERT_FALSE(isl_set_axis_has_noparam_constant_bound(set.get(), 0));
+  }
+
+  {
+    isl::set set(ctx, "[unused] -> { s[i] : 0 < i < 10 }");
+    ASSERT_TRUE(isl_set_axis_has_noparam_constant_bound(set.get(), 0));
+  }
+}
+
+}  // namespace cinn::poly
diff --git a/paddle/cinn/poly/map.cc b/paddle/cinn/poly/map.cc
new file mode 100644
index 0000000000000..6186cc329188d
--- /dev/null
+++ b/paddle/cinn/poly/map.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/map.h"
+
+#include "cinn/poly/isl_utils.h"
+#include "cinn/utils/functional.h"
+
+namespace cinn {
+namespace poly {
+
+std::string Map::__str__() const {
+  CHECK(!domain_iterators_.empty());
+
+  auto get_ids_repr = [](const std::vector<Iterator>& ids) {
+    std::vector<std::string> fields;
+    std::transform(ids.begin(), ids.end(), std::back_inserter(fields), [](const Iterator& x) { return x.id; });
+    return utils::Join(fields, ", ");
+  };
+
+  auto domain_iterators_repr = get_ids_repr(domain_iterators_);
+  auto range_iterators_repr  = get_ids_repr(range_iterators_);
+
+  std::vector<std::string> conds_fields;
+  std::transform(
+      conds_.begin(), conds_.end(), std::back_inserter(conds_fields), [](const Condition& x) { return x.__str__(); });
+  auto conds_repr = utils::Join(conds_fields, " and ");
+
+  if (!conds_.empty()) {
+    return utils::StringFormat("{ %s[%s] -> %s[%s]: %s }",
+                               id_.c_str(),
+                               domain_iterators_repr.c_str(),
+                               range_id_.c_str(),
+                               range_iterators_repr.c_str(),
+                               conds_repr.c_str());
+  }
+
+  return utils::StringFormat("{ %s[%s] -> %s[%s] }",
+                             id_.c_str(),
+                             domain_iterators_repr.c_str(),
+                             range_id_.c_str(),
+                             range_iterators_repr.c_str());
+}
+
+Map::Map(isl::ctx ctx,
+         std::string id,
+         std::vector<Iterator> domain_iterators,
+         std::vector<Iterator> range_iterators,
+         std::vector<Condition> conds,
+         std::string range_id)
+    : ctx_(ctx),
+      id_(std::move(id)),
+      domain_iterators_(std::move(domain_iterators)),
+      range_iterators_(std::move(range_iterators)),
+      conds_(std::move(conds)),
+      range_id_(std::move(range_id)) {}
+
+isl::map Map::to_isl() const {
+  auto map = isl::map(ctx_, __str__());
+  // set dimension names
+  auto handler          = [](const Iterator& x) { return x.id; };
+  auto domain_dim_names = utils::Map<std::vector<Iterator>, std::string>(domain_iterators_, handler);
+  auto range_dim_names  = utils::Map<std::vector<Iterator>, std::string>(range_iterators_, handler);
+  isl_set_dim_names(&map, isl_dim_in, domain_dim_names);
+  isl_set_dim_names(&map, isl_dim_out, range_dim_names);
+  return map;
+}
+
+std::ostream& operator<<(std::ostream& os, const Iterator& x) {
+  os << utils::StringFormat("<Iterator: %s>", x.id.c_str());
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const Map& x) {
+  os << x.__str__();
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const Aff& x) {
+  os << x.__str__();
+  return os;
+}
+
+Iterator& Iterator::operator=(const Iterator& other) {
+  id = other.id;
+  return *this;
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/map.h b/paddle/cinn/poly/map.h
new file mode 100644
index 0000000000000..fb754e7e68469
--- /dev/null
+++ b/paddle/cinn/poly/map.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/poly/dim.h"
+#include "cinn/poly/domain.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace poly {
+
+struct Iterator {
+  std::string id;
+
+  Iterator() = default;
+  explicit Iterator(const std::string& id) : id(id) {}
+  Iterator(const Iterator& x) : id(x.id) {}  // NOLINT
+  explicit Iterator(Iterator&& x) : id(std::move(x.id)) {}
+
+  Iterator& operator=(const Iterator& other);
+  friend bool operator==(const Iterator& a, const Iterator& b) { return a.id == b.id; }
+  friend bool operator!=(const Iterator& a, const Iterator& b) { return !(a.id == b.id); }
+
+  friend std::ostream& operator<<(std::ostream& os, const Iterator& x);
+};
+
+struct Condition {
+  std::string cond;
+
+  explicit Condition(std::string cond) : cond(std::move(cond)) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const Condition& x) {
+    os << x.__str__();
+    return os;
+  }
+
+  std::string __str__() const { return utils::StringFormat("%s", cond.c_str()); }
+};
+
+/**
+ * A wrapper on isl::map.
+ */
+class Map {
+ public:
+  Map(isl::ctx ctx,
+      std::string id,
+      std::vector<Iterator> domain_iterators,
+      std::vector<Iterator> range_iterators,
+      std::vector<Condition> conds,
+      std::string range_id = "");
+
+  //! Get the corresponding ISL map.
+  isl::map to_isl() const;
+
+  //! Get the ISL style map representation, such as '{ S[i,j] -> [i,j]: }'.
+  std::string __str__() const;
+
+ protected:
+  isl::ctx ctx_;
+  std::string id_;
+  std::vector<Iterator> domain_iterators_;
+  std::vector<Iterator> range_iterators_;
+  std::vector<Condition> conds_;
+  std::string range_id_;
+};
+
+class Aff : public Map {
+ public:
+  Aff(isl::ctx ctx,
+      std::string id,
+      std::vector<Iterator> domain_iterators,
+      std::vector<Iterator> range_iterators,
+      std::vector<Condition> conds)
+      : Map(std::move(ctx),
+            std::move(id),
+            std::move(domain_iterators),
+            std::move(range_iterators),
+            std::move(conds),
+            "") {}
+
+  isl::aff to_isl() const { return isl::aff(ctx_, __str__()); }
+};
+
+std::ostream& operator<<(std::ostream& os, const Map& x);
+std::ostream& operator<<(std::ostream& os, const Aff& x);
+static bool operator<(const Iterator& a, const Iterator& b) { return a.id < b.id; }
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/naive_scheduler.cc b/paddle/cinn/poly/naive_scheduler.cc
new file mode 100644
index 0000000000000..f77f7f550894e
--- /dev/null
+++ b/paddle/cinn/poly/naive_scheduler.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/naive_scheduler.h"
+
+#include <vector>
+
+namespace cinn {
+namespace poly {
+
+std::unique_ptr<Schedule> NaiveScheduler::BuildSchedule() {
+  PartitionGroups();
+  CHECK(!groups_.empty());
+
+  for (auto &group : groups_) {
+    std::vector<Stage *> status;
+    CHECK_EQ(group.nodes.size(), 1UL);
+    NaiveGroupScheduler scheduler(const_cast<Stage *>(group.nodes.front()->stage));
+    scheduler.Build();
+  }
+
+  std::unique_ptr<Schedule> res(new Schedule);
+  res->groups = groups_;
+
+  return res;
+}
+
+void NaiveScheduler::PartitionGroups() {
+  // treat each node as a unique group, collect the groups in topological order.
+  auto topo_order      = schedule_graph_.topological_order();  // NOLINT
+  auto &nodes_in_order = std::get<0>(topo_order);
+  auto &edges_in_order = std::get<1>(topo_order);
+
+  for (auto *node : nodes_in_order) {
+    ScheduleGroup group;
+    group.nodes.push_back(node->safe_as<ScheduleGraphNode>());
+    groups_.emplace_back(std::move(group));
+  }
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/naive_scheduler.h b/paddle/cinn/poly/naive_scheduler.h
new file mode 100644
index 0000000000000..328ef13b56f58
--- /dev/null
+++ b/paddle/cinn/poly/naive_scheduler.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "cinn/poly/schedule.h"
+
+namespace cinn {
+namespace poly {
+
+class NaiveGroupScheduler : public SchedulerBase {
+ public:
+  //! Constructor, for naive scheduler, each group has just one node.
+  explicit NaiveGroupScheduler(Stage *x) {
+    AddStage(*x);
+    FinishStageAdd();
+  }
+  //! Just one node, need no schedule.
+  void Build() {}
+};
+
+/**
+ * The NaiveScheduler just schedule each noninlined Tensor as a unique group. Only the `compute_at` will merge two
+ * tensor in the same group.
+ * It is simple and robust.
+ */
+class NaiveScheduler : public SchedulerBase {
+ public:
+  NaiveScheduler() = default;
+  explicit NaiveScheduler(const std::vector<Stage *> &stages) {
+    for (auto *x : stages) AddStage(*x);
+    FinishStageAdd();
+  }
+
+  std::unique_ptr<Schedule> BuildSchedule();
+
+ private:
+  void PartitionGroups();
+
+ private:
+  std::vector<ScheduleGroup> groups_;
+};
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/poly_scheduler.cc b/paddle/cinn/poly/poly_scheduler.cc
new file mode 100755
index 0000000000000..048ee7823f8b6
--- /dev/null
+++ b/paddle/cinn/poly/poly_scheduler.cc
@@ -0,0 +1,462 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/poly_scheduler.h"
+
+#include <glog/logging.h>
+
+#include <deque>
+#include <limits>
+#include <map>
+#include <set>
+#include <stack>
+#include <unordered_set>
+
+#include "cinn/poly/isl_utils.h"
+
+namespace cinn {
+namespace poly {
+
+namespace detail {
+
+//! Visit the nodes in topological order, if one node is valid to visit, visit it and check whether its out link
+//! children are ready to visit, merge them to the same group.
+//! NOTE this is discarded.
+std::vector<Group> PartitionGraphByIterationDomain(common::Graph* graph) {
+  VLOG(3) << "graph:\n" << graph->Visualize();
+  // collect indegrees for naive topological traversal.
+  std::map<DataFlowGraphNode*, uint16_t> indegree;
+  for (common::GraphNode* n : graph->nodes()) {
+    auto* node     = n->safe_as<DataFlowGraphNode>();
+    indegree[node] = node->inlinks().size();
+  }
+
+  std::map<std::string, DataFlowGraphNode*> name2node;
+  for (auto* n : graph->nodes()) {
+    name2node[n->id()] = n->safe_as<DataFlowGraphNode>();
+  }
+
+  // topological sort.
+  std::deque<DataFlowGraphNode*> queue;
+  for (auto* n : graph->start_points()) {
+    auto* node = n->safe_as<DataFlowGraphNode>();
+    queue.push_back(node);
+  }
+  while (!queue.empty()) {
+    auto* node = queue.front();
+    queue.pop_front();
+    VLOG(4) << "to visit " << node->id();
+
+    for (auto& c : node->outlinks()) {
+      auto* child = c->sink()->safe_as<DataFlowGraphNode>();
+      --indegree[child];
+
+      VLOG(3) << node->stage->transformed_domain() << " -> " << child->stage->transformed_domain();
+      if (indegree[child] == 0) {
+        // Merge the two groups if their iteration domain is the same.
+        if (DataFlowGraphNode::TransformedDomainIsSame(node, child)) {
+          VLOG(4) << child->id() << " ready to merge " << node->id() << " with " << child->id();
+          DataFlowGraphNode::MergeGroup(node, child);
+        }
+        queue.push_back(child);
+      }
+    }
+  }
+
+  // process the ComputeAt relation.
+  for (auto* n : graph->nodes()) {
+    auto* node = n->safe_as<DataFlowGraphNode>();
+    for (auto& compute_at : node->stage->compute_ats()) {
+      CHECK(compute_at.IsCompatible(node->stage.get())) << "The registered ComputeAt is not compatible";
+      // check the endpoints of compute_at has data dependency.
+      auto* node0 = node;
+      auto* node1 = name2node[compute_at.stage->id()];
+      VLOG(3) << "a -> b: " << node0->id() << " -> " << node1->id();
+
+      DataFlowGraphNode::MergeGroup(node0, node1);
+      // TODO(Superjomn) Consider the case node1 is a parent.
+    }
+  }
+
+  // gather groups
+  std::set<DataFlowGraphNode*> groups_gathered;
+  std::vector<DataFlowGraphNode*> groups_in_topo_order;
+
+  std::map<DataFlowGraphNode*, std::vector<DataFlowGraphNode*>> node_groups;
+
+  auto topo_order      = graph->topological_order();
+  auto& nodes_in_order = std::get<0>(topo_order);
+  auto& edges_in_order = std::get<1>(topo_order);
+
+  for (auto* n : nodes_in_order) {
+    auto* node     = n->safe_as<DataFlowGraphNode>();
+    auto* ancestor = node->group_ancestor();
+    if (!groups_gathered.count(ancestor)) {
+      groups_gathered.insert(ancestor);
+      groups_in_topo_order.push_back(ancestor);
+    }
+
+    node_groups[ancestor].push_back(node);
+  }
+
+  std::vector<Group> groups;
+  // preparing result
+  for (auto* ancestor : groups_in_topo_order) {
+    Group group;
+    for (auto* c : node_groups[ancestor]) {
+      group.nodes.push_back(c);
+    }
+    groups.emplace_back(group);
+  }
+
+  // NOTE DEBUG
+  // check there are same count of nodes both in the orginal graph and the groups.
+  // @{
+  int num_node_in_groups = 0;
+  for (auto& group : groups) num_node_in_groups += group.nodes.size();
+  CHECK_EQ(num_node_in_groups, graph->num_nodes());
+  // @}
+
+  return groups;
+}
+
+//! Check whether a group partition is valid. The ComputeAt and some other transform may broke data dependency, use this
+//! to check validity.
+// TODO(Superjomn) Implement this and integrate it into ComputeAt transform for checking transform validity.
+bool CheckGroupValid(const std::vector<Group>& groups) {
+  CINN_NOT_IMPLEMENTED
+  return false;
+}
+
+//! Tell if \param a links to \param b.
+bool IsLinkTo(const common::GraphNode* a, const common::GraphNode* b) {
+  // dfs
+  std::stack<const common::GraphNode*> stack({a});
+  std::unordered_set<const common::GraphNode*> visited;
+  while (!stack.empty()) {
+    auto* top = stack.top();
+    stack.pop();
+    if (visited.count(top)) continue;
+
+    if (top == b) return true;
+
+    for (auto& out : top->outlinks()) {
+      auto* x = out->sink();
+      if (!visited.count(x)) {
+        if (x == b) return true;
+        stack.push(x);
+      }
+    }
+    visited.insert(top);
+  }
+
+  return false;
+}
+
+bool IsBetween(const common::GraphNode* x, const common::GraphNode* a, const common::GraphNode* b) {
+  if (IsLinkTo(a, x) && IsLinkTo(x, b)) return true;
+  if (IsLinkTo(x, a) && IsLinkTo(b, x)) return true;
+  return false;
+}
+
+std::vector<Group> TopoSortGroups(std::vector<Group>& groups) {
+  // collect indegree.
+  absl::flat_hash_map<Group*, int> group_indegree;
+  std::vector<Group*> start_groups;
+  std::deque<Group*> queue;
+  std::vector<Group> group_order;
+  absl::flat_hash_map<std::string, Group*> node2group;
+  for (int i = 0; i < groups.size(); i++) {
+    Group* group  = &groups[i];
+    int in_degree = 0;
+    for (auto& node : group->nodes) {
+      node2group[node->id()] = group;
+      in_degree += node->inlinks().size();
+      for (auto& node2 : group->nodes) {
+        if (node2->as<common::GraphNode>()->IsLinkedTo(node->as<common::GraphNode>())) {
+          in_degree--;
+        }
+      }
+    }
+    group_indegree[group] = in_degree;
+    if (in_degree == 0) {
+      start_groups.push_back(group);
+    }
+  }
+
+  // insert start points first.
+  for (auto* n : start_groups) {
+    queue.push_back(n);
+  }
+
+  // start to visit
+  while (!queue.empty()) {
+    auto* top_group = queue.front();
+    group_order.push_back(*top_group);
+
+    queue.pop_front();
+    std::set<std::string> all_nodes;
+
+    for (auto& node : top_group->nodes) {
+      all_nodes.insert(node->id());
+    }
+    for (auto& node : top_group->nodes) {
+      for (auto& edge : node->outlinks()) {
+        CHECK_EQ(edge->source()->id(), node->id());
+        auto* sink = edge->sink();
+        if (all_nodes.count(sink->id()) == 0 && (--group_indegree[node2group[sink->id()]]) == 0) {
+          queue.push_back(node2group[sink->id()]);
+        }
+      }
+    }
+  }
+  return group_order;
+}
+
+/**
+ * Naive idea to split a graph.
+ *
+ * 1. treat each stage as a seperate group.
+ * 2. If ComputeAt is set between two stages and their iteration domain matches, the stages will be put in a group with
+ * relative order.
+ */
+std::vector<Group> NaivePartitionGraph(common::Graph* graph) {
+  std::map<DataFlowGraphNode*, std::vector<DataFlowGraphNode*>> node_groups;
+  auto topo_order      = graph->topological_order();
+  auto& nodes_in_order = std::get<0>(topo_order);
+  auto& edges_in_order = std::get<1>(topo_order);
+
+  std::map<std::string, DataFlowGraphNode*> name2node;
+  for (auto* n : graph->nodes()) {
+    name2node[n->id()] = n->safe_as<DataFlowGraphNode>();
+  }
+
+  // process compute_at
+  absl::flat_hash_map<const common::GraphNode*, uint32_t> node2score;  // record each node's score for sorting.
+  int score = 0;
+  for (auto* n : nodes_in_order) {
+    auto* node       = n->safe_as<DataFlowGraphNode>();
+    node2score[node] = score++;
+    for (ComputeAtRelation& compute_at : node->stage->compute_ats()) {
+      CHECK(compute_at.IsCompatible(node->stage.get())) << "The registered ComputeAt is not compatible";
+      // check the endpoints of compute_at has data dependency.
+      auto* node0 = node;
+      if (name2node.count(compute_at.stage->id()) == 0) {
+        continue;
+        LOG(FATAL) << "Didn't find node with name " << compute_at.stage->id() << " !";
+      }
+      auto* node1 = name2node[compute_at.stage->id()];
+      VLOG(3) << "a -> b: " << node0->id() << " -> " << node1->id();
+
+      DataFlowGraphNode::MergeGroup(node0, node1);
+      // process single level of outlinks
+      for (auto& outlink : node0->outlinks()) {
+        if (IsBetween(outlink->sink(), node0, node1)) {
+          DataFlowGraphNode::MergeGroup(node0, outlink->sink()->safe_as<DataFlowGraphNode>());
+        }
+      }
+
+      // TODO(Superjomn) Consider the case node1 is a parent.
+    }
+  }
+  // generate final groups.
+  absl::flat_hash_map<DataFlowGraphNode* /*ancestor*/, std::vector<DataFlowGraphNode*>> clusters;
+  for (auto* n : nodes_in_order) {
+    auto* node = n->safe_as<DataFlowGraphNode>();
+    clusters[node->group_ancestor()].push_back(node);
+  }
+  std::vector<Group> groups;
+  for (auto& item : clusters) {
+    Group group;
+    for (auto* c : item.second) {
+      group.nodes.emplace_back(c);
+    }
+    groups.push_back(std::move(group));
+  }
+  auto group_order = TopoSortGroups(groups);
+#ifdef CINN_DEBUG
+  VLOG(2) << "Group Partition result:";
+  int graph_node_count = 0;
+  for (auto& group : group_order) {
+    std::stringstream ss;
+    for (auto& node : group.nodes) {
+      ss << node->id() << " ";
+    }
+    VLOG(2) << "group: { " << ss.str() << " }";
+    graph_node_count += group.nodes.size();
+  }
+  // check the groups contains all the nodes in graph.
+  CHECK_EQ(graph_node_count, graph->nodes().size()) << "the groups should contain all the nodes in the graph";
+#endif
+
+  return group_order;
+}
+
+}  // namespace detail
+
+std::unique_ptr<Schedule> PolyScheduler::BuildSchedule() {
+  std::unique_ptr<Schedule> res(new Schedule);
+
+  // partition the DataFlowGraph to groups.
+  auto dfg_groups = PartitionGroups(dfg_.get());
+  CHECK(!dfg_groups.empty());
+
+  // transform the DFG groups to schedule groups.
+  CHECK(!schedule_graph_.nodes().empty());
+  CHECK_EQ(schedule_graph_.nodes().size(), dfg_->nodes().size()) << "DFG graph is not match schedule graph";
+  schedule_groups_.clear();
+  for (auto& dfg_group : dfg_groups) {
+    ScheduleGroup group;
+    for (auto& node : dfg_group.nodes) {
+      auto* schedule_node = schedule_graph_.RetrieveNode(node->id());
+      CHECK(schedule_node) << "missing node " << node->id() << " in schedule graph";
+      group.nodes.push_back(schedule_node->safe_as<ScheduleGraphNode>());
+    }
+    schedule_groups_.emplace_back(std::move(group));
+  }
+  CHECK_EQ(schedule_groups_.size(), dfg_groups.size());
+
+  // Schedule each group
+  ScheduleGroups();
+
+  // Collect result.
+  res->groups = schedule_groups_;
+
+  for (auto& group : schedule_groups_) {
+    for (auto& node : group.nodes) {
+      res->schedule[node->id()] = node->time_schedule.to_isl(Context::isl_ctx());
+    }
+  }
+
+  return res;
+}
+
+PolyScheduler::PolyScheduler(const std::vector<Stage*>& stages,
+                             const std::vector<std::pair<std::string, std::string>>& extra_links) {
+  CHECK(!stages.empty()) << "No stage is provided";
+
+  // collect extra links
+  auto _extra_links = extra_links;
+  if (extra_links.empty()) {
+    _extra_links = ExtractExtraDepLinksFromStages(stages);
+  }
+
+  dfg_ = CreateGraph(stages, _extra_links);
+
+  for (auto* stage : stages) {
+    AddStage(*stage);
+  }
+  FinishStageAdd();
+}
+
+std::vector<detail::Group> PolyScheduler::PartitionGroups(DataFlowGraph* graph) {
+  CHECK(graph);
+  CHECK(!graph->nodes().empty());
+  return detail::NaivePartitionGraph(graph);
+}
+
+void PolyScheduler::ScheduleAGroup(ScheduleGroup* group) {
+  CHECK(group);
+  CHECK(!group->nodes.empty());
+
+  // create scheduler for this group.
+  std::vector<Stage*> stages;
+  for (auto& node : group->nodes) {
+    stages.push_back(const_cast<Stage*>(node->stage));
+  }
+
+  PolyGroupScheduler scheduler(stages);
+  group->nodes           = scheduler.Build();
+  group->dimension_names = scheduler.detailed_dimension_names();
+}
+
+void PolyScheduler::ScheduleGroups() {
+  CHECK(!schedule_groups_.empty()) << "call PartitionGroups first";
+  for (auto& group : schedule_groups_) {
+    ScheduleAGroup(&group);
+  }
+}
+
+std::vector<Shared<ScheduleGraphNode>> PolyGroupScheduler::Build() {
+  // consider compute_at
+  std::map<std::string, Stage*> stage_map;
+  std::map<std::string, ComputeAtRelation> compute_at_links;
+  for (int i = 0; i < stages_.size(); i++) {
+    auto& stage                     = stages_[i];
+    stage_map[stage->tensor_->name] = stage;
+    for (auto& item : stage->compute_ats()) {
+      compute_at_links[stage->tensor_->name] = item;
+    }
+  }
+  std::map<std::string, int> stage_level;
+  for (auto& link : compute_at_links) {
+    CHECK_NE(stage_map.count(link.first), 0) << link.first << " not found in stage_map";
+    CHECK_NE(stage_map.count(link.second.stage->tensor_->name), 0)
+        << link.second.stage->tensor_->name << " not found in stage_map";
+    auto* a = stage_map.at(link.first);
+    auto* b = stage_map.at(link.second.stage->tensor_->name);
+    After(*a, *b, link.second.level);
+    stage_level[a->id()] = link.second.level;
+  }
+
+  for (int i = 0; i < stages_.size() - 1; i++) {
+    Stage* a = stages_[i];
+    Stage* b = stages_[i + 1];
+
+    auto a_set = a->transformed_domain();
+    auto b_set = b->transformed_domain();
+
+    // a -> b not in the compute_at_links
+    if (!compute_at_links.count(a->tensor_->name) ||
+        compute_at_links[a->tensor_->name].stage->tensor_->name != b->tensor_->name) {
+      int min_level = INT_MAX;
+      if (stage_level.count(a->id())) min_level = std::min(min_level, stage_level[a->id()]);
+      if (stage_level.count(b->id())) min_level = std::min(min_level, stage_level[b->id()]);
+      if (min_level < INT_MAX) {
+        After(*a, *b, min_level);
+      }
+    }
+  }
+
+  auto topo_order      = schedule_graph_.topological_order();
+  auto& nodes_in_order = std::get<0>(topo_order);
+  auto& edges_in_order = std::get<1>(topo_order);
+  std::vector<Shared<ScheduleGraphNode>> res;
+
+  // update the time schedule info.
+  for (auto& edge : edges_in_order) {
+    auto* node0 = edge->source()->safe_as<ScheduleGraphNode>();
+    auto* node1 = edge->sink()->safe_as<ScheduleGraphNode>();
+    int level   = edge->as<ScheduleGraphEdge>()->level;
+    if (level < 0) continue;
+    VLOG(2) << "schedule " << node0->id() << " -> " << node1->id() << " level " << level;
+    node1->time_schedule.OrderAfter(node0->time_schedule, level);
+  }
+
+  for (auto& node : nodes_in_order) {
+    res.emplace_back(node->safe_as<ScheduleGraphNode>());
+  }
+  return res;
+}
+
+PolyGroupScheduler::PolyGroupScheduler(const std::vector<Stage*>& stages) : stages_(stages) {
+  CHECK_GT(stages.size(), 0) << "No stage is provided";
+  for (auto* stage : stages) {
+    AddStage(*stage);
+  }
+  FinishStageAdd();
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/poly_scheduler.h b/paddle/cinn/poly/poly_scheduler.h
new file mode 100644
index 0000000000000..1cb61cb9de14e
--- /dev/null
+++ b/paddle/cinn/poly/poly_scheduler.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/poly/graph.h"
+#include "cinn/poly/isl_utils.h"
+#include "cinn/poly/map.h"
+#include "cinn/poly/schedule.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace poly {
+
+/**
+ * Schedule a single group with iterator domain considered and follow the stage order.
+ */
+class PolyGroupScheduler : public SchedulerBase {
+ public:
+  //! Constructor, this will build a DAG based on the stages.
+  explicit PolyGroupScheduler(const std::vector<Stage *> &stages);
+
+  //! Build the schedule, that is set the time schedule following each edge.
+  std::vector<Shared<ScheduleGraphNode>> Build();
+
+ private:
+  const std::vector<Stage *> &stages_;
+};
+
+/**
+ * PolyScheduler - Perform schedule on polyhedral model.
+ * It takes a normal schedule as input, merge two stages automatically if they have the same domain.
+ */
+class PolyScheduler : public SchedulerBase {
+ public:
+  /**
+   * Constructor.
+   * @param schedule A normal isl schedule, such as '{ S[i,j] -> [i,j] }'
+   *
+   * The schedule input can be transformed, that's ok, such as
+   *   '{ S[i,j] -> [i_outer, i_inner, j]: i_outer=floor(i/4) and i_inner=i%4 }'
+   * that's OK.
+   */
+  explicit PolyScheduler(const std::vector<Stage *> &stages,
+                         const std::vector<std::pair<std::string, std::string>> &extra_links = {});
+
+  /**
+   * Build and create schedule.
+   */
+  std::unique_ptr<Schedule> BuildSchedule();
+
+ private:
+  //! Partition the graph into several groups.
+  std::vector<detail::Group> PartitionGroups(DataFlowGraph *graph);
+  //! Schedule a single group.
+  void ScheduleAGroup(ScheduleGroup *group);
+  //! Schedule all the groups.
+  void ScheduleGroups();
+
+  std::unique_ptr<DataFlowGraph> dfg_;
+
+  //! The groups of ScheduleNode groups.
+  std::vector<ScheduleGroup> schedule_groups_;
+};
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/poly_scheduler_test.cc b/paddle/cinn/poly/poly_scheduler_test.cc
new file mode 100644
index 0000000000000..773748663bba4
--- /dev/null
+++ b/paddle/cinn/poly/poly_scheduler_test.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/poly_scheduler.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn {
+namespace poly {}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/schedule.cc b/paddle/cinn/poly/schedule.cc
new file mode 100644
index 0000000000000..df9d1362e9bf8
--- /dev/null
+++ b/paddle/cinn/poly/schedule.cc
@@ -0,0 +1,254 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/schedule.h"
+
+#include <deque>
+#include <set>
+#include <sstream>
+
+#include "cinn/common/graph_utils.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/poly/naive_scheduler.h"
+#include "cinn/poly/poly_scheduler.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace poly {
+
+std::string TimeSchedule::__str__() const {
+  CHECK_LE(time_dims_.size(), kMaxDims);
+
+  // generate range: [dup, t0, t1...]
+  std::vector<std::string> range_dims, cond_dims;
+  range_dims.push_back("r");  // root level
+  for (int i = 0; i < time_dims_.size(); i++) {
+    range_dims.push_back("t" + std::to_string(i));
+    range_dims.push_back("d" + std::to_string(i));
+  }
+
+  for (int i = 0; i < time_dims_.size(); i++) {
+    cond_dims.push_back("d" + std::to_string(i));
+    cond_dims.push_back("t" + std::to_string(i));
+  }
+
+  // generate conditions
+  std::vector<std::string> conds;
+  conds.push_back(utils::StringFormat("r=%d", root_time_));
+  for (int i = 0; i < time_dims_.size(); i++) {
+    conds.push_back(utils::StringFormat("%s=%s", cond_dims[2 * i].c_str(), std::to_string(time_dims_[i].time).c_str()));
+    conds.push_back(utils::StringFormat("%s=%s", cond_dims[2 * i + 1].c_str(), time_dims_[i].dim.c_str()));
+  }
+
+  return utils::StringFormat("{ %s[%s] -> [%s]: %s }",
+                             id_.c_str(),
+                             utils::Join(domain_dims, ", ").c_str(),
+                             utils::Join(range_dims, ", ").c_str(),
+                             utils::Join(conds, " and ").c_str());
+}
+
+std::vector<std::string> TimeSchedule::final_axis_names() const {
+  std::vector<std::string> dims;
+  for (int i = 0; i < time_dims_.size(); i++) {
+    dims.push_back(std::to_string(time_dims_[i].time).c_str());
+    dims.push_back(time_dims_[i].dim.c_str());
+  }
+  return dims;
+}
+
+TimeSchedule::TimeSchedule(const std::string &id, const std::vector<std::string> &dims) {
+  CHECK_LE(dims.size(), kMaxDims);
+  id_         = id;
+  domain_dims = dims;
+  for (auto &dim : domain_dims) {
+    CHECK(!dim.empty());
+    time_dims_.emplace_back(dim, 0);
+  }
+}
+
+void TimeSchedule::OrderAfter(const TimeSchedule &other, int level) {
+  CHECK_EQ(space_size(), other.space_size()) << "space not match";
+  CHECK_LT(level, other.space_size()) << other.__str__();
+  CHECK_GE(level, 0);
+  CHECK(!time_dims_.empty());
+
+  root_time_ = std::max(root_time_, other.root_time_);
+
+  if (level == -1) {
+    root_time_ = std::max(root_time_, other.root_time_ + 1);
+  }
+
+  for (int i = 0; i < level; i++) {
+    this->time_dims_[i].time = std::max(other.time_dims_[i].time, this->time_dims_[i].time);
+  }
+
+  this->time_dims_[level].time = std::max(this->time_dims_[level].time, other.time_dims_[level].time + 1);
+}
+
+isl::map TimeSchedule::to_isl(isl::ctx ctx) const {
+  VLOG(4) << "isl: " << __str__();
+  return isl::map(ctx, __str__());
+}
+
+const std::string &TimeSchedule::id() const {
+  CHECK(!id_.empty());
+  return id_;
+}
+
+void TimeSchedule::ResizeTimeSpace(int size) {
+  CHECK_LE(size, kMaxDims);
+  for (int i = time_dims_.size(); i < size; i++) {
+    time_dims_.emplace_back("0", 0);
+  }
+}
+
+/*
+std::unique_ptr<Schedule> CreateSchedule(const ir::Tensor &tensor, ScheduleKind schedule_kind) {
+  auto stages = GatherStagesInTensors({tensor});
+  VLOG(3) << "collected " << stages.size() << " stages";
+  return CreateSchedule(stages, schedule_kind);
+}
+ */
+
+std::unique_ptr<Schedule> CreateSchedule(const std::vector<Stage *> &stages,
+                                         ScheduleKind schedule_kind,
+                                         const std::vector<std::pair<std::string, std::string>> &extra_links) {
+  CHECK(!stages.empty());
+  for (auto &stage : stages) {
+    VLOG(4) << "stage: " << stage->domain();
+  }
+  switch (schedule_kind) {
+    case ScheduleKind::Naive: {
+      NaiveScheduler scheduler(stages);
+      return scheduler.BuildSchedule();
+    } break;
+    case ScheduleKind::Poly: {
+      PolyScheduler scheduler(stages, extra_links);
+      return scheduler.BuildSchedule();
+    } break;
+    default:
+      CINN_NOT_IMPLEMENTED
+  }
+  return nullptr;
+}
+
+std::map<std::string, isl::map> CollectScheduleMapFromGroup(const ScheduleGroup &group) {
+  std::map<std::string, isl::map> map;
+
+  std::vector<Stage *> stages;
+  for (auto &node : group.nodes) {
+    CHECK(node->stage);
+    stages.push_back(node->stage);
+  }
+
+  PolyGroupScheduler group_scheduler(stages);
+  group_scheduler.Build();
+
+  return group_scheduler.schedule_map();
+}
+
+void SchedulerBase::AddStage(const Stage &x) {
+  CHECK(!registration_finalized_) << "element registration has been finalized.";
+  space_size_ = std::max(space_size_, isl_map_dim(x.transform().get(), isl_dim_out));
+  VLOG(3) << "space_size: " << space_size_;
+  VLOG(3) << "schedule: " << x.transform();
+
+  // Use the dimensions from element's schedule's range as the new domain dimensions because in Element, the schedule is
+  // like '{ S0[i,j] -> S0[i_outer, i_inner, j] }', the scheduler should schedule base on the range.
+  auto dims      = isl_get_dim_names(x.transform(), isl_dim_out);
+  std::string id = isl_map_get_tuple_name(x.transform().get(), isl_dim_in);
+  schedule_graph_.RegisterNode(
+      x.id(), common::make_shared<ScheduleGraphNode>(id, isl_get_dim_names(x.transform(), isl_dim_out), &x));
+
+  // record the longest dimensions.
+  if (dims.size() > detailed_dimension_names_.size()) detailed_dimension_names_ = dims;
+
+  if (!ctx_.get()) {
+    ctx_ = x.domain().ctx();
+  } else {
+    CHECK_EQ(ctx_.get(), x.domain().ctx().get()) << "isl ctx not match";
+  }
+}
+
+void SchedulerBase::FinishStageAdd() {
+  for (auto *node : schedule_graph_.nodes()) {
+    auto *schedule_node = node->safe_as<ScheduleGraphNode>();
+    for (auto &depend : schedule_node->stage->ctrl_depends()) {
+      auto *depend_node = schedule_graph_.RetrieveNode(depend->name);
+      if (depend_node) {  // some dependencies might be in another graph.
+        auto *a_node = depend_node->safe_as<ScheduleGraphNode>();
+        auto *b_node = node->safe_as<ScheduleGraphNode>();
+        auto _a_edge_b_edge_ =
+            a_node->LinkTo<ScheduleGraphEdge>(b_node);  // Add link from extra depend statment to current node.
+        auto &a_edge                           = std::get<0>(_a_edge_b_edge_);
+        auto &b_edge                           = std::get<1>(_a_edge_b_edge_);
+        a_edge->as<ScheduleGraphEdge>()->level = -1;
+        b_edge->as<ScheduleGraphEdge>()->level = -1;
+      }
+    }
+  }
+
+  CHECK(!schedule_graph_.nodes().empty())
+      << "No node is registered to the graph, use RegisterElement to collect some elements";
+  registration_finalized_ = true;
+
+  for (auto &item : schedule_graph_.nodes()) {
+    VLOG(6) << "original dims in time_schedule: "
+            << utils::Join(item->safe_as<ScheduleGraphNode>()->time_schedule.domain_dims, ", ");
+    item->safe_as<ScheduleGraphNode>()->time_schedule.ResizeTimeSpace(space_size_);
+  }
+}
+
+std::vector<std::string> SchedulerBase::WrapIteratorNames(const std::vector<std::string> &names) {
+  std::vector<std::string> res;
+  for (int i = 0; i < names.size(); i++) {
+    res.push_back("");        // fake name for time space.
+    res.push_back(names[i]);  // name for the corresponding iterator.
+  }
+  return res;
+}
+
+SchedulerBase &SchedulerBase::After(const Stage &a, const Stage &b, int level) {
+  CHECK_LT(level, space_size_);
+  auto *a_node = schedule_graph_.RetrieveNode(a.id())->safe_as<ScheduleGraphNode>();
+  auto *b_node = schedule_graph_.RetrieveNode(b.id())->safe_as<ScheduleGraphNode>();
+  CHECK(a_node) << "no node called " << a.id() << " registered in the graph";
+  CHECK(b_node) << "no node called " << b.id() << " registered in the graph";
+
+  auto _a_edge_b_edge_                   = a_node->LinkTo<ScheduleGraphEdge>(b_node);  // NOLINT
+  auto &a_edge                           = std::get<0>(_a_edge_b_edge_);
+  auto &b_edge                           = std::get<1>(_a_edge_b_edge_);
+  a_edge->as<ScheduleGraphEdge>()->level = level;
+  b_edge->as<ScheduleGraphEdge>()->level = level;
+  VLOG(2) << "In After, Set [" << a.id() << "] -> [b: ]" << b.id() << "] with level = " << level;
+  return *this;
+}
+
+SchedulerBase &SchedulerBase::Before(const Stage &a, const Stage &b, int level) { return After(b, a, level); }
+
+std::map<std::string, isl::map> SchedulerBase::schedule_map() const {
+  std::map<std::string, isl::map> res;
+  for (auto &node : schedule_graph_.nodes()) {
+    auto *schedule_node      = node->safe_as<ScheduleGraphNode>();
+    res[schedule_node->id()] = schedule_node->time_schedule.to_isl(Context::isl_ctx());
+  }
+  return res;
+}
+
+const char *ScheduleGraphNode::__type_info__ = "ScheduleGraphNode";
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/schedule.h b/paddle/cinn/poly/schedule.h
new file mode 100755
index 0000000000000..77b17112768cb
--- /dev/null
+++ b/paddle/cinn/poly/schedule.h
@@ -0,0 +1,228 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+/**
+ * This file defines Schedule related concepts.
+ */
+#include <absl/container/flat_hash_map.h>
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/poly/graph.h"
+#include "cinn/poly/isl_utils.h"
+#include "cinn/poly/map.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace poly {
+
+/**
+ * The dimension with time space.
+ */
+struct TimeDim {
+  //! time of this dimension.
+  int time;
+  //! name of this dimension.
+  std::string dim;
+
+  TimeDim() = default;
+  TimeDim(const std::string &dim, int time) : dim(dim), time(time) { CHECK(!dim.empty()); }
+};
+
+class ScheduleGraphNode;
+struct ScheduleGraph : public common::Graph {};
+
+/**
+ * ISL schedule map with time space, used to generate the final schedule.
+ * The map it generates is like { [x,y] -> [t0,x,t1,y] }, the t0 and t1 are time space.
+ */
+struct TimeSchedule {
+  TimeSchedule(const std::string &id, const std::vector<std::string> &dims);
+
+  void ResizeTimeSpace(int size);
+
+  //! Schedule this after \p other in \p level.
+  void OrderAfter(const TimeSchedule &other, int level);
+
+  //! How many dimensions of this time schedule space.
+  size_t space_size() const { return time_dims_.size(); }
+
+  //! The unique ID of the time schedule.
+  const std::string &id() const;
+
+  //! Get the isl map.
+  isl::map to_isl(isl::ctx ctx) const;
+
+  //! ISL range format, such as '[dup, t0, t1]: dup=0 and t0=0 and t1=i]'
+  std::string __str__() const;
+
+  //! Get the axis names with the original dimension names and faked time dimensions.
+  std::vector<std::string> final_axis_names() const;
+
+  std::vector<std::string> domain_dims;
+  int duplicate_id{};
+
+  constexpr static int kMaxDims = 50;
+
+ private:
+  int root_time_{0};
+  std::vector<TimeDim> time_dims_;
+  std::string id_;
+};
+
+struct ScheduleGroup;
+/**
+ * A container type to contain the schedule information of a graph(several groups).
+ */
+struct Schedule {
+  //! The schedule groups partitioned from the graph.
+  std::vector<ScheduleGroup> groups;
+  //! id to the isl schedule for each node.
+  std::map<std::string, isl::map> schedule;
+};
+
+/**
+ * The base class for all the Scheduler, it helps to schedule the nodes in a group(isl space). All the schedule in the
+ * same group should have the same number of dimensions, and each have some dependency with others.
+ */
+class SchedulerBase {
+ public:
+  /**
+   * Wrap the iterator names with time space fake names, it is used for isl AST to set iterator names.
+   * @param names the original iterator names.
+   * @return the iterator names with time space included.
+   */
+  static std::vector<std::string> WrapIteratorNames(const std::vector<std::string> &names);
+
+  /**
+   * Mark this should schedule after another.
+   *
+   * @param b
+   * @param level
+   */
+  SchedulerBase &After(const Stage &a, const Stage &b, int level);
+  /**
+   * Mark this should schedule before another.
+   * @param b
+   * @param level
+   */
+  SchedulerBase &Before(const Stage &a, const Stage &b, int level);
+
+  std::map<std::string, isl::map> schedule_map() const;
+
+  const std::vector<std::string> &detailed_dimension_names() const { return detailed_dimension_names_; }
+
+ protected:
+  /**
+   * Register an Element to the scheduler.
+   */
+  void AddStage(const Stage &x);
+
+  /**
+   * Finalize the registration.
+   */
+  void FinishStageAdd();
+
+  /**
+   * Tell whether the registration is finalized.
+   */
+  bool finalized() const { return registration_finalized_; }
+  int space_size() const { return space_size_; }
+
+ protected:
+  /**
+   * The polyhedral schedule, any schedule is performed on it.
+   * We use the time-space map to record the schedule information, the format is borrowed from Tiramisu project:
+   * [time,dim,time,dim,time,dim ...]
+   */
+  int space_size_{0};
+  mutable isl::ctx ctx_{Context::isl_ctx()};
+  mutable ScheduleGraph schedule_graph_;
+  // Record the longest dimensions(of some stage) to be the final detailed dimension names. It might be used for ISL AST
+  // to set iterator names and generate readable code.
+  mutable std::vector<std::string> detailed_dimension_names_;
+
+ private:
+  bool registration_finalized_{false};
+};
+
+/**
+ * Schedule Kind.
+ */
+enum class ScheduleKind {
+  //! Basic strategy, each status is scheduled seperately.
+  Naive = 0,
+  //! The strategy with iteration domain considered.
+  Poly = 1,
+};
+
+//! Create a schedule from a tensor.
+// std::unique_ptr<Schedule> CreateSchedule(const ir::Tensor &tensor, ScheduleKind schedule_kind = ScheduleKind::Poly);
+//! Create a schedule from a list of stages, it will schedule the stages using the information from data dependency,
+//! iteration domains.
+std::unique_ptr<Schedule> CreateSchedule(const std::vector<Stage *> &stages,
+                                         ScheduleKind schedule_kind = ScheduleKind::Poly,
+                                         const std::vector<std::pair<std::string, std::string>> &extra_links = {});
+
+/**
+ * Gather the stages in the input tensors and their dependencies
+ * @param xs The input tensors.
+ * @param with_placeholder Whether to include placeholders(default false).
+ * @returns The stages in topological order follow the connection to `xs`.
+ */
+// std::vector<Stage *> GatherStagesInTensors(const std::vector<ir::Tensor> &xs, bool with_placeholder = false);
+
+struct ScheduleGraphEdge : public common::GraphEdge {
+  ScheduleGraphEdge(common::GraphNode *a, common::GraphNode *b) : common::GraphEdge(a, b) {}
+
+  //! Dependency level.
+  int level{-1};
+};
+
+/**
+ * Node in the schedule graph.
+ */
+struct ScheduleGraphNode : public common::GraphNode {
+  TimeSchedule time_schedule;
+  Stage *stage{};
+
+  //! NOTE this id is not human-readable.
+  // std::string id() const override { return std::to_string(reinterpret_cast<size_t>(this)); }
+  std::string id() const override { return time_schedule.id(); }
+
+  explicit ScheduleGraphNode(const std::string &id, const std::vector<std::string> &dims, const Stage *stage)
+      : time_schedule(id, dims), stage(const_cast<Stage *>(stage)) {}
+
+  const char *type_info() const override { return __type_info__; }
+
+  static const char *__type_info__;
+};
+
+struct ScheduleGroup {
+  std::vector<Shared<ScheduleGraphNode>> nodes;
+  std::vector<std::string> dimension_names;
+};
+
+std::map<std::string, isl::map> CollectScheduleMapFromGroup(const ScheduleGroup &group);
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/schedule_test.cc b/paddle/cinn/poly/schedule_test.cc
new file mode 100755
index 0000000000000..417b31c89b005
--- /dev/null
+++ b/paddle/cinn/poly/schedule_test.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/schedule.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+
+namespace cinn {
+namespace poly {
+
+TEST(CreateStages, compute_at) {
+  Expr N(100);
+  lang::Placeholder<float> A("A", {N, N});
+
+  auto B = lang::Compute(
+      {N, N}, [&](Var i, Var j) { return A(i, j) + 1.f; }, "B");
+
+  auto C = lang::Compute(
+      {N, N, N}, [&](Var i, Var j, Var k) { return B(i, j) * B(j, k); }, "C");
+
+  auto stages = CreateStages({C});
+  stages[B]->ComputeAtSchedule(stages[C], 1);
+
+  auto funcs = lang::Lower("func", stages, {B, C});
+
+  std::cout << funcs->body << std::endl;
+
+  auto target_out = R"ROC(
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 100)
+    {
+      B[i, j] = (1.00000000f + A[i, j])
+      serial for (k, 0, 100)
+      {
+        C[i, j, k] = (B[i, j] * B[j, k])
+      }
+    }
+  }
+}
+)ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(funcs->body), utils::Trim(target_out));
+}
+
+TEST(CreateStages, buffer_bind_to_multiple_tensors_schedule) {
+  Expr N(100);
+  lang::Placeholder<float> A("A", {N, N});
+  /*
+   * We create three tensors all binded to the same buffer, but has no depend in computation.
+   */
+
+  auto B = lang::Compute(
+      {N, N}, [&](Var i, Var j) { return A(i, j) + 1.f; }, "B");
+  lang::Buffer B_buf(B->type());
+  B->Bind(B_buf);
+
+  auto C = lang::Compute(
+      {N, N}, [&](Var i, Var j) { return A(i, j) + 1.f; }, "C");
+  C->Bind(B_buf);
+
+  auto D = lang::Compute(
+      {N, N}, [&](Var i, Var j) { return A(i, j) + 1.f; }, "D");
+  D->Bind(B_buf);
+
+  auto stages = CreateStages({B, C, D});
+
+  stages[C]->ShareBufferWith(stages[B]);
+  stages[D]->ShareBufferWith(stages[B]);
+  stages[C]->CtrlDepend(B);
+  stages[D]->CtrlDepend(C);
+
+  auto funcs = lang::Lower("func", stages, {B, C, D});
+
+  std::cout << funcs->body << std::endl;
+
+  auto target_out = R"ROC(
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 100)
+    {
+      B[i, j] = (1.00000000f + A[i, j])
+    }
+  }
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 100)
+    {
+      C[i, j] = (1.00000000f + A[i, j])
+    }
+  }
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 100)
+    {
+      D[i, j] = (1.00000000f + A[i, j])
+    }
+  }
+}
+)ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(funcs->body), utils::Trim(target_out));
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
new file mode 100644
index 0000000000000..ade202c010ee8
--- /dev/null
+++ b/paddle/cinn/poly/stage.cc
@@ -0,0 +1,1666 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/stage.h"
+
+#include <math.h>
+
+#include <algorithm>
+#include <set>
+#include <unordered_set>
+#include <utility>
+
+#include "cinn/common/axis.h"
+#include "cinn/ir/collect_ir_nodes.h"
+#include "cinn/ir/ir_mutator.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/ir/operation.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/compute.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/ir_replace.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/optim/replace_var_with_expr.h"
+#include "cinn/poly/compute_at_transform.h"
+#include "cinn/poly/isl_utils.h"
+#include "cinn/utils/functional.h"
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace poly {
+void RemoveDuplicate(std::vector<std::vector<Expr>> &indices) {
+  std::set<std::string> temp;
+  for (int i = 0; i < indices.size(); i++) {
+    std::string index_str = "";
+    for (auto &j : indices[i]) {
+      index_str += utils::GetStreamCnt(j) + ",";
+    }
+    if (temp.count(index_str) == 0) {
+      temp.insert(index_str);
+    } else {
+      indices.erase(indices.begin() + i);
+      i--;
+    }
+  }
+}
+
+std::vector<Iterator> NamesToIterators(const std::vector<std::string> &names) {
+  std::vector<Iterator> res;
+  for (auto &name : names) {
+    res.emplace_back(name);
+  }
+  return res;
+}
+
+void Stage::InitTransform() {
+  std::string id = isl_set_get_tuple_name(domain_.get());
+
+  auto dims      = isl_get_dim_names(domain_);
+  auto dims_repr = utils::Join(dims, ", ");
+
+  auto repr = utils::StringFormat("{ %s[%s] -> %s[%s] }", id.c_str(), dims_repr.c_str(), id.c_str(), dims_repr.c_str());
+  transform_ = isl::map(domain_.ctx(), repr);
+
+  // set dimension names
+  for (int i = 0; i < dims.size(); i++) {
+    transform_ = isl::manage(isl_map_set_dim_name(transform_.release(), isl_dim_in, i, dims[i].c_str()));
+    transform_ = isl::manage(isl_map_set_dim_name(transform_.release(), isl_dim_out, i, dims[i].c_str()));
+  }
+}
+
+Stage::Stage(const isl::set &domain, Expr expr, ir::_Tensor_ *tensor) : domain_(domain), expr_(expr), tensor_(tensor) {
+  CHECK(!domain_.is_null());
+  CHECK(!domain_.is_empty());
+  InitTransform();
+}
+
+std::tuple<Iterator, Iterator> Stage::SplitOuter(const std::string &level, int nparts) {
+  return std::move(SplitOuter(Iterator(level), nparts));
+}
+
+std::tuple<Iterator, Iterator> Stage::SplitOuter(int level, int nparts) {
+  AssertAxisIsNotLocked(level);
+  auto dim_names = isl_get_dim_names(transform_, isl_dim_out);
+  auto axis_name = dim_names.at(level);
+  return SplitOuter(axis_name, nparts);
+}
+
+int Stage::GetDimRange(int level) {
+  auto _minv_maxv_ = isl_set_get_axis_range(transformed_domain().get(), level);
+  auto &minv       = std::get<0>(_minv_maxv_);
+  auto &maxv       = std::get<1>(_minv_maxv_);
+  int max_iv       = maxv.get_num_si();
+  int min_iv       = minv.get_num_si();
+  CHECK_EQ(0, min_iv) << "The min range of level " << level << " in " << id() << " is not 0!";
+  return max_iv + 1;
+}
+
+std::tuple<Iterator, Iterator> Stage::SplitOuter(const Iterator &level, int nparts) {
+  int offset = isl_set_find_dim_by_name(transformed_domain().get(), isl_dim_set, level.id.c_str());
+  CHECK_GE(offset, 0) << "iterator " << level << " not in " << domain_;
+  AssertAxisIsNotLocked(offset);
+  auto _minv_maxv_ = isl_set_get_axis_range(transformed_domain().get(), offset);
+  auto &minv       = std::get<0>(_minv_maxv_);
+  auto &maxv       = std::get<1>(_minv_maxv_);
+  int max_iv       = maxv.get_num_si();
+  auto dim_names   = isl_get_dim_names(transform_, isl_dim_out);
+  double temp      = static_cast<double>(max_iv + 1.0) / static_cast<double>(nparts);
+  int factor_inner = ceil(temp);
+  return Split(level, factor_inner);
+}
+
+std::tuple<Iterator, Iterator> Stage::Split(int level, int factor) {
+  AssertAxisIsNotLocked(level);
+  auto dim_names = isl_get_dim_names(transform_, isl_dim_out);
+  auto axis_name = dim_names.at(level);
+  return Split(axis_name, factor);
+}
+
+std::tuple<Iterator, Iterator> Stage::Split(const Iterator &level, int factor) {
+  int offset = isl_set_find_dim_by_name(transformed_domain().get(), isl_dim_set, level.id.c_str());
+  CHECK_GE(offset, 0) << "iterator " << level << " not in " << domain_;
+  AssertAxisIsNotLocked(offset);
+
+  auto dim_names = isl_get_dim_names(transform_, isl_dim_out);
+
+  VLOG(2) << "domain: " << domain_;
+  VLOG(2) << "schedule: " << transform_;
+
+  auto from_iters = NamesToIterators(dim_names);
+  std::vector<Iterator> to_iters;
+  std::vector<Condition> conds;
+  Iterator inner_iter(InnerName(level.id));
+  Iterator outer_iter(OuterName(level.id));
+  for (auto &dim : dim_names) {
+    if (dim == level.id) {
+      to_iters.push_back(outer_iter);
+      to_iters.push_back(inner_iter);
+
+      conds.emplace_back(utils::StringFormat("%s=floor(%s/%d)", outer_iter.id.c_str(), level.id.c_str(), factor));
+      VLOG(3) << "outer cond: " << conds.back();
+      conds.emplace_back(utils::StringFormat("%s=%s %s %d", inner_iter.id.c_str(), level.id.c_str(), "%", factor));
+
+      VLOG(3) << "inner cond: " << conds.back();
+    } else {
+      to_iters.emplace_back(dim);
+    }
+  }
+
+  Map transform(domain_.ctx(), id(), from_iters, to_iters, conds, id());
+  VLOG(3) << "transform: " << transform.__str__();
+  transform_      = transform_.apply_range(transform.to_isl());
+  auto range_dims = utils::Map<std::vector<Iterator>, std::string>(to_iters, [](const Iterator &x) { return x.id; });
+  isl_set_dim_names(&transform_, isl_dim_out, range_dims);
+
+  VLOG(3) << "transform " << transform.to_isl();
+  VLOG(3) << "schedule after transform: " << transform_;
+  VLOG(3) << "iterators: " << outer_iter << " " << inner_iter;
+
+  return std::make_tuple(outer_iter, inner_iter);
+}
+
+void Stage::Reorder(const std::vector<Iterator> &order) {
+  auto in_names = isl_get_dim_names(transform_, isl_dim_out);
+  // assert all the iterators in the isl::set.
+  std::unordered_set<std::string> in_name_set(in_names.begin(), in_names.end());
+  std::set<Iterator> order_set(order.begin(), order.end());
+
+  std::vector<Iterator> range_iters, domain_iters;
+  for (auto &o : order) {
+    CHECK(in_name_set.count(o.id)) << "Iterator " << o.id << " not int the exsting axis";
+  }
+
+  int order_offset = 0;
+  for (auto &iter_name : in_names) {
+    Iterator iter(iter_name);
+
+    domain_iters.push_back(iter);
+
+    if (order_set.count(iter)) {
+      range_iters.push_back(order[order_offset++]);
+    } else {
+      range_iters.push_back(iter);
+    }
+  }
+
+  CHECK_EQ(range_iters.size(), in_names.size());
+
+  Map transform(domain().ctx(), id(), domain_iters, range_iters, {}, id());
+  transform_ = transform_.apply_range(transform.to_isl());
+}
+
+void Stage::Reorder(const std::vector<int> &order) {
+  std::vector<Iterator> iters;
+  for (int id : order) iters.push_back(ith_iterator(id));
+  Reorder(iters);
+}
+
+std::tuple<Iterator, Iterator, Iterator, Iterator>  //
+Stage::Tile(int level0, int level1, int factor0, int factor1) {
+  AssertAxisIsNotLocked(level0);
+  AssertAxisIsNotLocked(level1);
+  Iterator i0(common::axis_name(level0));
+  Iterator i1(common::axis_name(level1));
+  return Tile(i0, i1, factor0, factor1);
+}
+
+std::tuple<Iterator, Iterator, Iterator, Iterator> Stage::Tile(const Iterator &level0,
+                                                               const Iterator &level1,
+                                                               int factor0,
+                                                               int factor1) {
+  auto _level0_outer_level0_inner_ = Split(level0, factor0);  // NOLINT
+  auto &level0_outer               = std::get<0>(_level0_outer_level0_inner_);
+  auto &level0_inner               = std::get<1>(_level0_outer_level0_inner_);
+  auto _level1_outer_level1_inner_ = Split(level1, factor1);  // NOLINT
+  auto &level1_outer               = std::get<0>(_level1_outer_level1_inner_);
+  auto &level1_inner               = std::get<1>(_level1_outer_level1_inner_);
+  return std::make_tuple(level0_outer, level0_inner, level1_outer, level1_inner);
+}
+
+void Stage::ComputeAtSchedule(Stage *other, int level, ComputeAtKind kind) {
+  // TODO(Superjomn) Check there are data dependency between `self` and `other`, or the `ComputeAt` is meaningless.
+  CHECK(other->tensor());
+  CHECK(tensor());
+
+  ComputeAtRelation relation;
+  relation.stage = other;
+  relation.level = level;
+
+  CHECK(relation.IsCompatible(this)) << "Cannot apply ComputeAtSchedule with level: " << level << " from \n"
+                                     << isl_set_to_str(this->transformed_domain().get()) << "\n to \n"
+                                     << isl_set_to_str(other->transformed_domain().get());
+  compute_ats_[other->id()] = relation;
+
+  // Consider the order if provide.
+  switch (kind) {
+    case kComputeAtBefore:
+      other->CtrlDepend(ir::Tensor(tensor()));
+      break;
+    case kComputeAtAfter:
+      CtrlDepend(ir::Tensor(other->tensor()));
+      break;
+    case kComputeAtAuto:
+      // Do nothing.
+      break;
+  }
+
+  // Lock all the axis.
+  for (int i = 0; i < isl_map_dim(transform_.get(), isl_dim_out); i++) {
+    LockAxis(i);
+  }
+}
+
+void Stage::ChangeIndex(Stage *other) {
+  auto indices = optim::CollectTensorIndex(&(other->expr_), this->tensor()->name);
+  RemoveDuplicate(indices);
+  if (indices.empty()) {
+    return;
+  }
+  if (indices.size() >= 2) {
+    AddForLoopInTransform(indices);
+  }
+  this->tensor()->new_indices = indices[0];
+
+  std::vector<Var> axis_var = common::GenDefaultAxis(indices[0].size());
+  for (int i = 0; i < axis_var.size(); i++) {
+    optim::ReplaceVarWithExpr(&(this->expr_), axis_var[i], indices[0][i]);
+  }
+}
+
+// Return a - b as integer.
+int Minus(const Expr &a, const Expr &b) {
+  Expr diff = ir::Sub::Make(a, b);
+  optim::Simplify(&diff);
+  if (!diff.is_constant()) {
+    LOG(ERROR) << "Range is not constant";
+  }
+  int int_range = diff.as_int32();
+  return int_range;
+}
+
+// Return the range = max - min among all indices[i][axis](i = 0,1,2,...)
+int GetRange(std::vector<std::vector<Expr>> &indices, int axis) {
+  Expr max_expr = indices[0][axis];
+  Expr min_expr = indices[0][axis];
+  for (auto i = 1; i < indices.size(); i++) {
+    if (Minus(indices[i][axis], min_expr) < 0) min_expr = indices[i][axis];
+    if (Minus(max_expr, indices[i][axis]) < 0) max_expr = indices[i][axis];
+  }
+  indices[0][axis] = min_expr;
+  return Minus(max_expr, min_expr);
+}
+
+void Stage::AddForLoopInTransform(std::vector<std::vector<Expr>> &indices) {
+  for (int i = 0; i < indices[0].size(); i++) {
+    int int_range = GetRange(indices, i);
+    if (int_range == 0) continue;
+
+    std::string dim_name = common::axis_name(i) + "_at";
+    Var dim_var(dim_name);
+    indices[0][i]              = ir::Add::Make(indices[0][i], Expr(dim_var));
+    std::string this_domain    = isl_set_to_str(domain_.get());
+    std::string this_transform = isl_map_to_str(transform_.get());
+    isl::ctx this_ctx          = domain_.ctx();
+    isl::set domain2(this_ctx, this_domain);
+    std::string tuple_name = isl_set_get_tuple_name(domain_.get());
+    domain2                = isl::manage(isl_set_add_dims(domain2.release(), isl_dim_out, 1));
+    int dim_size           = isl_set_dim(domain2.get(), isl_dim_out);
+
+    domain2 = isl::manage(isl_set_set_dim_name(domain2.release(), isl_dim_out, dim_size - 1, dim_name.c_str()));
+    domain2 = isl::manage(isl_set_set_tuple_name(domain2.release(), tuple_name.c_str()));
+    std::string domain2_str = isl_set_to_str(domain2.get());
+    domain2_str             = domain2_str.substr(0, domain2_str.size() - 1) + "and 0 <= " + dim_name +
+                  " <= " + std::to_string(int_range) + " }";
+    VLOG(2) << "Edited domain is: " << domain2_str;
+    isl::set domain_res(this_ctx, domain2_str);
+    domain_ = domain_res;
+
+    isl::map transform2(this_ctx, this_transform);
+    transform2 = isl::manage(isl_map_add_dims(transform2.release(), isl_dim_in, 1));
+    dim_size   = isl_map_dim(transform2.get(), isl_dim_in);
+    transform2 = isl::manage(isl_map_set_dim_name(transform2.release(), isl_dim_in, dim_size - 1, dim_name.c_str()));
+    transform2 = isl::manage(isl_map_set_tuple_name(transform2.release(), isl_dim_in, tuple_name.c_str()));
+    std::string transform2_str = isl_map_to_str(transform2.get());
+    int found_index            = transform2_str.find_last_of("]");
+    transform2_str             = transform2_str.substr(0, found_index) + ", " + dim_name + "' = " + dim_name +
+                     transform2_str.substr(found_index, transform2_str.size() - found_index);
+    VLOG(2) << "Edited transform is: " << transform2_str;
+    isl::map trans_res(this_ctx, transform2_str);
+    transform_ = trans_res;
+  }
+}
+/**
+ * Change this stage's domain to be consistent with other's domain.
+ * @param level Change the domain lower than level to be consistent with other's domain.
+ * For example, when this->domain_ is "{ [i0, i1] : 0 <= i0 <= 9 and 0 <= i1 <= 9 }",
+ * other->domain_ is "{ [i0, i1] : 0 <= i0 <= 4 and 0 <= i1 <= 4 }" and level = 0.
+ * Then this->domain_ whill be changed to "{ [i0, i1] : 0 <= i0 <= 4 and 0 <= i1 <= 9 }".
+ */
+void Stage::ChangeDomain(Stage *other, int level) {
+  auto indices = optim::CollectTensorIndex(&(other->expr_), this->tensor()->name);
+  if (indices.empty()) {
+    return;
+  }
+  std::string this_domain = isl_set_to_str(this->domain().get());
+  isl::ctx this_ctx       = domain_.ctx();
+  auto dim_names          = isl_get_dim_names(domain_.get());
+  auto map_names          = isl_get_dim_names(other->transform().get(), isl_dim_out);
+  std::set<std::string> uniq_names;
+  for (int i = 0; i <= level; i++) {
+    uniq_names.insert(map_names[i].substr(0, 1));
+  }
+  // The new level is the compute level of original domain axis(i, j, k, ...)
+  // instead of transformed axis(i_outer, i_inner, j, k, ...)
+  level = uniq_names.size() - 1;
+  for (int i = 0; i <= level; i++) {
+    auto _minv_maxv_   = isl_set_get_axis_range(domain_.get(), i);
+    auto &minv         = std::get<0>(_minv_maxv_);
+    auto &maxv         = std::get<1>(_minv_maxv_);
+    int min_iv         = minv.get_num_si();
+    int max_iv         = maxv.get_num_si();
+    auto _minv2_maxv2_ = isl_set_get_axis_range(other->domain().get(), i);
+    auto &minv2        = std::get<0>(_minv2_maxv2_);
+    auto &maxv2        = std::get<1>(_minv2_maxv2_);
+    int min_tar        = minv2.get_num_si();
+    int max_tar        = maxv2.get_num_si();
+    // Change each dim's range.
+    // e.g., from "0 <= i0 <= 9" to "0 <= i0 <= 4"
+    utils::Replace(&this_domain,
+                   std::to_string(min_iv) + " <= " + dim_names[i] + " <= " + std::to_string(max_iv),
+                   std::to_string(min_tar) + " <= " + dim_names[i] + " <= " + std::to_string(max_tar));
+  }
+  VLOG(3) << "Final changed domain is: " << this_domain;
+  isl::set res_set(this_ctx, this_domain);
+  domain_ = res_set;
+}
+
+/**
+ * Edit temp tensor's shape, its buffer's shape and index when doing ComputeAt2.
+ * @param level The level of dims to be changed.
+ * For example, when this->domain_ is "{ [i0, i1] : 0 <= i0 <= 9 and 0 <= i1 <= 9 }",
+ * and 1st loop is binded to threadIdx.x, then i0 will be erased in this temp tensor's axes.
+ */
+void Stage::EditTempTensor(Stage *other, int level) {
+  auto bind_info              = other->forloop_infos();
+  auto transform_domain_names = axis_names();
+  std::set<std::string> erase_var;
+  std::string tensor_name = this->tensor()->name;
+  for (int i = 0; i <= level; i++) {
+    if (isl_is_removed_axis(this->transformed_domain().get(), i)) {
+      continue;
+    }
+    int new_i = i - isl_get_precending_removed_axes_counts(this->transformed_domain().get(), i);
+    if (bind_info.count(new_i) != 0) {
+      if (bind_info[new_i].for_type == ir::ForType::GPUThread && (this->scope() == ScopeKind::kShared)) {
+        continue;
+      }
+    }
+    // Iterators of loop within level will be erased.
+    auto related_dim_in = GetRelatedInputAxies(this->transform(), this->domain(), {transform_domain_names[i]});
+    for (auto &j : related_dim_in) {
+      erase_var.insert(j);
+    }
+  }
+  std::set<std::string> undo_erase_var;
+  // Beyond level, if the loop is binded to certain thread/block, it will also be earsed.
+  for (int i = level + 1; i < transform_domain_names.size(); i++) {
+    if (isl_is_removed_axis(this->transformed_domain().get(), i)) {
+      continue;
+    }
+    int new_i = i - isl_get_precending_removed_axes_counts(this->transformed_domain().get(), i);
+    if (bind_info.count(new_i) != 0) {
+      if (bind_info[new_i].for_type == ir::ForType::GPUBlock &&
+          (this->scope() == ScopeKind::kShared || this->scope() == ScopeKind::kLocal)) {
+        auto related_dim_in = GetRelatedInputAxies(this->transform(), this->domain(), {transform_domain_names[i]});
+        for (auto &j : related_dim_in) {
+          erase_var.insert(j);
+        }
+      } else if (bind_info[new_i].for_type == ir::ForType::GPUThread && (this->scope() == ScopeKind::kLocal)) {
+        auto related_dim_in = GetRelatedInputAxies(this->transform(), this->domain(), {transform_domain_names[i]});
+        for (auto &j : related_dim_in) {
+          erase_var.insert(j);
+        }
+      } else {
+        auto related_dim_in = GetRelatedInputAxies(this->transform(), this->domain(), {transform_domain_names[i]});
+        for (auto &j : related_dim_in) {
+          undo_erase_var.insert(j);
+        }
+      }
+    } else {
+      auto related_dim_in = GetRelatedInputAxies(this->transform(), this->domain(), {transform_domain_names[i]});
+      for (auto &j : related_dim_in) {
+        undo_erase_var.insert(j);
+      }
+    }
+  }
+  std::vector<std::string> erase_var_vec;
+  for (auto &i : erase_var) {
+    if (undo_erase_var.count(i) == 0) {
+      erase_var_vec.push_back(i);
+    }
+  }
+  // Erase loop iterators.
+  for (auto &j : erase_var_vec) {
+    Var dim_var(j);
+    for (auto &i : this->tensor()->new_indices) {
+      optim::ReplaceVarWithExpr(&i, dim_var, Expr(0));
+    }
+    optim::ReplaceVarWithExpr(&(other->expr_), dim_var, Expr(0), tensor_name);
+  }
+  // Store each loop's range.
+  std::map<std::string, int> dim_to_range;
+  std::vector<std::string> this_dim_names = isl_get_dim_names(domain_);
+  for (int i = 0; i < this_dim_names.size(); i++) {
+    auto _minv_maxv_                = isl_set_get_axis_range(domain_.get(), i);
+    auto &minv                      = std::get<0>(_minv_maxv_);
+    auto &maxv                      = std::get<1>(_minv_maxv_);
+    int min_iv                      = minv.get_num_si();
+    int max_iv                      = maxv.get_num_si();
+    dim_to_range[this_dim_names[i]] = max_iv;
+  }
+
+  std::vector<Expr> new_shape;
+  for (auto &i : this->tensor()->new_indices) {
+    new_shape.push_back(optim::IRCopy(i));
+  }
+  for (auto &i : new_shape) {
+    for (auto &j : dim_to_range) {
+      Var dim_var(j.first);
+      optim::ReplaceVarWithExpr(&i, dim_var, Expr(j.second));
+    }
+    i = ir::Add::Make(i, Expr(1));
+    optim::Simplify(&i);
+  }
+  // Set new shape.
+  VLOG(3) << "Tensor is : " << this->tensor()->name;
+  for (auto &i : new_shape) {
+    VLOG(3) << "In Temp Buffer, shape is: " << utils::GetStreamCnt(i);
+  }
+  this->tensor()->shape = new_shape;
+  CHECK(this->tensor()->buffer.defined());
+  this->tensor()->buffer->shape = new_shape;
+}
+
+void Stage::ComputeAt(Stage *other, int level) {
+  isl::set this_domain(domain().ctx(), isl_set_to_str(domain().get()));
+  isl::set target_domain(other->domain().ctx(), isl_set_to_str(other->domain().get()));
+
+  auto reduce_axes = origin_reduce_axis_names();
+  for (auto &i : reduce_axes) {
+    this_domain = isl::manage(isl_remove_axis_by_name(this_domain.release(), i.c_str()));
+  }
+  isl::map write_access = isl::manage(isl_set_identity(this_domain.release()));
+  isl::map read_access  = isl::manage(isl_set_identity(target_domain.release()));
+  read_access =
+      isl::manage(isl_map_set_tuple_name(read_access.release(), isl_dim_out, isl_set_get_tuple_name(domain().get())));
+  int num_out_dim = isl_map_dim(read_access.get(), isl_dim_out);
+  read_access     = isl::manage(isl_map_remove_dims(read_access.release(), isl_dim_out, 0, num_out_dim));
+  auto indices    = optim::CollectTensorIndex(&(other->expr_), this->tensor()->name);
+  RemoveDuplicate(indices);
+  if (indices.empty()) {
+    LOG(ERROR) << "No Access Relation between [" << other->id() << "] and [" << this->id() << "]! Please check.";
+  }
+  CHECK_EQ(indices.size(), 1) << "indices.size > 1 is not supported yet";
+  std::vector<std::string> target_dims = isl_get_dim_names(other->domain());
+  std::set<std::string> target_dims_set;
+  for (auto &i : target_dims) {
+    target_dims_set.insert(i);
+  }
+  std::vector<std::string> index_names;
+  for (auto &i : indices[0]) {
+    std::string str_name = utils::GetStreamCnt(i);
+    if (target_dims_set.count(str_name) > 0) {
+      target_dims_set.erase(str_name);
+      str_name = str_name + "' = " + str_name;
+    }
+    index_names.push_back(str_name);
+  }
+
+  // New Transform = W.(R^-1).S
+  // W is the write access relation
+  // R is the read access relation
+  // S is the original schedule of Stage *other
+  read_access = isl::manage(isl_map_add_dims(read_access.release(), isl_dim_out, index_names.size()));
+  isl_set_dim_names(&read_access, isl_dim_out, index_names);
+  read_access =
+      isl::manage(isl_map_set_tuple_name(read_access.release(), isl_dim_out, isl_set_get_tuple_name(domain().get())));
+  std::string read_access_str = isl_map_to_str(read_access.get());
+  isl::map read_access2(read_access.ctx(), read_access_str);
+  read_access2 = isl::manage(isl_map_reverse(read_access2.release()));
+
+  auto new_map = isl::manage(isl_map_apply_range(write_access.release(), read_access2.release()));
+  isl::map new_target_transform(other->transform().ctx(), isl_map_to_str(other->transform().get()));
+  auto target_map_dims    = isl_get_dim_names(new_target_transform.get(), isl_dim_out);
+  auto target_map_dims_in = isl_get_dim_names(new_target_transform.get(), isl_dim_in);
+  // For axis out of the level, we don't copy their transform except for they are related to axis within the level.
+  std::vector<std::string> level_out_dims;
+  std::set<std::string> related_output_dims_set;
+  for (int i = 0; i <= level; i++) {
+    level_out_dims.push_back(target_map_dims[i]);
+    related_output_dims_set.insert(target_map_dims[i]);
+  }
+  auto related_input_dims  = GetRelatedInputAxies(new_target_transform, other->domain(), level_out_dims);
+  auto related_output_dims = GetRelatedOutputAxies(new_target_transform, other->domain(), related_input_dims);
+  for (auto &i : related_output_dims) {
+    related_output_dims_set.insert(i);
+  }
+  std::set<std::string> related_input_dims_set;
+  for (auto &i : related_input_dims) {
+    related_input_dims_set.insert(i);
+  }
+  for (auto &i : target_map_dims) {
+    if (related_output_dims_set.count(i) == 0) {
+      new_target_transform =
+          isl::manage(isl_remove_axis_by_name(new_target_transform.release(), isl_dim_out, i.c_str()));
+    }
+  }
+
+  for (auto &i : target_map_dims_in) {
+    if (related_input_dims_set.count(i) == 0) {
+      new_target_transform     = isl::manage(isl_map_add_dims(new_target_transform.release(), isl_dim_out, 1));
+      int level                = isl_map_dim(new_target_transform.get(), isl_dim_out);
+      std::string dim_name_add = i + "' = " + i;
+      new_target_transform     = isl::manage(
+          isl_map_set_dim_name(new_target_transform.release(), isl_dim_out, level - 1, dim_name_add.c_str()));
+    }
+  }
+  new_target_transform = isl::manage(isl_map_set_tuple_name(new_target_transform.release(), isl_dim_out, other->id()));
+
+  isl::map f_target_transform(other->transform().ctx(), isl_map_to_str(new_target_transform.get()));
+  auto trans_res = isl::manage(isl_map_apply_range(new_map.release(), f_target_transform.release()));
+  trans_res      = isl::manage(isl_map_set_tuple_name(trans_res.release(), isl_dim_out, this->id()));
+
+  // When there are reduce axes, we need to add these axes manually
+  if (!reduce_axes.empty()) {
+    std::vector<std::string> reduce_axes_out;
+    for (auto &i : reduce_axes) {
+      reduce_axes_out.push_back(i + "' = " + i);
+    }
+    int map_dim_in  = isl_map_dim(trans_res.get(), isl_dim_in);
+    int map_dim_out = isl_map_dim(trans_res.get(), isl_dim_out);
+
+    trans_res = isl::manage(isl_map_add_dims(trans_res.release(), isl_dim_in, reduce_axes.size()));
+    for (int i = 0; i < reduce_axes.size(); i++) {
+      trans_res =
+          isl::manage(isl_map_set_dim_name(trans_res.release(), isl_dim_in, map_dim_in + i, reduce_axes[i].c_str()));
+    }
+    trans_res = isl::manage(isl_map_add_dims(trans_res.release(), isl_dim_out, reduce_axes_out.size()));
+    for (int i = 0; i < reduce_axes_out.size(); i++) {
+      trans_res = isl::manage(
+          isl_map_set_dim_name(trans_res.release(), isl_dim_out, map_dim_out + i, reduce_axes_out[i].c_str()));
+    }
+    trans_res = isl::manage(isl_map_set_tuple_name(trans_res.release(), isl_dim_in, this->id()));
+    trans_res = isl::manage(isl_map_set_tuple_name(trans_res.release(), isl_dim_out, this->id()));
+
+    std::string trans_res_str = isl_map_to_str(trans_res.get());
+    for (int i = 0; i < reduce_axes.size(); i++) {
+      auto _minv_maxv_ = isl_set_get_axis_range(domain_.get(), i + map_dim_in);
+      auto &minv       = std::get<0>(_minv_maxv_);
+      auto &maxv       = std::get<1>(_minv_maxv_);
+      int min_iv       = minv.get_num_si();
+      int max_iv       = maxv.get_num_si();
+
+      trans_res_str = trans_res_str.substr(0, trans_res_str.size() - 1) + "and " + std::to_string(min_iv) +
+                      " <= " + reduce_axes[i] + " <= " + std::to_string(max_iv) + " }";
+    }
+    isl::map temp_trans(trans_res.ctx(), trans_res_str);
+    trans_res = temp_trans;
+  }
+
+  VLOG(3) << "trans_res is : " << trans_res;
+
+  {
+    auto trans_dim_out   = isl_get_dim_names(trans_res.get(), isl_dim_out);
+    auto transformed_res = domain_.apply(trans_res);
+    for (int i = level + 1; i < trans_dim_out.size(); i++) {
+      auto _minv_maxv_        = isl_set_get_axis_range(transformed_res.get(), i);
+      auto &minv              = std::get<0>(_minv_maxv_);
+      auto &maxv              = std::get<1>(_minv_maxv_);
+      int max_iv              = maxv.get_num_si();
+      int min_iv              = minv.get_num_si();
+      auto related_input_dims = GetRelatedInputAxies(trans_res, domain_, {trans_dim_out[i]}, true);
+      if (max_iv != min_iv && related_input_dims.empty()) {
+        trans_res = isl::manage(isl_remove_axis_by_name(trans_res.release(), isl_dim_out, trans_dim_out[i].c_str()));
+      }
+      VLOG(3) << "Input axis related to output axis [" << trans_dim_out[i] << "] (from " << min_iv << " to " << max_iv
+              << ") is : ";
+      for (auto &j : related_input_dims) {
+        VLOG(3) << j << ", ";
+      }
+    }
+  }
+  VLOG(3) << "After removing redundant output axis, trans_res is : " << trans_res;
+  transform_ = trans_res;
+  CHECK(tensor_);
+
+  ComputeAtRelation relation;
+  relation.stage = other;
+  relation.level = level;
+  other->CtrlDepend(ir::Tensor(tensor()));
+
+  CHECK(relation.IsCompatible(this)) << "Cannot apply ComputeAt with level: " << level << " from \n"
+                                     << isl_set_to_str(this->transformed_domain().get()) << "\n to \n"
+                                     << isl_set_to_str(other->transformed_domain().get());
+  compute_ats_[other->id()] = relation;
+  for (int i = 0; i <= level; i++) AddForloopInfo(i, StageForloopInfo{ir::ForType::Default, DeviceAPI::UNK, 0});
+}
+
+void Stage::ComputeAt2(Stage *other, int level) {
+  // TODO(Superjomn) Check there are data dependency between `self` and `other`, or the `ComputeAt` is meaningless.
+  CHECK_GE(level, 0) << "level param of ComputeAt2 must be >= 0. Please check!";
+  this->ChangeDomain(other, level);
+  this->CopyTransform(other, level);
+  this->ChangeIndex(other);
+  CHECK(tensor_);
+  other->CtrlDepend(ir::Tensor(tensor()));
+  if (this->tensor()->buffer.defined()) {
+    std::string t_name = this->tensor()->buffer->name;
+    if (utils::Endswith(t_name, "_read_cache") || utils::Endswith(t_name, "_write_cache")) {
+      EditTempTensor(other, level);
+    }
+  }
+  ComputeAtRelation relation;
+  relation.stage = other;
+  relation.level = level;
+  other->CtrlDepend(ir::Tensor(tensor()));
+
+  CHECK(relation.IsCompatible(this)) << "Cannot apply ComputeAt2 with level: " << level << " from \n"
+                                     << isl_set_to_str(this->transformed_domain().get()) << "\n to \n"
+                                     << isl_set_to_str(other->transformed_domain().get());
+  compute_ats_[other->id()] = relation;
+}
+
+void Stage::ComputeAt3(Stage *other, int level) {
+  this->ChangeDomain(other, level);
+  this->CopyTransform(other, level);
+  this->ChangeIndex(other);
+  CHECK(tensor_);
+  other->CtrlDepend(ir::Tensor(tensor()));
+  if (this->tensor()->buffer.defined()) {
+    std::string t_name = this->tensor()->buffer->name;
+    if (utils::Endswith(t_name, "_read_cache") || utils::Endswith(t_name, "_write_cache")) {
+      EditTempTensor(other, level);
+    }
+  }
+}
+
+void Stage::SimpleComputeAt(Stage *other, int level) {
+  CHECK(tensor_);
+  other->CtrlDepend(ir::Tensor(tensor()));
+  if (this->tensor()->buffer.defined()) {
+    std::string t_name = this->tensor()->buffer->name;
+    if (utils::Endswith(t_name, "_read_cache") || utils::Endswith(t_name, "_write_cache")) {
+      EditTempTensor(other, level);
+    }
+  }
+  ComputeAtRelation relation;
+  relation.stage = other;
+  relation.level = level;
+  other->CtrlDepend(ir::Tensor(tensor()));
+
+  CHECK(relation.IsCompatible(this)) << "Cannot apply SimpleComputeAt with level: " << level << " from \n"
+                                     << isl_set_to_str(this->transformed_domain().get()) << "\n to \n"
+                                     << isl_set_to_str(other->transformed_domain().get());
+  compute_ats_[other->id()] = relation;
+  auto other_expr           = other->expr();
+  auto find_tensors         = ir::CollectIRNodesWithoutTensor(
+      other_expr, [&](const Expr *x) { return x->as_tensor() && x->as_tensor_ref()->name == tensor()->name; });
+  if (!find_tensors.empty()) {
+    for (int i = 0; i <= level; i++) AddForloopInfo(i, StageForloopInfo{ir::ForType::Default, DeviceAPI::UNK, 0});
+  }
+}
+
+std::tuple<Iterator, Iterator> Stage::Skew(const Iterator &i, const Iterator &j, int factor) {
+  CINN_NOT_IMPLEMENTED
+  Iterator i_new(i.id + "_skew");
+  Iterator j_new(j.id + "_skew");
+
+  return std::make_tuple(i_new, j_new);
+}
+
+Iterator Stage::Fuse(int level0, int level1) {
+  AssertAxisIsNotLocked(level0);
+  AssertAxisIsNotLocked(level1);
+  auto dims = isl_get_dim_names(transformed_domain());
+  CHECK_LT(level0, dims.size());
+  CHECK_LT(level1, dims.size());
+
+  Iterator iter0(dims[level0]);
+  Iterator iter1(dims[level1]);
+
+  return Fuse(iter0, iter1);
+}
+
+Iterator Stage::Fuse(const std::vector<int> &levels) {
+  auto dims = isl_get_dim_names(transformed_domain());
+  for (auto i : levels) {
+    AssertAxisIsNotLocked(i);
+    CHECK_LT(i, dims.size());
+  }
+  Iterator fused_axis(dims[levels[0]]);
+  for (size_t i = 1; i < levels.size(); i++) {
+    fused_axis = Fuse(fused_axis, Iterator(dims[levels[i]]));
+  }
+  return fused_axis;
+}
+
+Iterator Stage::Fuse(const std::string &level0, const std::string &level1) {
+  return Fuse(Iterator(level0), Iterator(level1));
+}
+
+Iterator Stage::FuseDirect(const std::vector<int> &levels) {
+  auto dims = isl_get_dim_names(transformed_domain());
+  for (auto i : levels) {
+    AssertAxisIsNotLocked(i);
+    CHECK_LT(i, dims.size());
+  }
+  std::vector<Iterator> iterators;
+  for (size_t i = 0; i < levels.size(); i++) {
+    iterators.push_back(Iterator(dims[levels[i]]));
+  }
+  return Fuse(iterators);
+}
+
+Iterator Stage::Fuse(const std::vector<Iterator> &levels) {
+  CHECK_GT(levels.size(), 1);
+  std::vector<int> offsets;
+  std::string new_iter_name;
+  for (auto &level : levels) {
+    int offset = isl_set_find_dim_by_name(transformed_domain().get(), isl_dim_set, level.id.c_str());
+    if (!offsets.empty())
+      CHECK_EQ(offsets.back() + 1, offset)
+          << "level [" << offsets.back() << "] and level [" << offset << "] should be adjancent";
+    AssertAxisIsNotLocked(offset);
+    offsets.push_back(offset);
+    new_iter_name += utils::StringFormat("%s_", level.id.c_str());
+  }
+  new_iter_name += "fused";
+
+  // Aff { s[i,j,k] -> [j] } and get the j's max value
+  // to apply something like { S[i,j] -> S[k]: k = i+j }
+  auto from_dim_names = isl_get_dim_names(transform_, isl_dim_out);
+  auto from_iters     = NamesToIterators(from_dim_names);
+
+  std::vector<int> iterator_max_val;
+  for (auto &level : levels) {
+    Aff aff(domain_.ctx(), id(), from_iters, std::vector<Iterator>({Iterator(level.id)}), {});
+    int level_max_val = transformed_domain().max_val(aff.to_isl()).get_num_si() + 1;
+    iterator_max_val.push_back(level_max_val);
+  }
+
+  // Map { s[i,j,k] -> s[n,k] : n = i * max_val + j }
+  std::vector<Iterator> to_iters;
+  {
+    Iterator new_iter(new_iter_name);
+    for (int i = 0; i < from_iters.size(); i++) {
+      int offset = isl_set_find_dim_by_name(transformed_domain().get(), isl_dim_set, from_iters[i].id.c_str());
+      if (i == offsets.back()) {
+        to_iters.push_back(new_iter);
+      } else if (i >= offsets.front() && i < offsets.back()) {
+      } else {
+        to_iters.push_back(from_iters[i]);
+      }
+    }
+  }
+  auto my_prod = [=](int a, int b) { return a * b; };
+  std::vector<Condition> conds;
+  conds.emplace_back(
+      utils::StringFormat("%s = floor(%s / %d)",
+                          levels.front().id.c_str(),
+                          new_iter_name.c_str(),
+                          (int)std::accumulate(iterator_max_val.begin() + 1, iterator_max_val.end(), 1, my_prod)));
+  conds.emplace_back(
+      utils::StringFormat("%s = %s mod %d", levels.back().id.c_str(), new_iter_name.c_str(), iterator_max_val.back()));
+
+  for (int i = 1; i < levels.size() - 1; i++) {
+    conds.emplace_back(
+        utils::StringFormat("%s = floor(%s / %d) mod %d",
+                            levels[i].id.c_str(),
+                            new_iter_name.c_str(),
+                            (int)std::accumulate(iterator_max_val.begin() + i + 1, iterator_max_val.end(), 1, my_prod),
+                            iterator_max_val[i]));
+  }
+
+  Map trans(domain_.ctx(), id(), from_iters, to_iters, conds, id());
+
+  VLOG(2) << "In Fuse, trans is : " << trans.to_isl();
+  VLOG(2) << "Before Fuse, transform_ is : " << transform_;
+  transform_ = transform_.apply_range(trans.to_isl());
+  {
+    std::vector<std::string> iter_names;
+    for (auto &iter : to_iters) iter_names.push_back(iter.id);
+
+    isl_set_dim_names(&transform_, isl_dim_out, iter_names);
+  }
+  VLOG(2) << "After Fuse, transform_ is : " << transform_;
+
+  return Iterator(new_iter_name);
+}
+
+/*
+ * Fuse use a polyhedral transform.
+ */
+Iterator Stage::Fuse(const Iterator &level0, const Iterator &level1) {
+  int offset0 = isl_set_find_dim_by_name(transformed_domain().get(), isl_dim_set, level0.id.c_str());
+  int offset1 = isl_set_find_dim_by_name(transformed_domain().get(), isl_dim_set, level1.id.c_str());
+  CHECK_EQ(offset1, offset0 + 1) << "level [" << level0.id << "] and level [" << level1.id << "] should be adjancent";
+  AssertAxisIsNotLocked(offset0);
+  AssertAxisIsNotLocked(offset1);
+
+  auto new_iter_name = utils::StringFormat("%s_%s_fused", level0.id.c_str(), level1.id.c_str());
+
+  // Aff { s[i,j,k] -> [j] } and get the j's max value
+  // to apply something like { S[i,j] -> S[k]: k = i+j }
+  auto from_dim_names = isl_get_dim_names(transform_, isl_dim_out);
+  auto from_iters     = NamesToIterators(from_dim_names);
+
+  Aff aff(domain_.ctx(), id(), from_iters, std::vector<Iterator>({Iterator(level1.id)}), {});
+
+  int level1_max_val = transformed_domain().max_val(aff.to_isl()).get_num_si() + 1;
+
+  // Map { s[i,j,k] -> s[n,k] : n = i * max_val + j }
+  std::vector<Iterator> to_iters;
+  {
+    Iterator new_iter(new_iter_name);
+    for (auto &iter : from_iters) {
+      if (iter == level0) {
+      } else if (iter == level1) {
+        to_iters.push_back(new_iter);
+      } else {
+        to_iters.push_back(iter);
+      }
+    }
+  }
+
+  std::vector<Condition> conds;
+  conds.emplace_back(utils::StringFormat(
+      "%s = %s * %d + %s", new_iter_name.c_str(), level0.id.c_str(), level1_max_val, level1.id.c_str()));
+
+  Map trans(domain_.ctx(), id(), from_iters, to_iters, conds, id());
+
+  transform_ = transform_.apply_range(trans.to_isl());
+  {
+    std::vector<std::string> iter_names;
+    for (auto &iter : to_iters) iter_names.push_back(iter.id);
+
+    isl_set_dim_names(&transform_, isl_dim_out, iter_names);
+  }
+
+  return Iterator(new_iter_name);
+}
+
+std::vector<std::string> Stage::input_statements() const {
+  if (!expr_.defined()) return {};
+  VLOG(3) << "stage " << id() << " expr: " << expr_;
+  auto load_exprs = ir::CollectIRNodes(expr_, [](const Expr *x) { return x->As<ir::Load>(); });
+  std::set<std::string> statements;
+  for (auto &expr : load_exprs) {
+    auto *load_node = expr.As<ir::Load>();
+    CHECK(load_node);
+    auto *tensor = load_node->tensor.As<ir::_Tensor_>();
+    CHECK(tensor);
+    auto tensor_name = tensor->name;
+    if (tensor_name != id()) statements.insert(tensor_name);
+  }
+  return std::vector<std::string>(statements.begin(), statements.end());
+}
+
+std::string InnerName(const std::string &name) { return name + "_inner"; }
+std::string OuterName(const std::string &name) { return name + "_outer"; }
+std::string InnerName(const Iterator &iterator) { return InnerName(iterator.id); }
+std::string OuterName(const Iterator &iterator) { return OuterName(iterator.id); }
+
+const char *Stage::id() const { return isl_set_get_tuple_name(domain_.get()); }
+
+std::tuple<Iterator, Iterator> Stage::Split(const std::string &level, int factor) {
+  return std::move(Split(Iterator(level), factor));
+}
+
+Shared<Stage> Stage::New(const isl::set &domain, Expr expr, ir::_Tensor_ *tensor) {
+  return new Stage(domain, expr, tensor);
+}
+
+std::vector<ComputeAtRelation> Stage::compute_ats() const {
+  std::vector<ComputeAtRelation> xs;
+  for (auto &item : compute_ats_) xs.push_back(item.second);
+  return xs;
+}
+
+void Stage::ShowISL() const {
+  LOG(INFO) << "Tensor " << id() << " domain is: " << isl_set_to_str(domain().get());
+  LOG(INFO) << "transformed_domain is: " << isl_set_to_str(transformed_domain().get());
+  LOG(INFO) << "transform is: " << isl_map_to_str(transform().get());
+}
+
+bool ComputeAtRelation::IsCompatible(Stage *self) {
+  CHECK_GE(level, 0);
+  CHECK(!self->domain().is_null());
+  CHECK(!stage->domain().is_null());
+
+  CHECK_LE(level, isl_set_dim(self->transformed_domain().get(), isl_dim_set));
+  CHECK_LE(level, isl_set_dim(stage->transformed_domain().get(), isl_dim_set));
+
+  int level_without_reduce_axis = level;
+  if (self->tensor()) {
+    level_without_reduce_axis = self->tensor()->domain.size() - 1;
+  }
+  std::vector<int> selected_dims;
+  for (int i = 0; i <= std::min(level, level_without_reduce_axis); i++) {
+    selected_dims.push_back(i);
+  }
+
+  auto stage_partial_set = SetGetDims(stage->transformed_domain(), selected_dims);
+  auto self_partial_set  = SetGetDims(self->transformed_domain(), selected_dims);
+
+  stage_partial_set = isl::manage(isl_set_set_tuple_name(stage_partial_set.release(), ""));
+  self_partial_set  = isl::manage(isl_set_set_tuple_name(self_partial_set.release(), ""));
+
+  // remove parameters, we don't consider them yet
+  auto remove_params = [](isl::set &set) {
+    int nparams = isl_set_dim(set.get(), isl_dim_param);
+    if (nparams > 0) {
+      set = isl::manage(isl_set_remove_dims(set.release(), isl_dim_param, 0, nparams));
+    }
+  };
+
+  remove_params(stage_partial_set);
+  remove_params(self_partial_set);
+
+  VLOG(3) << "stage0.partial_set " << stage_partial_set;
+  VLOG(3) << "stage1.partial_set " << self_partial_set;
+  return isl_set_is_equal(stage_partial_set.get(), self_partial_set.get());
+}
+
+void Stage::Vectorize(int level, int factor) {
+  AssertAxisIsNotLocked(level);
+  CHECK_GE(level, 0);
+  CHECK_LT(level, n_out_dims());
+  CHECK_GT(factor, 0);
+  if (factor == 1) {
+    VLOG(3) << "Vectorize-factor 1 has no sense, skip it";
+    return;
+  }
+  auto transformed_domain = this->transformed_domain();
+  if (isl_is_removed_axis(transformed_domain.get(), level)) {
+    VLOG(3) << "Vectorizing for-1 has no sense, skip it";
+    return;
+  }
+  int removed_axes_counts = isl_get_precending_removed_axes_counts(transformed_domain.get(), level);
+  VLOG(3) << "removed_axes_counts are " << removed_axes_counts << " before axis " << ith_dim_name(level);
+  VLOG(3) << "vectorize level: " << level - removed_axes_counts << ", factor: " << factor;
+  vectorize_info_.set(level - removed_axes_counts /*inner*/, factor);
+}
+
+void Stage::Vectorize(const std::string &axis, int factor) {
+  auto dims = isl_get_dim_names(transformed_domain());
+  auto it   = std::find(dims.begin(), dims.end(), axis);
+  CHECK(it != dims.end()) << "No dimension called " << axis;
+  Vectorize(std::distance(dims.begin(), it), factor);
+}
+
+void Stage::Vectorize(const Iterator &axis, int factor) { return Vectorize(axis.id, factor); }
+
+void Stage::Parallel(const std::string &axis) {
+  auto dims = isl_get_dim_names(transformed_domain());
+  auto it   = std::find(dims.begin(), dims.end(), axis);
+  CHECK(it != dims.end()) << "No dimension called " << axis;
+  Parallel(std::distance(dims.begin(), it));
+}
+
+void Stage::Parallel(const Iterator &axis) { return Parallel(axis.id); }
+
+void Stage::Parallel(int level) {
+  CHECK_GE(level, 0);
+  AssertAxisIsNotLocked(level);
+  auto transformed_domain = this->transformed_domain();
+  VLOG(3) << "transformed_domain" << transformed_domain;
+  if (isl_is_removed_axis(transformed_domain.get(), level)) {
+    VLOG(3) << "Paralleling for-1 has no sense, skip it";
+    return;
+  }
+  int removed_axes_counts = isl_get_precending_removed_axes_counts(transformed_domain.get(), level);
+  VLOG(3) << "removed_axes_counts are " << removed_axes_counts << " before axis " << ith_dim_name(level);
+  parallel_info_.insert(level - removed_axes_counts);
+}
+
+void Stage::Unroll(int level) {
+  CHECK_GE(level, 0);
+  AssertAxisIsNotLocked(level);
+  auto transformed_domain = this->transformed_domain();
+  if (isl_is_removed_axis(transformed_domain.get(), level)) {
+    VLOG(1) << "Unrolling for-1 has no sense, skip it";
+    return;
+  }
+  int removed_axes_counts = isl_get_precending_removed_axes_counts(transformed_domain.get(), level);
+  VLOG(3) << "removed_axes_counts are " << removed_axes_counts << " before axis " << ith_dim_name(level);
+  unroll_info_.insert(level - removed_axes_counts);
+}
+
+std::string Stage::ith_dim_name(int level) {
+  auto dims = isl_get_dim_names(transformed_domain());
+  CHECK_LT(level, dims.size());
+  return dims[level];
+}
+
+Iterator Stage::ith_iterator(int level) { return Iterator(ith_dim_name(level)); }
+
+isl::set Stage::transformed_domain() const {
+  CHECK(!domain_.is_null());
+  CHECK(!transform_.is_null());
+  return domain_.apply(transform_);
+}
+
+std::vector<std::pair<std::string, std::string>> ExtractExtraDepLinksFromStages(const std::vector<Stage *> &stages) {
+  std::vector<std::pair<std::string, std::string>> extra_links;
+  for (auto &stage : stages) {
+    for (auto &tensor : stage->ctrl_depends()) {
+      VLOG(1) << "get extra stage: " << tensor->name << " -> " << stage->id();
+      extra_links.emplace_back(tensor->name, stage->id());
+    }
+  }
+
+  return extra_links;
+}
+
+void Stage::Unroll(const std::string &level) {
+  auto dim_names = axis_names();
+  auto it        = std::find(dim_names.begin(), dim_names.end(), level);
+  int l          = std::distance(dim_names.begin(), it);
+  AssertAxisIsNotLocked(l);
+  Unroll(l);
+}
+
+void Stage::Unroll(const Iterator &level) {
+  auto dim_names = axis_names();
+  auto it        = std::find(dim_names.begin(), dim_names.end(), level.id);
+  int l          = std::distance(dim_names.begin(), it);
+  AssertAxisIsNotLocked(l);
+  Unroll(l);
+}
+
+std::vector<std::string> Stage::axis_names() const { return isl_get_dim_names(transformed_domain()); }
+
+std::vector<std::string> Stage::origin_reduce_axis_names() {
+  auto reduce_axis_var = this->tensor()->reduce_axis;
+  std::vector<std::string> reduce_axis_names;
+  for (auto &i : reduce_axis_var) {
+    reduce_axis_names.push_back(i->name);
+  }
+  return reduce_axis_names;
+}
+
+void Stage::Bind(int level, const std::string &axis) {
+  CHECK_LT(level, n_out_dims());
+  LockAxis(level);
+
+  if (axis == "threadIdx.x" || axis == "threadIdx.y" || axis == "threadIdx.z") {
+    uint8_t offset = axis.back() - 'x';
+    AddForloopInfo(level, StageForloopInfo{ir::ForType::GPUThread, DeviceAPI::GPU, offset});
+  } else if (axis == "blockIdx.x" || axis == "blockIdx.y" || axis == "blockIdx.z") {
+    uint8_t offset = axis.back() - 'x';
+    AddForloopInfo(level, StageForloopInfo{ir::ForType::GPUBlock, DeviceAPI::GPU, offset});
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+Iterator Stage::axis(int i) const {
+  auto names = axis_names();
+  CHECK_LT(i, names.size());
+  return Iterator(names[i]);
+}
+Iterator Stage::axis(const std::string &i) const {
+  auto names = axis_names();
+  auto it    = std::find(names.begin(), names.end(), i);
+  CHECK(it != names.end());
+  return Iterator(*it);
+}
+
+bool Stage::has_expression() const {
+  CHECK(tensor_);
+  return tensor_->has_expression();
+}
+
+void Stage::SyncThreads(StageMap stages) {
+  CHECK(tensor_);
+  auto this_tensor = ir::Tensor(tensor_);
+
+  auto sync_threads = lang::Compute(
+      {},
+      [](const std::vector<Expr> &axis) { return runtime::IntrinsicCall(Void(), "__syncthreads", {}); },
+      Context::Global().NewName("syncthreads"));
+
+  stages->Insert(sync_threads, ir::CreateStage(sync_threads).get());
+  CHECK_EQ(sync_threads->type(), Void());
+  stages[sync_threads]->CtrlDepend(this_tensor);
+  CHECK_LE(this->compute_ats().size(), 1);
+  for (auto &compute_at : this->compute_ats()) {
+    isl::set sync_domain(compute_at.stage->domain().ctx(),
+                         isl_set_to_str(compute_at.stage->transformed_domain().get()));
+    int dim_num = isl_set_dim(sync_domain.get(), isl_dim_set);
+    sync_domain = isl::manage(
+        isl_set_remove_dims(sync_domain.release(), isl_dim_set, compute_at.level + 1, dim_num - compute_at.level - 1));
+    sync_domain = isl::manage(isl_set_set_tuple_name(sync_domain.release(), sync_threads->name.c_str()));
+    stages[sync_threads]->domain_ = sync_domain;
+    stages[sync_threads]->InitTransform();
+
+    ComputeAtRelation relation;
+    relation.stage = compute_at.stage.get();
+    relation.level = compute_at.level;
+    relation.stage->CtrlDepend(sync_threads);
+
+    CHECK(relation.IsCompatible(this)) << "Cannot create ComputeAtRelation in SyncThreads with level: "
+                                       << relation.level << " from \n"
+                                       << isl_set_to_str(stages[sync_threads]->transformed_domain().get()) << "\n to \n"
+                                       << isl_set_to_str(relation.stage->transformed_domain().get());
+    stages[sync_threads]->compute_ats_[relation.stage->id()] = relation;
+  }
+
+  for (auto &s : stages) {
+    if (s.second->id() != this->id() && s.second->tensor()->Uses(this_tensor)) {
+      s.second->CtrlDepend(sync_threads);
+    }
+  }
+}
+
+void Stage::SyncThreads(int level, const std::vector<ir::Tensor> &before_tensors, StageMap stages) {
+  CHECK(tensor_);
+  auto this_tensor = ir::Tensor(tensor_);
+
+  auto sync_threads = lang::Compute(
+      {},
+      [](const std::vector<Expr> &axis) { return runtime::IntrinsicCall(Void(), "__syncthreads", {}); },
+      Context::Global().NewName("syncthreads"));
+
+  stages->Insert(sync_threads, ir::CreateStage(sync_threads).get());
+  CHECK_EQ(sync_threads->type(), Void());
+  this->CtrlDepend(sync_threads);
+
+  for (auto &other : before_tensors) {
+    stages[sync_threads]->CtrlDepend(other);
+  }
+
+  isl::set sync_domain(domain().ctx(), isl_set_to_str(transformed_domain().get()));
+  int dim_num = isl_set_dim(sync_domain.get(), isl_dim_set);
+  sync_domain = isl::manage(isl_set_remove_dims(sync_domain.release(), isl_dim_set, level + 1, dim_num - level - 1));
+  sync_domain = isl::manage(isl_set_set_tuple_name(sync_domain.release(), sync_threads->name.c_str()));
+  stages[sync_threads]->domain_ = sync_domain;
+  stages[sync_threads]->InitTransform();
+
+  ComputeAtRelation relation;
+  relation.stage = this;
+  relation.level = level;
+
+  CHECK(relation.IsCompatible(this));
+  stages[sync_threads]->compute_ats_[this->id()] = relation;
+}
+
+namespace {
+
+/**
+ * Replace the reader of a cache tensor to tensor.
+ */
+struct CacheReplaceMutator : public ir::IRMutator<> {
+  std::string tensor_name;
+  ir::Tensor cache;
+  bool read_or_write{};
+
+  /**
+   * construct
+   * @param tensor_name name of the tensor to cache
+   * @param cache the cache
+   * @param read_or_write read or write cache
+   */
+  CacheReplaceMutator(const std::string &tensor_name, ir::Tensor cache, bool read_or_write)
+      : tensor_name(tensor_name), cache(cache), read_or_write(read_or_write) {}
+
+  void operator()(Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::_Tensor_ *op, Expr *expr) override {
+    if (to_mutate_ && tensor_name == op->name) {
+      *expr = cache;
+    }
+  }
+
+  void Visit(const ir::Load *op, Expr *expr) override {
+    auto *node = expr->As<ir::Load>();
+    CHECK(node->tensor.as_tensor());
+    auto *tensor = node->tensor.as_tensor();
+    for (auto &index : node->indices) {
+      ir::IRMutator<>::Visit(&index, &index);
+    }
+    ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
+  }
+
+  bool to_mutate_{true};
+};
+}  // namespace
+
+void CacheReadWriteReplace(std::vector<ir::Tensor> &readers, ir::Tensor cache_tensor, std::string origin_tensor_name) {
+  for (auto k : readers) {
+    auto op = k->operation->as<ir::ComputeOp>()->body;
+    for (auto j : op) {
+      CacheReplaceMutator(origin_tensor_name, cache_tensor, true /*read*/)(&j);
+    }
+  }
+}
+
+void Stage::SetBuffer(const std::string &memory_type) {
+  tensor_->WithBuffer(memory_type, "_" + this->tensor_->name + "_temp_buffer");
+  if (memory_type == "shared") {
+    this->SetScope(ScopeKind::kShared);
+  } else if (memory_type == "local") {
+    this->SetScope(ScopeKind::kLocal);
+  } else if (memory_type == "global") {
+    this->SetScope(ScopeKind::kGlobal);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+  if (tensor_->buffer.defined()) {
+    VLOG(2) << "Has tensor_->buffer: " << tensor_->buffer->name;
+  } else {
+    VLOG(2) << "No tensor_->buffer";
+  }
+}
+
+/*
+ * To create a read cache:
+ * 1. create a cache write stage for cache assign.
+ * 2. add extra deps between cache and tensor to keep SSA order
+ * 3. register the readers of the cache to the \p tensor, replace latter in Lower
+ */
+ir::Tensor Stage::CacheRead(const std::string &memory_type, std::vector<ir::Tensor> &readers, StageMap stages) {
+  CHECK(tensor_);
+  auto my_tensor         = ir::Tensor(tensor_);
+  std::string cache_name = Context::Global().NewName(tensor_->name) + "_read_cache";
+  VLOG(4) << "cache_name " << cache_name;
+  auto cache_tensor = lang::Compute(
+      tensor_->shape, [=](const std::vector<Expr> &dims) { return my_tensor(dims); }, cache_name);
+  cache_tensor->WithBuffer(memory_type);
+
+  stages->Insert(cache_tensor, CreateStage(cache_tensor).get());
+  for (auto &reader : readers) {
+    stages[reader]->CtrlDepend(cache_tensor);
+  }
+
+  std::vector<std::string> reader_names;
+  std::transform(
+      readers.begin(), readers.end(), std::back_inserter(reader_names), [](const ir::Tensor &x) { return x->name; });
+  CacheReadWriteReplace(readers, cache_tensor, tensor_->name);
+
+  if (memory_type == "shared") {
+    stages[cache_tensor]->SetScope(ScopeKind::kShared);
+  } else if (memory_type == "local") {
+    stages[cache_tensor]->SetScope(ScopeKind::kLocal);
+  } else if (memory_type == "global") {
+    stages[cache_tensor]->SetScope(ScopeKind::kGlobal);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  return cache_tensor;
+}
+
+/*
+ * Replace the tensor's name to cache_name, and create a cache_stage to copy content from cache to original tensor.
+ */
+ir::Tensor Stage::CacheWrite(const std::string &memory_type, StageMap stages, ir::Tensor &key_tensor) {
+  CHECK(tensor_);
+  CHECK(!tensor_->buffer.defined()) << "This tensor is already binded to a buffer, cannot cache write";
+  CHECK(!meta.compute_inline) << "Cannot create a write cache on an inlined tensor";
+  auto ctrl_depend       = stages[tensor_]->ctrl_depends();
+  std::string cache_name = tensor_->name + "_write_cache";
+  auto original_name     = tensor_->name;
+  tensor_->name          = cache_name;
+  auto my_tensor         = ir::Tensor(tensor_);
+  // make my_tensor a cache
+  my_tensor->WithBuffer(memory_type);
+
+  auto write_stage = lang::Compute(
+      tensor_->shape, [=](const std::vector<Expr> &dims) { return my_tensor(dims); }, original_name);
+
+  stages->Insert(my_tensor, CreateStage(my_tensor).get());
+  stages[my_tensor]->ctrl_depends_ = ctrl_depend;
+  stages->Insert(write_stage, CreateStage(write_stage).get());
+
+  stages[write_stage]->CtrlDepend(my_tensor);
+  std::vector<ir::Tensor> temp;
+  for (auto &i : stages) {
+    if (i.second->tensor()->name == original_name || i.second->tensor()->name == cache_name) continue;
+    if (i.second->tensor()->is_compute_node()) {
+      temp.push_back(ir::Tensor(i.second->tensor()));
+    }
+  }
+  CacheReadWriteReplace(temp, write_stage, cache_name);
+
+  key_tensor = write_stage;
+
+  return my_tensor;
+}
+
+void Stage::ComputeInline() {
+  CHECK(tensor_);
+  meta.compute_inline = true;
+}
+
+void Stage::DisableComputeInline() {
+  CHECK(tensor_);
+  meta.compute_inline = false;
+}
+
+void Stage::ShareBufferWith(Stage *other) {
+  CHECK(tensor_);
+  CHECK(!other->meta.compute_inline);
+  CHECK(!meta.compute_inline);
+
+  meta.tensors_to_share_buffer_with.insert(other->id());
+  other->meta.tensors_to_share_buffer_with.insert(tensor_->name);
+}
+
+isl_map *__isl_give GatherAccesses(Stage *stage, const std::string &tensor_name) {
+  CHECK(stage->tensor_);
+  auto loads = ir::CollectIRNodes(stage->tensor_->body(), [&](const Expr *x) {
+    return x->As<ir::Load>() && x->As<ir::Load>()->tensor.as_tensor()->name == tensor_name;
+  });
+
+  auto vars = stage->tensor_->axis_with_reduce();
+
+  std::string in_tuple_name  = stage->tensor_->name;
+  std::string out_tuple_name = tensor_name;
+  std::vector<std::string> in_dim_names, out_loads;
+  std::transform(vars.begin(), vars.end(), std::back_inserter(in_dim_names), [](const Var &x) { return x->name; });
+  std::transform(
+      loads.begin(), loads.end(), std::back_inserter(out_loads), [](const Expr &x) { return utils::GetStreamCnt(x); });
+
+  isl_map *res = nullptr;
+  for (auto &load : out_loads) {
+    std::string repr = utils::StringFormat(
+        "{ %s[%s] -> %s }", in_tuple_name.c_str(), utils::Join(in_dim_names, ",").c_str(), load.c_str());
+    isl_map *access = isl_map_read_from_str(stage->domain().ctx().get(), repr.c_str());
+    if (res) {
+      res = isl_map_union(res, access);
+    } else {
+      res = access;
+    }
+  }
+
+  return res;
+}
+
+void Stage::AddForloopInfo(int level, const StageForloopInfo &info) {
+  cuda_bind_info_ = true;
+  int num_levels  = isl_map_dim(transform_.get(), isl_dim_out);
+  CHECK_GE(level, 0);
+  CHECK_LT(level, num_levels);
+  auto transformed_domain = this->transformed_domain();
+  int removed_axes_counts = isl_get_precending_removed_axes_counts(transformed_domain.get(), level);
+
+  if (isl_is_removed_axis(transformed_domain.get(), level)) {
+    // For scalar case, forloop info will be lost after for-1 and reduce-axis elimination. We record the forloop info in
+    // the -1th level for backup.
+    if (level == 0) {
+      VLOG(3) << "add forloop_infos in the -1 level for backup";
+      forloop_infos_[-1] = info;
+    }
+    VLOG(3) << "for-1 has no sense, skip it";
+    return;
+  }
+  VLOG(3) << "removed_axes_counts are " << removed_axes_counts << " before axis " << ith_dim_name(level);
+  forloop_infos_[level - removed_axes_counts] = info;
+}
+
+void Stage::CopyTransform(Stage *other, int level) {
+  auto target_transform =
+      RemoveAxiesByInputNames(other->transform(), other->domain(), other->origin_reduce_axis_names());
+  isl::set target_origin_domain(other->domain().ctx(), isl_set_to_str(other->domain().get()));
+  for (auto &i : other->origin_reduce_axis_names()) {
+    target_origin_domain = isl::manage(isl_remove_axis_by_name(target_origin_domain.release(), i.c_str()));
+  }
+  std::string str_target_trans = isl_map_to_str(target_transform.get());
+  std::string this_tensor_name = isl_set_get_tuple_name(domain_.get());
+  isl::ctx this_ctx            = domain_.ctx();
+  isl::map temp_transform_(this_ctx, str_target_trans);
+
+  auto this_map_dims   = isl_get_dim_names(transform_.get(), isl_dim_in);
+  auto target_map_dims = isl_get_dim_names(target_transform.get(), isl_dim_in);
+
+  // Edit level. e.g. if A->Split(0,10) and B->CopyTransform(A,0), level should increase to 1.
+  isl::map temp_target_trans(this_ctx, str_target_trans);
+  if (level + 1 < isl_map_dim(temp_target_trans.get(), isl_dim_out)) {
+    std::string pivot_dim_out = isl_map_get_dim_name(temp_target_trans.get(), isl_dim_out, level + 1);
+    std::vector<std::string> dim_out_level;
+    for (int i = 0; i <= level; i++) {
+      dim_out_level.push_back(isl_map_get_dim_name(temp_target_trans.get(), isl_dim_out, i));
+    }
+    auto related_dim_in  = GetRelatedInputAxies(temp_target_trans, target_origin_domain, dim_out_level);
+    auto related_dim_out = GetRelatedOutputAxies(temp_target_trans, target_origin_domain, related_dim_in);
+    for (auto &i : related_dim_out) {
+      if (i == pivot_dim_out) {
+        this->CopyTransform(other, level + 1);
+        return;
+      }
+    }
+  } else if (level >= isl_map_dim(temp_target_trans.get(), isl_dim_out)) {
+    LOG(ERROR) << "ComputeAt level: " << level
+               << " is not less than the axis number : " << isl_map_dim(temp_target_trans.get(), isl_dim_out)
+               << ", please check.";
+  }
+
+  //! When this->tensor's dim is more than other->tensor, we need to supplment dims.
+  std::vector<std::string> sup_dims;
+  for (int i = target_map_dims.size(); i < this_map_dims.size(); i++) {
+    sup_dims.push_back(this_map_dims[i]);
+  }
+  //! Check the dim range in this domain and target domain. Correspoding dim's range must be equal.
+
+  auto dim_names = isl_get_dim_names(domain_.get());
+  std::set<std::string> this_dim_names;
+  std::vector<std::string> erase_dim_names;
+  for (int i = 0; i < isl_set_dim(domain_.get(), isl_dim_set); i++) {
+    this_dim_names.insert(isl_set_get_dim_name(domain_.get(), isl_dim_set, i));
+  }
+  //! Delete redundant input dims in transform_ (e,g. B[i,j] -> CopyTransform(C[i,j,k]) , Redundant dim k will be
+  //! deleted.)
+  for (int i = 0; i < isl_map_dim(temp_transform_.get(), isl_dim_in); i++) {
+    if (this_dim_names.count(isl_map_get_dim_name(temp_transform_.get(), isl_dim_in, i)) == 0) {
+      temp_transform_ = isl::manage(isl_map_remove_dims(temp_transform_.release(), isl_dim_in, i, 1));
+      i--;
+    }
+  }
+  //! Check related output dims in transform_ and delete them (e,g. C[i,j,k] -> C[i,j,k1,k2] , Redundant output dim k1
+  //! nad k2 will be deleted.)
+  std::string new_target_trans = isl_map_to_str(temp_transform_.get());
+  for (int i = 0; i < isl_map_dim(temp_transform_.get(), isl_dim_out); i++) {
+    std::string temp_dim = isl_map_get_dim_name(temp_transform_.get(), isl_dim_out, i);
+    if (utils::Count(&new_target_trans, temp_dim) != utils::Count(&str_target_trans, temp_dim)) {
+      temp_transform_ = isl::manage(isl_map_remove_dims(temp_transform_.release(), isl_dim_out, i, 1));
+      i--;
+    }
+  }
+  //! Add dims
+  if (level >= 0) {
+    std::set<std::string> keep_names;
+    int dim_size = isl_map_dim(temp_transform_.get(), isl_dim_out);
+    for (int i = level + 1; i < dim_size; i++) {
+      std::string temp = isl_map_get_dim_name(temp_transform_.get(), isl_dim_out, i);
+      temp             = temp.substr(0, 1);
+      temp             = temp + "' = " + temp;
+      keep_names.insert(temp);
+    }
+    temp_transform_ =
+        isl::manage(isl_map_remove_dims(temp_transform_.release(), isl_dim_out, level + 1, dim_size - level - 1));
+    for (auto i : keep_names) {
+      VLOG(3) << "i in keep_names is: " << i;
+      temp_transform_ = isl::manage(isl_map_add_dims(temp_transform_.release(), isl_dim_out, 1));
+      temp_transform_ = isl::manage(isl_map_set_dim_name(temp_transform_.release(), isl_dim_out, level + 1, i.c_str()));
+      level++;
+    }
+  }
+  if (sup_dims.size() > 0) {
+    int level_in  = isl_map_dim(temp_transform_.get(), isl_dim_in);
+    int level_out = isl_map_dim(temp_transform_.get(), isl_dim_out);
+    for (auto i : sup_dims) {
+      VLOG(3) << "i in sup_dims is: " << i;
+      temp_transform_ = isl::manage(isl_map_add_dims(temp_transform_.release(), isl_dim_in, 1));
+      temp_transform_ = isl::manage(isl_map_set_dim_name(temp_transform_.release(), isl_dim_in, level_in, i.c_str()));
+      level_in++;
+      std::string i_dim_out = i + "' = " + i;
+      temp_transform_       = isl::manage(isl_map_add_dims(temp_transform_.release(), isl_dim_out, 1));
+      temp_transform_ =
+          isl::manage(isl_map_set_dim_name(temp_transform_.release(), isl_dim_out, level_out, i_dim_out.c_str()));
+      level_out++;
+    }
+  }
+  isl_map_set_tuple_name(temp_transform_.get(), isl_dim_in, this_tensor_name.c_str());
+  isl_map_set_tuple_name(temp_transform_.get(), isl_dim_out, this_tensor_name.c_str());
+  std::string res_trans = isl_map_to_str(temp_transform_.get());
+  isl::map res_map(this_ctx, res_trans);
+  VLOG(2) << "This domain is: " << isl_set_to_str(domain_.get());
+  VLOG(2) << "After Copytransform this trans is : " << isl_map_to_str(res_map.get());
+  VLOG(2) << "Target transform is : " << isl_map_to_str(other->transform().get());
+  VLOG(2) << "CopyTransform Level is : " << level;
+  transform_ = res_map;
+}
+
+void Stage::CopyLoopInfo(Stage *other) {
+  // copy other stage's forloop_infos
+  auto &target_forloop_infos                = other->forloop_infos();
+  auto target_transformed_domain            = other->transformed_domain();
+  std::vector<std::string> this_dim_names   = isl_get_dim_names(transformed_domain());
+  std::vector<std::string> target_dim_names = isl_get_dim_names(target_transformed_domain);
+
+  for (auto &i : target_forloop_infos) {
+    for (int j = 0; j < this_dim_names.size(); j++) {
+      int new_i = poly::isl_get_original_axes_from_optimized_level(target_transformed_domain.get(), i.first);
+      if (target_dim_names[new_i] == this_dim_names[j]) {
+        this->AddForloopInfo(j, i.second);
+      }
+    }
+  }
+  // copy other stage's vectorize/unroll/parallel info
+  auto &target_vectorize_info = other->vectorize_info();
+  auto &target_unroll_info    = other->unroll_info();
+  auto &target_parallel_info  = other->parallel_info();
+  vectorize_info_             = target_vectorize_info;
+  unroll_info_                = target_unroll_info;
+  parallel_info_              = target_parallel_info;
+}
+
+void Stage::LockAxis(uint32_t level) {
+  CHECK_LT(level, n_out_dims()) << "axis level out of range";
+  locked_axis_.insert(level);
+}
+
+void Stage::UnlockAxis(uint32_t level) {
+  CHECK_LT(level, n_out_dims()) << "axis level out of range";
+  locked_axis_.erase(level);
+}
+
+bool Stage::is_axis_locked(uint32_t level) const {
+  CHECK_LT(level, n_out_dims()) << "axis level out of range";
+  return locked_axis_.count(level);
+}
+
+void Stage::AssertAxisIsNotLocked(uint32_t level) {
+  CHECK(!is_axis_locked(level)) << "The " << level << "-th axis is locked, cannot perform schedule";
+}
+
+int Stage::GetTransformedLevel(int level) {
+  if (!compute_ats().empty()) {
+    // The ComputeAt schedule will insert some consumer axis in the preceding of this, so the raw before ComputeAt
+    // should add the numbers of axis inserted.
+    CHECK_EQ(compute_ats().size(), 1UL);
+    auto &compute_at = compute_ats().front();
+    return compute_at.level + level + 1;
+  }
+
+  // or just return the original.
+  return level;
+}
+
+void Stage::CtrlDepend(const ir::Tensor &t) { ctrl_depends_.insert(t); }
+
+const std::set<ir::Tensor> &Stage::ctrl_depends() const { return ctrl_depends_; }
+
+ir::Tensor Stage::LookupCtrlDepend(const std::string &tensor_name) const {
+  auto it = std::find_if(
+      ctrl_depends_.begin(), ctrl_depends_.end(), [&](const ir::Tensor &x) { return x->name == tensor_name; });
+  if (it == ctrl_depends_.end()) return ir::Tensor();
+  return *it;
+}
+
+Stage *_StageMap_::operator[](const ir::Tensor &tensor) {
+  CHECK(data_.count(tensor->name)) << "StageMap has no stage for tensor [" << tensor->name << "]";
+  return data_[tensor->name].get();
+}
+const Stage *_StageMap_::operator[](const ir::Tensor &tensor) const {
+  CHECK(data_.count(tensor->name));
+  return data_.at(tensor->name).get();
+}
+Stage *_StageMap_::operator[](const ir::_Tensor_ *tensor) {
+  CHECK(data_.count(tensor->name)) << "StageMap has no stage for tensor [" << tensor->name << "]";
+  return data_[tensor->name].get();
+}
+const Stage *_StageMap_::operator[](const ir::_Tensor_ *tensor) const {
+  CHECK(data_.count(tensor->name)) << "StageMap has no stage for tensor [" << tensor->name << "]";
+  return data_.at(tensor->name).get();
+}
+
+Stage *_StageMap_::Insert(const ir::Tensor &key, Stage *stage) {
+  CHECK(stage);
+  data_[key->name].Reset(stage);
+  return stage;
+}
+
+Stage *_StageMap_::InsertLazily(const ir::Tensor &key) {
+  if (data_.count(key->name)) return operator[](key);
+  return Insert(key, ir::CreateStage(key).get());
+}
+
+Stage *_StageMap_::InsertLazily(const ir::Tensor &key, Stage *stage) {
+  if (data_.count(key->name)) return operator[](key);
+  CHECK(stage);
+  data_[key->name].Reset(stage);
+  return stage;
+}
+
+StageMap CreateStages(const std::vector<ir::Tensor> &tensors) {
+  StageMap stages;
+
+  std::set<ir::Tensor> all_tensors(tensors.begin(), tensors.end());
+
+  for (auto &tensor : tensors) {
+    auto used_tensors = ir::CollectIRNodes(tensor->body(), [](const Expr *x) { return x->as_tensor(); });
+    for (const Expr &x : used_tensors) {
+      all_tensors.insert(x.as_tensor_ref());
+    }
+  }
+
+  for (auto &t : all_tensors) {
+    stages->Insert(t, ir::CreateStage(t).get());
+  }
+
+  return stages;
+}
+
+Stage *_StageMap_::Lookup(const std::string &name) const {
+  auto it = data_.find(name);
+  if (it == data_.end()) return nullptr;
+  return it->second.get();
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/stage.h b/paddle/cinn/poly/stage.h
new file mode 100755
index 0000000000000..7e834123c2d24
--- /dev/null
+++ b/paddle/cinn/poly/stage.h
@@ -0,0 +1,537 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <glog/logging.h>
+#include <isl/cpp.h>
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/common.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/poly/domain.h"
+#include "cinn/poly/map.h"
+
+namespace cinn {
+namespace poly {
+using ir::DeviceAPI;
+
+struct ComputeAtRelation;
+
+enum class ScopeKind {
+  kLocal  = 0,
+  kShared = 1,
+  kGlobal = 2,
+};
+
+class StageMap;
+
+struct StageForloopInfo {
+  StageForloopInfo() = default;
+  StageForloopInfo(ir::ForType for_type, ir::DeviceAPI device, uint8_t offset)
+      : for_type(for_type), device(device), offset(offset) {}
+
+  ir::ForType for_type;
+  //! The offset in the \p for_type. e.g. for GPUBlock, 0 represents blockIdx.x, 1 is blockIdx.y, 2 is blockIdx.z.
+  uint8_t offset;
+  ir::DeviceAPI device;
+};
+
+//! Store the infomations about some other tensor `compute_at` this tensor.
+struct ComputeAtInfo {
+  ComputeAtInfo(const std::string& consumer_tensor_name,
+                const std::string& producer_tensor_name,
+                const std::vector<int>& adjusted_producer_shape,
+                const std::vector<int>& preceding_offset_for_producer_load,
+                int level)
+      : consumer_tensor_name(consumer_tensor_name),
+        producer_tensor_name(producer_tensor_name),
+        adjusted_producer_shape(adjusted_producer_shape),
+        preceding_offset_for_producer_load(preceding_offset_for_producer_load),
+        level(level) {}
+
+  std::string consumer_tensor_name;
+  std::string producer_tensor_name;
+  //! The shape of the buffer belong to the producer tensor after compute_at.
+  //! NOTE this doesn't support dynamic dimension yet.
+  std::vector<int> adjusted_producer_shape;
+  //! The preceding offsets for the indice in the Loads for the producers, the offset will make the minimum indice to be
+  //! 0, size of this should equal to level+1.
+  std::vector<int> preceding_offset_for_producer_load;
+  //! the level of the consumer tensor's transformed range.
+  int level{-1};
+};
+
+/**
+ * Meta infomation for tensor.
+ */
+struct TensorScheduleMeta {
+  //! Store the information of all the other producer tensors `compute_at` this tensor.
+  std::vector<ComputeAtInfo> compute_at_infos;
+
+  bool compute_inline{false};
+
+  //! Name of the tensors those share buffer with `this` tensor.
+  std::set<std::string> tensors_to_share_buffer_with;
+};
+
+/**
+ * Stage is the basic element of polyhedral which represents a stage in CINN.
+ * It supports multiple transforms such as tile, split and so on.
+ */
+class Stage : public Object {
+ public:
+  static Shared<Stage> New(const isl::set& domain, Expr expr = Expr(), ir::_Tensor_* tensor = nullptr);
+
+  TensorScheduleMeta meta;
+
+  /**
+   * The id of this element, should be unique across the transform.
+   */
+  const char* id() const;
+
+  //! Expression contained in this stage.
+  const Expr& expr() const { return expr_; }
+  //! Change this stage's domain to be consistent with other's domain.
+  void ChangeDomain(Stage* other, int level);
+  //! Add for loop in this stage's transform and replace this tensor's index in
+  //! this tensor's compute body.
+  void ChangeIndex(Stage* other);
+  //! Get the i-th axis.
+  Iterator axis(int i) const;
+  //! Get the axis named \p i.
+  Iterator axis(const std::string& i) const;
+  //! Get the original reduce axis names.
+  std::vector<std::string> origin_reduce_axis_names();
+
+  std::vector<std::string> axis_names() const;
+
+  ir::_Tensor_* tensor() { return tensor_; }
+
+  /**
+   * Mark this stage to expand inplace in all the usages.
+   */
+  void ComputeInline();
+  void DisableComputeInline();
+
+  bool inlined() const { return meta.compute_inline; }
+
+  /**
+   * Mark this buffer should share buffer with \p other.
+   */
+  void ShareBufferWith(Stage* other);
+
+  /**
+   * Split the loop level of into two new loop levels.
+   * @param level the level to split.
+   * @param factor the extent(size) of the inner loop created after splitting.
+   * @return the new outer and inner iterators.
+   */
+  // @{
+  std::tuple<Iterator, Iterator>  //
+  Split(const Iterator& level, int factor);
+  std::tuple<Iterator, Iterator>  //
+  Split(const std::string& level, int factor);
+  std::tuple<Iterator, Iterator>  //
+  Split(int level, int factor);
+  // @}
+
+  /**
+   * Split the loop level of into two new loop levels.
+   * @param level the level to split.
+   * @param nparts the extent(size) of the outer loop created after splitting.
+   * @return the new outer and inner iterators.
+   */
+  // @{
+  std::tuple<Iterator, Iterator>  //
+  SplitOuter(const Iterator& level, int nparts);
+  std::tuple<Iterator, Iterator>  //
+  SplitOuter(const std::string& level, int nparts);
+  std::tuple<Iterator, Iterator>  //
+  SplitOuter(int level, int nparts);
+  // @}
+
+  /**
+   * Reorder the iterators.
+   * @param order the order of all the iterators.
+   */
+  void Reorder(const std::vector<Iterator>& order);
+  void Reorder(const std::vector<int>& order);
+
+  /**
+   * Tile the two loop levels \p level0 and \p level1 with rectangular tiling.
+   * @param level0 the first level.
+   * @param level1 the second level.
+   * @param factor0 tiling size of the first level.
+   * @param factor1 tiling size of the second level.
+   * @return the new iterators.
+   */
+  std::tuple<Iterator, Iterator, Iterator, Iterator>  //
+  Tile(const Iterator& level0, const Iterator& level1, int factor0, int factor1);
+  std::tuple<Iterator, Iterator, Iterator, Iterator>  //
+  Tile(int level0, int level1, int factor0, int factor1);
+
+  int GetDimRange(int level);
+
+  /**
+   * Vectorize the stage in \p level.
+   * @param level
+   */
+  void Vectorize(int level, int factor);
+  void Vectorize(const std::string& axis, int factor);
+  void Vectorize(const Iterator& axis, int factor);
+
+  /**
+   * Parallel a for-loop.
+   * @param level
+   */
+  void Parallel(int level);
+  void Parallel(const std::string& axis);
+  void Parallel(const Iterator& axis);
+
+  /**
+   * Unroll a for-loop.
+   */
+  void Unroll(int level);
+  void Unroll(const std::string& level);
+  void Unroll(const Iterator& level);
+
+  void Bind(int level, const std::string& axis);
+
+  enum ComputeAtKind {
+    kComputeAtAuto,
+    kComputeAtBefore,
+    kComputeAtAfter,
+  };
+
+  /**
+   * Apply loop skewing on the loop levels \p i and \p j with a skewing factor of \p factor.
+   * TODO(Superjomn) Refine this transform.
+   */
+  std::tuple<Iterator, Iterator>  //
+  Skew(const Iterator& i, const Iterator& j, int factor);
+
+  //! Add a control dependency link to \p t.
+  void CtrlDepend(const ir::Tensor& t);
+  //! Get the tensors control depend on.
+  const std::set<ir::Tensor>& ctrl_depends() const;
+
+  /**
+   * Set the memory type of this stage's tensor.
+   * @param memory_type the memory type of this tensor. For example, memory_type="shared".
+   */
+  void SetBuffer(const std::string& memory_type);
+
+  /**
+   * Given two stages already satisfy ComputeAtRelation.IsCompatible, set compute_ats_ for them.
+   * @param other the other stage to set compute_ats_.
+   * @param level the level of ComputeAtRelation.
+   */
+  void SimpleComputeAt(Stage* other, int level);
+
+  /**
+   * Create a cache Tensor and load the \p source into this buffer, replace all the reading in the readers with the
+   * cache.
+   * @param tensor the source memory to cache.
+   * @param memory_type the memory type, "share" for CUDA share memory, "local" for CUDA local memory.
+   * @param readers the readers of the \p tensor
+   */
+  ir::Tensor CacheRead(const std::string& memory_type, std::vector<ir::Tensor>& readers, poly::StageMap stages);
+
+  /**
+   * \brief Mark the stage compute at the level of some other stage. Usually used when there is no access relation
+   * between two tensors.
+   *
+   * The difference bewteen ComputeAt2 and ComputeAt is that ComputeAt2 can be used when there is no access relation
+   * between two tensors.
+   *
+   * @param other the target stage to compute at.
+   * @param level the level of \p other's forloop to compute at
+   */
+  void ComputeAt2(Stage* other, int level);
+
+  // Do ComputeAt2 except for setting the ComputeAt level, which is moving the computations together.
+  void ComputeAt3(Stage* other, int level);
+
+  /**
+   * \brief Mark the stage compute at the level of some other stage.
+   *
+   * NOTE This can only be called after all transformations are preformed, and once called, no further transform can
+   * perform for that if the iterators are changed, the original `ComputeAt` level will become invalid.
+   *
+   * @param other the target stage to compute at.
+   * @param level the level of \p other's forloop to compute at
+   */
+  void ComputeAt(Stage* other, int level);
+
+  void ShowISL() const;
+
+  void AddForLoopInTransform(std::vector<std::vector<Expr>>& indices);
+  /**
+   * Create a cache for write to the original tensor.
+   * @param tensor the tensor to create the cache for.
+   * @param memory_type "share" for CUDA share memory, "local" for CUDA local memory.
+   */
+  ir::Tensor CacheWrite(const std::string& memory_type, poly::StageMap stages, ir::Tensor& key_tensor);
+
+  /**
+   * Generate the `syncthreads()` code to sync all threads on CUDA backends.
+   * For other backends like Opencl, generate corresponding code to sync multi threads.
+   * @param tensor the exact tensor computed just before syncthreads.
+   * @param stages the stagemap of all tensor.
+   */
+  void SyncThreads(StageMap stages);
+
+  /**
+   * Generate the `syncthreads()` code to sync all threads on CUDA backends.
+   * For other backends like Opencl, generate corresponding code to sync multi threads.
+   * @param level the ComputeAt level of syncthreads in this tensor's computation.
+   * @param before_tensors the tensors computed before syncthreads.
+   * @param stages the stagemap of all tensor.
+   * Example Code :
+   * for (i = 0:9)
+   *   for (j = 0:9)
+   *     A[i,j]
+   *
+   * After stages[A]->SyncThreads(0, {}, stages), The Code is :
+   * for (i = 0:9)
+   *   syncthreads()
+   *   for (j = 0:9)
+   *     A[i,j]
+   */
+  void SyncThreads(int level, const std::vector<ir::Tensor>& before_tensors, StageMap stages);
+
+  /**
+   * Set thread scope.
+   */
+  void SetScope(ScopeKind scope) { scope_ = scope; }
+
+  /**
+   * Get thread scope.
+   */
+  ScopeKind scope() const { return scope_; }
+
+  /**
+   * \brief Fuse two forloop levels and return the new level.
+   * @param level0 the first level.
+   * @param level1 the second level.
+   * @return the new level.
+   */
+  Iterator Fuse(const Iterator& level0, const Iterator& level1);
+  Iterator Fuse(const std::vector<Iterator>& levels);
+  Iterator Fuse(int level0, int level1);
+  Iterator Fuse(const std::vector<int>& levels);
+  Iterator FuseDirect(const std::vector<int>& levels);
+  Iterator Fuse(const std::string& level0, const std::string& level1);
+  const isl::set& domain() const { return domain_; }
+  const isl::map& transform() const { return transform_; }
+  isl::set transformed_domain() const;
+
+  // Dealing with the `ComputateAt` transform.
+  std::vector<ComputeAtRelation> compute_ats() const;
+
+  //! Get the level-th dimensional name.
+  std::string ith_dim_name(int level);
+  //! Get the i-th iterator.
+  Iterator ith_iterator(int level);
+
+  /** Get the final level after all the transforms.
+   * The level will be affected by some schedule like ComputeAt, this will return the right level.
+   *
+   * @param level the level in schedule.
+   */
+  int GetTransformedLevel(int level);
+
+  //! Get the statements.
+  std::vector<std::string> input_statements() const;
+
+  virtual const char* type_info() const { return __type_info__; }
+
+  inline const ir::VectorizeInfo& vectorize_info() const { return vectorize_info_; }
+  inline const std::set<int>& unroll_info() const { return unroll_info_; }
+  inline const std::set<int>& parallel_info() const { return parallel_info_; }
+  inline std::map<std::string, ComputeAtRelation>& GetComputeAts() { return compute_ats_; }
+  inline void SetComputeAts(const std::map<std::string, ComputeAtRelation>& compute_ats) { compute_ats_ = compute_ats; }
+
+  /*
+  const std::set<std::string>& extra_depend_stages() const { return extra_depend_stages_; }
+  void set_extra_depend_stages(const std::set<std::string>& x) { extra_depend_stages_ = x; }
+  void add_extra_depend_stage(const std::string& statement) { extra_depend_stages_.insert(statement); }
+   */
+
+  const std::map<int /*level*/, StageForloopInfo>& forloop_infos() const { return forloop_infos_; }
+
+  bool has_expression() const;
+
+  Stage() = default;
+
+  void ComputeAtSchedule(Stage* other, int level, ComputeAtKind kind = kComputeAtAuto);
+
+  ir::Tensor LookupCtrlDepend(const std::string& tensor_name) const;
+
+  //! Get number of transform output dimensions, this equals to the number of forloops in generated code.
+  inline int n_in_dims() const { return isl_map_dim(transform_.get(), isl_dim_in); }
+  //! Get number of transform output dimensions, this equals to the number of dimensions of corresponding tensor.
+  inline int n_out_dims() const { return isl_map_dim(transform_.get(), isl_dim_out); }
+
+  //! Copy other stage's transform.
+  //! For example, if the target_transform is `Split(0,1)`,
+  //! this api will apply `Split(0,1)` on itself.
+  void CopyTransform(Stage* other, int level = -1);
+  //! Edit temp tensor's shape, its buffer's shape and index when doing ComputeAt2.
+  void EditTempTensor(Stage* other, int level);
+  //! Copy other stage's LoopInfo.
+  //! For example, if the target_forloop_infos is `Bind(0,"threadIdx.x")`,
+  //! this api will apply `Bind(0,"threadIdx.x")` on itself.
+  void CopyLoopInfo(Stage* other);
+  //! Set stage's transform_
+  void SetTransform(isl::map new_transform) { transform_ = new_transform; }
+  //! Set stage's forloop_infos_
+  void SetForloopInfo(std::map<int, StageForloopInfo> forloop_infos) { forloop_infos_ = forloop_infos; }
+  void AddForloopInfo(int level, const StageForloopInfo& info);
+  bool IfCudaBind() { return cuda_bind_info_; }
+
+ private:
+  explicit Stage(const isl::set& domain, Expr expr = Expr(), ir::_Tensor_* tensor = nullptr);
+
+  /**
+   * Initialize with an identity schedule.
+   */
+  void InitTransform();
+
+  //! Lock the \p level-th axis and disallow the futher schedules on this axis.
+  void LockAxis(uint32_t level);
+  //! Unlock the \p level-th axis.
+  void UnlockAxis(uint32_t level);
+  //! Tell if the \p level -th axis is locked.
+  bool is_axis_locked(uint32_t level) const;
+  //! Assert that the axis is not locked, abort if fail.
+  void AssertAxisIsNotLocked(uint32_t level);
+
+  static constexpr char* __type_info__ = "Stage";
+
+ private:
+  isl::set domain_;
+  isl::map transform_;
+  Expr expr_;
+  // this compute_at some other stages.
+  std::map<std::string, ComputeAtRelation> compute_ats_;
+  ir::VectorizeInfo vectorize_info_;
+  //! The for-loop levels to unroll.
+  std::set<int> unroll_info_;
+  //! The for-loop levels to parallel.
+  std::set<int> parallel_info_;
+  //! Record some forloop levels' information.
+  std::map<int /*level*/, StageForloopInfo> forloop_infos_;
+  //! A weak reference to the tensor.
+  ir::_Tensor_* tensor_{};
+  //! Thread scope.
+  ScopeKind scope_{ScopeKind::kGlobal};
+  std::set<ir::Tensor> ctrl_depends_;
+
+  std::set<int> locked_axis_;
+  bool cuda_bind_info_{false};
+
+  friend isl_map* __isl_give GatherAccesses(Stage* stage, const std::string& tensor_name);
+  friend class PolyGroupScheduler;
+};
+
+std::vector<std::pair<std::string, std::string>> ExtractExtraDepLinksFromStages(const std::vector<Stage*>& stages);
+
+//! This stage compute_at some other stage.
+struct ComputeAtRelation {
+  //! the other stage.
+  Shared<Stage> stage;
+  int level{-1};
+
+  //! Check whether the stage \p self is compatible with \p stage.
+  bool IsCompatible(Stage* self);
+};
+
+//! Return the corresponding inner iterator name.
+inline std::string InnerName(const std::string& name);
+inline std::string InnerName(const Iterator& iterator);
+//! Return the corresponding inner iterator name.
+inline std::string OuterName(const std::string& name);
+inline std::string OuterName(const Iterator& iterator);
+
+inline Iterator DefaultIterator(int i) { return Iterator(common::axis_name(i)); }
+
+/**
+ * Collect the access to a tensor named \p tensor_name in \p stage.
+ */
+std::vector<isl::map> GatherAccesses(const Stage* stage, const std::string& tensor_name);
+
+class _StageMap_ : public Object {
+ public:
+  /**
+   * Get a stage from the stage map.
+   * NOTE The stage should exists, or it will abort.
+   */
+  // @{
+  Stage* operator[](const ir::Tensor& tensor);
+  const Stage* operator[](const ir::Tensor& tensor) const;
+  Stage* operator[](const ir::_Tensor_* tensor);
+  const Stage* operator[](const ir::_Tensor_* tensor) const;
+  // @}
+
+  //! Insert a stage into the map, it will replace if an older one exists.
+  Stage* Insert(const ir::Tensor& key, Stage* stage);
+  //! Insert a stage only if not exists.
+  Stage* InsertLazily(const ir::Tensor& key);
+
+  Stage* InsertLazily(const ir::Tensor& key, Stage* stage);
+
+  //! Lookup a tensor from the map, return nullptr if not exists.
+  Stage* Lookup(const std::string& name) const;
+
+  inline size_t size() const { return data_.size(); }
+
+  const char* type_info() const override { return __type_info__; }
+
+  static constexpr const char* __type_info__ = "StageMap";
+
+ private:
+  absl::flat_hash_map<std::string, Shared<Stage>> data_;
+
+  friend class StageMap;
+};
+
+class StageMap : public Shared<_StageMap_> {
+ public:
+  StageMap() : Shared(new _StageMap_) {}
+
+  Stage* operator[](const ir::Tensor& tensor) { return (*self())[tensor]; }
+  const Stage* operator[](const ir::Tensor& tensor) const { return (*self())[tensor]; }
+  Stage* operator[](const ir::_Tensor_* tensor) { return (*self())[tensor]; }
+  const Stage* operator[](const ir::_Tensor_* tensor) const { return (*self())[tensor]; }
+
+  auto begin() const { return self()->data_.begin(); }
+  auto end() const { return self()->data_.end(); }
+};
+
+StageMap CreateStages(const std::vector<ir::Tensor>& tensors);
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/poly/stage_test.cc b/paddle/cinn/poly/stage_test.cc
new file mode 100755
index 0000000000000..717643aa8908b
--- /dev/null
+++ b/paddle/cinn/poly/stage_test.cc
@@ -0,0 +1,554 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/poly/stage.h"
+
+#include <gtest/gtest.h>
+
+#include <set>
+
+#include "cinn/backends/llvm/codegen_llvm.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/cinn.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+
+namespace cinn {
+namespace poly {
+
+// Create a call.
+Expr CreateCall(const std::string& name, const std::vector<Expr>& args) {
+  auto expr = ir::Call::Make(Float(32), name, args, {}, ir::CallType::CINN, ir::FunctionRef(), 0);
+  return expr;
+}
+
+TEST(Stage, split) {
+  isl::ctx ctx(isl_ctx_alloc());
+  isl::set domain(ctx, "{ S[i,j]: 0<=i,j<=100 }");
+
+  auto ele           = Stage::New(domain);
+  auto _outer_inner_ = ele->Split(Iterator("i"), 4);  // NOLINT
+  auto& outer        = std::get<0>(_outer_inner_);
+  auto& inner        = std::get<1>(_outer_inner_);
+  LOG(INFO) << ele->transform();
+  EXPECT_EQ(utils::GetStreamCnt(ele->transform()),
+            "{ S[i, j] -> S[i_outer, i_inner, j' = j] : (-i + i_inner) mod 4 = 0 and -3 + i <= 4i_outer <= i and 0 <= "
+            "i_inner <= 3 }");
+
+  EXPECT_EQ(outer.id, "i_outer");
+  EXPECT_EQ(inner.id, "i_inner");
+}
+
+TEST(Stage, tile) {
+  isl::ctx ctx(isl_ctx_alloc());
+  isl::set domain(ctx, "{ S[i,j,k]: 0<=i,j,k<=100 }");
+  auto ele = Stage::New(domain);
+
+  auto _outer0_inner0_outer1_inner1_ = ele->Tile(Iterator("i"), Iterator("j"), 4, 6);  // NOLINT
+  auto& outer0                       = std::get<0>(_outer0_inner0_outer1_inner1_);
+  auto& inner0                       = std::get<1>(_outer0_inner0_outer1_inner1_);
+  auto& outer1                       = std::get<2>(_outer0_inner0_outer1_inner1_);
+  auto& inner1                       = std::get<3>(_outer0_inner0_outer1_inner1_);
+  LOG(INFO) << ele->transform();
+  EXPECT_EQ(outer0.id, "i_outer");
+  EXPECT_EQ(outer1.id, "j_outer");
+  EXPECT_EQ(inner0.id, "i_inner");
+  EXPECT_EQ(outer1.id, "j_outer");
+  EXPECT_EQ(
+      utils::GetStreamCnt(ele->transform()),
+      "{ S[i, j, k] -> S[i_outer, i_inner, j_outer, j_inner, k' = k] : (-i + i_inner) mod 4 = 0 and (-j + j_inner) mod "
+      "6 = 0 and -3 + i <= 4i_outer <= i and 0 <= i_inner <= 3 and -5 + j <= 6j_outer <= j and 0 <= j_inner <= 5 }");
+}
+
+TEST(Stage, reorder) {
+  isl::ctx ctx(isl_ctx_alloc());
+  isl::set domain(ctx, "{ S[i,j,k]: 0<=i,j,k<=100 }");
+  auto ele = Stage::New(domain);
+  Iterator i("i"), j("j"), k("k");
+  ele->Reorder(std::vector<Iterator>{{i, k, j}});
+  LOG(INFO) << ele->transform();
+}
+
+TEST(Stage, split_reorder) {
+  isl::ctx ctx(isl_ctx_alloc());
+  isl::set domain(ctx, "{ S[i,j,k]: 0<=i,j,k<=100 }");
+  auto ele           = Stage::New(domain);
+  auto _outer_inner_ = ele->Split(Iterator("i"), 4);  // NOLINT
+  auto& outer        = std::get<0>(_outer_inner_);
+  auto& inner        = std::get<1>(_outer_inner_);
+
+  Iterator i("i"), j("j"), k("k");
+  ele->Reorder(std::vector<Iterator>{{outer, k, inner, j}});
+  LOG(INFO) << ele->transform();
+}
+
+TEST(ComputeAtRelation, basic) {
+  isl::ctx ctx(isl_ctx_alloc());
+  isl::set domain0(ctx, "{ S[i,j,k]: 0<=i,j,k<=100 }");
+  isl::set domain1(ctx, "{ D[a,b,c,d]: 0<=a,b,c,d<=100 }");
+
+  auto stage0 = Stage::New(domain0);
+  auto stage1 = Stage::New(domain0);
+
+  ComputeAtRelation relation;
+  relation.stage = stage1;
+  relation.level = 2;
+  ASSERT_TRUE(relation.IsCompatible(stage0.get()));
+}
+
+TEST(Stage, Fuse) {
+  isl::ctx ctx(isl_ctx_alloc());
+  isl::set domain(ctx, "{ S[i,j,k]: 0<=i,j,k<=100 }");
+  auto ele           = Stage::New(domain);
+  auto _outer_inner_ = ele->Split(Iterator("i"), 4);  // NOLINT
+  auto& outer        = std::get<0>(_outer_inner_);
+  auto& inner        = std::get<1>(_outer_inner_);
+  LOG(INFO) << "split: " << ele->transform();
+  ele->Fuse(outer, inner);
+  LOG(INFO) << "fused: " << ele->transform();
+}
+
+TEST(Stage, Fuse1) {
+  isl::ctx ctx(isl_ctx_alloc());
+  isl::set domain(ctx, "{ S[i,j,k]: 0<=i,j,k<=100 }");
+  auto ele = Stage::New(domain);
+  Iterator i("i");
+  Iterator j("j");
+  auto n = ele->Fuse(i, j);
+  LOG(INFO) << "fused: " << ele->transform();
+}
+
+TEST(ComputeAt, Before) {
+  Expr M(100), N(200);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto A_cache = Compute(
+      {M, N}, [&](Expr i, Expr j) { return A(i, j); }, "cache");
+  auto C = Compute(
+      {Expr(10), Expr(10)}, [&](Expr i, Expr j) { return A_cache(i, j) + B(i, j); }, "C");
+
+  auto stages = CreateStages({A_cache, C});
+
+  stages[A_cache]->ComputeAt2(stages[C], 1);
+
+  auto fn = Lower("fn", stages, {A, B, A_cache, C});
+  LOG(INFO) << "fn:\n" << fn;
+
+  auto target = R"ROC(
+function fn (_A, _B, _cache, _C)
+{
+  serial for (i, 0, 10)
+  {
+    serial for (j, 0, 10)
+    {
+      cache[i, j] = A[i, j]
+      C[i, j] = (cache[i, j] + B[i, j])
+    }
+  }
+}
+)ROC";
+
+  ASSERT_EQ(utils::Trim(target), utils::GetStreamCnt(fn));
+}
+
+TEST(ComputeAt, simple) {
+  {
+    Expr n(64);
+    auto A = Placeholder<float>("A", {n, n});
+
+    auto A1 = Compute(
+        {n, n}, [&](Expr i, Expr j) { return A(i, j); }, "A1");
+    auto B = Compute(
+        {n / 2, n / 2}, [&](Expr i, Expr j) { return A1(i, j) + A1(i + 1, j) + A1(i + 2, j); }, "B");
+
+    auto stages = CreateStages({B});
+    stages[B]->Split(0, 16);
+    stages[A1]->ComputeAt2(stages[B], 1);
+
+    auto fn = Lower("fn", stages, {A, A1, B});
+    LOG(INFO) << "fn:\n" << fn;
+
+    auto target = R"ROC(
+function fn (_A, _A1, _B)
+{
+  serial for (i_outer, 0, 2)
+  {
+    serial for (i_inner, 0, 16)
+    {
+      serial for (j, 0, 64)
+      {
+        serial for (i_at, 0, 3)
+        {
+          A1[((16 * i_outer) + (i_at + i_inner)), j] = A[((16 * i_outer) + (i_at + i_inner)), j]
+        }
+      }
+      serial for (j, 0, 32)
+      {
+        B[((16 * i_outer) + i_inner), j] = (A1[((16 * i_outer) + i_inner), j] + (A1[(1 + ((16 * i_outer) + i_inner)), j] + A1[(2 + ((16 * i_outer) + i_inner)), j]))
+      }
+    }
+  }
+}
+)ROC";
+    ASSERT_EQ(utils::Trim(target), utils::GetStreamCnt(fn));
+
+    Module::Builder builder("module", common::DefaultHostTarget());
+    builder.AddFunction(fn);
+
+    CodeGenC codegen(common::DefaultHostTarget());
+    codegen.SetInlineBuiltinCodes(false);
+    LOG(INFO) << "source:\n" << codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  }
+}
+
+TEST(ComputeAt, Before1) {
+  Expr M(100), N(200);
+
+  Placeholder<float> A("A", {M, N});
+
+  auto create_module = [&] {
+    // cached compute way
+    auto cache_prepare = Compute({M, N} /*domain*/, [&](Var i, Var j) { return A(i, j); }, "cache", {N} /*shape*/);
+
+    auto transformed_compute = Compute(
+        {M, N}, [&](Var i, Var j) { return Expr(1.f); }, "transformed");
+
+    return std::make_tuple(cache_prepare, transformed_compute);
+  };
+
+  {  // C_init Before C
+    auto _cache_prepare_transformed_compute_ = create_module();
+    auto& cache_prepare                      = std::get<0>(_cache_prepare_transformed_compute_);
+    auto& transformed_compute                = std::get<1>(_cache_prepare_transformed_compute_);
+
+    auto stages = CreateStages({cache_prepare, transformed_compute});
+    stages[cache_prepare]->ComputeAt2(stages[transformed_compute], 1);
+
+    // codegen and compare
+    auto fn = Lower("fn", stages, {A, cache_prepare, transformed_compute});
+    LOG(INFO) << "fn1 :\n" << fn;
+
+    auto target = utils::Trim(R"ROC(
+function fn (_A, _cache, _transformed)
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 200)
+    {
+      cache[i] = A[i, j]
+      transformed[i, j] = 1.00000000f
+    }
+  }
+}
+)ROC");
+
+    ASSERT_EQ(utils::Trim(utils::GetStreamCnt(fn)), target);
+  }
+  {  // C_init After C
+    auto _cache_prepare_transformed_compute_ = create_module();
+    auto& cache_prepare                      = std::get<0>(_cache_prepare_transformed_compute_);
+    auto& transformed_compute                = std::get<1>(_cache_prepare_transformed_compute_);
+
+    auto stages = CreateStages({cache_prepare, transformed_compute});
+    stages[transformed_compute]->ComputeAt2(stages[cache_prepare], 1);
+
+    // codegen and compare
+    auto fn = Lower("fn", stages, {A, cache_prepare, transformed_compute});
+    LOG(INFO) << "fn2 :\n" << fn;
+    auto target = utils::Trim(R"ROC(
+function fn (_A, _cache, _transformed)
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 200)
+    {
+      transformed[i, j] = 1.00000000f
+      cache[i] = A[i, j]
+    }
+  }
+}
+)ROC");
+
+    ASSERT_EQ(utils::Trim(utils::GetStreamCnt(fn)), target);
+  }
+}
+
+void TestElementwiseAddJitPrecession(std::function<void(ir::Tensor*, StageMap)>&& scheduler) {
+  Expr M(30);
+  Expr N(40);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) -> Expr { return A(i, j) + B(i, j); }, "C");
+
+  auto stages = CreateStages({C});
+
+  scheduler(&C, stages);
+
+  auto fn = Lower("fn", stages, {A, B, C});
+  LOG(INFO) << "fn:\n" << fn;
+
+  Module::Builder module_builder("some_module", common::DefaultHostTarget());
+  module_builder.AddFunction(fn);
+
+  auto jit = backends::SimpleJIT::Create();
+  jit->Link(module_builder.Build(), false);
+  auto _fn_handler = jit->Lookup("fn");
+  auto* fn_handler = reinterpret_cast<lower_func_ptr_t>(_fn_handler);
+
+  // create buffer and args
+  auto A_buf    = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+  auto B_buf    = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+  auto C_buf    = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_zero().Build();
+  auto arg_pack = common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
+
+  fn_handler(arg_pack.data(), arg_pack.size());
+
+  auto* A_data = reinterpret_cast<float*>(A_buf->memory);
+  auto* B_data = reinterpret_cast<float*>(B_buf->memory);
+  auto* C_data = reinterpret_cast<float*>(C_buf->memory);
+  for (int i = 0; i < A_buf->num_elements(); i++) {
+    if (i < 4) LOG(INFO) << C_data[i];
+    ASSERT_NEAR(A_data[i] + B_data[i], C_data[i], 1e-5);
+  }
+
+  cinn_buffer_free(nullptr, A_buf);
+  cinn_buffer_free(nullptr, B_buf);
+}
+
+// use an elementwise_add to test fuse precision
+TEST(Fuse, jit_precision_test) {
+  TestElementwiseAddJitPrecession([](ir::Tensor* C, StageMap stages) { stages[(*C)]->Fuse(0, 1); });
+}
+
+// split test fuse precision
+TEST(Fuse, jit_precision_test2) {
+  TestElementwiseAddJitPrecession([](ir::Tensor* C, StageMap stages) {
+    auto _i_outer_i_inner_ = stages[(*C)]->Split(0, 4);
+    auto& i_outer          = std::get<0>(_i_outer_i_inner_);
+    auto& i_inner          = std::get<1>(_i_outer_i_inner_);
+    stages[(*C)]->Fuse(i_outer, i_inner);
+  });
+}
+
+TEST(Tile, jit_precision_test) {
+  TestElementwiseAddJitPrecession([](ir::Tensor* C, StageMap stages) { stages[(*C)]->Tile(0, 1, 4, 4); });
+}
+
+TEST(Reorder, jit_precision_test) {
+  TestElementwiseAddJitPrecession([](ir::Tensor* C, StageMap stages) {
+    auto* stage = stages[(*C)];
+    stage->Reorder({stage->axis(1), stage->axis(0)});
+  });
+}
+
+TEST(Unroll, jit_precision_test) {
+  TestElementwiseAddJitPrecession([](ir::Tensor* C, StageMap stages) { stages[(*C)]->Unroll(1); });
+}
+
+TEST(Unroll, jit_precision_test1) {
+  TestElementwiseAddJitPrecession([](ir::Tensor* C, StageMap stages) { stages[*C]->Unroll(0); });
+}
+
+TEST(ComputeInline, basic) {
+  Expr M(100), N(200);
+  Placeholder<float> A("A", {M, N});
+
+  auto B = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return A(i, j) + 1.f; }, "B");
+  auto B1 = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return B(i, j) + 1.f; }, "B1");
+  auto B2 = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return B1(i, j) + 1.f; }, "B2");
+
+  auto C = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return B2(i, j) * 2.f; }, "C");
+
+  auto stages = CreateStages({C});
+
+  stages[B]->ComputeInline();
+  stages[B1]->ComputeInline();
+  stages[B2]->ComputeInline();
+
+  auto inlined_B = B->inline_expanded({Expr(2), Expr(1)});
+  ASSERT_EQ("(A[2, 1] + 1.00000000f)", utils::GetStreamCnt(inlined_B));
+
+  auto fn = Lower("fn", stages, {A, C});
+
+  LOG(INFO) << "fn:\n" << fn;
+
+  auto target = R"ROC(
+function fn (_A, _C)
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 200)
+    {
+      C[i, j] = (6.00000000f + (2.00000000f * A[i, j]))
+    }
+  }
+}
+  )ROC";
+
+  ASSERT_EQ(utils::Trim(target), utils::Trim(utils::GetStreamCnt(fn)));
+}
+
+TEST(ComputeInline, complex_graph) {
+  Expr M(100), N(200);
+  Placeholder<float> A("A", {M, N});
+
+  auto B = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return A(i, j) + 1.f; }, "B");
+  auto B1 = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return B(i, j) + 1.f; }, "B1");
+  auto B2 = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return B1(i, j) + 1.f; }, "B2");
+
+  auto C = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return B(i, j) * 2.f; }, "C");
+  auto C1 = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return B1(i, j) * 2.f; }, "C1");
+  auto C2 = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return B2(i, j) * 2.f; }, "C2");
+
+  auto stages = CreateStages({C, C1, C2});
+
+  stages[B]->ComputeInline();
+  stages[B1]->ComputeInline();
+  stages[B2]->ComputeInline();
+
+  auto fn = Lower("fn", stages, {A, C, C1, C2});
+
+  LOG(INFO) << "fn:\n" << fn;
+
+  auto target = R"ROC(
+function fn (_A, _C, _C1, _C2)
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 200)
+    {
+      C2[i, j] = (6.00000000f + (2.00000000f * A[i, j]))
+    }
+  }
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 200)
+    {
+      C1[i, j] = (4.00000000f + (2.00000000f * A[i, j]))
+    }
+  }
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 200)
+    {
+      C[i, j] = (2.00000000f + (2.00000000f * A[i, j]))
+    }
+  }
+}
+  )ROC";
+
+  ASSERT_EQ(utils::Trim(target), utils::Trim(utils::GetStreamCnt(fn)));
+}
+
+TEST(ShareBufferWith, basic) {
+  Expr M(100), N(200);
+  Placeholder<float> A("A", {M, N});
+
+  auto B = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return A(i, j) + 1.f; }, "B");
+  auto B1 = Compute(
+      {M, N}, [=](Expr i, Expr j) -> Expr { return B(i, j) + 1.f; }, "B1");
+
+  auto stages = CreateStages({B, B1});
+
+  stages[B1]->ShareBufferWith(stages[B]);
+
+  auto fn = Lower("fn", stages, {A, B, B1});
+
+  LOG(INFO) << "fn:\n" << fn;
+
+  Module::Builder builder("some_module", common::DefaultHostTarget());
+  builder.AddFunction(fn);
+
+  CodeGenC codegen(common::DefaultHostTarget());
+  codegen.SetInlineBuiltinCodes(false);
+
+  LOG(INFO) << "\n" << codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+}
+
+TEST(isl, test) {
+  isl::ctx ctx(isl_ctx_alloc());
+  isl::set domain(
+      ctx, "[p0, p1] -> { p[i, j] : p0 = 0 and 0 <= p1 <= 2 and 4p1 <= i <= 1 + 4p1 and 0 <= j <= 9 + 4p1 - i }");
+
+  isl::map schedule(ctx, "[p0, p1] -> { p[i, j] -> p[t0, t1, t2 = j] : 2t1 = i and (t0) mod 2 = 0 and 0 <= t0 <= 1 }");
+
+  auto schedule_intersected = schedule.intersect_domain(domain);
+  LOG(INFO) << "schedule_intersected: " << schedule_intersected.coalesce();
+
+  isl::set context(ctx, "[p0,p1]->{:p0<100 and p1<100}");
+  LOG(INFO) << "space: " << context.space();
+
+  auto* build = isl_ast_build_from_context(context.release());
+  auto* node  = isl_ast_build_node_from_schedule_map(build, isl_union_map_from_map(schedule_intersected.release()));
+  LOG(INFO) << "code:\n" << isl_ast_node_to_C_str(node);
+}
+
+TEST(isl, test1) {
+  isl::ctx ctx(isl_ctx_alloc());
+
+  isl::set domain(
+      ctx, "[p0, p1] -> { p[i, j] : p0 = 0 and 0 <= p1 <= 2 and 4p1 <= i <= 1 + 4p1 and 0 <= j <= 9 + 4p1 - i }");
+  isl::map schedule(
+      ctx,
+      "[p0, p1] -> { p[i, j] -> p[o0, o1, t0, t1, t2 = j] : 2t1 = i and (o0) mod 4 = 0 and (t0) mod 2 = 0 "
+      "and 0 <= o0 <= 3 and 0 <= o1 <= 2 and 0 <= t0 <= 1 }");
+
+  isl::map schedule_t(ctx,
+                      "[p0,p1] -> { p[i0,i1,i2,i3,i4] -> [t0,t1,t2,t3,t30,t4] : t0 =i0 and t1 = i1 and t2 = i2 and t3 "
+                      "= i3 and t4 = i4 and t30=0 }");
+
+  isl::set cdomain(ctx, "[p0,p1] -> { c[a,b,c]: 0<=a,b,c<10 }");
+  isl::map cschedule(ctx, "[p0,p1] -> { c[a,b,c] -> c[t0,t1,t2,t3]: t0=a%4 and t1=a/4 and t2=b and t3=c }");
+
+  isl::map schedule_t1(ctx,
+                       "[p0,p1] -> { c[i0,i1,i2,i3] -> [t0,t1,t2,t3,t30,t4] : t0 =i0 and t1 = i1 and t2 = i2 and t3=i3 "
+                       "and t4=0 and t30=1 }");
+
+  schedule  = schedule.apply_range(schedule_t);
+  cschedule = cschedule.apply_range(schedule_t1);
+
+  auto whole_domain = isl::manage(isl_union_set_from_set(domain.copy()));
+  whole_domain      = isl::manage(isl_union_set_add_set(whole_domain.release(), cdomain.copy()));
+
+  auto whole_schedule = isl::manage(isl_union_map_from_map(schedule.copy()));
+  whole_schedule      = isl::manage(isl_union_map_add_map(whole_schedule.release(), cschedule.copy()));
+
+  auto intersect_schedule = whole_schedule.intersect_domain(whole_domain);
+
+  isl::set context(ctx, "[p0,p1]->{:p0<100 and p1<100}");
+
+  auto* build = isl_ast_build_from_context(context.release());
+  auto* node  = isl_ast_build_node_from_schedule_map(build, intersect_schedule.release());
+  LOG(INFO) << "code:\n\n" << isl_ast_node_to_C_str(node);
+}
+
+}  // namespace poly
+}  // namespace cinn
diff --git a/paddle/cinn/pybind/CMakeLists.txt b/paddle/cinn/pybind/CMakeLists.txt
new file mode 100755
index 0000000000000..683bf422d3cb6
--- /dev/null
+++ b/paddle/cinn/pybind/CMakeLists.txt
@@ -0,0 +1,29 @@
+set(srcs runtime.cc common.cc lang.cc ir.cc poly.cc backends.cc bind.cc optim.cc pe.cc frontend.cc framework.cc utils.cc)
+
+if (WITH_CUDA)
+  message(STATUS "Compile core_api with CUDA support")
+  nv_library(core_api SHARED
+      SRCS ${srcs}
+      DEPS cinncore_static cinn_runtime pybind)
+  message("cuda_nvrtc: ${CUDA_NVRTC}")
+  target_link_libraries(core_api ${CUDA_NVRTC_LIB} ${CUDA_LIBRARIES} cuda cudnn)
+  if (NVTX_FOUND)
+    target_link_libraries(core_api ${CUDA_NVTX_LIB})
+  endif()
+else()
+  message(STATUS "Compile core_api without CUDA support")
+  cc_library(core_api SHARED
+      SRCS ${srcs}
+      DEPS cinncore_static cinn_runtime pybind ${llvm_libs})
+endif()
+
+target_link_libraries(core_api ${MKLML_LIB} isl ginac)
+if (USE_OPENMP STREQUAL "gnu")
+  target_link_libraries(core_api ${OpenMP_CXX_LIBRARIES})
+  message(STATUS "OpenMP lib: ${OpenMP_CXX_LIBRARIES}")
+elseif(USE_OPENMP STREQUAL "intel")
+  target_link_libraries(core_api ${MKLML_IOMP_LIB})
+  message(STATUS "OpenMP lib: ${MKLML_IOMP_LIB}")
+endif()
+
+SET_TARGET_PROPERTIES(core_api PROPERTIES PREFIX "")
diff --git a/paddle/cinn/pybind/backends.cc b/paddle/cinn/pybind/backends.cc
new file mode 100644
index 0000000000000..830d8daed5f76
--- /dev/null
+++ b/paddle/cinn/pybind/backends.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pybind11/functional.h>
+
+#include <functional>
+
+#include "cinn/backends/compiler.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/pybind/bind.h"
+
+namespace py = pybind11;
+
+struct cinn_pod_value_t;
+
+namespace cinn::pybind {
+
+using backends::Compiler;
+using backends::ExecutionEngine;
+using backends::ExecutionOptions;
+
+namespace {
+
+void BindExecutionEngine(py::module *);
+
+void BindExecutionEngine(py::module *m) {
+  py::class_<ExecutionOptions> options(*m, "ExecutionOptions");
+  options.def(py::init<>())
+      .def_readwrite("opt_level", &ExecutionOptions::opt_level)
+      .def_readwrite("enable_debug_info", &ExecutionOptions::enable_debug_info);
+
+  auto lookup = [](ExecutionEngine &self, absl::string_view name) {
+    auto *function_ptr    = reinterpret_cast<void (*)(void **, int32_t)>(self.Lookup(name));
+    auto function_wrapper = [function_ptr](std::vector<cinn_pod_value_t> &args) {
+      function_ptr(reinterpret_cast<void **>(args.data()), args.size());
+    };
+    return std::function<void(std::vector<cinn_pod_value_t> &)>(function_wrapper);
+  };
+
+  py::class_<ExecutionEngine> engine(*m, "ExecutionEngine");
+  engine
+      .def_static("create",
+                  py::overload_cast<const ExecutionOptions &>(&ExecutionEngine::Create),
+                  py::arg("options") = ExecutionOptions())
+      .def(py::init(py::overload_cast<const ExecutionOptions &>(&ExecutionEngine::Create)),
+           py::arg("options") = ExecutionOptions())
+      .def("lookup", lookup)
+      .def("link", &ExecutionEngine::Link);
+
+  {
+    auto lookup = [](Compiler &self, absl::string_view name) {
+      auto *function_ptr    = reinterpret_cast<void (*)(void **, int32_t)>(self.Lookup(name));
+      auto function_wrapper = [function_ptr](std::vector<cinn_pod_value_t> &args) {
+        function_ptr(reinterpret_cast<void **>(args.data()), args.size());
+      };
+      return std::function<void(std::vector<cinn_pod_value_t> &)>(function_wrapper);
+    };
+
+    py::class_<Compiler> compiler(*m, "Compiler");
+    compiler
+        .def_static("create", &Compiler::Create)  //
+        .def("build", &Compiler::BuildDefault)    //
+        .def("lookup", lookup);
+  }
+}
+
+}  // namespace
+
+void BindBackends(py::module *m) { BindExecutionEngine(m); }
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/bind.cc b/paddle/cinn/pybind/bind.cc
new file mode 100644
index 0000000000000..6b0326378cb1f
--- /dev/null
+++ b/paddle/cinn/pybind/bind.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/pybind/bind.h"
+
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/runtime/use_extern_funcs.h"
+
+namespace py = pybind11;
+
+namespace cinn::pybind {
+
+PYBIND11_MODULE(core_api, m) {
+  m.doc() = "CINN core API";
+
+  py::module runtime   = m.def_submodule("runtime", "bind cinn_runtime");
+  py::module common    = m.def_submodule("common", "namespace cinn::common");
+  py::module lang      = m.def_submodule("lang", "namespace cinn::lang");
+  py::module ir        = m.def_submodule("ir", "namespace cinn::ir");
+  py::module poly      = m.def_submodule("poly", "namespace cinn::poly, polyhedral");
+  py::module backends  = m.def_submodule("backends", "namespace cinn::backends, execution backends");
+  py::module optim     = m.def_submodule("optim", "namespace cinn::optim, CINN IR optimization");
+  py::module pe        = m.def_submodule("pe", "namespace cinn::hlir::pe, CINN Primitive Emitters");
+  py::module frontend  = m.def_submodule("frontend", "namespace cinn::frontend, CINN frontend");
+  py::module framework = m.def_submodule("framework", "namespace cinn::hlir::framework, CINN framework");
+  py::module utils     = m.def_submodule("utils", "namespace cinn::utils, CINN framework");
+
+  BindRuntime(&runtime);
+  BindCommon(&common);
+  BindIr(&ir);
+  BindLang(&lang);
+  BindPoly(&poly);
+  BindBackends(&backends);
+  BindOptim(&optim);
+  BindPE(&pe);
+  BindFrontend(&frontend);
+  BindFramework(&framework);
+  BindUtils(&utils);
+}
+
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/bind.h b/paddle/cinn/pybind/bind.h
new file mode 100644
index 0000000000000..588f2ea58fa3b
--- /dev/null
+++ b/paddle/cinn/pybind/bind.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/strings/string_view.h>
+#include <absl/types/variant.h>
+#include <pybind11/cast.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace pybind11 {
+namespace detail {
+template <typename... Ts>
+struct type_caster<absl::variant<Ts...>> : variant_caster<absl::variant<Ts...>> {};
+
+template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc>
+struct type_caster<absl::flat_hash_map<Key, Value, Hash, Equal, Alloc>>
+    : map_caster<absl::flat_hash_map<Key, Value, Hash, Equal, Alloc>, Key, Value> {};
+
+template <>
+struct type_caster<absl::string_view> : string_caster<absl::string_view, true> {};
+}  // namespace detail
+}  // namespace pybind11
+
+namespace cinn::pybind {
+
+void BindRuntime(pybind11::module *m);
+void BindCommon(pybind11::module *m);
+void BindLang(pybind11::module *m);
+void BindIr(pybind11::module *m);
+void BindBackends(pybind11::module *m);
+void BindPoly(pybind11::module *m);
+void BindOptim(pybind11::module *m);
+void BindPE(pybind11::module *m);
+void BindFrontend(pybind11::module *m);
+void BindFramework(pybind11::module *m);
+void BindUtils(pybind11::module *m);
+
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/bind_utils.h b/paddle/cinn/pybind/bind_utils.h
new file mode 100644
index 0000000000000..5d8a807eb4ce6
--- /dev/null
+++ b/paddle/cinn/pybind/bind_utils.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include <string>
+
+#include "cinn/common/cinn_value.h"
+#include "cinn/common/shared.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/pybind/bind.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace py = pybind11;
+
+namespace cinn::pybind {
+using common::CINNValue;
+using common::Shared;
+using common::Type;
+using ir::Expr;
+using ir::ExprNode;
+
+using ExprOp   = absl::variant<ir::IntImm,
+                             ir::UIntImm,
+                             ir::FloatImm,
+                             ir::StringImm,
+                             ir::Cast,
+                             ir::Let,
+                             ir::Reduce,
+                             ir::Call,
+                             ir::_Var_,
+                             ir::Select,
+                             ir::Load,
+                             ir::Store,
+                             ir::Alloc,
+                             ir::Free,
+                             ir::IfThenElse,
+                             ir::For,
+                             ir::PolyFor,
+                             ir::Ramp,
+                             ir::Broadcast,
+                             ir::Product,
+                             ir::Sum,
+                             ir::Block,
+                             ir::_Module_>;
+using BinaryOp = absl::variant<>;
+using UnaryOp  = absl::variant<>;
+
+// hold CINNValue
+using ValueVar = absl::variant<int32_t, int64_t, float, ir::Var, ir::Expr, std::nullptr_t>;
+
+inline ValueVar ConvertToVar(const CINNValue &value) {
+  auto type_code = value.type_code();
+  ValueVar var;
+  if (type_code == ::cinn_type_code<int32_t>()) {
+    var = static_cast<int32_t>(value);
+  } else if (type_code == ::cinn_type_code<int64_t>()) {
+    var = static_cast<int64_t>(value);
+  } else if (type_code == ::cinn_type_code<float>()) {
+    var = static_cast<float>(value);
+  } else if (type_code == CINNValue::TypeCode<ir::Var>()) {
+    var = value.operator ir::Var();
+  } else if (type_code == CINNValue::TypeCode<ir::Expr>()) {
+    var = ir::Expr(value.operator ir::Expr());
+  } else {
+    var = nullptr;
+  }
+
+  return var;
+}
+
+template <typename T>
+auto DefineShared(py::module *m, absl::string_view obj_name) {
+  std::string name = "Shared" + std::string(obj_name);
+  py::class_<Shared<T>> shared(*m, name.c_str());
+
+  shared.def(py::init<>()).def(py::init<T *>()).def(py::init<const Shared<T> &>());
+  return shared;
+}
+
+template <typename NodeType>
+void DefineExprNode(py::module *m, absl::string_view node_name) {
+  using ExprNodeT = ExprNode<NodeType>;
+
+  std::string prefix{"ExprNode"};
+  std::string name = prefix + std::string(node_name);
+  py::class_<ExprNodeT, ir::IrNode> expr_node(*m, name.c_str(), py::module_local());
+  expr_node.def(py::init<>())
+      .def(py::init<Type>())
+      .def(py::init<int>())
+      .def("operands_mutable", py::overload_cast<>(&ExprNodeT::operands))
+      .def("operands_const", py::overload_cast<>(&ExprNodeT::operands, py::const_))
+      .def("operand_mutable", py::overload_cast<int>(&ExprNodeT::operand), py::return_value_policy::reference)
+      .def("operand_const", py::overload_cast<int>(&ExprNodeT::operand, py::const_), py::return_value_policy::reference)
+      .def("copy", &ExprNodeT::Copy)
+      .def("node_type", &ExprNodeT::node_type);
+}
+
+template <typename NodeType>
+void DefineBinaryOpNode(py::module *m, absl::string_view node_name) {
+  DefineExprNode<NodeType>(m, node_name);
+  std::string prefix{"BinaryOpNode"};
+  std::string name    = prefix + std::string(node_name);
+  using BinaryOpNodeT = ir::BinaryOpNode<NodeType>;
+  py::class_<BinaryOpNodeT, ir::ExprNode<NodeType>> binary_op_node(*m, name.c_str());
+  binary_op_node.def(py::init<>())
+      .def(py::init<Type, Expr, Expr>())
+      .def("a_mutable", py::overload_cast<>(&BinaryOpNodeT::a), py::return_value_policy::reference)
+      .def("a_const", py::overload_cast<>(&BinaryOpNodeT::a, py::const_), py::return_value_policy::reference)
+      .def("b_mutable", py::overload_cast<>(&BinaryOpNodeT::b), py::return_value_policy::reference)
+      .def("b_const", py::overload_cast<>(&BinaryOpNodeT::b, py::const_), py::return_value_policy::reference)
+      .def("type", &BinaryOpNodeT::type)
+      .def("expr_fields_mutable", py::overload_cast<>(&BinaryOpNodeT::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&BinaryOpNodeT::expr_fields, py::const_));
+}
+
+template <typename NodeType>
+void DefineUnaryOpNode(py::module *m, absl::string_view node_name) {
+  using UnaryOpNodeT = ir::UnaryOpNode<NodeType>;
+  DefineExprNode<NodeType>(m, node_name);
+
+  std::string name = "UnaryOpNode" + std::string(node_name);
+  py::class_<UnaryOpNodeT, ir::ExprNode<NodeType>> unary_op_node(*m, name.c_str());
+  unary_op_node.def(py::init<>())
+      .def(py::init<Type, Expr>())
+      .def("type", &UnaryOpNodeT::type)
+      .def("v_mutable", py::overload_cast<>(&UnaryOpNodeT::v), py::return_value_policy::reference)
+      .def("v_const", py::overload_cast<>(&UnaryOpNodeT::v, py::const_), py::return_value_policy::reference)
+      .def("expr_fields_mutable", py::overload_cast<>(&UnaryOpNodeT::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&UnaryOpNodeT::expr_fields, py::const_))
+      .def("operands_mutable", py::overload_cast<>(&UnaryOpNodeT::operands), py::return_value_policy::reference)
+      .def("operands_const",
+           py::overload_cast<>(&UnaryOpNodeT::operands, py::const_),
+           py::return_value_policy::reference);
+}
+
+class ObjectWrapper : public Object {
+ public:
+  using Object::Object;
+
+  const char *type_info() const override { PYBIND11_OVERLOAD_PURE(const char *, Object, type_info); }
+};
+
+class IrNodeWrapper : ir::IrNode {
+  using ir::IrNode::IrNode;
+};
+
+class _Operation_Wrapper : ir::_Operation_ {
+ public:
+  const char *func_type() const override { PYBIND11_OVERLOAD_PURE(const char *, ir::_Operation_, func_type); }
+};
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/common.cc b/paddle/cinn/pybind/common.cc
new file mode 100644
index 0000000000000..a5249904846f7
--- /dev/null
+++ b/paddle/cinn/pybind/common.cc
@@ -0,0 +1,322 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/ir_util.h"
+#include "cinn/common/object.h"
+#include "cinn/common/shared.h"
+#include "cinn/common/target.h"
+#include "cinn/common/type.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/pybind/bind.h"
+#include "cinn/pybind/bind_utils.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/string.h"
+
+namespace py = pybind11;
+
+namespace cinn::pybind {
+
+using common::bfloat16;
+using common::CINNValue;
+using common::float16;
+using common::Object;
+using common::Target;
+using common::Type;
+using utils::GetStreamCnt;
+using utils::StringFormat;
+
+namespace {
+void BindTarget(py::module *);
+void BindType(py::module *);
+void BindObject(py::module *);
+void BindShared(py::module *);
+void BindCinnValue(py::module *);
+
+void ResetGlobalNameID() { common::Context::Global().ResetNameId(); }
+
+void BindTarget(py::module *m) {
+  py::class_<Target> target(*m, "Target");
+  target.def_readwrite("os", &Target::os)
+      .def_readwrite("arch", &Target::arch)
+      .def_readwrite("bits", &Target::bits)
+      .def_readwrite("features", &Target::features)
+      .def(py::init<>())
+      .def(py::init<Target::OS, Target::Arch, Target::Bit, const std::vector<Target::Feature> &>())
+      .def("defined", &Target::defined)
+      .def("runtime_arch", &Target::runtime_arch);
+
+  m->def("DefaultHostTarget", &common::DefaultHostTarget)
+      .def("DefaultNVGPUTarget", &common::DefaultNVGPUTarget)
+      .def("DefaultTarget", &common::DefaultTarget);
+
+  m->def("get_target", &cinn::runtime::CurrentTarget::GetCurrentTarget);
+  m->def("set_target", &cinn::runtime::CurrentTarget::SetCurrentTarget, py::arg("target"));
+
+  py::enum_<Target::OS> os(target, "OS");
+  os.value("Unk", Target::OS::Unk).value("Linux", Target::OS::Linux).value("Windows", Target::OS::Windows);
+
+  py::enum_<Target::Arch> arch(target, "Arch");
+  arch.value("Unk", Target::Arch::Unk)
+      .value("X86", Target::Arch::X86)
+      .value("ARM", Target::Arch::ARM)
+      .value("NVGPU", Target::Arch::NVGPU);
+
+  py::enum_<Target::Bit> bit(target, "Bit");
+  bit.value("Unk", Target::Bit::Unk).value("k32", Target::Bit::k32).value("k64", Target::Bit::k64);
+
+  py::enum_<Target::Feature> feature(target, "Feature");
+  feature.value("JIT", Target::Feature::JIT).value("Debug", Target::Feature::Debug);
+
+  m->def("is_compiled_with_cuda", cinn::runtime::IsCompiledWithCUDA);
+  m->def("is_compiled_with_cudnn", cinn::runtime::IsCompiledWithCUDNN);
+  m->def("reset_name_id", ResetGlobalNameID);
+}
+
+void BindType(py::module *m) {
+  py::class_<Type> type(*m, "Type");
+  type.def(py::init<>()).def(py::init<Type::type_t, int, int, Type::specific_type_t>());
+#define DEFINE_TYPE_METHOD(__name) (type = type.def(#__name, &Type::__name))
+  DEFINE_TYPE_METHOD(is_primitive);
+  DEFINE_TYPE_METHOD(is_unk);
+  DEFINE_TYPE_METHOD(is_void);
+  DEFINE_TYPE_METHOD(is_bool);
+  DEFINE_TYPE_METHOD(is_vector);
+  DEFINE_TYPE_METHOD(is_scalar);
+  DEFINE_TYPE_METHOD(is_float);
+  DEFINE_TYPE_METHOD(is_float16);
+  DEFINE_TYPE_METHOD(is_bfloat16);
+  DEFINE_TYPE_METHOD(is_int);
+  DEFINE_TYPE_METHOD(is_uint);
+  DEFINE_TYPE_METHOD(is_string);
+  DEFINE_TYPE_METHOD(set_cpp_handle);
+  DEFINE_TYPE_METHOD(is_cpp_handle);
+  DEFINE_TYPE_METHOD(set_cpp_handle2);
+  DEFINE_TYPE_METHOD(is_cpp_handle2);
+  DEFINE_TYPE_METHOD(set_cpp_const);
+  DEFINE_TYPE_METHOD(is_cpp_const);
+  DEFINE_TYPE_METHOD(set_customized_type);
+  DEFINE_TYPE_METHOD(customized_type);
+  DEFINE_TYPE_METHOD(is_customized_type);
+  DEFINE_TYPE_METHOD(with_bits);
+  DEFINE_TYPE_METHOD(with_type);
+  DEFINE_TYPE_METHOD(with_cpp_const);
+#undef DEFINE_TYPE_METHOD
+  type.def("vector_of", &Type::VectorOf)
+      .def("element_of", &Type::ElementOf)
+      .def("pointer_of", &Type::PointerOf)
+      .def("__str__", [](const Type &self) { return GetStreamCnt(self); })
+      .def("__repr__", [](const Type &self) { return StringFormat("<Type: %s>", GetStreamCnt(self).c_str()); });
+
+  py::enum_<Type::type_t> type_t(type, "type_t");
+  type_t.value("unk", Type::type_t::Unk)
+      .value("int", Type::type_t::Int)
+      .value("uInt", Type::type_t::UInt)
+      .value("float", Type::type_t::Float)
+      .value("string", Type::type_t::String)
+      .value("void", Type::type_t::Void)
+      .value("customized", Type::type_t::Customized)
+      .export_values();
+
+  py::enum_<Type::specific_type_t> specific_type_t(type, "specific_type_t");
+  specific_type_t.value("None", Type::specific_type_t::None)
+      .value("FP16", Type::specific_type_t::FP16)
+      .value("BF16", Type::specific_type_t::BF16)
+      .export_values();
+
+  py::enum_<Type::cpp_type_t> cpp_type_t(type, "cpp_type_t");
+  cpp_type_t.value("None", Type::cpp_type_t::None)
+      .value("Const", Type::cpp_type_t::Const)
+      .value("Handle", Type::cpp_type_t::Handle)
+      .value("HandleHandle", Type::cpp_type_t::HandleHandle)
+      .export_values();
+
+  m->def("Void", &common::Void)
+      .def("Int", &common::Int, py::arg("bits"), py::arg("lanes") = 1)
+      .def("UInt", &common::UInt, py::arg("bits"), py::arg("lanes") = 1)
+      .def("Float", &common::Float, py::arg("bits"), py::arg("lanes") = 1, py::arg("st") = Type::specific_type_t::None)
+      .def("Float16", &common::Float16, py::arg("lanes") = 1)
+      .def("BFloat16", &common::BFloat16, py::arg("lanes") = 1)
+      .def("Bool", &common::Bool, py::arg("lanes") = 1)
+      .def("String", &common::String);
+
+  m->def(
+       "make_const",
+       [](const Type &type, int32_t val) -> Expr { return common::make_const(type, val); },
+       py::arg("type"),
+       py::arg("val"))
+      .def(
+          "make_const",
+          [](const Type &type, int64_t val) -> Expr { return common::make_const(type, val); },
+          py::arg("type"),
+          py::arg("val"))
+      .def(
+          "make_const",
+          [](const Type &type, float val) -> Expr { return common::make_const(type, val); },
+          py::arg("type"),
+          py::arg("val"))
+      .def(
+          "make_const",
+          [](const Type &type, double val) -> Expr { return common::make_const(type, val); },
+          py::arg("type"),
+          py::arg("val"))
+      .def(
+          "make_const",
+          [](const Type &type, bool val) -> Expr { return common::make_const(type, val); },
+          py::arg("type"),
+          py::arg("val"));
+
+  m->def("type_of", [](absl::string_view dtype) { return common::Str2Type(dtype.data()); });
+}
+
+void BindObject(py::module *m) {
+  py::class_<Object, ObjectWrapper> object(*m, "Object");
+  object.def("type_info", &Object::type_info);
+}
+
+void BindShared(py::module *m) {
+  py::class_<common::RefCount> ref_count(*m, "RefCount");
+  ref_count.def(py::init<>())
+      .def("inc", &common::RefCount::Inc)
+      .def("dec", &common::RefCount::Dec)
+      .def("is_zero", &common::RefCount::is_zero)
+      .def("to_string", &common::RefCount::to_string)
+      .def("val", &common::RefCount::val);
+}
+
+// TODO(wanghaipeng03) using true_type or false_type as tag disptcher losses semantic context
+template <typename T1, typename T2, typename F>
+inline auto __binary_op_fn_dispatch(T1 x, T2 y, F fn, std::true_type) {
+  return fn(ir::Expr(x), ir::Expr(y)).as_var_ref();
+}
+template <typename T1, typename T2, typename F>
+inline auto __binary_op_fn_dispatch(T1 x, T2 y, F fn, std::false_type) {
+  return fn(x, y);
+}
+
+template <typename T1, typename T2, typename F>
+inline void __binary_op_visitor_dispatch(CINNValue &v, T1 lhs, T2 rhs, F fn, std::true_type) {
+  v = CINNValue();
+}
+template <typename T1, typename T2, typename F>
+inline void __binary_op_visitor_dispatch(CINNValue &v, T1 lhs, T2 rhs, F fn, std::false_type) {
+  v.Set(fn(lhs, rhs));
+}
+
+void BindCinnValue(py::module *m) {
+  using common::_CINNValuePack_;
+  using common::CINNValuePack;
+
+  DefineShared<_CINNValuePack_>(m, "_CINNValuePack_");
+
+  py::class_<_CINNValuePack_> cinn_value_pack(*m, "_CINNValuePack_");
+  cinn_value_pack.def_static("make", &_CINNValuePack_::Make)
+      .def("__getitem__", [](_CINNValuePack_ &self, int offset) { return self[offset]; })
+      .def("__setitem__", [](_CINNValuePack_ &self, int offset, CINNValue &v) { self[offset] = v; })
+      .def("add_value", &_CINNValuePack_::AddValue)
+      .def("clear", &_CINNValuePack_::Clear)
+      .def("size", &_CINNValuePack_::size)
+      .def("__len__", &_CINNValuePack_::size)
+      .def("type_info", &_CINNValuePack_::type_info);
+
+  py::class_<CINNValuePack, common::Shared<_CINNValuePack_>> cinn_value_pack_shared(*m, "CINNValuePack");
+  cinn_value_pack_shared.def(py::init<_CINNValuePack_ *>())
+      .def("__getitem__", [](CINNValuePack &self, int offset) { return self[offset]; })
+      .def("__setitem__", [](CINNValuePack &self, int offset, CINNValue &v) { self[offset] = v; });
+
+  py::class_<CINNValue, cinn_pod_value_t> cinn_value(*m, "CINNValue");
+  cinn_value.def(py::init<>())
+      .def(py::init<cinn_value_t, int>())
+      .def(py::init<bool>())
+      .def(py::init<int8_t>())
+      .def(py::init<int32_t>())
+      .def(py::init<int64_t>())
+      .def(py::init<float>())
+      .def(py::init<double>())
+      .def(py::init<char *>())
+      .def(py::init<cinn_buffer_t *>())
+      .def(py::init<void *>())
+      .def(py::init<const char *>())
+      .def(py::init<const ir::Var &>())
+      .def(py::init<const ir::Expr &>())
+      .def(py::init<const CINNValuePack &>())
+      .def("defined", &CINNValue::defined)
+      .def("to_double", [](CINNValue &self) { return static_cast<double>(self); })
+      .def("to_float", [](CINNValue &self) { return static_cast<float>(self); })
+      .def("to_int8", [](CINNValue &self) { return static_cast<int8_t>(self); })
+      .def("to_int32", [](CINNValue &self) { return static_cast<int32_t>(self); })
+      .def("to_int64", [](CINNValue &self) { return static_cast<int64_t>(self); })
+      .def("to_void_p", [](CINNValue &self) { return static_cast<void *>(self); })
+      .def("to_cinn_buffer_p", [](CINNValue &self) { return static_cast<cinn_buffer_t *>(self); })
+      .def("to_str", [](CINNValue &self) { return static_cast<char *>(self); })
+      .def("to_var", [](CINNValue &self) { return self.operator ir::Var(); })
+      .def("to_expr", [](CINNValue &self) { return ir::Expr(self.operator ir::Expr()); })
+      .def("set", &CINNValue::Set<int32_t>)
+      .def("set", &CINNValue::Set<int64_t>)
+      .def("set", &CINNValue::Set<float>)
+      .def("set", &CINNValue::Set<double>)
+      .def("set", &CINNValue::Set<char *>)
+      .def("set", &CINNValue::Set<const ir::Var &>)
+      .def("set", &CINNValue::Set<const ir::Expr &>)
+      .def("set", &CINNValue::Set<cinn_buffer_t *>)
+      .def("set", &CINNValue::Set<const CINNValuePack &>)
+      .def("set", &CINNValue::Set<const char *>)
+      .def("set", &CINNValue::Set<const CINNValue &>);
+
+  auto binary_op_visitor = [](CINNValue &v, auto lhs, auto rhs, auto fn) {
+    using lhs_t = decltype(lhs);
+    using rhs_t = decltype(rhs);
+    using tag_t =
+        std::conditional_t<std::is_same<lhs_t, std::nullptr_t>::value || std::is_same<rhs_t, std::nullptr_t>::value ||
+                               !std::is_same<lhs_t, rhs_t>::value,
+                           std::true_type,
+                           std::false_type>;
+    __binary_op_visitor_dispatch(v, lhs, rhs, fn, tag_t{});
+  };
+
+#define DEFINE_BINARY_OP(__op, __fn)                                                                     \
+  auto __op##_fn = [&](auto x, auto y) {                                                                 \
+    constexpr auto is_var_x = std::is_same<std::decay_t<decltype(x)>, ir::Var>::value;                   \
+    constexpr auto is_var_y = std::is_same<std::decay_t<decltype(y)>, ir::Var>::value;                   \
+    using tag_t             = std::conditional_t<is_var_x && is_var_y, std::true_type, std::false_type>; \
+    return __binary_op_fn_dispatch(x, y, __fn, tag_t{});                                                 \
+  };                                                                                                     \
+  cinn_value.def(#__op, [&](CINNValue &self, CINNValue &other) {                                         \
+    auto visitor = [&](auto x, auto y) { return binary_op_visitor(self, x, y, __op##_fn); };             \
+    absl::visit(visitor, ConvertToVar(self), ConvertToVar(other));                                       \
+    return self;                                                                                         \
+  })
+
+  DEFINE_BINARY_OP(__add__, [](auto x, auto y) { return x + y; });
+  DEFINE_BINARY_OP(__sub__, [](auto x, auto y) { return x - y; });
+  DEFINE_BINARY_OP(__mul__, [](auto x, auto y) { return x * y; });
+  DEFINE_BINARY_OP(__truediv__, [](auto x, auto y) { return x / y; });
+  DEFINE_BINARY_OP(__and__, [](auto x, auto y) { return x && y; });
+  DEFINE_BINARY_OP(__or__, [](auto x, auto y) { return x || y; });
+  DEFINE_BINARY_OP(__lt__, [](auto x, auto y) { return x < y; });
+  DEFINE_BINARY_OP(__le__, [](auto x, auto y) { return x <= y; });
+  DEFINE_BINARY_OP(__gt__, [](auto x, auto y) { return x > y; });
+  DEFINE_BINARY_OP(__ge__, [](auto x, auto y) { return x >= y; });
+
+#undef DEFINE_BINARY_OP
+}
+}  // namespace
+
+void BindCommon(py::module *m) {
+  BindTarget(m);
+  BindType(m);
+  BindObject(m);
+  BindShared(m);
+  BindCinnValue(m);
+}
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/framework.cc b/paddle/cinn/pybind/framework.cc
new file mode 100755
index 0000000000000..f6898c81ed719
--- /dev/null
+++ b/paddle/cinn/pybind/framework.cc
@@ -0,0 +1,196 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+
+#include "cinn/common/cinn_value.h"
+#include "cinn/frontend/interpreter.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/pybind/bind.h"
+#include "cinn/runtime/flags.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn::pybind {
+
+namespace py = pybind11;
+using namespace cinn::hlir::framework;  // NOLINT
+void BindFramework(pybind11::module *m) {
+  py::class_<Operator>(*m, "Operator")
+      .def("get_op_attrs", [](const std::string &key) { return Operator::GetAttrs<StrategyFunction>(key); })
+      .def("get_op_shape_attrs", [](const std::string &key) { return Operator::GetAttrs<InferShapeFunction>(key); });
+
+  py::class_<OpValueType<StrategyFunction>>(*m, "OpValueType")
+      .def("apply_strategy",
+           [](OpValueType<StrategyFunction> &self,
+              const std::string &key,
+              const NodeAttr &attrs,
+              const std::vector<ir::Tensor> &inputs,
+              const std::vector<Type> &out_types,
+              const std::vector<std::vector<int>> &output_shapes,
+              const common::Target &target) {
+             const Operator *op_ptr = Operator::Get(key);
+             auto impl = OpStrategy::SelectImpl(self[op_ptr](attrs, inputs, out_types, output_shapes, target));
+             std::vector<common::CINNValue> temp_inputs;
+             std::vector<ir::Tensor> res;
+             for (auto &tensor : inputs) {
+               res.push_back(tensor);
+               temp_inputs.push_back(common::CINNValue(tensor));
+             }
+
+             ir::LoweredFunc func;
+             if (FLAGS_cinn_ir_schedule) {
+               std::string output_name = "out";
+               temp_inputs.emplace_back(output_name);
+               std::vector<std::string> input_output_names;
+               for (const auto &input : inputs) {
+                 input_output_names.push_back(input->name);
+               }
+               input_output_names.push_back(output_name);
+               std::vector<ir::LoweredFunc> funcs = hlir::framework::GetFuncFromImpl(
+                   impl, common::CINNValuePack{temp_inputs}, res, input_output_names, key, target);
+               CHECK_EQ(funcs.size(), 1U);
+               func = funcs[0];
+             } else {
+               common::CINNValuePack C = impl->fcompute(common::CINNValuePack{temp_inputs});
+               poly::StageMap stages   = C.back();
+               // make sure all the tensors in the stages before schedule launch.
+               for (int i = 0; i < C->size() - 1; i++) {
+                 ir::Expr temp = C[i];
+                 stages->InsertLazily(temp.as_tensor_ref());
+               }
+               C = impl->fschedule(C);
+               for (int i = 0; i < C->size() - 1; i++) {
+                 ir::Expr temp = C[i];
+                 res.push_back(temp.as_tensor_ref());
+               }
+               func = Lower(key, stages, res);
+             }
+             return func;
+           });
+
+  py::class_<OpValueType<InferShapeFunction>>(*m, "OpValueType1")
+      .def("infer_shape",
+           [](OpValueType<InferShapeFunction> &self,
+              const std::string &key,
+              const std::vector<std::vector<int>> &input_shapes,
+              const AttrMapType &attrs) {
+             const Operator *op_ptr = Operator::Get(key);
+             auto shapes            = self[op_ptr](input_shapes, attrs);
+             return shapes;
+           });
+
+  py::class_<NodeAttr>(*m, "NodeAttr")
+      .def(py::init<>())
+      .def_readwrite("attr_store", &NodeAttr::attr_store)
+      .def("set_attr",
+           [](NodeAttr &self, const std::string &key, NodeAttr::attr_t value) { self.attr_store[key] = value; })
+      .def("get_attr",
+           [](NodeAttr &self, const std::string &key) {
+             CHECK_EQ(self.attr_store.count(key), 1) << "Didn't find value with key [" << key << "].";
+             return self.attr_store[key];
+           })
+      .def("__str__", [](NodeAttr &self) { return utils::GetStreamCnt(self); });
+
+  py::class_<Scope, std::shared_ptr<Scope>>(*m, "Scope")
+      .def(py::init<>())  //
+      .def("get_tensor",
+           [](Scope &self, const std::string &name, const Target &target) {
+             auto t = self.GetTensor(name);
+             py::dtype dt(common::Type2Str(t->type()));
+             py::array::ShapeContainer shape(t->shape().data().begin(), t->shape().data().end());
+             py::array array(std::move(dt), std::move(shape));
+             auto *mutable_data = array.mutable_data();
+             if (target.arch == Target::Arch::X86) {
+               std::memcpy(mutable_data, t->data<void>(), t->shape().numel() * t->type().bytes());
+             } else if (target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+               CUDA_CALL(cudaMemcpy(mutable_data,
+                                    reinterpret_cast<void *>(t->mutable_data(target, t->type())),
+                                    t->shape().numel() * t->type().bytes(),
+                                    cudaMemcpyDeviceToHost));
+#else
+               LOG(FATAL) <<"To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+             } else {
+               CINN_NOT_IMPLEMENTED
+             }
+             return array;
+           })
+      .def("var_names", &Scope::var_names);
+
+  py::class_<common::Shared<hlir::framework::_Tensor_>>(*m, "SharedTensor");
+  py::class_<Tensor, common::Shared<hlir::framework::_Tensor_>>(*m, "Tensor")
+      .def(py::init<>())
+      .def("shape", [](hlir::framework::Tensor &self) { return self->shape().data(); })
+      .def("set_type", [](hlir::framework::Tensor &self, Type type) { self->set_type(type); })
+      .def("numpy",
+           [](hlir::framework::Tensor &self, const common::Target &target) {
+             std::string type_str = common::Type2Str(self->type());
+             if (type_str == "bfloat16") {
+               type_str = "uint16";
+             }
+             py::dtype dt(type_str);
+             py::array::ShapeContainer shape(self->shape().data().begin(), self->shape().data().end());
+             py::array array(std::move(dt), std::move(shape));
+             void *array_data = array.mutable_data();
+             if (target.arch == Target::Arch::X86) {
+               std::memcpy(array_data, self->data<void>(), self->shape().numel() * self->type().bytes());
+             } else if (target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+               CUDA_CALL(cudaMemcpy(array_data,
+                                    self->data<void>(),
+                                    self->shape().numel() * self->type().bytes(),
+                                    cudaMemcpyDeviceToHost));
+#else
+               LOG(FATAL) <<"To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+             } else {
+               CINN_NOT_IMPLEMENTED
+             }
+             return array;
+           })
+      .def("from_numpy", [](hlir::framework::Tensor &self, py::array array, const common::Target &target) {
+        CHECK(array.dtype().is(py::dtype(common::Type2Str(self->type()))))
+            << "currently only support float32 data type as input";
+        hlir::framework::shape_t shape;
+        std::copy_n(array.shape(), array.ndim(), std::back_inserter(shape));
+        CHECK_EQ(std::accumulate(shape.begin(), shape.end(), 1, [](int32_t a, int32_t b) { return a * b; }),
+                 self->shape().numel());
+        auto *data = self->mutable_data(target, self->type());
+        if (target.arch == Target::Arch::X86) {
+          std::memcpy(data, array.data(), self->shape().numel() * self->type().bytes());
+        } else if (target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+          CUDA_CALL(cudaMemcpy(reinterpret_cast<void *>(data),
+                               reinterpret_cast<const void *>(array.data()),
+                               self->shape().numel() * self->type().bytes(),
+                               cudaMemcpyHostToDevice));
+#else
+               LOG(FATAL) <<"To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+        } else {
+          CINN_NOT_IMPLEMENTED
+        }
+      });
+}
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/frontend.cc b/paddle/cinn/pybind/frontend.cc
new file mode 100644
index 0000000000000..73de15adab305
--- /dev/null
+++ b/paddle/cinn/pybind/frontend.cc
@@ -0,0 +1,799 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+
+#include "cinn/common/common.h"
+#include "cinn/frontend/computation.h"
+#include "cinn/frontend/decomposer/use_decomposer.h"
+#include "cinn/frontend/decomposer_registry.h"
+#include "cinn/frontend/interpreter.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/optimize.h"
+#include "cinn/frontend/paddle_model_convertor.h"
+#include "cinn/frontend/pass/use_program_pass.h"
+#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/hlir/framework/graph.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/pass.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/hlir/framework/visualize_helper.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/pybind/bind.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/string.h"
+#include "cinn/utils/timer.h"
+
+namespace cinn::pybind {
+using common::Type;
+using frontend::Placeholder;
+namespace py = pybind11;
+using namespace cinn::frontend;  // NOLINT
+
+// this function is a helper function, not threadsafe,
+// used in this file only for py function register
+static const char *SnakeName(const char *name) {
+  static char buf[256];
+  char *p       = buf;
+  const char *q = name;
+  for (; *q; q++, p++) {
+    if ((*q >= 'A') && (*q <= 'Z')) {
+      if (p > buf) *p++ = '_';
+      *p = *q - 'A' + 'a';
+    } else {
+      *p = *q;
+    }
+  }
+  *p = 0;
+  return buf;
+}
+
+#define EXPAND_CINN_SUPPORT_TYPE(EXPAND_MACRO) \
+  EXPAND_MACRO(bool)                           \
+  EXPAND_MACRO(int64_t)                        \
+  EXPAND_MACRO(double)
+
+void BindFrontend(pybind11::module *m) {
+  py::class_<Variable>(*m, "Variable")  //
+      .def(py::init<const std::string &>(), py::arg("id") = "")
+      .def(py::init([](const Placeholder &p) { return new Variable(p); }))
+      .def("__str__", [](Variable &self) { return self->id; })
+      .def("__repr__", [](Variable &self) { return utils::GetStreamCnt(self); })
+      .def("id", [](Variable &self) { return self->id; })
+      .def("name", [](Variable &self) { return self->id; })
+      .def("shape", [](Variable &self) { return self->shape; })
+      .def("type", [](Variable &self) { return common::Type2Str(self->type); })
+      .def("set_type",
+           [](Variable &self, const Type &type) {
+             self->type = type;
+             return self;
+           })
+      .def("set_type",
+           [](Variable &self, const std::string &type) {
+             self->type = common::Str2Type(type);
+             return self;
+           })
+      .def("set_shape", [](Variable &self, const std::vector<int> &shape) {
+        self->shape = shape;
+        return self;
+      });
+
+  py::class_<Placeholder>(*m, "Placeholder")  //
+      .def(py::init<const common::Type &, const std::vector<int> &, absl::string_view>(),
+           py::arg("type"),
+           py::arg("shape"),
+           py::arg("id") = "")
+      .def("shape", &Placeholder::shape)
+      .def("type", [](Placeholder &self) { return common::Type2Str(self.type()); })
+      .def("id", &Placeholder::id)
+      .def("name", &Placeholder::id)
+      .def("__str__", [](const Placeholder &self) { return self.id(); });
+
+  py::implicitly_convertible<Placeholder, Variable>();
+
+  py::class_<Instruction>(*m, "Instruction")  //
+      .def("set_attr", [](Instruction &self, const std::string &key, int x) { self.SetAttr(key, x); })
+      .def("set_attr", [](Instruction &self, const std::string &key, float x) { self.SetAttr(key, x); })
+      .def("set_attr", [](Instruction &self, const std::string &key, const std::string &x) { self.SetAttr(key, x); })
+      .def("set_attr",
+           [](Instruction &self, const std::string &key, const std::vector<int> &x) { self.SetAttr(key, x); })
+      .def("set_attr",
+           [](Instruction &self, const std::string &key, const std::vector<float> &x) { self.SetAttr(key, x); })
+      .def("set_attr",
+           [](Instruction &self, const std::string &key, const std::vector<std::string> &x) { self.SetAttr(key, x); })
+      .def("get_attr_int32", &Instruction::GetAttrs<int>)
+      .def("get_attr_fp32", &Instruction::GetAttrs<float>)
+      .def("get_attr_str", &Instruction::GetAttrs<std::string>)
+      .def("get_attr_int32s", &Instruction::GetAttrs<std::vector<int>>)
+      .def("get_attr_fp32s", &Instruction::GetAttrs<std::vector<float>>)
+      .def("get_attr_strs", &Instruction::GetAttrs<std::vector<std::string>>)
+      .def("__str__", [](Instruction &self) { return utils::GetStreamCnt(self); })
+      .def("get_op_type", [](Instruction &self) { return self->op_type; })
+      .def("get_inputs", [](Instruction &self) { return self->inputs; })
+      .def("get_outputs", [](Instruction &self) { return self->outputs; });
+
+  m->def("get_default_program_pass", []() { return DefaultTrainingOptimizeOptions().program_passes; })
+      .def("get_default_graph_pass", []() { return DefaultTrainingOptimizeOptions().graph_passes; })
+      .def("get_default_opfusion_pass", []() { return DefaultOpFusionPasses(); });
+
+  py::class_<Program>(*m, "Program")
+      .def(py::init<>())
+      .def("size", &Program::size)
+      .def("__getitem__", [](Program &self, int idx) { return self[idx]; })
+      .def("__str__", [](Program &self) { return utils::GetStreamCnt(self); })
+      .def("get_inputs", &Program::GetInputs)
+      .def("add", &Program::add)
+      .def("mul", &Program::mul)
+      .def("elementwise_add", &Program::elementwise_add)
+      .def("relu", &Program::relu)
+      .def("relu6", &Program::relu6)
+      .def("sigmoid", &Program::sigmoid)
+      .def("dropout_infer", &Program::dropout_infer)
+      .def("scale", &Program::scale)
+      .def("slice", &Program::slice)
+      .def("conv2d", &Program::conv2d)
+      .def("depthwise_conv2d", &Program::depthwise_conv2d)
+      .def("batchnorm", &Program::batchnorm)
+      .def("softmax", &Program::softmax)
+      .def("pool2d", &Program::pool2d)
+      .def("concat", &Program::concat)
+      .def("reshape", &Program::reshape)
+      .def(
+          "build_and_get_output",
+          [](Program &self,
+             const common::Target &target,
+             const std::vector<Variable> &tensor_inputs,
+             const std::vector<py::array> &input_data,
+             const std::vector<Variable> &tensor_outputs,
+             const std::vector<std::string> &passes        = std::vector<std::string>{},
+             std::shared_ptr<hlir::framework::Scope> scope = nullptr) {
+            cinn::runtime::CurrentTarget::SetCurrentTarget(target);
+            std::unordered_set<std::string> fetch_ids;
+            for (const auto &out : tensor_outputs) {
+              fetch_ids.insert(out->id);
+            }
+            // Acquire all 0D outputs from frontend::Program
+            std::unordered_set<std::string> zero_dim_outputs;
+            for (std::size_t i = 0; i < self.size(); i++) {
+              for (auto &output : self[i].GetOutputs()) {
+                if (output->shape.empty()) {
+                  zero_dim_outputs.insert(output->id);
+                }
+              }
+            }
+
+            auto graph = Optimize(&self, fetch_ids, target, passes);
+
+            scope = hlir::framework::BuildScope(target, graph, scope);
+            hlir::framework::GraphCompiler gc(target, scope, graph);
+
+            // Keep compile option same as paddle
+            hlir::framework::GraphCompiler::CompileOptions options;
+            options.with_instantiate_variables = true;
+            options.remove_unused_variables    = false;
+            auto gc_fetch_ids                  = fetch_ids;
+            const auto &result                 = gc.Build(options, std::move(gc_fetch_ids));
+            const auto &program                = result.runtime_program;
+
+            for (size_t i = 0; i < tensor_inputs.size(); i++) {
+              auto in_tensor = scope->GetTensor(tensor_inputs[i]->id);
+              auto dtype     = tensor_inputs[i]->type;
+              auto *data     = in_tensor->mutable_data(target, dtype);
+              CHECK_EQ(input_data[i].size(), in_tensor->shape().numel())
+                  << "The size of tensor [" << tensor_inputs[i]->id
+                  << "] is different with the input data's size! Please check.";
+              if (target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+                CUDA_CALL(cudaMemcpy(
+                    data, input_data[i].data(), in_tensor->shape().numel() * dtype.bytes(), cudaMemcpyHostToDevice));
+#else
+                 LOG(FATAL) <<"To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+              } else if (target.arch == Target::Arch::X86) {
+                memcpy(data, input_data[i].data(),
+                       in_tensor->shape().numel() * dtype.bytes());  // All random data
+              } else {
+                CINN_NOT_IMPLEMENTED
+              }
+            }
+            program->Execute();
+
+            std::vector<hlir::framework::Tensor> outputs;
+            for (size_t i = 0; i < tensor_outputs.size(); i++) {
+              outputs.push_back(scope->GetTensor(tensor_outputs[i]->id));
+              outputs.back()->set_type(tensor_outputs[i]->type);
+              // Change Tensor from 1D to 0D
+              if (outputs.back()->shape().numel() == 1 &&
+                  zero_dim_outputs.find(tensor_outputs[i]->id) != zero_dim_outputs.end()) {
+                outputs.back()->Resize({});
+              }
+            }
+
+            return outputs;
+          },
+          py::arg("target"),
+          py::arg("feed_list"),
+          py::arg("feed_datas"),
+          py::arg("fetch_list"),
+          py::arg("passes") = std::vector<std::string>{},
+          py::arg("scope")  = nullptr)
+      .def("apply_pass",
+           [](Program &self,
+              const std::unordered_set<std::string> &fetch_ids,
+              const common::Target &target,
+              const std::vector<std::string> &passes = {}) {
+             auto graph = Optimize(&self, fetch_ids, target, passes);
+             return graph->fusion_groups.size();
+           })
+
+      /**
+       * @brief Test the performance of a single-op program
+       * @param self The program built with only one op
+       * @param target The Target that controls the backends to execute on
+       * @param tensor_inputs The vector that contains all input Variables. Must be on CPU
+       * @param input_data The vector that contains each input Variable's data(stored as py::array)
+       * @param tensor_out The output Variable.
+       * @param repeat_ The number of executing time. Increase it to avoid testing noise.
+       * @param info The string to be print before testing. Usually it implyies the kind of op and
+       *  input variable's shape.
+       *
+       * @return The output tensor after executing the op.
+       *
+       * @note
+       *  This function is for user to test single op performance on python.
+       *  To learn more about how to test op's benchmark, see '/python/tests/test_op_benchmark.py'
+       *
+       */
+      .def("test_benchmark",
+           [](Program &self,
+              const common::Target &target,
+              const std::vector<Variable> &tensor_inputs,
+              const std::vector<py::array> &input_data,
+              const Variable &tensor_out,
+              int repeat_,
+              const std::string &info) {
+             std::shared_ptr<hlir::framework::Graph> g(new hlir::framework::Graph(self, target));
+             hlir::framework::ApplyPass(g.get(), "InferShape");
+             std::shared_ptr<hlir::framework::Scope> scope = hlir::framework::BuildScope(target, g);
+             hlir::framework::GraphCompiler gc(target, scope, g);
+             auto program = gc.Build();
+             for (size_t i = 0; i < tensor_inputs.size(); i++) {
+               auto in_tensor = scope->GetTensor(tensor_inputs[i]->id);
+               auto *data     = in_tensor->mutable_data<float>(target);
+               CHECK_EQ(input_data[i].size(), in_tensor->shape().numel())
+                   << "The size of tensor [" << tensor_inputs[i]->id
+                   << "] is different with the input data's size! Please check.";
+               if (target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+                 CUDA_CALL(cudaMemcpy(reinterpret_cast<void *>(data),
+                                      input_data[i].data(),
+                                      in_tensor->shape().numel() * sizeof(float),
+                                      cudaMemcpyHostToDevice));
+#else
+                 LOG(FATAL) <<"To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+               } else if (target.arch == Target::Arch::X86) {
+                 for (size_t j = 0; j < in_tensor->shape().numel(); j++) {
+                   data[j] = reinterpret_cast<const float *>(input_data[i].data())[j];  // All random data
+                 }
+               } else {
+                 CINN_NOT_IMPLEMENTED
+               }
+             }
+             VLOG(3) << info;
+             program->ExecuteTest(repeat_);
+             auto out = scope->GetTensor(tensor_out->id);
+             return out;
+           })
+      .def("test_benchmark_with_code",
+           [](Program &self,
+              const common::Target &target,
+              const std::vector<Variable> &tensor_inputs,
+              const std::vector<py::array> &input_data,
+              const Variable &tensor_out,
+              int repeat_,
+              const std::string &info,
+              const std::string &code) {
+             // std::shared_ptr<hlir::framework::Graph> g(new hlir::framework::Graph(self, target));
+             // hlir::framework::ApplyPass(g.get(), "InferShape");
+             std::unordered_set<std::string> fetch_ids;
+             auto graph                                    = cinn::frontend::Optimize(&self, fetch_ids, target);
+             std::shared_ptr<hlir::framework::Scope> scope = hlir::framework::BuildScope(target, graph);
+
+             hlir::framework::GraphCompiler gc(target, scope, graph);
+             auto program = gc.Build(code);
+             for (size_t i = 0; i < tensor_inputs.size(); i++) {
+               auto in_tensor = scope->GetTensor(tensor_inputs[i]->id);
+               auto *data     = in_tensor->mutable_data<float>(target);
+               CHECK_EQ(input_data[i].size(), in_tensor->shape().numel())
+                   << "The size of tensor [" << tensor_inputs[i]->id
+                   << "] is different with the input data's size! Please check.";
+               if (target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+                 CUDA_CALL(cudaMemcpy(reinterpret_cast<void *>(data),
+                                      input_data[i].data(),
+                                      in_tensor->shape().numel() * sizeof(float),
+                                      cudaMemcpyHostToDevice));
+#else
+                 LOG(FATAL) <<"To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+               } else if (target.arch == Target::Arch::X86) {
+                 for (size_t j = 0; j < in_tensor->shape().numel(); j++) {
+                   data[j] = reinterpret_cast<const float *>(input_data[i].data())[j];  // All random data
+                 }
+               } else {
+                 CINN_NOT_IMPLEMENTED
+               }
+             }
+             VLOG(3) << info;
+             program->ExecuteTest(repeat_);
+             auto out = scope->GetTensor(tensor_out->id);
+             return out;
+           })
+      .def("test_generate_code",
+           [](Program &self,
+              const common::Target &target,
+              const std::vector<Variable> &tensor_inputs,
+              const std::vector<py::array> &input_data,
+              const Variable &tensor_out) {
+             std::shared_ptr<hlir::framework::Graph> g(new hlir::framework::Graph(self, target));
+             hlir::framework::ApplyPass(g.get(), "InferShape");
+             std::shared_ptr<hlir::framework::Scope> scope = hlir::framework::BuildScope(target, g);
+             hlir::framework::GraphCompiler gc(target, scope, g);
+             return gc.GenSourceCode();
+           });
+
+  py::class_<frontend::Interpreter>(*m, "Interpreter")
+      .def(py::init<const std::vector<std::string> &, const std::vector<hlir::framework::shape_t> &>(),
+           py::arg("input_names"),
+           py::arg("input_shapes"))  //
+      .def("load_paddle_model",
+           &frontend::Interpreter::LoadPaddleModel,
+           py::arg("model_dir"),
+           py::arg("target"),
+           py::arg("params_combined"),
+           py::arg("model_name") = "")
+      .def("run", &frontend::Interpreter::Run)
+      .def("get_tensor", &frontend::Interpreter::GetTensor)
+      .def("get_program", &frontend::Interpreter::GetProgram)
+      .def("get_scope", &frontend::Interpreter::GetScope);
+
+  py::class_<NetBuilder, std::shared_ptr<NetBuilder>>(*m, "NetBuilder")
+      .def(py::init<const std::string &>(), py::arg("name") = "")
+  // clang-format off
+#define PY_REGISTER_CONSTANT_OP(TYPE__)                                              \
+     .def("constant",                                                                \
+          static_cast<Variable (NetBuilder::*)(const TYPE__&, const std::string &, const std::string &)>( \
+               &NetBuilder::template Constant<TYPE__>),                              \
+          py::arg("value"),                                                          \
+          py::arg("name") = "",                                                      \
+          py::arg("dtype") = "")
+     EXPAND_CINN_SUPPORT_TYPE(PY_REGISTER_CONSTANT_OP)
+#define EXPAND_ONE_VECTOR(TYPE) PY_REGISTER_CONSTANT_OP(std::vector<TYPE>)
+     EXPAND_CINN_SUPPORT_TYPE(EXPAND_ONE_VECTOR)
+#define EXPAND_TWICE_VECTOR(TYPE) EXPAND_ONE_VECTOR(std::vector<TYPE>)
+     EXPAND_CINN_SUPPORT_TYPE(EXPAND_TWICE_VECTOR)
+#define EXPAND_TRIPLE_VECTOR(TYPE) EXPAND_TWICE_VECTOR(std::vector<TYPE>)
+     EXPAND_CINN_SUPPORT_TYPE(EXPAND_TRIPLE_VECTOR)
+#define EXPAND_QUARTIC_VECTOR(TYPE) EXPAND_TRIPLE_VECTOR(std::vector<TYPE>)
+     EXPAND_CINN_SUPPORT_TYPE(EXPAND_QUARTIC_VECTOR)
+#define EXPAND_QUINTIC_VECTOR(TYPE) EXPAND_QUARTIC_VECTOR(std::vector<TYPE>)
+     EXPAND_CINN_SUPPORT_TYPE(EXPAND_QUINTIC_VECTOR)
+#define EXPAND_SEXTIC_VECTOR(TYPE) EXPAND_QUINTIC_VECTOR(std::vector<TYPE>)
+     EXPAND_CINN_SUPPORT_TYPE(EXPAND_SEXTIC_VECTOR)
+#undef EXPAND_ONE_VECTOR
+#undef EXPAND_TWICE_VECTOR
+#undef EXPAND_TRIPLE_VECTOR
+#undef EXPAND_QUARTIC_VECTOR
+#undef EXPAND_QUINTIC_VECTOR
+#undef EXPAND_SEXTIC_VECTOR
+#undef PY_REGISTER_CONSTANT_OP
+#define PY_REGISTER_FILLCONSTANT_OP(TYPE__)                                                        \
+     .def("fill_constant",                                                                         \
+           static_cast<Variable (NetBuilder::*)(                                                   \
+               const std::vector<int> &, TYPE__, const std::string &, const std::string &, bool)>( \
+               &NetBuilder::FillConstant<TYPE__>),                                                 \
+           py::arg("shape"),                                                                       \
+           py::arg("value"),                                                                       \
+           py::arg("name") = "",                                                                   \
+           py::arg("dtype"),                                                                       \
+           py::arg("force_cpu") = false)                                                           \
+     .def("fill_constant",                                                                         \
+          static_cast<Variable (NetBuilder::*)(                                                    \
+               const std::vector<int> &, TYPE__, const std::string &, bool)>(                      \
+               &NetBuilder::template FillConstant<TYPE__>),                                        \
+          py::arg("shape"),                                                                        \
+          py::arg("value"),                                                                        \
+          py::arg("name") = "",                                                                    \
+          py::arg("force_cpu") = false)
+          EXPAND_CINN_SUPPORT_TYPE(PY_REGISTER_FILLCONSTANT_OP)
+#undef PY_REGISTER_FILLCONSTANT_OP
+#define PY_REGISTER_UNARY_FUNC(func_name__) \
+  .def(SnakeName(#func_name__), &NetBuilder::func_name__, py::arg("x"))
+      NETBUILDER_UNARY_OP_FOREACH(PY_REGISTER_UNARY_FUNC)
+#undef PY_REGISTER_UNARY_FUNC
+#define PY_REGISTER_BINARY_FUNC(func_name__) \
+  .def(SnakeName(#func_name__), &NetBuilder::func_name__, py::arg("x"), py::arg("y"), py::arg("axis") = -1)
+      NETBUILDER_BINARY_OP_FOREACH(PY_REGISTER_BINARY_FUNC)
+#undef PY_REGISTER_BINARY_FUNC
+#define PY_REGISTER_REDUCE_FUNC(func_name__) \
+  .def(SnakeName(#func_name__),              \
+       &NetBuilder::func_name__,             \
+       py::arg("x"),                         \
+       py::arg("axis") = std::vector<int>{}, \
+       py::arg("keepdim") = false)
+      NETBUILDER_REDUCE_OP_FOREACH(PY_REGISTER_REDUCE_FUNC)
+#undef PY_REGISTER_REDUCE_FUNC
+#define PY_REGISTER_REDUCE_CINN_FUNC(func_name__) \
+  .def(SnakeName(#func_name__),              \
+       &NetBuilder::func_name__,             \
+       py::arg("x"),                         \
+       py::arg("dim") = std::vector<int>{}, \
+       py::arg("keep_dim") = false)
+      NETBUILDER_REDUCE_OP_FOREACH(PY_REGISTER_REDUCE_CINN_FUNC)
+#undef PY_REGISTER_REDUCE_CINN_FUNC
+      // clang-format on
+      .def(py::init<const std::string &>(), py::arg("name") = "")
+      .def(
+          "create_input",
+          static_cast<Placeholder (NetBuilder::*)(const common::Type &, const std::vector<int> &, const std::string &)>(
+              &NetBuilder::CreateInput),
+          py::arg("type"),
+          py::arg("shape"),
+          py::arg("id_hint"))
+      .def(
+          "create_input",
+          [](NetBuilder &self, const std::string &type, const std::vector<int> &shape, const std::string &id) {
+            return self.CreateInput(cinn::common::Str2Type(type), shape, id);
+          },
+          py::arg("type"),
+          py::arg("shape"),
+          py::arg("id_hint"))
+      .def("create_input", static_cast<Placeholder (NetBuilder::*)(const Variable &)>(&NetBuilder::CreateInput))
+      .def("build", &NetBuilder::Build, py::arg("in_reverse") = false)
+      .def("name", &NetBuilder::name)
+      .def("__str__", [](NetBuilder &self) { return self.name(); })
+      .def("append_instruction", &NetBuilder::AppendInstruction, py::arg("instr"))
+      .def("fill_constant",
+           static_cast<Variable (NetBuilder::*)(
+               const std::vector<int> &, const std::string &, const std::string &, const std::string &, bool)>(
+               &NetBuilder::FillConstant),
+           py::arg("shape"),
+           py::arg("value"),
+           py::arg("name") = "",
+           py::arg("dtype"),
+           py::arg("force_cpu") = false)
+      .def("broadcast_to",
+           static_cast<Variable (NetBuilder::*)(const Variable &, const std::vector<int> &)>(&NetBuilder::BroadcastTo),
+           py::arg("x"),
+           py::arg("out_shape"))
+      .def("broadcast_to",
+           static_cast<Variable (NetBuilder::*)(const Variable &, const std::vector<int> &, const std::vector<int> &)>(
+               &NetBuilder::BroadcastTo),
+           py::arg("x"),
+           py::arg("out_shape"),
+           py::arg("broadcast_axes"))
+      .def("concat", &NetBuilder::Concat, py::arg("xs"), py::arg("axis") = 0)
+      .def("reshape", &NetBuilder::Reshape, py::arg("x"), py::arg("shape"))
+      .def("transpose", &NetBuilder::Transpose, py::arg("x"), py::arg("axis"))
+      .def("top_k", &NetBuilder::TopK, py::arg("x"), py::arg("k"), py::arg("axis"), py::arg("largest"))
+      .def("sort", &NetBuilder::Sort, py::arg("operand"), py::arg("axis"), py::arg("is_ascend"))
+      .def("argsort", &NetBuilder::ArgSort, py::arg("operand"), py::arg("axis"), py::arg("is_ascend"))
+      .def("slice",
+           &NetBuilder::Slice,
+           py::arg("x"),
+           py::arg("axes"),
+           py::arg("starts"),
+           py::arg("ends"),
+           py::arg("infer_flags")   = std::vector<int>{},
+           py::arg("strides")       = std::vector<int>{},
+           py::arg("decrease_axis") = std::vector<int>{})
+      .def("reverse", &NetBuilder::Reverse, py::arg("x"), py::arg("axis"))
+      .def("resize", &NetBuilder::Resize, py::arg("x"), py::arg("out_shape"), py::arg("mode") = "bilinear")
+      .def("select", &NetBuilder::Select, py::arg("condition"), py::arg("true_value"), py::arg("false_value"))
+      .def("split", &NetBuilder::Split, py::arg("x"), py::arg("num_or_sections"), py::arg("axis") = 0)
+      .def("gather", &NetBuilder::Gather, py::arg("x"), py::arg("index"), py::arg("axis") = 0)
+      .def("slice_assign",
+           &NetBuilder::SliceAssign,
+           py::arg("x"),
+           py::arg("assign"),
+           py::arg("axes"),
+           py::arg("starts"),
+           py::arg("ends"),
+           py::arg("strides") = std::vector<int>{})
+      .def("scatter_assign",
+           &NetBuilder::ScatterAssign,
+           py::arg("x"),
+           py::arg("updates"),
+           py::arg("index"),
+           py::arg("axis") = 0)
+      .def("scatter_add",
+           &NetBuilder::ScatterAdd,
+           py::arg("x"),
+           py::arg("updates"),
+           py::arg("index"),
+           py::arg("axis") = 0)
+      .def("isclose",
+           &NetBuilder::IsClose,
+           py::arg("x"),
+           py::arg("y"),
+           py::arg("rtol")      = 1e-05f,
+           py::arg("atol")      = 1e-08f,
+           py::arg("equal_nan") = false)
+      .def("mul",
+           &NetBuilder::Mul,
+           py::arg("x"),
+           py::arg("y"),
+           py::arg("x_num_col_dims") = 1,
+           py::arg("y_num_col_dims") = 1,
+           py::arg("is_infer")       = false)
+      .def("elementwise_add_grad",
+           &NetBuilder::ElementwiseAddGrad,
+           py::arg("dout"),
+           py::arg("x"),
+           py::arg("y"),
+           py::arg("axis") = -1)
+      .def("relu6", &NetBuilder::Relu6, py::arg("a"), py::arg("threshold") = 6.0f)
+      .def("gelu", &NetBuilder::Gelu, py::arg("x"))
+      .def("squeeze", &NetBuilder::Squeeze, py::arg("a"), py::arg("axes") = std::vector<int>{})
+      .def("expand_dims", &NetBuilder::ExpandDims, py::arg("x"), py::arg("axes"))
+      .def("argmax", &NetBuilder::Argmax, py::arg("x"), py::arg("axis"), py::arg("keep_dim") = false)
+      .def("argmin", &NetBuilder::Argmin, py::arg("x"), py::arg("axis"), py::arg("keep_dim") = false)
+      .def("lookup_table", &NetBuilder::LookupTable, py::arg("table"), py::arg("ids"), py::arg("padding_idx"))
+      .def("one_hot",
+           &NetBuilder::OneHot,
+           py::arg("indices"),
+           py::arg("on_value"),
+           py::arg("off_value"),
+           py::arg("depth"),
+           py::arg("axis")  = -1,
+           py::arg("dtype") = "float32")
+      .def("conv2d",
+           &NetBuilder::Conv2d,
+           py::arg("x"),
+           py::arg("w"),
+           py::arg("strides")           = std::vector<int>{1, 1},
+           py::arg("paddings")          = std::vector<int>{0, 0},
+           py::arg("dilations")         = std::vector<int>{1, 1},
+           py::arg("groups")            = 1,
+           py::arg("data_format")       = "NCHW",
+           py::arg("padding_algorithm") = "EXPLICIT")
+      .def("depthwise_conv2d",
+           &NetBuilder::DepthwiseConv2d,
+           py::arg("x"),
+           py::arg("w"),
+           py::arg("strides")           = std::vector<int>{1, 1},
+           py::arg("paddings")          = std::vector<int>{0, 0},
+           py::arg("dilations")         = std::vector<int>{1, 1},
+           py::arg("groups")            = 1,
+           py::arg("data_format")       = "NCHW",
+           py::arg("padding_algorithm") = "EXPLICIT")
+      .def("pool2d",
+           &NetBuilder::Pool2d,
+           py::arg("x"),
+           py::arg("pooling_type"),
+           py::arg("kernel_size"),
+           py::arg("stride")            = std::vector<int>{1, 1},
+           py::arg("padding")           = std::vector<int>{0, 0},
+           py::arg("ceil_mode")         = false,
+           py::arg("exclusive")         = true,
+           py::arg("global_pooling")    = false,
+           py::arg("data_format")       = "NCHW",
+           py::arg("adaptive")          = false,
+           py::arg("padding_algorithm") = "EXPLICIT")
+      .def("pool2d_grad",
+           &NetBuilder::Pool2dGrad,
+           py::arg("x"),
+           py::arg("y"),
+           py::arg("dy"),
+           py::arg("pooling_type"),
+           py::arg("kernel_size"),
+           py::arg("stride")            = std::vector<int>{1, 1},
+           py::arg("padding")           = std::vector<int>{0, 0},
+           py::arg("ceil_mode")         = false,
+           py::arg("exclusive")         = true,
+           py::arg("global_pooling")    = false,
+           py::arg("data_format")       = "NCHW",
+           py::arg("adaptive")          = false,
+           py::arg("padding_algorithm") = "EXPLICIT")
+      .def("batchnorm",
+           &NetBuilder::BatchNorm,
+           py::arg("x"),
+           py::arg("scale"),
+           py::arg("bias"),
+           py::arg("mean"),
+           py::arg("variance"),
+           py::arg("epsilon")     = 1e-5f,
+           py::arg("momentum")    = 0.9f,
+           py::arg("data_layout") = "NCHW",
+           py::arg("is_test")     = true)
+      .def("batch_norm_grad",
+           &NetBuilder::BatchNormGrad,
+           py::arg("dy"),
+           py::arg("x"),
+           py::arg("scale"),
+           py::arg("save_mean"),
+           py::arg("save_variance"),
+           py::arg("epsilon")     = 1e-5,
+           py::arg("data_layout") = "NCHW")
+      .def("scale",
+           &NetBuilder::Scale,
+           py::arg("x"),
+           py::arg("scale")            = 1.0f,
+           py::arg("bias")             = 0.0f,
+           py::arg("bias_after_scale") = true)
+      .def("softmax",
+           &NetBuilder::Softmax,
+           py::arg("x"),
+           py::arg("axes")        = std::vector<int>{-1},
+           py::arg("mode")        = "fast",
+           py::arg("data_format") = "AnyLayout")
+      .def("dropout_infer",
+           &NetBuilder::DropoutInfer,
+           py::arg("x"),
+           py::arg("dropout_prob")           = 0.5f,
+           py::arg("dropout_implementation") = "downgrade_in_infer")
+      .def("relu_grad", &NetBuilder::ReluGrad, py::arg("dout"), py::arg("x"))
+      .def("sum", &NetBuilder::Sum, py::arg("inputs"))
+      .def("matmul",
+           &NetBuilder::Matmul,
+           py::arg("x"),
+           py::arg("y"),
+           py::arg("transpose_x") = false,
+           py::arg("transpose_y") = false,
+           py::arg("alpha")       = 1.0f)
+      .def("conv",
+           &NetBuilder::Conv,
+           py::arg("x"),
+           py::arg("w"),
+           py::arg("strides")           = std::vector<int>{1, 1},
+           py::arg("paddings")          = std::vector<int>{0, 0},
+           py::arg("dilations")         = std::vector<int>{1, 1},
+           py::arg("groups")            = 1,
+           py::arg("conv_type")         = "forward",
+           py::arg("data_format")       = "NCHW",
+           py::arg("padding_algorithm") = "EXPLICIT",
+           py::arg("output_shape")      = std::vector<int>{})
+      .def("cast", &NetBuilder::Cast, py::arg("x"), py::arg("dtype"))
+      .def("bitcast_convert", &NetBuilder::BitcastConvert, py::arg("x"), py::arg("dtype"))
+      .def("arange", &NetBuilder::Arange, py::arg("start"), py::arg("stop"), py::arg("step"), py::arg("dtype"))
+      .def("gather_nd", &NetBuilder::GatherNd, py::arg("x"), py::arg("index"))
+      .def("cbrt", &NetBuilder::Cbrt, py::arg("x"))
+      .def("clz", &NetBuilder::Clz, py::arg("x"))
+      .def("popc", &NetBuilder::Popc, py::arg("x"))
+      .def("reciprocal", &NetBuilder::Reciprocal, py::arg("x"))
+      .def("gaussian_random",
+           &NetBuilder::GaussianRandom,
+           py::arg("shape"),
+           py::arg("mean")  = 0.0f,
+           py::arg("std")   = 1.0f,
+           py::arg("seed")  = 0,
+           py::arg("dtype") = "float32")
+      .def("uniform_random",
+           &NetBuilder::UniformRandom,
+           py::arg("shape"),
+           py::arg("min")       = -1.0f,
+           py::arg("max")       = 1.0f,
+           py::arg("seed")      = 0,
+           py::arg("dtype")     = "float32",
+           py::arg("diag_num")  = 0,
+           py::arg("diag_step") = 0,
+           py::arg("diag_val")  = 1.0f)
+      .def("randint",
+           &NetBuilder::RandInt,
+           py::arg("shape"),
+           py::arg("min")   = 0,
+           py::arg("max")   = 0,
+           py::arg("seed")  = 0,
+           py::arg("dtype") = "int64")
+      .def("repeat", &NetBuilder::Repeat, py::arg("x"), py::arg("repeats"), py::arg("axis"))
+      .def("flip", &NetBuilder::Flip, py::arg("x"), py::arg("axis"))
+      .def("cholesky", &NetBuilder::Cholesky, py::arg("x"), py::arg("upper") = false)
+      .def("triangular_solve",
+           &NetBuilder::TriangularSolve,
+           py::arg("input1"),
+           py::arg("input2"),
+           py::arg("left_side")     = true,
+           py::arg("upper")         = false,
+           py::arg("transpose_a")   = false,
+           py::arg("unit_diagonal") = false);
+
+  auto computation = py::class_<CinnComputation, std::shared_ptr<CinnComputation>>(*m, "Computation");
+  py::class_<CinnComputation::CompileOptions>(computation, "CompileOptions")
+      .def_readwrite("use_decomposer", &CinnComputation::CompileOptions::use_decomposer)
+      .def_readwrite("do_prerun", &CinnComputation::CompileOptions::do_prerun)
+      .def_readwrite("use_default_passes", &CinnComputation::CompileOptions::use_default_passes)
+      .def_readwrite("passes", &CinnComputation::CompileOptions::passes);
+
+  computation
+      .def("default_compile_options", &CinnComputation::DefaultCompileOptions)
+      // currently stream param is not exported to python, the default stream is used always
+      .def_static(
+          "build_and_compile",
+          [](const common::Target &target, NetBuilder &builder, const CinnComputation::CompileOptions &options) {
+            return CinnComputation::BuildAndCompile(target, builder, options);
+          },
+          py::arg("target"),
+          py::arg("builder"),
+          py::arg("options") = CinnComputation::DefaultCompileOptions())
+      .def_static(
+          "compile",
+          [](const common::Target &target, Program &program, const CinnComputation::CompileOptions &options) {
+            return CinnComputation::Compile(target, program, options);
+          },
+          py::arg("target"),
+          py::arg("program"),
+          py::arg("options") = CinnComputation::DefaultCompileOptions())
+      .def_static(
+          "compile_paddle_model",
+          [](const common::Target &target,
+             const std::string &model_path,
+             const std::vector<std::string> &input_names,
+             const std::vector<hlir::framework::shape_t> &input_shapes,
+             bool params_combined,
+             const CinnComputation::CompileOptions &options) {
+            return CinnComputation::CompilePaddleModel(
+                target, model_path, input_names, input_shapes, params_combined, options);
+          },
+          py::arg("target"),
+          py::arg("model_path"),
+          py::arg("input_names"),
+          py::arg("input_shapes"),
+          py::arg("params_combined"),
+          py::arg("options") = CinnComputation::DefaultCompileOptions())
+      .def("get_all_tensor_names", &CinnComputation::GetAllTensorNames)
+      .def("get_tensor", &CinnComputation::GetTensor)
+      .def("execute", [](CinnComputation &self) { self.Execute(); });
+
+  py::class_<PaddleModelConvertor>(*m, "PaddleModelConvertor")
+      .def(py::init<>())
+      .def(py::init<const common::Target &, std::shared_ptr<NetBuilder>, std::shared_ptr<hlir::framework::Scope>>(),
+           py::arg("target"),
+           py::arg("builder") = nullptr,
+           py::arg("scope")   = nullptr)
+      .def("__call__", &PaddleModelConvertor::operator())
+      .def("load_model",
+           &PaddleModelConvertor::LoadModel,
+           py::arg("model_dir"),
+           py::arg("is_combined") = false,
+           py::arg("feed")        = std::unordered_map<std::string, std::vector<int64_t>>())
+      .def("create_input", &PaddleModelConvertor::CreateInput, py::arg("dtype"), py::arg("shape"), py::arg("name"))
+      .def("append_op",
+           static_cast<void (PaddleModelConvertor::*)(const std::string &,
+                                                      const std::map<std::string, std::vector<std::string>> &,
+                                                      const std::map<std::string, std::vector<std::string>> &,
+                                                      const std::map<std::string, cinn::utils::Attribute> &)>(
+               &PaddleModelConvertor::RunOp),
+           py::arg("type"),
+           py::arg("inputs"),
+           py::arg("outputs"),
+           py::arg("attrs"))
+      .def("get_fetch_list",
+           &PaddleModelConvertor::GetFetchList,
+           py::arg("fetch_list") = std::unordered_set<std::string>{})
+      .def("get_cinn_name", [](PaddleModelConvertor &self, const std::string &paddle_name) {
+        CHECK(self.var_model_to_program_map().count(paddle_name))
+            << "Cannot find variabel " << paddle_name << " in CINN! Please check.";
+        return self.var_model_to_program_map().at(paddle_name);
+      });
+
+}  // namespace frontend
+
+#undef EXPAND_CINN_SUPPORT_TYPE
+
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/ir.cc b/paddle/cinn/pybind/ir.cc
new file mode 100755
index 0000000000000..550b9f65122ab
--- /dev/null
+++ b/paddle/cinn/pybind/ir.cc
@@ -0,0 +1,636 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/ir/ir.h"
+
+#include <llvm/Support/FormatVariadic.h>
+#include <pybind11/functional.h>
+#include <pybind11/operators.h>
+#include <pybind11/stl.h>
+
+#include <string>
+#include <type_traits>
+
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/ir_printer.h"
+#include "cinn/ir/ir_visitor.h"
+#include "cinn/ir/lowered_func.h"
+#include "cinn/ir/operation.h"
+#include "cinn/ir/registry.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/packed_func.h"
+#include "cinn/poly/stage.h"
+#include "cinn/pybind/bind.h"
+#include "cinn/pybind/bind_utils.h"
+
+namespace py = pybind11;
+
+namespace cinn::pybind {
+using ir::IrNode;
+using ir::IrNodeRef;
+using ir::IrNodeTy;
+
+// lowered_func.h
+using ir::Argument;
+using ir::Expr;
+using ir::LoweredFunc;
+using ir::Var;
+
+namespace {
+void BindLoweredFunc(py::module *);
+void BindNode(py::module *);
+void BindIrVisitor(py::module *);
+void BindIrIr(py::module *);
+void BindOperation(py::module *);
+void BindPackedFunc(py::module *);
+void BindRegistry(py::module *);
+
+void BindLoweredFunc(py::module *m) {
+  py::class_<Argument> argument(*m, "Argument");
+
+  py::enum_<Argument::IO> io(argument, "IO");
+  io.value("kInput", Argument::IO::kInput).value("kOutput", Argument::IO::kOutput);
+
+  argument.def(py::init<const ir::Buffer &, Argument::IO>(), py::arg("buffer"), py::arg("io") = Argument::IO::kInput)
+      .def(py::init<const ir::Var &, Argument::IO>(), py::arg("var"), py::arg("io") = Argument::IO::kInput)
+      .def("set_buffer", &Argument::set_buffer)
+      .def("set_var", &Argument::set_var)
+      .def("is_input", &Argument::is_input)
+      .def("is_output", &Argument::is_output)
+      .def("is_var", &Argument::is_var)
+      .def("is_buffer", &Argument::is_buffer)
+      .def("defined", &Argument::defined)
+      .def("buffer_arg", &Argument::buffer_arg)
+      .def("type", &Argument::type)
+      .def("name", &Argument::name)
+      .def("human_readable", &Argument::human_readable);
+
+  py::class_<LoweredFunc> lowered_func(*m, "LoweredFunc");
+  lowered_func.def(py::init<>())
+      .def(py::init<IrNode *>())
+      .def("name", [](const ir::LoweredFunc &self) -> std::string { return self->name; })
+      .def("__str__", [](const ir::LoweredFunc &self) -> std::string { return utils::GetStreamCnt(Expr(self)); })
+      .def("__repr__", [](const ir::LoweredFunc &self) -> std::string {
+        return llvm::formatv("<LoweredFunc {0}>", self.get(), self->name.c_str());
+      });
+}
+
+void BindNode(py::module *m) {
+  // enum class IrNodeTy
+  py::enum_<ir::IrNodeTy> ir_node_ty(*m, "IrNodeTy");
+  ir_node_ty.value("kUnk", ir::IrNodeTy::kUnk);
+#define DECLARE_IR_NODE_TY(__ty) ir_node_ty.value(#__ty, ir::IrNodeTy::__ty);
+  NODETY_FORALL(DECLARE_IR_NODE_TY)
+#undef DECLARE_IR_NODE_TY
+
+  // class IrNode
+  py::class_<ir::IrNode, IrNodeWrapper /*, ObjectWrapper*/> ir_node(*m, "IrNode", py::module_local());
+  ir_node.def(py::init<>())
+      .def(py::init<ir::Type>())
+      .def_readwrite("operands", &ir::IrNode::operands)
+      .def("node_type", &ir::IrNode::node_type)
+      .def("type", &ir::IrNode::type)
+      .def("set_type", &ir::IrNode::set_type)
+      .def("expr_fields_mutable", py::overload_cast<>(&ir::IrNode::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&ir::IrNode::expr_fields, py::const_))
+      .def("type_info", &ir::IrNode::type_info);
+
+  // class Shared<IrNode>
+  DefineShared<IrNode>(m, "IrNode");
+
+  // class IrNodeRef : public Shared<IrNode>
+  py::class_<ir::IrNodeRef, common::Shared<IrNode>> ir_node_ref(*m, "IrNodeRef");
+  ir_node_ref.def(py::init<>())
+      .def(py::init<const ir::IrNodeRef &>())
+      .def(py::init<ir::IrNode *>())
+      .def("node_type", &ir::IrNodeRef::node_type);
+
+  // struct IntImm : ExprNode<IntImm>
+  DefineExprNode<ir::IntImm>(m, "IntImm");
+  py::class_<ir::IntImm, ir::ExprNode<ir::IntImm>> int_imm(*m, "IntImm");
+  int_imm.def_readwrite("value", &ir::IntImm::value)
+      .def(py::init<Type, int64_t>())
+      .def("__str__", [](const ir::IntImm &self) { return std::to_string(self.value); })
+      .def("__repr__",
+           [](ir::IntImm &self) -> std::string { return llvm::formatv("<IntImm {0}>", self.self(), self.value); });
+
+  // struct UIntImm : ExprNode<UIntImm>
+  DefineExprNode<ir::UIntImm>(m, "UIntImm");
+  py::class_<ir::UIntImm, ir::ExprNode<ir::UIntImm>> uint_imm(*m, "UIntImm");
+  uint_imm.def_readwrite("value", &ir::UIntImm::value).def(py::init<Type, uint64_t>());
+
+  // struct FloatImm : ExprNode<FloatImm>
+  DefineExprNode<ir::FloatImm>(m, "FloatImm");
+  py::class_<ir::FloatImm, ir::ExprNode<ir::FloatImm>> float_imm(*m, "FloatImm");
+  float_imm.def_readwrite("value", &ir::FloatImm::value).def(py::init<Type, double>());
+
+  // struct StringImm : ExprNode<StringImm>
+  DefineExprNode<ir::StringImm>(m, "StringImm");
+  py::class_<ir::StringImm, ir::ExprNode<ir::StringImm>> string_imm(*m, "StringImm");
+  string_imm.def_readwrite("value", &ir::StringImm::value).def(py::init<const std::string &>());
+
+  auto expr = py::class_<ir::Expr, ir::IrNodeRef>(*m, "Expr");
+
+  expr.def(py::init<ir::Expr &>());
+  expr.def(py::init<ir::IrNode *>());
+  expr.def(py::init<const ir::Var &>());
+  expr.def(py::init<int32_t>());
+  expr.def(py::init<uint32_t>());
+  expr.def(py::init<int64_t>());
+  expr.def(py::init<uint64_t>());
+  expr.def(py::init<float>());
+  expr.def(py::init<double>());
+  expr.def(py::init<const std::string &>());
+
+  expr.def("as_int32", &ir::Expr::as_int32)
+      .def("as_int64", &ir::Expr::as_int64)
+      .def("as_float", &ir::Expr::as_float)
+      .def("as_double", &ir::Expr::as_double)
+      .def("int", [](ir::Expr &self) { return self.As<ir::IntImm>()->value; })
+      .def("float", [](ir::Expr &self) { return self.As<ir::FloatImm>()->value; })
+
+      .def("__str__", [](const Expr &self) { return utils::GetStreamCnt(self); })
+      .def("__repr__", [](const Expr &self) -> std::string {
+        std::string content = self.get() ? utils::GetStreamCnt(self) : "";
+        return llvm::formatv("<cinn.ir.Expr {0}>", content);
+      });
+
+  expr.def("as_var_mutable", py::overload_cast<>(&ir::Expr::as_var), py::return_value_policy::reference)
+      .def("as_var_const", py::overload_cast<>(&ir::Expr::as_var, py::const_), py::return_value_policy::reference)
+      .def("as_var_ref", &ir::Expr::as_var_ref);
+
+  expr.def("as_buffer_mutable", py::overload_cast<>(&ir::Expr::as_buffer), py::return_value_policy::reference)
+      .def("as_buffer_const", py::overload_cast<>(&ir::Expr::as_buffer, py::const_), py::return_value_policy::reference)
+      .def("as_buffer_ref", &ir::Expr::as_buffer_ref);
+
+  expr.def("is_constant", &ir::Expr::is_constant)
+      .def("get_constant", &ir::Expr::get_constant)
+      .def("is_var", &ir::Expr::is_var)
+      .def("type", &ir::Expr::type);
+
+  // operators
+
+#define BIND_POD_BINARY_OP(otype__) \
+  .def(py::self + otype__)          \
+      .def(py::self - otype__)      \
+      .def(py::self *otype__)       \
+      .def(py::self / otype__)      \
+      .def(py::self % otype__)      \
+      .def(py::self < otype__)      \
+      .def(py::self <= otype__)     \
+      .def(py::self > otype__)      \
+      .def(py::self >= otype__)     \
+      .def(otype__ + py::self)      \
+      .def(otype__ - py::self)      \
+      .def(otype__ *py::self)       \
+      .def(otype__ / py::self)      \
+      .def(otype__ % py::self)      \
+      .def(otype__ < py::self)      \
+      .def(otype__ <= py::self)     \
+      .def(otype__ > py::self)      \
+      .def(otype__ >= py::self)
+
+  expr                              //
+      BIND_POD_BINARY_OP(py::self)  //
+      BIND_POD_BINARY_OP(int())     //
+      BIND_POD_BINARY_OP(float());
+
+  expr.def("__add__", [](const Expr &self, const Var &other) -> Expr { return self + other; })
+      .def("__sub__", [](const Expr &self, const Var &other) -> Expr { return self - other; })
+      .def("__mul__", [](const Expr &self, const Var &other) -> Expr { return self * other; })
+      .def("__div__", [](const Expr &self, const Var &other) -> Expr { return self / other; });
+}
+
+void BindIrVisitor(py::module *m) {
+  py::class_<ir::IRVisitor> ir_visitor(*m, "IRVisitor");
+  ir_visitor.def(py::init<>()).def("visit", py::overload_cast<const ir::Expr *>(&ir::IRVisitor::Visit));
+#define DEFINE_VISIT_FN(__ty) ir_visitor.def("visit", py::overload_cast<const ir::__ty *>(&ir::IRVisitor::Visit));
+  NODETY_FORALL(DEFINE_VISIT_FN)
+#undef DEFINE_VISIT_FN
+}
+
+void BindIrIr(py::module *m) {
+  using ir::Expr;
+  using ir::IrNode;
+  using ir::IrNodeRef;
+  using ir::Var;
+  using py::arg;
+
+  // struct Cast : ExprNode<Cast>
+  DefineExprNode<ir::Cast>(m, "Cast");
+  py::class_<ir::Cast, ExprNode<ir::Cast>> cast(*m, "Cast");
+  cast.def(py::init<>())
+      .def("v_mutable", py::overload_cast<>(&ir::Cast::v), py::return_value_policy::reference)
+      .def("v_const", py::overload_cast<>(&ir::Cast::v, py::const_), py::return_value_policy::reference);
+
+  // struct Let : ExprNode<Let>
+  DefineExprNode<ir::Let>(m, "Let");
+  py::class_<ir::Let, ExprNode<ir::Let>> let(*m, "Let");
+  let.def(py::init<>())
+      .def_readwrite("symbol", &ir::Let::symbol)
+      .def_readwrite("body", &ir::Let::body)
+      .def_static("make", &ir::Let::Make)
+      .def("type", &ir::Let::type)
+      .def("expr_fields_mutable", py::overload_cast<>(&ir::Let::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&ir::Let::expr_fields, py::const_));
+
+  // struct Reduce : ExprNode<Reduce>
+  DefineExprNode<ir::Reduce>(m, "Reduce");
+  py::class_<ir::Reduce, ExprNode<ir::Reduce>> reduce(*m, "Reduce");
+  py::enum_<ir::Reduce::ReduceType> reduce_type(reduce, "ReduceType");
+  reduce_type  //
+      .value("kSum", ir::Reduce::ReduceType::kSum)
+      .value("kSub", ir::Reduce::ReduceType::kSub)
+      .value("kMul", ir::Reduce::ReduceType::kMul)
+      .value("kDiv", ir::Reduce::ReduceType::kDiv)
+      .value("kMax", ir::Reduce::ReduceType::kMax)
+      .value("kMin", ir::Reduce::ReduceType::kMin)
+      .value("kAll", ir::Reduce::ReduceType::kAll)
+      .value("kAny", ir::Reduce::ReduceType::kAny);
+
+  reduce.def_readwrite("init", &ir::Reduce::init)
+      .def_readwrite("body", &ir::Reduce::body)
+      .def_readwrite("reduce_type", &ir::Reduce::reduce_type)
+      .def_static("make", &ir::Reduce::Make)
+      .def("type", &ir::Reduce::type)
+      .def("expr_fields_mutable", py::overload_cast<>(&ir::Reduce::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&ir::Reduce::expr_fields, py::const_));
+
+  // enum class CallType
+  py::enum_<ir::CallType> call_type(*m, "CallType");
+  call_type.value("Extern", ir::CallType::Extern)
+      .value("CINN", ir::CallType::CINN)
+      .value("Intrinsic", ir::CallType::Intrinsic)
+      .value("ISL", ir::CallType::ISL);
+
+  // struct Call : ExprNode<Call>
+  DefineExprNode<ir::Call>(m, "Call");
+  py::class_<ir::Call, ExprNode<ir::Call>> call(*m, "Call");
+  call.def(py::init<Type>())
+      .def_readwrite("name", &ir::Call::name)
+      .def_readwrite("read_args", &ir::Call::read_args)
+      .def_readwrite("write_args", &ir::Call::write_args)
+      .def_readwrite("call_type", &ir::Call::call_type)
+      .def_readwrite("func", &ir::Call::func)
+      .def_readwrite("value_index", &ir::Call::value_index)
+      .def_static("make", &ir::Call::Make)
+      .def("total_args_count", &ir::Call::total_args_count)
+      .def("is_extern_call", &ir::Call::is_extern_call)
+      .def("is_cinn_call", &ir::Call::is_cinn_call)
+      .def("is_intrinsic_call", &ir::Call::is_intrinsic_call)
+      .def("is_isl_call", &ir::Call::is_isl_call)
+      .def("expr_fields_mutable", py::overload_cast<>(&ir::Call::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&ir::Call::expr_fields, py::const_));
+
+  // struct _Var_ : ExprNode<_Var_>
+  DefineExprNode<ir::_Var_>(m, "_Var_");
+  py::class_<ir::_Var_, ExprNode<ir::_Var_>> _var_(*m, "_Var_");
+  _var_.def_readwrite("name", &ir::_Var_::name)
+      .def_readwrite("is_reduce_axis", &ir::_Var_::is_reduce_axis)
+      .def_readwrite("lower_bound", &ir::_Var_::lower_bound)
+      .def_readwrite("upper_bound", &ir::_Var_::upper_bound)
+      .def_readwrite("tag", &ir::_Var_::tag)
+      .def(py::init<>())
+      .def(py::init<const std::string &, Type>())
+      .def_static("make", py::overload_cast<const std::string &, const Type &>(&ir::_Var_::Make))
+      .def_static("make", py::overload_cast<ir::Expr, ir::Expr, const std::string &, bool>(&ir::_Var_::Make))
+      .def("copy", &ir::_Var_::Copy);
+
+  // struct Select
+  DefineExprNode<ir::Select>(m, "Select");
+  py::class_<ir::Select, ExprNode<ir::Select>> select(*m, "Select");
+  select.def_readwrite("condition", &ir::Select::condition)
+      .def_readwrite("true_value", &ir::Select::true_value)
+      .def_readwrite("false_value", &ir::Select::false_value)
+      .def(py::init<ir::Expr, ir::Expr, ir::Expr>())
+      .def_static("make", &ir::Select::Make)
+      .def("type", &ir::Select::type)
+      .def("expr_fields_mutable", py::overload_cast<>(&ir::Select::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&ir::Select::expr_fields, py::const_));
+
+  // struct LoadStoreAddrMnger
+  py::class_<ir::LoadStoreAddrMnger> load_store_addr_manager(*m, "LoadStoreAddrMnger");
+  load_store_addr_manager.def_readwrite("tensor", &ir::LoadStoreAddrMnger::tensor)
+      .def("is_addr_tensor", &ir::LoadStoreAddrMnger::is_addr_tensor)
+      .def("is_addr_scalar", &ir::LoadStoreAddrMnger::is_addr_scalar);
+
+  // struct Load : ExprNode<Load>, LoadStoreAddrMnger
+  DefineExprNode<ir::Load>(m, "Load");
+  py::class_<ir::Load, ExprNode<ir::Load>, ir::LoadStoreAddrMnger> load(*m, "Load");
+  load.def_readwrite("indices", &ir::Load::indices)
+      .def("index", &ir::Load::index)
+      .def_static("make", &ir::Load::Make)
+      .def("expr_fields_mutable", py::overload_cast<>(&ir::Load::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&ir::Load::expr_fields, py::const_))
+      .def("name", &ir::Load::name)
+      .def("type", &ir::Load::type);
+
+  // struct Store : ExprNode<Store>, LoadStoreAddrMnger
+  DefineExprNode<ir::Store>(m, "Store");
+  py::class_<ir::Store, ExprNode<ir::Store>, ir::LoadStoreAddrMnger> store(*m, "Store");
+  store.def_readwrite("value", &ir::Store::value)
+      .def_readwrite("indices", &ir::Store::indices)
+      .def_static("make", &ir::Store::Make)
+      .def("expr_fields_mutable", py::overload_cast<>(&ir::Store::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&ir::Store::expr_fields, py::const_))
+      .def("type", &ir::Store::type)
+      .def("index", &ir::Store::index);
+
+#define DEFINE_BINARY_NODE(__node)                                               \
+  DefineBinaryOpNode<ir::__node>(m, #__node);                                    \
+  py::class_<ir::__node, ir::BinaryOpNode<ir::__node>> py_##__node(*m, #__node); \
+  py_##__node.def(py::init<ir::Expr, ir::Expr>()).def_static("make", &ir::__node::Make).def("type", &ir::__node::type)
+
+  DEFINE_BINARY_NODE(Add);
+  DEFINE_BINARY_NODE(Sub);
+  DEFINE_BINARY_NODE(Mul);
+  DEFINE_BINARY_NODE(Div);
+  DEFINE_BINARY_NODE(Mod);
+  DEFINE_BINARY_NODE(Min);
+  DEFINE_BINARY_NODE(Max);
+  DEFINE_BINARY_NODE(EQ);
+  DEFINE_BINARY_NODE(NE);
+  DEFINE_BINARY_NODE(LT);
+  DEFINE_BINARY_NODE(LE);
+  DEFINE_BINARY_NODE(GT);
+  DEFINE_BINARY_NODE(GE);
+  DEFINE_BINARY_NODE(And);
+  DEFINE_BINARY_NODE(Or);
+
+#undef DEFINE_BINARY_NODE
+
+  // FracOp
+  DefineBinaryOpNode<ir::FracOp>(m, "FracOp");
+  py::class_<ir::FracOp, ir::BinaryOpNode<ir::FracOp>> frac_op(*m, "FracOp");
+  frac_op.def(py::init<>()).def_static("make", &ir::FracOp::Make).def("type", &ir::FracOp::type);
+
+#define DEFINE_UNARY_NODE(__node)                                               \
+  DefineUnaryOpNode<ir::__node>(m, #__node);                                    \
+  py::class_<ir::__node, ir::UnaryOpNode<ir::__node>> py_##__node(*m, #__node); \
+  py_##__node.def(py::init<ir::Expr>()).def_static("make", &ir::__node::Make)
+
+  DEFINE_UNARY_NODE(Minus);
+  DEFINE_UNARY_NODE(Not);
+#undef DEFINE_UNARY_NODE
+
+  py::class_<Var, IrNodeRef> var(*m, "Var");
+  var.def(py::init<>())
+      .def(py::init<IrNode *>())
+      .def(py::init<const std::string &, common::Type>(), arg("name_hint"), arg("t") = common::type_of<int>())
+      .def(py::init<Expr, Expr, const std::string &>())
+      .def(py::init<int, const std::string &>())
+      .def(py::init<Expr, const std::string &>())
+      .def("get_mutable", py::overload_cast<>(&Var::get), py::return_value_policy::reference)
+      .def("get_const", py::overload_cast<>(&Var::get, py::const_), py::return_value_policy::reference)
+      .def("to_expr_mutable", py::overload_cast<>(&Var::operator ir::Expr))
+      .def("to_expr_const", py::overload_cast<>(&Var::operator ir::Expr, py::const_))
+      .def("__repr__", [](Var &self) -> std::string { return llvm::formatv("<cinn.ir.Var {0}>", self->name); })
+      .def("expr", [](Var &self) -> Expr { return Expr(self->self()); })
+
+          BIND_POD_BINARY_OP(int())  //
+      BIND_POD_BINARY_OP(int32_t())  //
+      BIND_POD_BINARY_OP(float())
+
+#define BINARY_OP(type__)                                                       \
+  .def("__add__", [](Var &self, type__ v) -> Expr { return self + v; })         \
+      .def("__sub__", [](Var &self, type__ v) -> Expr { return self - v; })     \
+      .def("__truediv__", [](Var &self, type__ v) -> Expr { return self / v; }) \
+      .def("__mul__", [](Var &self, type__ v) -> Expr { return self * v; })     \
+      .def("__mod__", [](Var &self, type__ v) -> Expr { return self % v; })
+
+          BINARY_OP(int32_t)  //
+      BINARY_OP(int64_t)      //
+      BINARY_OP(float)        //
+      BINARY_OP(double);
+#undef BINARY_OP
+
+  DefineExprNode<ir::Product>(m, "Product");
+  py::class_<ir::Product, ir::ExprNode<ir::Product>> product(*m, "Product");
+  product.def_static("make", &ir::Product::Make)
+      .def("type", &ir::Product::type)
+      .def("operand_mutable", py::overload_cast<int>(&ir::Product::operand), py::return_value_policy::reference)
+      .def("operand_const",
+           py::overload_cast<int>(&ir::Product::operand, py::const_),
+           py::return_value_policy::reference);
+
+  DefineExprNode<ir::Sum>(m, "Sum");
+  py::class_<ir::Sum, ir::ExprNode<ir::Sum>> sum(*m, "Sum");
+  sum.def_static("make", &ir::Sum::Make)
+      .def("operand_mutable", py::overload_cast<int>(&ir::Sum::operand), py::return_value_policy::reference)
+      .def("operand_const", py::overload_cast<int>(&ir::Sum::operand, py::const_), py::return_value_policy::reference)
+      .def("type", &ir::Sum::type);
+
+  DefineExprNode<ir::Block>(m, "Block");
+  py::class_<ir::Block, ir::ExprNode<ir::Block>> block(*m, "Block");
+  block.def_readwrite("stmts", &ir::Block::stmts)
+      .def(py::init<>())
+      .def_static("make", &ir::Block::Make)
+      .def("expr_fields_mutable", py::overload_cast<>(&ir::Block::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&ir::Block::expr_fields, py::const_));
+
+  DefineExprNode<ir::_Module_>(m, "_Module_");
+  py::class_<ir::_Module_, ir::ExprNode<ir::_Module_>> _module_(*m, "_Module_");
+  _module_.def_readwrite("name", &ir::_Module_::name)
+      .def_readwrite("target", &ir::_Module_::target)
+      .def_readwrite("buffers", &ir::_Module_::buffers)
+      .def_readwrite("functions", &ir::_Module_::functions)
+      .def_readwrite("submodules", &ir::_Module_::submodules);
+}
+
+void BindOperation(py::module *m) {
+  py::class_<ir::PlaceholderOp /*, _Operation_Wrapper*/> placeholder_op(*m, "PlaceholderOp");
+  placeholder_op.def_readwrite("shape", &ir::PlaceholderOp::shape)
+      .def_readwrite("dtype", &ir::PlaceholderOp::dtype)
+      .def_static("make", &ir::PlaceholderOp::Make)
+      .def("func_type", &ir::PlaceholderOp::func_type);
+
+  py::class_<ir::CallOp /*, _Operation_Wrapper*/> call_op(*m, "CallOp");
+  call_op.def("target", &ir::CallOp::target)
+      .def_readwrite("call_expr", &ir::CallOp::call_expr)
+      .def("read_args_mutable", py::overload_cast<>(&ir::CallOp::read_args))
+      .def("read_args_const", py::overload_cast<>(&ir::CallOp::read_args, py::const_))
+      .def("write_args_mutable", py::overload_cast<>(&ir::CallOp::write_args))
+      .def("write_args_const", py::overload_cast<>(&ir::CallOp::write_args, py::const_))
+      .def("args", &ir::CallOp::args)
+      .def_readwrite("func", &ir::CallOp::func)
+      .def_readwrite("value_slot", &ir::CallOp::value_slot)
+      .def_readwrite("is_tuple_get", &ir::CallOp::is_tuple_get)
+      .def_readwrite("num_value_slots", &ir::CallOp::num_value_slots)
+      .def(py::init<>())
+      .def_static("make", &ir::CallOp::Make)
+      .def("func_type", &ir::CallOp::func_type);
+
+  py::class_<ir::ComputeOp /*, _Operation_Wrapper*/> compute_op(*m, "ComputeOp");
+  compute_op.def_readwrite("reduce_axis", &ir::ComputeOp::reduce_axis)
+      .def_readwrite("shape", &ir::ComputeOp::shape)
+      .def_readwrite("body", &ir::ComputeOp::body)
+      .def_readwrite("producer_fn", &ir::ComputeOp::producer_fn)
+      .def(py::init<>())
+      .def_static("make", &ir::ComputeOp::Make)
+      .def("func_type", &ir::ComputeOp::func_type);
+}
+
+void BindIrTensor(py::module *m) {
+  py::class_<ir::Tensor, ir::IrNodeRef> tensor(*m, "Tensor");
+  tensor.def(py::init<>())
+      .def(py::init<ir::IrNode *>())
+      .def("ndims", &ir::Tensor::ndims)
+      .def("__call__", [](ir::Tensor &self, Expr a) { return self(a); })
+      .def("__call__", [](ir::Tensor &self, Expr a, Expr b) { return self(a, b); })
+      .def("__call__", [](ir::Tensor &self, Expr a, Expr b, Expr c) { return self(a, b, c); })
+      .def("__call__", [](ir::Tensor &self, Expr a, Expr b, Expr c, Expr d) { return self(a, b, c, d); });
+
+  DefineExprNode<ir::_Tensor_>(m, "_Tensor_");
+  py::class_<ir::_Tensor_, ir::ExprNode<ir::_Tensor_>> _tensor_(*m, "_Tensor_");
+  _tensor_.def_readwrite("shape", &ir::_Tensor_::shape)
+      .def_readwrite("reduce_axis", &ir::_Tensor_::reduce_axis)
+      .def_readwrite("operation", &ir::_Tensor_::operation)
+      .def_readwrite("name", &ir::_Tensor_::name)
+      .def_readwrite("buffer", &ir::_Tensor_::buffer)
+      .def("domain_with_reduce_axis", &ir::_Tensor_::domain_without_reduce_axis)
+      .def("domain_without_reduce_axis", &ir::_Tensor_::domain_without_reduce_axis)
+      .def_static("make", &ir::_Tensor_::Make)
+      .def("is_tuple", &ir::_Tensor_::is_tuple)
+      .def("is_tuple_get", &ir::_Tensor_::is_tuple_get)
+      .def("tuple_get", &ir::_Tensor_::TupleGet)
+      .def("get_depend_tensor_names", &ir::_Tensor_::GetDependTensorNames)
+      .def("is_depend_on_statement", &ir::_Tensor_::IsDependOnStatement)
+      .def("depending_tensor_names", &ir::_Tensor_::DependingTensorNames)
+      .def("same_shape_with", &ir::_Tensor_::HasSameShapeWith)
+      .def("is_compute_node", &ir::_Tensor_::is_compute_node)
+      .def("is_placeholder_node", &ir::_Tensor_::is_placeholder_node)
+      .def("is_call_node", &ir::_Tensor_::is_call_node)
+      .def("is_extern_call_node", &ir::_Tensor_::is_extern_call_node)
+      .def("is_preceding_view_node", &ir::_Tensor_::is_preceding_view_node)
+      .def("is_buffer_shared_node", &ir::_Tensor_::is_buffer_shared_node)
+      .def("operation_type", &ir::_Tensor_::operation_type)
+      .def("get_compute_op", &ir::_Tensor_::get_compute_op)
+      .def("get_placeholder_op", &ir::_Tensor_::get_placeholder_op)
+      .def("body", &ir::_Tensor_::body)
+      .def("tensor_store_expanded_body", &ir::_Tensor_::tensor_store_expanded_body)
+      .def("inline_expanded", &ir::_Tensor_::inline_expanded)
+      .def("contains_reduce_axis", &ir::_Tensor_::contains_reduce_axis)
+      .def("expr_fields_mutable", py::overload_cast<>(&ir::_Tensor_::expr_fields))
+      .def("expr_fields_const", py::overload_cast<>(&ir::_Tensor_::expr_fields, py::const_))
+      .def("axis", &ir::_Tensor_::axis)
+      .def("axis_with_reduce", &ir::_Tensor_::axis_with_reduce)
+      .def("buffer_depended_tensor_names", &ir::_Tensor_::buffer_depended_tensor_names)
+      .def(py::init<>())
+      .def("has_expression", &ir::_Tensor_::has_expression)
+      .def("reshape", &ir::_Tensor_::Reshape)
+      .def("reshape_copied", &ir::_Tensor_::ReshapeCopied)
+      .def("with_buffer",
+           py::overload_cast<const ir::Type &>(&ir::_Tensor_::WithBuffer),
+           py::arg("type") = Type::type_t::Void)
+      .def("with_buffer",
+           py::overload_cast<const std::string &, const std::string &, const ir::Type &>(&ir::_Tensor_::WithBuffer),
+           py::arg("memory_type"),
+           py::arg("buffer_name") = "",
+           py::arg("type")        = Type::type_t::Void)
+      .def("bind", py::overload_cast<lang::Buffer &>(&ir::_Tensor_::Bind))
+      .def("bind", py::overload_cast<const ir::Buffer &>(&ir::_Tensor_::Bind))
+      .def("__str__", [](const ir::Tensor &self) { return "<Tensor " + self->name + ">"; });
+
+  py::class_<ir::Operation /*, ir::FunctionDef*/> operation(*m, "Operation");
+  operation.def(py::init<>()).def(py::init<ir::IrNode *>()).def_readwrite("name", &ir::Operation::name);
+}
+
+auto PackedFuncCall(lang::PackedFunc &self, py::args args) {  // NOLINT
+  lang::Args cinn_args;
+  using common::CINNValue;
+  for (auto handle : args) {
+    if (py::isinstance<py::int_>(handle)) {
+      cinn_args.Append(CINNValue(py::cast<int64_t>(handle)));
+    } else if (py::isinstance<py::float_>(handle)) {
+      cinn_args.Append(CINNValue(py::cast<float>(handle)));
+    } else if (py::isinstance<ir::Var>(handle)) {
+      cinn_args.Append(CINNValue(py::cast<ir::Var>(handle)));
+    } else if (py::isinstance<ir::Expr>(handle)) {
+      cinn_args.Append(CINNValue(py::cast<ir::Expr>(handle)));
+    } else {
+      LOG(FATAL) << "unsupported type: " << std::string(py::str(handle.get_type()));
+    }
+  }
+  lang::RetValue ret_value;
+  self.body()(cinn_args, &ret_value);
+  return ConvertToVar(ret_value);
+}
+
+void BindPackedFunc(py::module *m) {
+  py::class_<lang::Args> args(*m, "Args");
+  args.def(py::init<>())
+      .def(py::init<cinn_value_t *, int *, int>())
+      .def("append", &lang::Args::Append)
+      .def("size", &lang::Args::size)
+      .def("__len__", &lang::Args::size)
+      .def(
+          "__getitem__", [](lang::Args &self, int i) { return self[i]; }, py::return_value_policy::reference)
+      .def("__setitem__", [](lang::Args &self, int i, common::CINNValue &v) { self[i] = v; });
+
+  py::class_<lang::PackedFunc> packed_func(*m, "PackedFunc");
+  packed_func.def(py::init<>())
+      .def(py::init<const std::string &>())
+      .def(py::init<lang::PackedFunc::body_t>())
+      .def("body", &lang::PackedFunc::body)
+      .def("__call__", &PackedFuncCall);
+}
+
+void BindRegistry(py::module *m) {
+  py::class_<ir::Registry> registry(*m, "Registry");
+  registry
+      .def_static("register",
+                  &ir::Registry::Register,
+                  py::arg("name"),
+                  py::arg("override") = false,
+                  py::return_value_policy::reference)
+      .def_static("register", &ir::Registry::Register, py::return_value_policy::reference)
+      .def_static("remove", &ir::Registry::Remove)
+      .def_static("get", &ir::Registry::Get, py::return_value_policy::reference)
+      .def_static("list_names", &ir::Registry::ListNames)
+      .def("set_body", py::overload_cast<lang::PackedFunc>(&ir::Registry::SetBody), py::return_value_policy::reference);
+
+#ifdef CINN_WITH_TEST
+  ir::Registry::Register("test_add_int64").SetBody([](lang::Args args, lang::RetValue *rv) {
+    int64_t x = args[0];
+    int64_t y = args[1];
+    *rv       = x + y;
+  });
+
+  ir::Registry::Register("test_add_expr").SetBody([](lang::Args args, lang::RetValue *rv) {
+    ir::Expr x = args[0];
+    ir::Expr y = args[1];
+    *rv        = x + y;
+  });
+
+  ir::Registry::Register("test_mul_float").SetBody([](lang::Args args, lang::RetValue *rv) {
+    float x = args[0];
+    float y = args[1];
+    *rv     = x * y;
+  });
+#endif
+}
+}  // namespace
+
+void BindIr(py::module *m) {
+  BindOperation(m);
+  BindLoweredFunc(m);
+  BindNode(m);
+  BindIrVisitor(m);
+  BindIrIr(m);
+  BindIrTensor(m);
+  BindPackedFunc(m);
+  BindRegistry(m);
+}
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/lang.cc b/paddle/cinn/pybind/lang.cc
new file mode 100644
index 0000000000000..9d5c05c77570b
--- /dev/null
+++ b/paddle/cinn/pybind/lang.cc
@@ -0,0 +1,248 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/types/variant.h>
+#include <pybind11/functional.h>
+
+#include <memory>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/common/target.h"
+#include "cinn/ir/module.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/buffer.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/pybind/bind.h"
+#include "cinn/pybind/bind_utils.h"
+
+namespace py = pybind11;
+
+namespace cinn::pybind {
+using common::Type;
+using lang::Placeholder;
+using py::arg;
+using utils::GetStreamCnt;
+using utils::StringFormat;
+
+namespace {
+void BindBuffer(py::module *);
+void BindLower(py::module *);
+void BindLowerVec(py::module *);
+void BindPlaceholder(py::module *);
+void BindCompute(py::module *);
+void BindModule(py::module *);
+void BindBuiltin(py::module *);
+
+void BindBuffer(py::module *m) {
+  py::class_<lang::Buffer> buffer(*m, "Buffer");
+  buffer.def(py::init<ir::Type, const std::string &>(), py::arg("type"), py::arg("name") = "")
+      .def(py::init<const ir::Buffer &>())
+      .def("buffer", &lang::Buffer::buffer);
+}
+
+void BindLower(py::module *m) {
+  using py::arg;
+  m->def("lower",
+         &lang::Lower,
+         arg("name"),
+         arg("stages"),
+         arg("tensor_args"),
+         arg("scalar_args")        = std::vector<ir::Var>(),
+         arg("temp_tensors")       = std::vector<ir::Tensor>(),
+         arg("b")                  = nullptr,
+         arg("target")             = common::DefaultHostTarget(),
+         arg("supprt_ir_schedule") = false);
+}
+
+void BindLowerVec(py::module *m) {
+  using py::arg;
+  m->def("lower_vec",
+         &lang::LowerVec,
+         arg("name"),
+         arg("stages"),
+         arg("tensor_args"),
+         arg("scalar_args")        = std::vector<ir::Var>(),
+         arg("temp_tensors")       = std::vector<ir::Tensor>(),
+         arg("b")                  = nullptr,
+         arg("target")             = common::DefaultHostTarget(),
+         arg("supprt_ir_schedule") = false);
+}
+
+void BindCompute(py::module *m) {
+#define MAKE_COMPUTE_FN(__fn)                                                                                 \
+  py::overload_cast<const std::vector<ir::Expr> &, __fn, const std::string &, const std::vector<ir::Expr> &>( \
+      &lang::Compute)
+
+#define DEFINE_COMPUTE(__fn)    \
+  m->def("compute",             \
+         MAKE_COMPUTE_FN(__fn), \
+         arg("domin"),          \
+         arg("fn"),             \
+         arg("name")  = "",     \
+         arg("shape") = std::vector<ir::Expr>())
+
+  // DEFINE_COMPUTE(std::function<ir::Expr()>);
+  // DEFINE_COMPUTE(std::function<ir::Expr(ir::Expr)>);
+  DEFINE_COMPUTE(std::function<ir::Expr(const std::vector<ir::Expr> &)>);
+  // DEFINE_COMPUTE(std::function<ir::Expr(ir::Expr, ir::Expr)>);
+  // DEFINE_COMPUTE(std::function<ir::Expr(ir::Expr, ir::Expr, ir::Expr)>);
+  // DEFINE_COMPUTE(std::function<ir::Expr(ir::Expr, ir::Expr, ir::Expr, ir::Expr)>);
+  // DEFINE_COMPUTE(std::function<ir::Expr(ir::Expr, ir::Expr, ir::Expr, ir::Expr, ir::Expr)>);
+  DEFINE_COMPUTE(lang::compute_handler_t);
+
+#undef DEFINE_COMPUTE
+#undef MAKE_COMPUTE_FN
+
+  py::class_<lang::ReturnType> return_type(*m, "ReturnType");
+  return_type.def_readwrite("type", &lang::ReturnType::type)
+      .def_readwrite("dims", &lang::ReturnType::dims)
+      .def_readwrite("name", &lang::ReturnType::name);
+
+  m->def("call_lowered",
+         py::overload_cast<const std::string &, const std::vector<ir::Expr> &, const std::vector<lang::ReturnType> &>(
+             &lang::CallLowered));
+  m->def("call_extern",
+         py::overload_cast<const std::string &,
+                           const std::vector<ir::Expr> &,
+                           const std::map<std::string, absl::variant<int, float, bool, std::string>> &>(
+             &lang::CallExtern));
+}
+
+void BindModule(py::module *m) {
+  py::class_<ir::Module /*, ir::IrNodeRef*/> module(*m, "Module");
+
+  module.def("target", &ir::Module::target)
+      .def("buffers", &ir::Module::buffers)
+      .def("functions", &ir::Module::functions)
+      .def("submodules", &ir::Module::submodules)
+      .def("compile", &ir::Module::Compile)
+      .def("get_c_code", [](const ir::Module &self) -> std::string {
+        backends::CodeGenC codegen(common::DefaultHostTarget());
+        codegen.SetInlineBuiltinCodes(false);
+        return codegen.Compile(self, backends::CodeGenC::OutputKind::CImpl);
+      });
+
+  py::class_<ir::Module::Builder> builder(module, "Builder");
+  builder.def(py::init<const std::string &, const common::Target &>())
+      .def("add_function", &ir::Module::Builder::AddFunction)
+      .def("add_buffer", &ir::Module::Builder::AddBuffer)
+      .def("build", &ir::Module::Builder::Build);
+}
+
+class PlaceholderWrapper {
+ public:
+#define DEFINE_PLACEHOLDER(__dtype, __type) \
+  if (dtype == #__dtype) placeholder_ = std::make_unique<Placeholder<__type>>(name, shape)
+
+#define INIT_PLACEHOLDER              \
+  DEFINE_PLACEHOLDER(int32, int32_t); \
+  DEFINE_PLACEHOLDER(int64, int64_t); \
+  DEFINE_PLACEHOLDER(float32, float); \
+  DEFINE_PLACEHOLDER(float64, double)
+
+  PlaceholderWrapper(absl::string_view dtype, const std::string &name, const std::vector<int> &shape) {
+    INIT_PLACEHOLDER;
+  }
+
+  PlaceholderWrapper(absl::string_view dtype, const std::string &name, const std::vector<ir::Expr> &shape) {
+    INIT_PLACEHOLDER;
+  }
+#undef INIT_PLACEHOLDER
+#undef DEFINE_PLACEHOLDER
+
+  ir::Type type() const {
+    return absl::visit([](auto &v) { return v->type(); }, placeholder_);
+  }
+
+  ir::Tensor tensor() const {
+    return absl::visit([](auto &v) { return v->tensor(); }, placeholder_);
+  }
+
+  ir::Expr operator()(ir::Expr a) const {
+    return absl::visit([&](auto &v) { return (*v)(a); }, placeholder_);
+  }
+
+  ir::Expr operator()(ir::Expr a, ir::Expr b) const {
+    return absl::visit([&](auto &v) { return (*v)(a, b); }, placeholder_);
+  }
+
+  ir::Expr operator()(ir::Expr a, ir::Expr b, ir::Expr c) const {
+    return absl::visit([&](auto &v) { return (*v)(a, b, c); }, placeholder_);
+  }
+
+  ir::Expr operator()(const std::vector<ir::Expr> &indices) const {
+    return absl::visit([&](auto &v) { return (*v)(indices); }, placeholder_);
+  }
+
+  operator ir::Tensor() {
+    return absl::visit([&](auto &v) { return ir::Tensor(*v); }, placeholder_);
+  }
+  operator ir::Expr() {
+    return absl::visit([&](auto &v) { return ir::Expr(*v); }, placeholder_);
+  }
+
+ private:
+  template <typename... Ts>
+  using PlaceholderVariant = absl::variant<std::unique_ptr<Placeholder<Ts>>...>;
+
+  PlaceholderVariant<int, int64_t, float, double> placeholder_;
+};
+
+void BindPlaceholder(py::module *m) {
+  py::class_<PlaceholderWrapper> placeholder(*m, "Placeholder");
+  placeholder.def(py::init<absl::string_view, const std::string &, const std::vector<int> &>())
+      .def(py::init<absl::string_view, const std::string &, const std::vector<ir::Expr> &>())
+      .def("type", &PlaceholderWrapper::type)
+      .def("tensor", &PlaceholderWrapper::tensor)
+      .def("__call__", [](PlaceholderWrapper &self, ir::Expr a) { return self(std::move(a)); })
+      .def("__call__",
+           [](PlaceholderWrapper &self, ir::Expr a, ir::Expr b) { return self(std::move(a), std::move(b)); })
+      .def("__call__",
+           [](PlaceholderWrapper &self, ir::Expr a, ir::Expr b, ir::Expr c) {
+             return self(std::move(a), std::move(b), std::move(c));
+           })
+      .def("__call__", [](PlaceholderWrapper &self, const std::vector<ir::Expr> &indices) { return self(indices); })
+      .def("to_expr", [](PlaceholderWrapper &self) { return ir::Expr(self); })
+      .def("to_tensor", [](PlaceholderWrapper &self) { return ir::Tensor(self); });
+
+  m->def("create_placeholder",
+         static_cast<ir::Tensor (*)(const std::vector<Expr> &, Type, const std::string &)>(&lang::CreatePlaceHolder));
+  m->def("create_placeholder",
+         static_cast<ir::Tensor (*)(const std::vector<int> &, Type, const std::string &)>(&lang::CreatePlaceHolder));
+}
+
+void BindBuiltin(py::module *m) {
+  m->def("reduce_sum", &lang::ReduceSum, py::arg("e"), py::arg("reduce_axis"), py::arg("init") = Expr());
+  m->def("reduce_mul", &lang::ReduceMul);
+  m->def("reduce_max", &lang::ReduceMax);
+  m->def("reduce_min", &lang::ReduceMin);
+  m->def("reduce_all", &lang::ReduceAll);
+  m->def("reduce_any", &lang::ReduceAny);
+}
+
+}  // namespace
+
+void BindLang(py::module *m) {
+  BindBuffer(m);
+  BindLower(m);
+  BindLowerVec(m);
+  BindPlaceholder(m);
+  BindCompute(m);
+  BindModule(m);
+  BindBuiltin(m);
+}
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/optim.cc b/paddle/cinn/pybind/optim.cc
new file mode 100755
index 0000000000000..a828b911ba8a0
--- /dev/null
+++ b/paddle/cinn/pybind/optim.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/ir_util.h"
+#include "cinn/common/object.h"
+#include "cinn/common/shared.h"
+#include "cinn/common/target.h"
+#include "cinn/common/type.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/optim/ir_copy.h"
+#include "cinn/optim/ir_simplify.h"
+#include "cinn/pybind/bind.h"
+#include "cinn/pybind/bind_utils.h"
+#include "cinn/utils/string.h"
+
+namespace py = pybind11;
+
+namespace cinn::pybind {
+
+using optim::Simplify;
+
+namespace {
+
+void BindSimplify(py::module* m) {
+  m->def(
+      "simplify",
+      [](const Expr& expr) -> Expr {
+        auto copied = optim::IRCopy(expr);
+        Simplify(&copied);
+        return copied;
+      },
+      py::arg("expr"));
+
+  m->def("ir_copy", py::overload_cast<Expr>(&optim::IRCopy));
+}
+
+}  // namespace
+
+void BindOptim(py::module* m) { BindSimplify(m); }
+
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/pe.cc b/paddle/cinn/pybind/pe.cc
new file mode 100644
index 0000000000000..9d6e32c807ded
--- /dev/null
+++ b/paddle/cinn/pybind/pe.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/reduction.h"
+#include "cinn/hlir/pe/transform.h"
+#include "cinn/pybind/bind.h"
+#include "cinn/pybind/bind_utils.h"
+#include "cinn/utils/string.h"
+
+namespace py = pybind11;
+
+namespace cinn {
+namespace pybind {
+
+using common::Type;
+using lang::Placeholder;
+using py::arg;
+using utils::GetStreamCnt;
+using utils::StringFormat;
+
+void BindPE(py::module* m) {
+#define BIND_UNARY(name__, fn__) m->def(#name__, &hlir::pe::fn__, py::arg("x"), py::arg("out") = "T_" #name__ "_out")
+  BIND_UNARY(exp, Exp);
+  BIND_UNARY(erf, Erf);
+  BIND_UNARY(sqrt, Sqrt);
+  BIND_UNARY(log, Log);
+  BIND_UNARY(log2, Log2);
+  BIND_UNARY(log10, Log10);
+  BIND_UNARY(floor, Floor);
+  BIND_UNARY(ceil, Ceil);
+  BIND_UNARY(round, Round);
+  BIND_UNARY(trunc, Trunc);
+  BIND_UNARY(cos, Cos);
+  BIND_UNARY(cosh, Cosh);
+  BIND_UNARY(tan, Tan);
+  BIND_UNARY(sin, Sin);
+  BIND_UNARY(sinh, Sinh);
+  BIND_UNARY(acos, Acos);
+  BIND_UNARY(acosh, Acosh);
+  BIND_UNARY(asin, Asin);
+  BIND_UNARY(asinh, Asinh);
+  BIND_UNARY(atan, Atan);
+  BIND_UNARY(atanh, Atanh);
+  BIND_UNARY(isnan, IsNan);
+  BIND_UNARY(tanh, Tanh);
+  BIND_UNARY(isfinite, IsFinite);
+  BIND_UNARY(isinf, IsInf);
+
+  BIND_UNARY(negative, Negative);
+  BIND_UNARY(identity, Identity);
+  BIND_UNARY(logical_not, LogicalNot);
+  BIND_UNARY(bitwise_not, BitwiseNot);
+  BIND_UNARY(sigmoid, Sigmoid);
+  BIND_UNARY(sign, Sign);
+  BIND_UNARY(abs, Abs);
+  BIND_UNARY(rsqrt, Rsqrt);
+
+#define BIND_BINARY(name__, fn__) \
+  m->def(#name__, &hlir::pe::fn__, py::arg("x"), py::arg("y"), py::arg("out"), py::arg("axis") = Expr(-1))
+
+  BIND_BINARY(add, Add);
+  BIND_BINARY(atan2, Atan2);
+  BIND_BINARY(subtract, Subtract);
+  BIND_BINARY(multiply, Multiply);
+  BIND_BINARY(divide, Divide);
+  BIND_BINARY(floor_divide, FloorDivide);
+  BIND_BINARY(mod, Mod);
+  BIND_BINARY(remainder, Remainder);
+  BIND_BINARY(max, Maximum);
+  BIND_BINARY(min, Minimum);
+  BIND_BINARY(left_shift, LeftShift);
+  BIND_BINARY(right_shift, RightShift);
+  BIND_BINARY(logical_and, LogicalAnd);
+  BIND_BINARY(logical_or, LogicalOr);
+  BIND_BINARY(logical_xor, LogicalXOr);
+  BIND_BINARY(bitwise_and, BitwiseAnd);
+  BIND_BINARY(bitwise_or, BitwiseOr);
+  BIND_BINARY(bitwise_xor, BitwiseXor);
+  BIND_BINARY(greater, Greater);
+  BIND_BINARY(less, Less);
+  BIND_BINARY(equal, Equal);
+  BIND_BINARY(not_equal, NotEqual);
+  BIND_BINARY(greater_equal, GreaterEqual);
+  BIND_BINARY(less_equal, LessEqual);
+
+#define BIND_REDUCE(name__, fn__)      \
+  m->def(#name__,                      \
+         &hlir::pe::fn__,              \
+         py::arg("x"),                 \
+         py::arg("axes"),              \
+         py::arg("keep_dims") = false, \
+         py::arg("out")       = "T_" #name__ "_out")
+  BIND_REDUCE(reduce_sum, ReduceSum);
+  BIND_REDUCE(reduce_prod, ReduceProd);
+  BIND_REDUCE(reduce_max, ReduceMax);
+  BIND_REDUCE(reduce_min, ReduceMin);
+  BIND_REDUCE(reduce_all, ReduceAll);
+  BIND_REDUCE(reduce_any, ReduceAny);
+
+  m->def("matmul",
+         &hlir::pe::Matmul,
+         py::arg("tensor_a"),
+         py::arg("tensor_b"),
+         py::arg("trans_a") = false,
+         py::arg("trans_b") = false,
+         py::arg("alpha")   = 1,
+         py::arg("out")     = "T_Matmul_out");
+
+  m->def("matmul_mkl",
+         &hlir::pe::MatmulMKL,
+         py::arg("tensor_a"),
+         py::arg("tensor_b"),
+         py::arg("trans_a") = false,
+         py::arg("trans_b") = false,
+         py::arg("alpha")   = 1,
+         py::arg("out")     = "T_Matmul_mkl_out",
+         py::arg("target")  = common::DefaultHostTarget());
+}
+
+}  // namespace pybind
+}  // namespace cinn
diff --git a/paddle/cinn/pybind/poly.cc b/paddle/cinn/pybind/poly.cc
new file mode 100644
index 0000000000000..dae4c51700096
--- /dev/null
+++ b/paddle/cinn/pybind/poly.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <llvm/Support/FormatVariadic.h>
+
+#include "cinn/poly/stage.h"
+#include "cinn/pybind/bind.h"
+#include "cinn/pybind/bind_utils.h"
+
+namespace py = pybind11;
+
+namespace cinn::pybind {
+
+using poly::Condition;
+using poly::Iterator;
+using poly::Stage;
+using poly::StageForloopInfo;
+using py::arg;
+
+namespace {
+void BindMap(py::module *);
+void BindStage(py::module *);
+
+void BindMap(py::module *m) {
+  py::class_<Iterator> iterator(*m, "Iterator");
+  iterator.def_readwrite("id", &Iterator::id)
+      .def(py::init<>())
+      .def(py::init<const std::string &>())
+      .def(py::init<const Iterator &>())
+      .def("__eq__", [](Iterator &self, Iterator &other) { return self == other; })
+      .def("__ne__", [](Iterator &self, Iterator &other) { return self != other; })
+      .def("__str__", [](Iterator &self) { return self.id; })
+      .def("__repr__", [](Iterator &self) -> std::string { return llvm::formatv("<Iterator {0}>", self.id); });
+
+  py::class_<Condition> condition(*m, "Condition");
+  condition.def_readwrite("cond", &Condition::cond).def(py::init<std::string>()).def("__str__", &Condition::__str__);
+}
+
+void BindStageMap(py::module *m) {
+  DefineShared<poly::_StageMap_>(m, "StageMap");
+  py::class_<poly::StageMap, Shared<poly::_StageMap_>> stage_map(*m, "StageMap");
+  stage_map  //
+      .def(
+          "__getitem__",
+          [](poly::StageMap self, ir::Tensor &t) -> Stage & { return *self[t]; },
+          py::return_value_policy::reference);
+
+  m->def("create_stages", &poly::CreateStages, py::arg("tensors"));
+}
+
+void BindStage(py::module *m) {
+  py::class_<Stage, common::Object> stage(*m, "Stage");
+  // enum Stage::ComputeAtKind
+  py::enum_<Stage::ComputeAtKind> compute_at_kind(stage, "ComputeAtKind");
+  compute_at_kind.value("kComputeAtUnk", Stage::ComputeAtKind::kComputeAtAuto)
+      .value("kComputeAtBefore", Stage::ComputeAtKind::kComputeAtBefore)
+      .value("kComputeAtAfter", Stage::ComputeAtKind::kComputeAtAfter);
+
+  DefineShared<Stage>(m, "Stage");
+  stage.def("id", &Stage::id)
+      .def("expr", &Stage::expr)
+      .def("axis", py::overload_cast<int>(&Stage::axis, py::const_))
+      .def("axis", py::overload_cast<const std::string &>(&Stage::axis, py::const_))
+      .def("axis_names", &Stage::axis_names)
+      .def("bind", &Stage::Bind)
+      .def("compute_inline",
+           &Stage::ComputeInline,
+           "Mark this tensor as inline, and will expand in-place in where it is used")
+      .def(
+          "share_buffer_with",
+          [](Stage &self, Stage &other) { self.ShareBufferWith(&other); },
+          "Share the underlying buffer with another tensor")
+      .def("split", py::overload_cast<const Iterator &, int>(&Stage::Split), arg("level"), arg("factor"))
+      .def("split", py::overload_cast<const std::string &, int>(&Stage::Split), arg("level"), arg("factor"))
+      .def("split", py::overload_cast<int, int>(&Stage::Split), arg("level"), arg("factor"))
+      .def("fuse", py::overload_cast<int, int>(&Stage::Fuse), arg("level0"), arg("level1"))
+      .def("fuse", py::overload_cast<const std::vector<int> &>(&Stage::Fuse))
+      .def("reorder",
+           py::overload_cast<const std::vector<Iterator> &>(&Stage::Reorder),
+           "Reorder the axis in the computation")
+      .def("reorder",
+           py::overload_cast<const std::vector<int> &>(&Stage::Reorder),
+           "Reorder the axis in the computation")
+      .def("tile", py::overload_cast<const Iterator &, const Iterator &, int, int>(&Stage::Tile))
+      .def("tile", py::overload_cast<int, int, int, int>(&Stage::Tile))
+      .def("vectorize", py::overload_cast<int, int>(&Stage::Vectorize))
+      .def("vectorize", py::overload_cast<const std::string &, int>(&Stage::Vectorize))
+      .def("vectorize", py::overload_cast<const Iterator &, int>(&Stage::Vectorize))
+      .def("unroll", py::overload_cast<int>(&Stage::Unroll))
+      .def("unroll", py::overload_cast<const std::string &>(&Stage::Unroll))
+      .def("unroll", py::overload_cast<const Iterator &>(&Stage::Unroll))
+      .def("parallel", py::overload_cast<int>(&Stage::Parallel))
+      .def("parallel", py::overload_cast<const std::string &>(&Stage::Parallel))
+      .def("parallel", py::overload_cast<const Iterator &>(&Stage::Parallel))
+      .def("compute_at", &Stage::ComputeAtSchedule, arg("other"), arg("level"), arg("kind") = Stage::kComputeAtAuto)
+      .def("skew", &Stage::Skew)
+      .def("ctrl_depend", &Stage::CtrlDepend)
+      .def("cache_read", &Stage::CacheRead)
+      .def("cache_write", &Stage::CacheWrite)
+      .def("sync_threads", py::overload_cast<poly::StageMap>(&Stage::SyncThreads))
+      .def("sync_threads",
+           py::overload_cast<int, const std::vector<ir::Tensor> &, poly::StageMap>(&Stage::SyncThreads));
+}
+
+}  // namespace
+
+void BindPoly(py::module *m) {
+  BindMap(m);
+  BindStage(m);
+  BindStageMap(m);
+}
+
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/runtime.cc b/paddle/cinn/pybind/runtime.cc
new file mode 100644
index 0000000000000..1f9b747fc7b80
--- /dev/null
+++ b/paddle/cinn/pybind/runtime.cc
@@ -0,0 +1,279 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <pybind11/numpy.h>
+#include <pybind11/operators.h>
+#include <pybind11/stl.h>
+
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "cinn/pybind/bind.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/runtime/flags.h"
+
+namespace py = pybind11;
+namespace cinn::pybind {
+namespace {
+using py::arg;
+void BindCinnRuntime(py::module *);
+
+cinn_type_t NumpyTypeToCinn(py::dtype dt) {
+  if (dt.is(py::dtype::of<int32_t>())) {
+    return cinn_int32_t();
+  } else if (dt.is(py::dtype::of<int64_t>())) {
+    return cinn_int64_t();
+  } else if (dt.is(py::dtype::of<uint32_t>())) {
+    return cinn_uint32_t();
+  } else if (dt.is(py::dtype::of<uint64_t>())) {
+    return cinn_uint64_t();
+  } else if (dt.is(py::dtype::of<float>())) {
+    return cinn_float32_t();
+  } else if (dt.is(py::dtype::of<double>())) {
+    return cinn_float64_t();
+  } else if (dt.is(py::dtype::of<bool>())) {
+    return cinn_bool_t();
+  } else if (dt.is(py::dtype::of<int8_t>())) {
+    return cinn_int8_t();
+  }
+
+  return cinn_unk_t();
+}
+
+cinn_buffer_t *CreateBufferFromNumpy(py::array data, cinn_device_kind_t device, int align = 0) {
+  cinn_type_t type = NumpyTypeToCinn(data.dtype());
+  std::vector<int> shape;
+  std::copy_n(data.shape(), data.ndim(), std::back_inserter(shape));
+  auto *buffer = cinn_buffer_t::new_(device, type, shape, align);
+  cinn_buffer_malloc(nullptr, buffer);
+  std::memcpy(buffer->memory, data.data(), data.nbytes());
+
+  return buffer;
+}
+
+py::array BufferHostMemoryToNumpy(cinn_buffer_t &buffer) {  // NOLINT
+  py::dtype dt;
+  if (buffer.type == cinn_int32_t()) {
+    dt = py::dtype::of<int32_t>();
+  } else if (buffer.type == cinn_int64_t()) {
+    dt = py::dtype::of<int64_t>();
+  } else if (buffer.type == cinn_uint32_t()) {
+    dt = py::dtype::of<uint32_t>();
+  } else if (buffer.type == cinn_uint64_t()) {
+    dt = py::dtype::of<uint64_t>();
+  } else if (buffer.type == cinn_float32_t()) {
+    dt = py::dtype::of<float>();
+  } else if (buffer.type == cinn_float64_t()) {
+    dt = py::dtype::of<double>();
+  } else if (buffer.type == cinn_int8_t()) {
+    dt = py::dtype::of<int8_t>();
+  } else if (buffer.type == cinn_bool_t()) {
+    dt = py::dtype::of<bool>();
+  } else {
+    LOG(FATAL) << "Not supported type found";
+  }
+
+  py::array::ShapeContainer shape(buffer.dims, buffer.dims + buffer.dimensions);
+  py::array array(std::move(dt), std::move(shape));
+  void *mutable_data = array.mutable_data();
+  cinn_buffer_copy_to_host(nullptr, &buffer);
+  if (buffer.device == cinn_x86_device) {
+    std::memcpy(mutable_data, buffer.memory, buffer.memory_size);
+  } else {
+    CINN_RUNTIME_NOT_IMPLEMENTED
+  }
+  return array;
+}
+
+struct VoidPointer {
+  void *ptr{nullptr};
+};
+
+void BindSpecialTypes(py::module *m) {
+  py::class_<VoidPointer> void_ptr(*m, "VoidPointer");
+  void_ptr.def(py::init<>());
+
+#define VOID_PTR_SUPPORT_TYPE(__type) \
+  void_ptr.def("set", [](VoidPointer &self, __type *p) { self.ptr = static_cast<void *>(p); })
+
+  VOID_PTR_SUPPORT_TYPE(char);
+  VOID_PTR_SUPPORT_TYPE(int8_t);
+  VOID_PTR_SUPPORT_TYPE(int16_t);
+  VOID_PTR_SUPPORT_TYPE(int32_t);
+  VOID_PTR_SUPPORT_TYPE(int64_t);
+  VOID_PTR_SUPPORT_TYPE(float);
+  VOID_PTR_SUPPORT_TYPE(double);
+#undef VOID_PTR_SUPPORT_TYPE
+
+  m->def("nullptr", []() { return VoidPointer(); });
+}
+
+void BindCinnRuntime(py::module *m) {
+  py::enum_<cinn_type_code_t> cinn_type_code(*m, "cinn_type_code_t");
+  cinn_type_code.value("cinn_type_unk", cinn_type_unk)
+      .value("cinn_type_int", cinn_type_int)
+      .value("cinn_type_uint", cinn_type_uint)
+      .value("cinn_type_float", cinn_type_float)
+      .value("cinn_type_handle", cinn_type_handle)
+      .export_values();
+
+  py::class_<cinn_type_t> cinn_type(*m, "cinn_type_t");
+  cinn_type.def_readwrite("code", &cinn_type_t::code)
+      .def_readwrite("bits", &cinn_type_t::bits)
+      .def_readwrite("lanes", &cinn_type_t::lanes)
+      .def(py::init<>())
+      .def(py::init<cinn_type_code_t, uint8_t, uint16_t>(), arg("code"), arg("bits"), arg("lanes") = 1)
+      .def(py::self == cinn_type_t())
+      .def(py::self != cinn_type_t())
+      .def("bytes", &cinn_type_t::bytes);
+
+  m->def("cinn_unk_t", &cinn_unk_t)
+      .def("cinn_int8_t", &cinn_int8_t)
+      .def("cinn_bool_t", &cinn_bool_t)
+      .def("cinn_int32_t", &cinn_int32_t)
+      .def("cinn_int64_t", &cinn_int64_t)
+      .def("cinn_uint32_t", &cinn_uint32_t)
+      .def("cinn_uint64_t", &cinn_uint64_t)
+      .def("cinn_float32_t", &cinn_float32_t)
+      .def("cinn_float64_t", &cinn_float64_t);
+
+  py::enum_<cinn_device_kind_t> cinn_device_kind(*m, "cinn_device_kind_t");
+  cinn_device_kind.value("cinn_unk_device", cinn_unk_device)
+      .value("cinn_x86_device", cinn_x86_device)
+      .value("cinn_opencl_device", cinn_opencl_device)
+      .value("cinn_arm_device", cinn_arm_device)
+      .export_values();
+
+  py::enum_<cinn_buffer_kind_t> cinn_buffer_kind(*m, "cinn_buffer_kind_t");
+  cinn_buffer_kind.value("cinn_buffer_on_host", cinn_buffer_on_host)
+      .value("cinn_buffer_on_device", cinn_buffer_on_device)
+      .export_values();
+
+  py::class_<cinn_device_interface_t> cinn_device_interface(*m, "cinn_device_interface_t");
+
+  m->def("cinn_device_release", &cinn_device_release);
+  m->def("cinn_buffer_copy_to_host", &cinn_buffer_copy_to_host);
+  m->def("cinn_buffer_copy_to_device", &cinn_buffer_copy_to_device);
+  m->def("cinn_buffer_copy", &cinn_buffer_copy);
+  m->def("cinn_device_sync", &cinn_device_sync);
+  m->def("cinn_buffer_malloc", &cinn_buffer_malloc);
+  m->def("cinn_buffer_malloc", [](VoidPointer &p, cinn_buffer_t *buffer) { return cinn_buffer_malloc(p.ptr, buffer); });
+  m->def("cinn_buffer_free", &cinn_buffer_free);
+  m->def("cinn_buffer_get_data_handle", &cinn_buffer_get_data_handle);
+  m->def("cinn_buffer_get_data_const_handle", &cinn_buffer_get_data_const_handle);
+
+  py::class_<cinn_buffer_t> cinn_buffer(*m, "cinn_buffer_t");
+  cinn_buffer.def_readwrite("device", &cinn_buffer_t::device)
+      .def_readwrite("device_interface", &cinn_buffer_t::device_interface)
+      .def_readwrite("memory", &cinn_buffer_t::memory)
+      .def_readwrite("flag", &cinn_buffer_t::flag)
+      .def_readwrite("type", &cinn_buffer_t::type)
+      .def_readwrite("dimensions", &cinn_buffer_t::dimensions)
+      //.def_readwrite("dims", &cinn_buffer_t::dims)
+      .def_readwrite("lazy", &cinn_buffer_t::lazy)
+      .def_readwrite("memory_size", &cinn_buffer_t::memory_size)
+      .def_readwrite("align", &cinn_buffer_t::align)
+      .def(py::init<>())
+      .def_static("new",
+                  &cinn_buffer_t::new_,
+                  arg("device"),
+                  arg("type"),
+                  arg("shape"),
+                  arg("align") = 0,
+                  py::return_value_policy::reference)
+      .def_static("delete", &cinn_buffer_t::delete_)
+      // .def_static("alloc", &cinn_buffer_t::alloc)
+      .def("resize", &cinn_buffer_t::resize)
+      .def("num_elements", &cinn_buffer_t::num_elements)
+      .def("on_host", &cinn_buffer_t::on_host)
+      .def("on_device", &cinn_buffer_t::on_device)
+      .def("set_on_host", &cinn_buffer_t::set_on_host, arg("x") = true)
+      .def("set_on_device", &cinn_buffer_t::set_on_device, arg("x") = true)
+      .def("device_sync", &cinn_buffer_t::device_sync, arg("ctx") = nullptr)
+      .def("begin", &cinn_buffer_t::begin, py::return_value_policy::reference)
+      .def("end", &cinn_buffer_t::end, py::return_value_policy::reference)
+      .def("get_flag", &cinn_buffer_t::get_flag)
+      .def("set_flag", &cinn_buffer_t::set_flag)
+      // Python methods
+      .def("numpy", &BufferHostMemoryToNumpy)
+      .def(py::init(&CreateBufferFromNumpy), arg("data"), arg("device"), arg("align") = 0);
+
+  m->def("cinn_x86_device_interface", &cinn_x86_device_interface)
+      .def("cinn_buffer_load_float32", &cinn_buffer_load_float32)
+      .def("cinn_buffer_load_float64", &cinn_buffer_load_float64);
+  //.def("cinn_buffer_slice", &cinn_buffer_slice,
+  //     py::return_value_policy::reference);
+
+  py::class_<cinn_value_t> cinn_value(*m, "cinn_value_t");
+  cinn_value.def(py::init<>())
+      .def_property(
+          "v_int64",
+          [](cinn_value_t &self) -> const int64_t { return self.v_int64; },
+          [](cinn_value_t &self, int64_t v) { self.v_int64 = v; })
+      .def_property(
+          "v_float64",
+          [](cinn_value_t &self) -> const double { return self.v_float64; },
+          [](cinn_value_t &self, double v) { self.v_float64 = v; })
+      .def_property(
+          "v_handle",
+          [](cinn_value_t &self) -> const void * { return self.v_handle; },
+          [](cinn_value_t &self, void *v) { self.v_handle = v; })
+      .def_property(
+          "v_str",
+          [](cinn_value_t &self) -> const char * { return self.v_str; },
+          [](cinn_value_t &self, char *v) { self.v_str = v; });
+  py::class_<cinn_pod_value_t> cinn_pod_value(*m, "cinn_pod_value_t");
+  cinn_pod_value.def(py::init<>())
+      .def(py::init<cinn_value_t, int>())
+      .def(py::init<cinn_buffer_t *>())
+      .def(py::init<bool>())
+      .def(py::init<int8_t>())
+      .def(py::init<int32_t>())
+      .def(py::init<int64_t>())
+      .def(py::init<float>())
+      .def(py::init<double>())
+      .def(py::init<void *>())
+      .def(py::init<const char *>())
+      .def("to_double", &cinn_pod_value_t::operator double)
+      .def("to_float", &cinn_pod_value_t::operator float)
+      .def("to_int8", &cinn_pod_value_t::operator int8_t)
+      .def("to_int32", &cinn_pod_value_t::operator int32_t)
+      .def("to_int64", &cinn_pod_value_t::operator int64_t)
+      .def("to_void_p", &cinn_pod_value_t::operator void *)
+      .def("to_cinn_buffer_t_p", &cinn_pod_value_t::operator cinn_buffer_t *)
+      .def("to_char_p", &cinn_pod_value_t::operator char *)
+      .def("type_code", py::overload_cast<>(&cinn_pod_value_t::type_code, py::const_))
+      .def("data_addr", &cinn_pod_value_t::data_addr);
+
+  m->def("cinn_pod_value_to_float", &cinn_pod_value_to_float)
+      .def("cinn_pod_value_to_double", &cinn_pod_value_to_double)
+      .def("cinn_pod_value_to_int64", &cinn_pod_value_to_int64)
+      .def("cinn_pod_value_to_int32", &cinn_pod_value_to_int32)
+      .def("cinn_pod_value_to_int8", &cinn_pod_value_to_int8)
+      .def("cinn_pod_value_to_void_p", &cinn_pod_value_to_void_p)
+      .def("cinn_pod_value_to_buffer_p", &cinn_pod_value_to_buffer_p);
+
+  m->def("set_cinn_cudnn_deterministic", &cinn::runtime::SetCinnCudnnDeterministic, py::arg("state") = true);
+  m->def("seed", &cinn::runtime::RandomSeed::GetOrSet, py::arg("seed") = 0);
+  m->def("clear_seed", &cinn::runtime::RandomSeed::Clear);
+}
+}  // namespace
+
+void BindRuntime(py::module *m) {
+  BindSpecialTypes(m);
+  BindCinnRuntime(m);
+}
+}  // namespace cinn::pybind
diff --git a/paddle/cinn/pybind/utils.cc b/paddle/cinn/pybind/utils.cc
new file mode 100644
index 0000000000000..c64fc569c63c7
--- /dev/null
+++ b/paddle/cinn/pybind/utils.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/pybind/bind.h"
+#include "cinn/utils/profiler.h"
+
+namespace py = pybind11;
+
+namespace cinn {
+namespace pybind {
+
+using namespace cinn::utils;
+
+void BindUtils(py::module *m) {
+  py::enum_<EventType>(*m, "EventType")
+      .value("kOrdinary", EventType::kOrdinary)
+      .value("kGraph", EventType::kGraph)
+      .value("kProgram", EventType::kProgram)
+      .value("kFusePass", EventType::kFusePass)
+      .value("kCompute", EventType::kCompute)
+      .value("kSchedule", EventType::kSchedule)
+      .value("kOptimize", EventType::kOptimize)
+      .value("kCodeGen", EventType::kCodeGen)
+      .value("kCompile", EventType::kCompile)
+      .value("kInstruction", EventType::kInstruction)
+      .export_values();
+
+  py::class_<ProfilerHelper>(*m, "ProfilerHelper")
+      .def_static("enable_all", &ProfilerHelper::EnableAll)
+      .def_static("enable_cpu", &ProfilerHelper::EnableCPU)
+      .def_static("enable_cuda", &ProfilerHelper::EnableCUDA)
+      .def_static("is_enable", &ProfilerHelper::IsEnable)
+      .def_static("is_enable_cpu", &ProfilerHelper::IsEnableCPU)
+      .def_static("is_enable_cuda", &ProfilerHelper::IsEnableCUDA);
+
+  py::class_<HostEventRecorder>(*m, "HostEventRecorder")
+      .def_static("instance", &HostEventRecorder::GetInstance)
+      .def_static("table", &HostEventRecorder::Table)
+      .def("events", &HostEventRecorder::Events)
+      .def("clear", &HostEventRecorder::Clear);
+
+  py::class_<HostEvent>(*m, "HostEvent")
+      .def(py::init<const std::string &, double, EventType>())
+      .def_property(
+          "annotation",
+          [](HostEvent &self) -> const std::string & { return self.annotation_; },
+          [](HostEvent &self, const std::string &v) { self.annotation_ = v; })
+      .def_property(
+          "duration",
+          [](HostEvent &self) -> const double { return self.duration_; },
+          [](HostEvent &self, double v) { self.duration_ = v; })
+      .def_property(
+          "type",
+          [](HostEvent &self) -> const EventType & { return self.type_; },
+          [](HostEvent &self, const EventType &v) { self.type_ = v; });
+}
+
+}  // namespace pybind
+}  // namespace cinn
\ No newline at end of file
diff --git a/paddle/cinn/runtime/CMakeLists.txt b/paddle/cinn/runtime/CMakeLists.txt
new file mode 100644
index 0000000000000..82d2e05d9eb15
--- /dev/null
+++ b/paddle/cinn/runtime/CMakeLists.txt
@@ -0,0 +1,23 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS
+  flags.cc
+  intrinsic.cc
+  cinn_runtime.cc
+  intrinsic_types.cc
+  custom_function.cc
+  )
+
+cc_library(cinn_runtime SRCS cinn_runtime.cc buffer.cc
+        #cinn_x86_device_impl.cc
+        )
+
+cc_test(test_cinn_runtime SRCS cinn_runtime_test.cc DEPS cinn_runtime)
+cc_test(test_custom_function SRCS custom_function_test.cc DEPS cinncore)
+
+if (WITH_OPENMP)
+cc_library(tiny_runtime STATIC SRCS tiny_runtime.cc)
+endif()
+
+add_subdirectory(cuda)
+add_subdirectory(cpu)
diff --git a/paddle/cinn/runtime/buffer.cc b/paddle/cinn/runtime/buffer.cc
new file mode 100755
index 0000000000000..2c5510f91f928
--- /dev/null
+++ b/paddle/cinn/runtime/buffer.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/buffer.h"
+
+namespace cinn {
+namespace runtime {
+
+Shape::Shape(const Shape &other) : data_(new value_type[other.ndims()]), ndims_(other.ndims()) {
+  if (ndims() > 0) {
+    memcpy(data_, other.data(), ndims_ * sizeof(value_type));
+  }
+}
+
+void Shape::Resize(int ndim) {
+  CHECK_GT(ndim, 0);
+  ndims_ = ndim;
+  if (data_) delete data_;
+  data_ = new value_type[ndim];
+}
+
+Shape::value_type &Shape::operator[](int i) {
+  CHECK_GT(ndims_, 0) << "shape is empty";
+  CHECK_LT(i, ndims_) << "index " << i << "out of range " << ndims_;
+  return data_[i];
+}
+
+Shape::value_type Shape::operator[](int i) const {
+  CHECK_GT(ndims_, 0) << "shape is empty";
+  CHECK_LT(i, ndims_) << "index " << i << "out of range " << ndims_;
+  return data_[i];
+}
+
+uint32_t Shape::num_elements() const {
+  uint32_t res = ndims_ > 0 ? 1 : 0;
+  for (int i = 0; i < ndims(); i++) res *= (*this)[i];
+  return res;
+}
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/buffer.h b/paddle/cinn/runtime/buffer.h
new file mode 100755
index 0000000000000..ba47e6e2d4578
--- /dev/null
+++ b/paddle/cinn/runtime/buffer.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include <string>
+/**
+ * runtime::Buffer is an encapsulation of memory operations.
+ */
+namespace cinn {
+namespace runtime {
+
+/**
+ * Shape of the buffers.
+ */
+struct Shape {
+  using value_type = int32_t;
+
+  Shape() = default;
+
+  Shape(const Shape& other);
+
+  bool defined() const { return data_; }
+
+  //! Get the number of dimensions.
+  uint32_t ndims() const { return ndims_; }
+
+  //! Get the mutable data.
+  value_type* data() { return data_; }
+  //! Get the immutable data.
+  const value_type* data() const { return data_; }
+
+  //! Resize the number of dimensions.
+  void Resize(int ndim);
+
+  //! Get the number of elements the shape defines.
+  uint32_t num_elements() const;
+
+  //! Get i-th element.
+  value_type& operator[](int i);
+  //! Get i-th element.
+  value_type operator[](int i) const;
+
+ private:
+  uint32_t ndims_{0};
+  int32_t* data_{};
+};
+
+/**
+ * A C++ wrapper for buffer.
+ */
+template <typename T>
+class Buffer {
+ public:
+  Buffer(const Shape& shape) : shape_(shape) {}
+
+  //! Allocate the memory in host device.
+  void AllocHost() {
+    CHECK(shape_.defined());
+    data_ = new T[shape_.num_elements()];
+    CHECK(data_) << "alloc buffer failed";
+  }
+  //! Deallocate the memory in host device.
+  void DeallocHost() {
+    if (data_) delete data_;
+    data_ = nullptr;
+  }
+
+  T& operator()(int i0) {
+    CHECK_EQ(shape_.ndims(), 1);
+    return static_cast<T*>(data_)[i0];
+  }
+  T& operator()(int i0, int i1) {
+    CHECK_EQ(shape_.ndims(), 2);
+    return static_cast<T*>(data_)[i0 * shape_[0] + i1];
+  }
+  T& operator()(int i0, int i1, int i2) {
+    CHECK_EQ(shape_.ndims(), 3);
+    return static_cast<T*>(data_)[i0 * shape_[1] * shape_[2] + i1 * shape_[2] + i2];
+  }
+
+ private:
+  Shape shape_;
+  void* data_{};
+};
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cinn_runtime.cc b/paddle/cinn/runtime/cinn_runtime.cc
new file mode 100644
index 0000000000000..b86cafe34089f
--- /dev/null
+++ b/paddle/cinn/runtime/cinn_runtime.cc
@@ -0,0 +1,495 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cinn_runtime.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include <cmath>
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+extern "C" {
+
+int cinn_buffer_malloc(void* context, struct cinn_buffer_t* buf) {
+  // ASSERT_NOT_NULL(context)
+  ASSERT_NOT_NULL(buf)
+  ASSERT_NOT_NULL(buf->device_interface)
+  return buf->device_interface->impl->malloc(context, buf);
+}
+
+int cinn_buffer_free(void* context, struct cinn_buffer_t* buf) {
+  // ASSERT_NOT_NULL(context)
+  ASSERT_NOT_NULL(buf)
+  // If buffer is lazy, then we will not free this buffer, that will greatly improve performance.
+  if (buf->lazy) return 0;
+  return buf->device_interface->impl->free(context, buf);
+}
+
+void* cinn_buffer_slice(struct cinn_buffer_t* buf, uint32_t offset) {
+  CINN_CHECK(buf);
+  uint64_t offset_byte = offset * buf->type.bytes();
+  CINN_CHECK_LT(offset_byte, buf->memory_size);
+  return buf->memory + offset_byte;
+}
+
+int cinn_device_sync(void* context, struct cinn_buffer_t* buf) {
+  ASSERT_NOT_NULL(buf)
+  ASSERT_NOT_NULL(buf->device_interface)
+  // ASSERT_NOT_NULL(context)
+  buf->device_interface->impl->sync(context, buf);
+  return 0;
+}
+
+int cinn_device_release(void* context, const struct cinn_device_interface_t* device_interface) {
+  // ASSERT_NOT_NULL(context)
+  ASSERT_NOT_NULL(device_interface)
+  CINN_RUNTIME_NOT_IMPLEMENTED
+}
+
+int cinn_buffer_copy_to_host(void* context, struct cinn_buffer_t* buf) {
+  // ASSERT_NOT_NULL(context)
+  ASSERT_NOT_NULL(buf)
+  ASSERT_NOT_NULL(buf->device_interface)
+  return buf->device_interface->impl->copy_to_host(context, buf);
+}
+
+int cinn_buffer_copy_to_device(void* context, struct cinn_buffer_t* buf) {
+  // ASSERT_NOT_NULL(context)
+  ASSERT_NOT_NULL(buf)
+  ASSERT_NOT_NULL(buf->device_interface)
+  return buf->device_interface->impl->copy_to_device(context, buf);
+}
+int cinn_buffer_copy(void* context, struct cinn_buffer_t* src, struct cinn_buffer_t* dst) {
+  // ASSERT_NOT_NULL(context);
+  ASSERT_NOT_NULL(src);
+  ASSERT_NOT_NULL(dst);
+  return dst->device_interface->buffer_copy(context, src, dst);
+}
+
+void* cinn_buffer_get_data_handle(struct cinn_buffer_t* buf) {
+  CINN_CHECKP(buf, "%s", "buffer is null");
+  return buf->memory;
+}
+
+void* cinn_buffer_get_data_const_handle(const struct cinn_buffer_t* buf) {
+  CINN_CHECKP(buf, "%s", "buffer is null");
+  return buf->memory;
+}
+
+cinn_buffer_t* cinn_buffer_new_default(int target, uint64_t memory_size, int align) {
+  struct cinn_buffer_t* buf = (struct cinn_buffer_t*)malloc(sizeof(struct cinn_buffer_t));
+  buf->type                 = cinn_float32_t();
+  buf->device               = (cinn_device_kind_t)target;
+  buf->memory               = nullptr;
+  buf->memory_size          = memory_size;
+  buf->align                = align;
+  buf->lazy                 = true;
+#ifdef __cplusplus
+  buf->external_malloc = nullptr;
+  buf->external_free   = nullptr;
+#endif  // __cplusplus
+  // NOTE set device_interface for each buffer.
+  switch (buf->device) {
+    case cinn_x86_device:
+      buf->device_interface = cinn_x86_device_interface();
+      break;
+    case cinn_unk_device:
+      fprintf(stderr, "Device type of buffer should be set, found Unk");
+      abort();
+      break;
+    default:
+      fprintf(stderr, "Not supported device type");
+      abort();
+  }
+  cinn_buffer_malloc((void*)(0), buf);
+  return buf;
+}
+
+cinn_type_t cinn_unk_t() { return cinn_type_t(cinn_type_unk, 0); }
+cinn_type_t cinn_bool_t(int num_asterisks) { return cinn_type_t(cinn_type_int, 1, num_asterisks); }
+
+cinn_type_t cinn_int8_t(int num_asterisks) { return cinn_type_t(cinn_type_int, 8, num_asterisks); }
+cinn_type_t cinn_int16_t(int num_asterisks) { return cinn_type_t(cinn_type_int, 16, num_asterisks); }
+cinn_type_t cinn_int32_t(int num_asterisks) { return cinn_type_t(cinn_type_int, 32, num_asterisks); }
+cinn_type_t cinn_int64_t(int num_asterisks) { return cinn_type_t(cinn_type_int, 64, num_asterisks); }
+
+cinn_type_t cinn_uint8_t(int num_asterisks) { return cinn_type_t(cinn_type_uint, 8, num_asterisks); }
+cinn_type_t cinn_uint16_t(int num_asterisks) { return cinn_type_t(cinn_type_uint, 16, num_asterisks); }
+cinn_type_t cinn_uint32_t(int num_asterisks) { return cinn_type_t(cinn_type_uint, 32, num_asterisks); }
+cinn_type_t cinn_uint64_t(int num_asterisks) { return cinn_type_t(cinn_type_uint, 64, num_asterisks); }
+
+cinn_type_t cinn_bfloat16_t(int num_asterisks) { return cinn_type_t(cinn_type_bfloat, 16, num_asterisks); }
+cinn_type_t cinn_float16_t(int num_asterisks) { return cinn_type_t(cinn_type_float, 16, num_asterisks); }
+cinn_type_t cinn_float32_t(int num_asterisks) { return cinn_type_t(cinn_type_float, 32, num_asterisks); }
+cinn_type_t cinn_float64_t(int num_asterisks) { return cinn_type_t(cinn_type_float, 64, num_asterisks); }
+
+}  // extern "C"
+
+struct cinn_buffer_t* cinn_buffer_t::new_(cinn_device_kind_t device,
+                                          cinn_type_t type,
+                                          const std::vector<int>& shape,
+                                          int align) {
+  int32_t dimensions = shape.size();
+  CINN_CHECK(shape.size() < CINN_BUFFER_MAX_DIMS);
+
+  struct cinn_buffer_t* buf = (struct cinn_buffer_t*)malloc(sizeof(struct cinn_buffer_t));
+  memcpy(&(buf->dims[0]), shape.data(), shape.size() * sizeof(int));
+  buf->type        = type;
+  buf->device      = device;
+  buf->memory      = nullptr;
+  buf->memory_size = 0;
+  buf->lazy        = true;
+  // NOTE set device_interface for each buffer.
+  switch (buf->device) {
+    case cinn_x86_device:
+      buf->device_interface = cinn_x86_device_interface();
+      break;
+    case cinn_unk_device:
+      fprintf(stderr, "Device type of buffer should be set, found Unk");
+      abort();
+      break;
+    default:
+      fprintf(stderr, "Not supported device type");
+      abort();
+  }
+
+  buf->dimensions = dimensions;
+  buf->align      = align;
+#ifdef __cplusplus
+  buf->external_malloc = nullptr;
+  buf->external_free   = nullptr;
+#endif  // __cplusplus
+  return buf;
+}
+
+cinn_buffer_t* cinn_buffer_new(cinn_device_kind_t device, cinn_type_t type, const std::vector<int>& shape, int align) {
+  return cinn_buffer_t::new_(device, type, shape, align);
+}
+
+cinn_pod_value_t::operator double() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<double>());
+  return value_.v_float64;
+}
+cinn_pod_value_t::operator float() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<float>());
+  return value_.v_float64;
+}
+cinn_pod_value_t::operator cinn::common::bfloat16() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<cinn::common::bfloat16>());
+  return static_cast<cinn::common::bfloat16>(value_.v_float64);
+}
+cinn_pod_value_t::operator cinn::common::float16() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<cinn::common::float16>());
+  return static_cast<cinn::common::float16>(value_.v_float64);
+}
+
+cinn_pod_value_t::operator bool() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<bool>());
+  return value_.v_int64;
+}
+
+cinn_pod_value_t::operator int8_t() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<int8_t>());
+  return value_.v_int64;
+}
+cinn_pod_value_t::operator int16_t() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<int16_t>());
+  return value_.v_int64;
+}
+cinn_pod_value_t::operator int32_t() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<int32_t>());
+  return value_.v_int64;
+}
+cinn_pod_value_t::operator int64_t() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<int64_t>());
+  return value_.v_int64;
+}
+
+cinn_pod_value_t::operator uint8_t() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<uint8_t>());
+  return value_.v_int64;
+}
+cinn_pod_value_t::operator uint16_t() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<uint16_t>());
+  return value_.v_int64;
+}
+cinn_pod_value_t::operator uint32_t() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<uint32_t>());
+  return value_.v_int64;
+}
+cinn_pod_value_t::operator uint64_t() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<uint64_t>());
+  return value_.v_int64;
+}
+
+cinn_pod_value_t::operator void*() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<void*>());
+  return value_.v_handle;
+}
+cinn_pod_value_t::operator cinn_buffer_t*() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<cinn_buffer_t*>());
+  return static_cast<cinn_buffer_t*>(value_.v_handle);
+}
+cinn_pod_value_t::operator char*() const {
+  CINN_CHECK_EQ(type_code_, ::cinn_type_code<char*>());
+  return static_cast<char*>(value_.v_handle);
+}
+
+cinn_pod_value_t::cinn_pod_value_t(cinn_value_t value, int type_code) : value_(value), type_code_(type_code) {}
+cinn_pod_value_t::cinn_pod_value_t(cinn_buffer_t* value) : type_code_(::cinn_type_code<cinn_buffer_t*>()) {
+  value_.v_handle = value;
+}
+cinn_pod_value_t::cinn_pod_value_t(bool value) : type_code_(::cinn_type_code<bool>()) { value_.v_int64 = value; }
+
+cinn_pod_value_t::cinn_pod_value_t(int8_t value) : type_code_(::cinn_type_code<int8_t>()) { value_.v_int64 = value; }
+cinn_pod_value_t::cinn_pod_value_t(int16_t value) : type_code_(::cinn_type_code<int16_t>()) { value_.v_int64 = value; }
+cinn_pod_value_t::cinn_pod_value_t(int32_t value) : type_code_(::cinn_type_code<int32_t>()) { value_.v_int64 = value; }
+cinn_pod_value_t::cinn_pod_value_t(int64_t value) : type_code_(::cinn_type_code<int64_t>()) { value_.v_int64 = value; }
+
+cinn_pod_value_t::cinn_pod_value_t(uint8_t value) : type_code_(::cinn_type_code<uint8_t>()) { value_.v_int64 = value; }
+cinn_pod_value_t::cinn_pod_value_t(uint16_t value) : type_code_(::cinn_type_code<uint16_t>()) {
+  value_.v_int64 = value;
+}
+cinn_pod_value_t::cinn_pod_value_t(uint32_t value) : type_code_(::cinn_type_code<uint32_t>()) {
+  value_.v_int64 = value;
+}
+cinn_pod_value_t::cinn_pod_value_t(uint64_t value) : type_code_(::cinn_type_code<uint64_t>()) {
+  value_.v_int64 = value;
+}
+
+cinn_pod_value_t::cinn_pod_value_t(float value) : type_code_(::cinn_type_code<float>()) { value_.v_float64 = value; }
+cinn_pod_value_t::cinn_pod_value_t(bfloat16 value) : type_code_(::cinn_type_code<bfloat16>()) {
+  value_.v_float64 = value;
+}
+cinn_pod_value_t::cinn_pod_value_t(float16 value) : type_code_(::cinn_type_code<float16>()) {
+  value_.v_float64 = value;
+}
+cinn_pod_value_t::cinn_pod_value_t(double value) : type_code_(::cinn_type_code<double>()) { value_.v_float64 = value; }
+cinn_pod_value_t::cinn_pod_value_t(void* value) : type_code_(::cinn_type_code<void*>()) { value_.v_handle = value; }
+cinn_pod_value_t::cinn_pod_value_t(const char* value) : type_code_(::cinn_type_code<char*>()) {
+  value_.v_handle = const_cast<char*>(value);
+}
+
+// @{
+float cinn_pod_value_to_float(cinn_pod_value_t* value) { return *value; }
+double cinn_pod_value_to_double(cinn_pod_value_t* value) { return *value; }
+bfloat16 cinn_pod_value_to_bfloat16(cinn_pod_value_t* value) { return *value; }
+float16 cinn_pod_value_to_float16(cinn_pod_value_t* value) { return *value; }
+
+int64_t cinn_pod_value_to_int64(cinn_pod_value_t* value) { return *value; }
+int32_t cinn_pod_value_to_int32(cinn_pod_value_t* value) { return *value; }
+int16_t cinn_pod_value_to_int16(cinn_pod_value_t* value) { return *value; }
+int8_t cinn_pod_value_to_int8(cinn_pod_value_t* value) { return *value; }
+
+uint64_t cinn_pod_value_to_uint64(cinn_pod_value_t* value) { return *value; }
+uint32_t cinn_pod_value_to_uint32(cinn_pod_value_t* value) { return *value; }
+uint16_t cinn_pod_value_to_uint16(cinn_pod_value_t* value) { return *value; }
+uint8_t cinn_pod_value_to_uint8(cinn_pod_value_t* value) { return *value; }
+
+bool cinn_pod_value_to_bool(cinn_pod_value_t* value) { return *value; }
+
+void* cinn_pod_value_to_void_p(cinn_pod_value_t* value) { return *value; }
+cinn_buffer_t* cinn_pod_value_to_buffer_p(cinn_pod_value_t* value) { return *value; }
+// @}
+
+// @{
+void float_to_cinn_pod_value(float v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+void bfloat16_to_cinn_pod_value(bfloat16 v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+void float16_to_cinn_pod_value(float16 v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+void double_to_cinn_pod_value(double v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+
+void bool_to_cinn_pod_value(bool v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+
+void int8_to_cinn_pod_value(int8_t v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+void int16_to_cinn_pod_value(int16_t v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+void int32_to_cinn_pod_value(int32_t v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+void int64_to_cinn_pod_value(int64_t v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+
+void uint8_to_cinn_pod_value(uint8_t v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+void uint16_to_cinn_pod_value(uint16_t v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+void uint32_to_cinn_pod_value(uint32_t v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+void uint64_to_cinn_pod_value(uint64_t v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+
+void handle_to_cinn_pod_value(void* v, cinn_pod_value_t* out) { *out = cinn_pod_value_t(v); }
+void buffer_p_to_cinn_pod_value(const cinn_buffer_t* v, cinn_pod_value_t* out) {
+  *out = cinn_pod_value_t(const_cast<cinn_buffer_t*>(v));
+}
+// @}
+
+void cinn_print_debug_string(const char* s, ...) {
+  va_list args;
+  va_start(args, s);
+  vfprintf(stderr, s, args);
+  va_end(args);
+
+  fprintf(stderr, "\n");
+}
+
+void debug_pod_value(cinn_pod_value_t v, int i) {
+  switch (v.type_code()) {
+    case ::cinn_type_code<cinn_buffer_t*>(): {
+      cinn_buffer_t* node = v;
+      if (node->memory) {
+        cinn_print_debug_string("arg[%d].memory: %p\n", i, node->memory);
+      } else {
+        cinn_print_debug_string("arg[%d].memory: %p\n", i, NULL);
+      }
+    } break;
+    case ::cinn_type_code<int32_t>(): {
+      int node = v;
+      cinn_print_debug_string("arg[%d] : %d\n", i, node);
+    } break;
+    case ::cinn_type_code<float>(): {
+      float node = v;
+      cinn_print_debug_string("arg[%f] : %d\n", i, node);
+    } break;
+    default:
+      cinn_print_debug_string("pod type not supported");
+      break;
+  }
+}
+
+void cinn_print_debug_args(cinn_pod_value_t* args, int count) {
+  cinn_print_debug_string("start debug ==");
+  cinn_print_debug_string("args: %p\n", (void*)args);  // NOLINT
+  cinn_print_debug_string("with %d arguments", count);
+  if (!args) {
+    cinn_print_debug_string("args is null!!");
+    return;
+  }
+
+  for (int i = 0; i < count; i++) {
+    cinn_print_debug_string("arg[%d]: %p\n", i, (void*)(&args[i]));  // NOLINT
+    debug_pod_value(args[i], i);
+  }
+}
+
+void cinn_args_construct(cinn_pod_value_t* arr, int count, ...) {
+  CINN_CHECK(count < 1000);
+
+  va_list args;
+  va_start(args, count);
+  for (int i = 0; i < count; i++) {
+    cinn_pod_value_t* elem_addr = va_arg(args, cinn_pod_value_t*);
+    arr[i]                      = *elem_addr;
+    // debug_pod_value(*elem_addr, i);
+  }
+  va_end(args);
+}
+
+void* cinn_pod_value_t::data_addr() const {
+  switch (type_code()) {
+    case ::cinn_type_code<bool>():
+    case ::cinn_type_code<int8_t>():
+    case ::cinn_type_code<int16_t>():
+    case ::cinn_type_code<int32_t>():
+    case ::cinn_type_code<int64_t>():
+    case ::cinn_type_code<uint8_t>():
+    case ::cinn_type_code<uint16_t>():
+    case ::cinn_type_code<uint32_t>():
+    case ::cinn_type_code<uint64_t>():
+      return (void*)&value_.v_int64;  // NOLINT
+    case ::cinn_type_code<bfloat16>():
+    case ::cinn_type_code<float16>():
+    case ::cinn_type_code<float>():
+    case ::cinn_type_code<double>():
+      return (void*)&value_.v_float64;  // NOLINT
+    case ::cinn_type_code<void*>():
+      return (void*)&value_.v_handle;  // NOLINT
+    case ::cinn_type_code<cinn_buffer_t*>():
+      return (void*)&value_.v_handle;  // NOLINT
+    default:
+      cinn_print_debug_string("POD value type [%d] not supported", type_code());
+      CINN_RUNTIME_NOT_IMPLEMENTED
+  }
+  return nullptr;
+}
+
+template <>
+cinn_type_t cinn_type_of<bool>() {
+  return cinn_bool_t();
+}
+
+template <>
+cinn_type_t cinn_type_of<int8_t>() {
+  return cinn_int8_t();
+}
+template <>
+cinn_type_t cinn_type_of<int16_t>() {
+  return cinn_int16_t();
+}
+template <>
+cinn_type_t cinn_type_of<int32_t>() {
+  return cinn_int32_t();
+}
+template <>
+cinn_type_t cinn_type_of<int64_t>() {
+  return cinn_int64_t();
+}
+
+template <>
+cinn_type_t cinn_type_of<uint8_t>() {
+  return cinn_uint8_t();
+}
+template <>
+cinn_type_t cinn_type_of<uint16_t>() {
+  return cinn_uint16_t();
+}
+template <>
+cinn_type_t cinn_type_of<uint32_t>() {
+  return cinn_uint32_t();
+}
+template <>
+cinn_type_t cinn_type_of<uint64_t>() {
+  return cinn_uint64_t();
+}
+
+template <>
+cinn_type_t cinn_type_of<bfloat16>() {
+  return cinn_bfloat16_t();
+}
+template <>
+cinn_type_t cinn_type_of<float16>() {
+  return cinn_float16_t();
+}
+template <>
+cinn_type_t cinn_type_of<float>() {
+  return cinn_float32_t();
+}
+template <>
+cinn_type_t cinn_type_of<double>() {
+  return cinn_float64_t();
+}
+
+template <>
+cinn_type_t cinn_type_of<float*>() {
+  return cinn_float64_t();
+}
+template <>
+cinn_type_t cinn_type_of<double*>() {
+  return cinn_float64_t();
+}
+template <>
+cinn_type_t cinn_type_of<bfloat16*>() {
+  return cinn_float64_t();
+}
+template <>
+cinn_type_t cinn_type_of<float16*>() {
+  return cinn_float64_t();
+}
+
+#include "cinn/runtime/cinn_x86_device_impl.cc"
diff --git a/paddle/cinn/runtime/cinn_runtime.h b/paddle/cinn/runtime/cinn_runtime.h
new file mode 100755
index 0000000000000..e8243e26be042
--- /dev/null
+++ b/paddle/cinn/runtime/cinn_runtime.h
@@ -0,0 +1,570 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This file contains some core runtime concepts, the basic definition is used in C so that it can be deployed in some
+ * light-weight devices.
+ */
+#ifndef CINN_RUNTIME_CINN_RUNTIME_H_
+#define CINN_RUNTIME_CINN_RUNTIME_H_
+#ifdef __cplusplus
+#pragma once
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __cplusplus
+#include <functional>
+#include <vector>
+#endif
+
+#ifndef CINN_COMMON_FLOAT16_H
+#include "cinn/common/float16.h"
+#endif  // CINN_COMMON_FLOAT16_H
+
+#ifndef CINN_COMMON_BFLOAT16_H
+#include "cinn/common/bfloat16.h"
+#endif  // CINN_COMMON_BFLOAT16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CINN_ALWAYS_INLINE __attribute__((always_inline)) inline
+
+//! Code for the primitive types supported in CINN.
+typedef enum cinn_type_code_t {
+  cinn_type_unk    = -1,  //! Unknown type
+  cinn_type_int    = 0,   //! signed int
+  cinn_type_uint   = 1,   //! unsigned int
+  cinn_type_float  = 2,   //! floating point
+  cinn_type_handle = 3,   //! void*
+  cinn_type_bfloat = 4    //! bfloat16
+} cinn_type_code_t;
+
+#ifndef CINN_ATTRIBUTE_ALIGN
+#define CINN_ATTRIBUTE_ALIGN(n) __attribute__((aligned(n)))
+#endif
+
+/**
+ * A runtime tag for type in CINN system.
+ */
+typedef struct cinn_type_t {
+#if __cplusplus >= 201103L
+  CINN_ATTRIBUTE_ALIGN(1) cinn_type_code_t code;
+#else
+  uint8_t code;
+#endif
+
+  //! Number of bits.
+  uint8_t bits;
+
+  //! Number of elements in a vector, 1 for scalar.
+  uint16_t lanes;
+
+  //! Number of '*', e.g. for `float*`, the num_asterisks is 1, `float**` it is 2.
+  uint8_t num_asterisks{0};
+
+#ifdef __cplusplus
+  CINN_ALWAYS_INLINE cinn_type_t() : code(cinn_type_int), bits(0), lanes(0) {}
+  CINN_ALWAYS_INLINE cinn_type_t(cinn_type_code_t code, uint8_t bits, uint16_t lanes = 1, uint8_t num_asterisks = 0)
+      : code(code), bits(bits), lanes(lanes), num_asterisks(num_asterisks) {}
+  CINN_ALWAYS_INLINE bool operator==(const cinn_type_t& other) const {
+    return code == other.code && bits == other.bits && lanes == other.lanes;
+  }
+  CINN_ALWAYS_INLINE bool operator!=(const cinn_type_t& other) const { return !(*this == other); }
+  CINN_ALWAYS_INLINE uint16_t bytes() const { return (bits + 7) / 8; }
+#endif  // __cplusplus
+} cinn_type_t;
+
+//! Some primitive types.
+// @{
+extern cinn_type_t cinn_unk_t();
+extern cinn_type_t cinn_bool_t(int num_asterisks = 0);
+
+extern cinn_type_t cinn_int8_t(int num_asterisks = 0);
+extern cinn_type_t cinn_int16_t(int num_asterisks = 0);
+extern cinn_type_t cinn_int32_t(int num_asterisks = 0);
+extern cinn_type_t cinn_int64_t(int num_asterisks = 0);
+
+extern cinn_type_t cinn_uint8_t(int num_asterisks = 0);
+extern cinn_type_t cinn_uint16_t(int num_asterisks = 0);
+extern cinn_type_t cinn_uint32_t(int num_asterisks = 0);
+extern cinn_type_t cinn_uint64_t(int num_asterisks = 0);
+
+extern cinn_type_t cinn_bfloat16_t(int num_asterisks = 0);
+extern cinn_type_t cinn_float16_t(int num_asterisks = 0);
+extern cinn_type_t cinn_float32_t(int num_asterisks = 0);
+extern cinn_type_t cinn_float64_t(int num_asterisks = 0);
+// @}
+
+//! Help to define the size of a dimension, due to polyhedral representation, we no need to record the extend or
+//! min(default to 0).
+typedef int cinn_dimension_t;
+
+//! Help to tell the kind of the device.
+typedef enum cinn_device_kind_t {
+  cinn_unk_device    = -1,  // Undefined device.
+  cinn_x86_device    = 0,   // X86 device
+  cinn_opencl_device = 1,   // OpenCL device
+  cinn_arm_device    = 2    // ARM device
+} cinn_device_kind_t;
+
+//! Help to tell where the buffer locates.
+typedef enum cinn_buffer_kind_t {
+  cinn_buffer_on_host   = 0,      //! buffer on host
+  cinn_buffer_on_device = 1 << 1  // ! buffer on device e.g. GPU.
+} cinn_buffer_kind_t;
+
+struct cinn_buffer_t;
+
+/**
+ * All CINN backends implementation should provide an interface to be used.
+ */
+struct cinn_device_interface_impl_t;
+
+struct cinn_device_interface_t {
+  int (*malloc)(void* context, struct cinn_buffer_t* buf);
+  int (*free)(void* context, struct cinn_buffer_t* buf);
+  int (*sync)(void* context, struct cinn_buffer_t* buf);
+  int (*release)(void* context, const struct cinn_device_interface_t* device_interface);
+  int (*copy_to_host)(void* context, struct cinn_buffer_t* buf);
+  int (*copy_to_device)(void* context, struct cinn_buffer_t* buf);
+  int (*buffer_copy)(void* context, struct cinn_buffer_t* src, struct cinn_buffer_t* dst);
+  struct cinn_device_interface_impl_t* impl;
+};
+
+/**
+ * Release all data associated with the given interface.
+ */
+extern int cinn_device_release(void* context, const struct cinn_device_interface_t* device_interface);
+
+/*
+ * Copy image data from device to host memory.
+ */
+extern int cinn_buffer_copy_to_host(void* context, struct cinn_buffer_t* buf);
+
+//! Copy data from host to device memory.
+extern int cinn_buffer_copy_to_device(void* context, struct cinn_buffer_t* buf);
+
+//! Copy data from one buffer to another.
+extern int cinn_buffer_copy(void* context, struct cinn_buffer_t* src, struct cinn_buffer_t* dst);
+
+//! Wait for current device operations to complete.
+extern int cinn_device_sync(void* context, struct cinn_buffer_t* buf);
+
+//! Allocate device memory.
+extern int cinn_buffer_malloc(void* context, struct cinn_buffer_t* buf);
+
+//! Free device memory.
+extern int cinn_buffer_free(void* context, struct cinn_buffer_t* buf);
+
+//! Get the memory address in buffer.
+extern void* cinn_buffer_get_data_handle(struct cinn_buffer_t* buf);
+extern void* cinn_buffer_get_data_const_handle(const struct cinn_buffer_t* buf);
+
+//! Create a new default cinn_buffer.
+extern cinn_buffer_t* cinn_buffer_new_default(int target, uint64_t memory_size, int align = 32);
+
+//! The raw representation of a buffer,used in the generated code/lib.
+#define CINN_BUFFER_MAX_DIMS 8
+typedef struct cinn_buffer_t {
+  //! Tell which kind of device this buffer locates.
+  cinn_device_kind_t device;
+
+  //! The interface used to operate on device.
+  const struct cinn_device_interface_t* device_interface;
+
+  //! A pointer to the memory in host.
+  uint8_t* memory;
+
+  //! Extra flags.
+  uint64_t flag;
+
+  //! Data type.
+  cinn_type_t type;
+
+  //! Number of dimensions.
+  int32_t dimensions;
+  cinn_dimension_t dims[CINN_BUFFER_MAX_DIMS];
+
+  //! Allocate and deallocate lazily, default true.
+  char lazy;
+
+  //! The actual memory size(in bytes).
+  uint64_t memory_size;
+
+  uint16_t align;
+
+#ifdef __cplusplus
+  cinn_buffer_t()
+      : device(cinn_unk_device),
+        device_interface(NULL),
+        memory(NULL),
+        flag(0UL),
+        type(cinn_type_t()),
+        dimensions(0),
+        lazy(true),
+        memory_size(0),
+        align(0),
+        external_malloc(NULL),
+        external_free(NULL) {}
+
+  static struct cinn_buffer_t* new_(cinn_device_kind_t device,
+                                    cinn_type_t type,
+                                    const std::vector<int>& shape,
+                                    int align = 0);
+  static void delete_(struct cinn_buffer_t* x) { delete x; }
+
+  ~cinn_buffer_t() {
+    if (external_malloc != NULL) {
+      delete external_malloc;
+    }
+    if (external_free != NULL) {
+      delete external_free;
+    }
+  }
+
+  // NOTE the buffer should be resized first.
+  static void alloc(struct cinn_buffer_t*);
+
+  //! Set the shape of the buffer. NOTE this just record the shape, not allocate the memory.
+  CINN_ALWAYS_INLINE void resize(const cinn_dimension_t* dims, int dimensions) {
+    this->dimensions = dimensions;
+    memcpy(this->dims, dims, dimensions * sizeof(cinn_dimension_t));
+  }
+
+  CINN_ALWAYS_INLINE uint64_t num_elements() const {
+    uint64_t res = 1;
+    for (int i = 0; i < dimensions; i++) {
+      res *= dims[i];
+    }
+    return res;
+  }
+
+  CINN_ALWAYS_INLINE bool on_host() const { return get_flag(cinn_buffer_on_host); }
+  CINN_ALWAYS_INLINE bool on_device() const { return get_flag(cinn_buffer_on_device); }
+  CINN_ALWAYS_INLINE void set_on_host(bool x = true) { set_flag(cinn_buffer_on_host, x); }
+  CINN_ALWAYS_INLINE void set_on_device(bool x = true) { set_flag(cinn_buffer_on_device, x); }
+
+  CINN_ALWAYS_INLINE int device_sync(void* ctx = NULL) {
+    if (device_interface && device_interface->sync) {
+      return device_interface->sync(ctx, this);
+    }
+    return 0;
+  }
+
+  CINN_ALWAYS_INLINE uint8_t* begin() const { return 0; }
+  CINN_ALWAYS_INLINE uint8_t* end() const { return memory + num_elements() * type.bytes(); }
+
+  CINN_ALWAYS_INLINE bool get_flag(cinn_buffer_kind_t flag) const { return (this->flag & flag) != 0; }
+  CINN_ALWAYS_INLINE void set_flag(cinn_buffer_kind_t flag, bool value) {
+    if (value)
+      this->flag |= flag;
+    else
+      this->flag &= ~flag;
+  }
+
+  // The callback to control memory alloc. It is useful in Paddle-CINN
+  // where the memory is managed out of CINN.
+  std::function<int(void*, struct cinn_buffer_t*)>* external_malloc;
+
+  // The callback to control memory free. It is useful in Paddle-CINN
+  // where the memory is managed out of CINN.
+  std::function<int(void*, struct cinn_buffer_t*)>* external_free;
+
+#endif  // __cplusplus
+} cinn_buffer_t;
+
+#ifdef __cplusplus
+//! Create a new cinn_buffer.
+cinn_buffer_t* cinn_buffer_new(cinn_device_kind_t device,
+                               cinn_type_t type,
+                               const std::vector<int>& shape,
+                               int align = 0);
+
+struct cinn_device_interface_impl_t {
+  int (*malloc)(void* context, struct cinn_buffer_t* buf);
+  int (*free)(void* context, struct cinn_buffer_t* buf);
+  int (*sync)(void* context, struct cinn_buffer_t* buf);
+  int (*release)(void* context);
+  int (*copy_to_host)(void* context, struct cinn_buffer_t* buf);
+  int (*copy_to_device)(void* context, struct cinn_buffer_t* buf);
+  int (*buffer_copy)(void* context, struct cinn_buffer_t* src, struct cinn_buffer_t* dst);
+};
+
+// The device implementations
+extern struct cinn_device_interface_t* cinn_x86_device_interface();
+
+inline cinn::common::bfloat16 cinn_buffer_load_bfloat16(struct cinn_buffer_t* buf, uint32_t index) {
+  return ((cinn::common::bfloat16*)buf->memory)[index];  // NOLINT
+}
+inline cinn::common::float16 cinn_buffer_load_float16(struct cinn_buffer_t* buf, uint32_t index) {
+  return ((cinn::common::float16*)buf->memory)[index];  // NOLINT
+}
+inline float cinn_buffer_load_float32(struct cinn_buffer_t* buf, uint32_t index) {
+  return ((float*)buf->memory)[index];  // NOLINT
+}
+inline double cinn_buffer_load_float64(struct cinn_buffer_t* buf, uint32_t index) {
+  return ((double*)buf->memory)[index];  // NOLINT
+}
+#endif  // __cplusplus
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+CINN_ALWAYS_INLINE void* cinn_buffer_slice(struct cinn_buffer_t* buf, uint32_t offset);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int32_t cinn_min(int32_t a, int32_t b) { return a < b ? a : b; }
+static inline int32_t cinn_max(int32_t a, int32_t b) { return a > b ? a : b; }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#ifndef CINN_RUNTIME_NOT_IMPLEMENTED
+#define CINN_RUNTIME_NOT_IMPLEMENTED     \
+  do {                                   \
+    fprintf(stderr, "Not Implemented!"); \
+    abort();                             \
+  } while (false);
+#endif
+
+#define ASSERT_NOT_NULL(v__)          \
+  if (!v__) {                         \
+    fprintf(stderr, #v__ " is null"); \
+    return -1;                        \
+  }
+#define CINN_LOG(fmt, ...)                                                          \
+  do {                                                                              \
+    fprintf(stderr, "%s:%d:%s(): " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); \
+  } while (0)
+
+#define CINN_CHECK(cond)                \
+  if (!(cond)) {                        \
+    CINN_LOG("check %s failed", #cond); \
+    abort();                            \
+  }
+#define CINN_CHECK_LT(a, b)                                \
+  if (!(a < b)) {                                          \
+    cinn_print_debug_string("check %d > %d failed", a, b); \
+    abort();                                               \
+  }
+#define CINN_CHECKP(cond, ...) \
+  if (!(cond)) {               \
+    CINN_LOG(__VA_ARGS__);     \
+    abort();                   \
+  }
+#define CINN_CHECK_EQ(a, b)                                        \
+  {                                                                \
+    if ((a) != (b)) {                                              \
+      CINN_LOG("check %s == %s failed, %d != %d", #a, #b, (a), b); \
+      abort();                                                     \
+    }                                                              \
+  }                                                                \
+  while (false)                                                    \
+    ;  // NOLINT
+
+#endif  // CINN_RUNTIME_CINN_RUNTIME_H_
+
+union cinn_value_t {
+  int64_t v_int64;
+  double v_float64;
+  void* v_handle;
+  char* v_str;
+};
+
+struct cinn_pod_value_t {
+#ifdef __cplusplus
+  // @{ PodValue
+
+  cinn_pod_value_t() = default;
+
+  cinn_pod_value_t(cinn_value_t value, int type_code);
+  explicit cinn_pod_value_t(cinn_buffer_t* value);
+  explicit cinn_pod_value_t(bool value);
+
+  explicit cinn_pod_value_t(int8_t value);
+  explicit cinn_pod_value_t(int16_t value);
+  explicit cinn_pod_value_t(int32_t value);
+  explicit cinn_pod_value_t(int64_t value);
+
+  explicit cinn_pod_value_t(uint8_t value);
+  explicit cinn_pod_value_t(uint16_t value);
+  explicit cinn_pod_value_t(uint32_t value);
+  explicit cinn_pod_value_t(uint64_t value);
+
+  explicit cinn_pod_value_t(float value);
+  explicit cinn_pod_value_t(double value);
+  explicit cinn_pod_value_t(cinn::common::bfloat16 value);
+  explicit cinn_pod_value_t(cinn::common::float16 value);
+
+  explicit cinn_pod_value_t(void* value);
+  explicit cinn_pod_value_t(const char* value);
+
+  //! The value getters for the supported types.
+  //@{
+  operator double() const;
+  operator float() const;
+  operator cinn::common::bfloat16() const;
+  operator cinn::common::float16() const;
+
+  operator bool() const;
+
+  operator int8_t() const;
+  operator int16_t() const;
+  operator int32_t() const;
+  operator int64_t() const;
+
+  operator uint8_t() const;
+  operator uint16_t() const;
+  operator uint32_t() const;
+  operator uint64_t() const;
+
+  operator void*() const;
+  operator cinn_buffer_t*() const;
+  operator char*() const;
+  //@}
+
+  int type_code() const { return type_code_; }
+
+  void* data_addr() const;
+
+  void set_type_code(int x) { type_code_ = x; }
+  void set_value(union cinn_value_t x) { value_ = x; }
+
+ protected:
+  // @}
+#endif  // __cplusplus
+  int type_code_;
+  union cinn_value_t value_;
+};
+
+#ifdef __cplusplus
+template <typename T>
+constexpr int cinn_type_code();
+
+//! Implement the type_code for all the supported types.
+// @{
+#define __m(T, code__)                \
+  template <>                         \
+  constexpr int cinn_type_code<T>() { \
+    return code__;                    \
+  }
+__m(int32_t, 0);
+__m(int64_t, 1);
+__m(float, 2);
+__m(double, 3);
+__m(void*, 4);
+__m(char*, 5);
+__m(char const*, 6);
+__m(cinn_buffer_t*, 7);
+__m(int8_t, 8);
+__m(bool, 9);
+__m(cinn::common::float16, 10);
+__m(int16_t, 11);
+__m(uint8_t, 12);
+__m(uint16_t, 13);
+__m(uint32_t, 14);
+__m(uint64_t, 15);
+__m(cinn::common::bfloat16, 16);
+#undef __m
+//@}
+#endif  // __cplusplus
+
+typedef struct cinn_pod_value_t cinn_pod_value_t;
+
+// the LoweredFunc pointer type for JIT usage.
+typedef void (*lower_func_ptr_t)(void*, int32_t);
+typedef void (*lower_func_ptr_g)(void*, int32_t, void*);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+//! cinn_pod_value to specific types.
+// @{
+float cinn_pod_value_to_float(cinn_pod_value_t* value);
+double cinn_pod_value_to_double(cinn_pod_value_t* value);
+cinn::common::bfloat16 cinn_pod_value_to_bfloat16(cinn_pod_value_t* value);
+cinn::common::float16 cinn_pod_value_to_float16(cinn_pod_value_t* value);
+
+int64_t cinn_pod_value_to_int64(cinn_pod_value_t* value);
+int32_t cinn_pod_value_to_int32(cinn_pod_value_t* value);
+int16_t cinn_pod_value_to_int16(cinn_pod_value_t* value);
+int8_t cinn_pod_value_to_int8(cinn_pod_value_t* value);
+
+uint64_t cinn_pod_value_to_uint64(cinn_pod_value_t* value);
+uint32_t cinn_pod_value_to_uint32(cinn_pod_value_t* value);
+uint16_t cinn_pod_value_to_uint16(cinn_pod_value_t* value);
+uint8_t cinn_pod_value_to_uint8(cinn_pod_value_t* value);
+
+bool cinn_pod_value_to_bool(cinn_pod_value_t* value);
+
+void* cinn_pod_value_to_void_p(cinn_pod_value_t* value);
+cinn_buffer_t* cinn_pod_value_to_buffer_p(cinn_pod_value_t* value);
+// @}
+
+//! other specific types to cinn_pod_value
+// @{
+void float_to_cinn_pod_value(float v, cinn_pod_value_t* out);
+void bfloat16_to_cinn_pod_value(cinn::common::bfloat16 v, cinn_pod_value_t* out);
+void float16_to_cinn_pod_value(cinn::common::float16 v, cinn_pod_value_t* out);
+void double_to_cinn_pod_value(double v, cinn_pod_value_t* out);
+
+void bool_to_cinn_pod_value(bool v, cinn_pod_value_t* out);
+
+void int8_to_cinn_pod_value(int8_t v, cinn_pod_value_t* out);
+void int16_to_cinn_pod_value(int16_t v, cinn_pod_value_t* out);
+void int32_to_cinn_pod_value(int32_t v, cinn_pod_value_t* out);
+void int64_to_cinn_pod_value(int64_t v, cinn_pod_value_t* out);
+
+void uint8_to_cinn_pod_value(uint8_t v, cinn_pod_value_t* out);
+void uint16_to_cinn_pod_value(uint16_t v, cinn_pod_value_t* out);
+void uint32_to_cinn_pod_value(uint32_t v, cinn_pod_value_t* out);
+void uint64_to_cinn_pod_value(uint64_t v, cinn_pod_value_t* out);
+
+void handle_to_cinn_pod_value(void* v, cinn_pod_value_t* out);
+void buffer_p_to_cinn_pod_value(const struct cinn_buffer_t* v, cinn_pod_value_t* out);
+// @}
+
+void cinn_print_debug_string(const char* s, ...);
+
+void cinn_print_debug_args(cinn_pod_value_t* args, int count);
+
+/**
+ * Construct a Args for LoweredFunc with a list of `cinn_pod_value_t*`
+ * @param arr An array of `cinn_pod_value_t`
+ * @param count Count of elements in the arg list.
+ * @param ... variadic args of `cinn_pod_value_t*`
+ */
+void cinn_args_construct(cinn_pod_value_t* arr, int count, ...);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#ifdef __cplusplus
+template <typename T>
+cinn_type_t cinn_type_of();
+
+#endif  // __cplusplus
diff --git a/paddle/cinn/runtime/cinn_runtime_test.cc b/paddle/cinn/runtime/cinn_runtime_test.cc
new file mode 100644
index 0000000000000..9f69a9070b2ff
--- /dev/null
+++ b/paddle/cinn/runtime/cinn_runtime_test.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cinn_runtime.h"
+
+#include <gtest/gtest.h>
+
+TEST(buffer, basic) {
+  auto* buffer = cinn_buffer_t::new_(cinn_x86_device, cinn_float32_t(), {3, 10});
+  ASSERT_TRUE(buffer);
+  ASSERT_TRUE(buffer->device_interface);
+  ASSERT_EQ(buffer->device_interface, cinn_x86_device_interface());
+  buffer->device_interface->impl->malloc(NULL, buffer);
+  auto* data = reinterpret_cast<float*>(buffer->memory);
+  data[0]    = 0.f;
+  data[1]    = 1.f;
+  EXPECT_EQ(data[0], 0.f);
+  EXPECT_EQ(data[1], 1.f);
+}
+
+TEST(cinn_print_debug_string, basic) {
+  cinn_print_debug_string("hello world");
+  cinn_print_debug_string("should be 1, %d", 1);
+  int a = 1;
+  cinn_print_debug_string("should be pointer, %p", &a);
+  cinn_print_debug_string("should be 1, %d", a);
+  cinn_print_debug_string("v3[%d %d %d], ", 1, 2, 3);
+}
+
+TEST(cinn_args_construct, basic) {
+  cinn_pod_value_t arr[4];
+  cinn_pod_value_t a0(0);
+  cinn_pod_value_t a1(1);
+  cinn_pod_value_t a2(2);
+  cinn_pod_value_t a3(3);
+  cinn_args_construct(arr, 4, &a0, &a1, &a2, &a3);
+  for (int i = 0; i < 4; i++) ASSERT_EQ((int)arr[i], i);
+}
diff --git a/paddle/cinn/runtime/cinn_x86_device_impl.cc b/paddle/cinn/runtime/cinn_x86_device_impl.cc
new file mode 100644
index 0000000000000..f976aee5dcf67
--- /dev/null
+++ b/paddle/cinn/runtime/cinn_x86_device_impl.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+
+#include "cinn/runtime/cinn_runtime.h"
+
+int cinn_x86_malloc(void* context, cinn_buffer_t* buf) {
+  // ASSERT_NOT_NULL(context)
+  ASSERT_NOT_NULL(buf)
+  uint64_t memory_size;
+  bool need_malloc = false;
+  if (buf->memory_size > 0 && !buf->memory) {
+    memory_size = buf->memory_size;
+    need_malloc = true;
+  } else {
+    memory_size = buf->num_elements() * buf->type.bytes();
+  }
+  CINN_CHECK(memory_size > 0);
+  if (buf->memory_size < memory_size || need_malloc) {
+    if (buf->memory) {
+      free(buf->memory);
+    }
+    if (buf->align == 0) {
+      buf->memory = (unsigned char*)malloc(memory_size);
+    } else {
+      buf->memory = (unsigned char*)aligned_alloc(buf->align, memory_size);
+    }
+    buf->memory_size = memory_size;
+    CINN_LOG("buf.memory size is %ld\n", buf->memory_size);
+  }
+  ASSERT_NOT_NULL(buf->memory);
+  return 0;
+}
+
+int cinn_x86_free(void* context, cinn_buffer_t* buf) {
+  // ASSERT_NOT_NULL(context);
+  ASSERT_NOT_NULL(buf);
+  if (buf->memory) {
+    free(buf->memory);
+    buf->memory = NULL;
+  }
+  return 0;
+}
+
+// All the following operations are not support by X86 device, just leave them empty.
+// @{
+int cinn_x86_sync(void* context, cinn_buffer_t* buf) { return 0; }
+int cinn_x86_release(void* context) { return 0; }
+int cinn_x86_copy_to_host(void* context, cinn_buffer_t* buf) { return 0; }
+int cinn_x86_copy_to_device(void* context, cinn_buffer_t* buf) { return 0; }
+int cinn_x86_buffer_copy(void* context, cinn_buffer_t* src, cinn_buffer_t* dst) { return 0; }
+// @}
+
+cinn_device_interface_impl_t cinn_x86_device_impl{&cinn_x86_malloc,
+                                                  &cinn_x86_free,
+                                                  &cinn_x86_sync,
+                                                  &cinn_x86_release,
+                                                  &cinn_x86_copy_to_host,
+                                                  &cinn_x86_copy_to_device,
+                                                  &cinn_x86_buffer_copy};
+
+cinn_device_interface_t cinn_x86_device_interface_interface{&cinn_buffer_malloc,
+                                                            &cinn_buffer_free,
+                                                            &cinn_device_sync,
+                                                            &cinn_device_release,
+                                                            &cinn_buffer_copy_to_host,
+                                                            &cinn_buffer_copy_to_device,
+                                                            &cinn_buffer_copy,
+                                                            &cinn_x86_device_impl};
+
+struct cinn_device_interface_t* cinn_x86_device_interface() {
+  return &cinn_x86_device_interface_interface;
+}
diff --git a/paddle/cinn/runtime/cpu/CMakeLists.txt b/paddle/cinn/runtime/cpu/CMakeLists.txt
new file mode 100644
index 0000000000000..df13981b15242
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/CMakeLists.txt
@@ -0,0 +1,26 @@
+core_gather_headers()
+
+
+gather_srcs(cinnapi_src SRCS
+    host_intrinsics.cc
+    thread_backend.cc)
+
+
+if (WITH_MKL_CBLAS)
+  gather_srcs(cinnapi_src SRCS mkl_math.cc cblas.cc)
+  if (WITH_MKLDNN)
+    gather_srcs(cinnapi_src SRCS mkldnn_math.cc)
+  endif()
+endif()
+
+
+cc_test(test_host_intrinsics SRCS host_intrinsics_test.cc DEPS cinncore)
+if (WITH_MKL_CBLAS)
+  if (NOT WITH_CUDA)
+    cc_test(test_mkl_math SRCS mkl_math_test.cc mkl_math.cc DEPS cinncore)
+  endif()
+
+  if (WITH_MKLDNN)
+    cc_test(test_mkldnn_math SRCS mkldnn_math_test.cc mkldnn_math.cc DEPS cinncore)
+  endif ()
+endif()
diff --git a/paddle/cinn/runtime/cpu/cblas.cc b/paddle/cinn/runtime/cpu/cblas.cc
new file mode 100644
index 0000000000000..88b77b17b28b0
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/cblas.cc
@@ -0,0 +1,226 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cpu/cblas.h"
+
+#include <vector>
+
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/common/cas.h"
+
+namespace {
+
+inline CBLAS_TRANSPOSE ToCblasTranspose(bool trans) { return trans ? CblasTrans : CblasNoTrans; }
+
+}  // namespace
+
+void cinn_cpu_mkl_gemm_fp32(float alpha,
+                            int M,
+                            int N,
+                            int K,
+                            bool ta,
+                            bool tb,
+                            int lda,
+                            int ldb,
+                            int ldc,
+                            float beta,
+                            cinn_buffer_t* A,
+                            cinn_buffer_t* B,
+                            cinn_buffer_t* C) {
+  cblas_sgemm(CblasRowMajor,
+              ToCblasTranspose(ta),
+              ToCblasTranspose(tb),
+              M,
+              N,
+              K,
+              alpha,
+              reinterpret_cast<float*>(A->memory),
+              lda,
+              reinterpret_cast<float*>(B->memory),
+              ldb,
+              beta,
+              reinterpret_cast<float*>(C->memory),
+              ldc);
+}
+
+void cinn_cpu_mkl_gemm_batch_fp32(float alpha,
+                                  int batch_size,
+                                  int M,
+                                  int N,
+                                  int K,
+                                  bool ta,
+                                  bool tb,
+                                  int lda,
+                                  int ldb,
+                                  int ldc,
+                                  int a_stride,
+                                  int b_stride,
+                                  int c_stride,
+                                  float beta,
+                                  cinn_buffer_t* A,
+                                  cinn_buffer_t* B,
+                                  cinn_buffer_t* C) {
+  std::vector<const float*> A_array(batch_size);
+  std::vector<const float*> B_array(batch_size);
+  std::vector<float*> C_array(batch_size);
+
+  for (int i = 0; i < batch_size; ++i) {
+    A_array[i] = reinterpret_cast<float*>(A->memory) + i * a_stride;
+    B_array[i] = reinterpret_cast<float*>(B->memory) + i * b_stride;
+    C_array[i] = reinterpret_cast<float*>(C->memory) + i * c_stride;
+  }
+  CBLAS_TRANSPOSE trans_a = ToCblasTranspose(ta);
+  CBLAS_TRANSPOSE trans_b = ToCblasTranspose(tb);
+  cblas_sgemm_batch(CblasRowMajor,
+                    &trans_a,
+                    &trans_b,
+                    &M,
+                    &N,
+                    &K,
+                    &alpha,
+                    A_array.data(),
+                    &lda,
+                    B_array.data(),
+                    &ldb,
+                    &beta,
+                    C_array.data(),
+                    &ldc,
+                    1,
+                    &batch_size);
+}
+
+/**
+ * This function is temporarily unavailable, see the error message in the following PR for details.
+ * The specific reason may be that the custom call does not support host op.
+ * See: https://github.com/PaddlePaddle/CINN/pull/1133
+ */
+void cinn_call_cholesky_host(void* v_args, int num_args, int batch_size, int m, bool upper) {
+#ifdef CINN_WITH_MKL_CBLAS
+  cinn_pod_value_t* args = static_cast<cinn_pod_value_t*>(v_args);
+
+  cinn_buffer_t* x   = args[0].operator cinn_buffer_t*();
+  cinn_buffer_t* out = args[1].operator cinn_buffer_t*();
+  memcpy(out->memory, x->memory, x->memory_size);
+
+  uint8_t bits = x->type.bits;
+  CHECK(bits == 32 || bits == 64) << "Unsupported bits = " << bits << " float data type for cholesky";
+  char uplo = upper ? 'U' : 'L';
+  for (int i = 0; i < batch_size; i++) {
+    if (bits == 32) {
+      float* matrix = reinterpret_cast<float*>(out->memory) + i * m * m;
+      LAPACKE_spotrf(LAPACK_ROW_MAJOR, uplo, m, matrix, m);
+    } else if (bits == 64) {
+      double* matrix = reinterpret_cast<double*>(out->memory) + i * m * m;
+      LAPACKE_dpotrf(LAPACK_ROW_MAJOR, uplo, m, matrix, m);
+    }
+  }
+#else
+  CINN_NOT_IMPLEMENTED
+#endif
+}
+
+CINN_REGISTER_HELPER(cinn_cpu_mkl) {
+  using namespace cinn;  // NOLINT
+  using backends::FunctionProto;
+  auto host_target = common::DefaultHostTarget();
+
+  FunctionProto::shape_inference_t inference_shape_gemm = [](const std::vector<Expr>& args, int offset) {
+    CHECK_EQ(offset, 0UL) << "Only one output";
+    CHECK_EQ(args.size(), 12UL) << "Wrong number of arguments passed in";
+    auto M = common::AutoSimplify(args[1]);
+    auto N = common::AutoSimplify(args[2]);
+    std::vector<Expr> shape;
+    shape.push_back(M);
+    shape.push_back(N);
+    return shape;
+  };
+
+  FunctionProto::shape_inference_t inference_shape_gemm_batch = [](const std::vector<Expr>& args, int offset) {
+    CHECK_EQ(offset, 0UL) << "Only one output";
+    CHECK_EQ(args.size(), 16UL) << "Wrong number of arguments passed in";
+    auto& A       = args[14];
+    auto A_tensor = A.as_tensor();
+    CHECK(A_tensor);
+
+    auto batch_size        = common::AutoSimplify(args[1]);
+    int32_t batch_size_val = batch_size.as_int32();
+
+    auto M = common::AutoSimplify(args[2]);
+    auto N = common::AutoSimplify(args[3]);
+
+    std::vector<Expr> shape;
+    int total = 1;
+    for (auto& v : A_tensor->shape) {
+      auto val = common::AutoSimplify(v);
+      CHECK(val.is_constant());
+      shape.push_back(val);
+      total *= val.as_int32();
+      if (total >= batch_size_val) break;
+    }
+    shape.push_back(M);
+    shape.push_back(N);
+    return shape;
+  };
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_cpu_mkl_gemm_fp32, host_target)
+      .SetRetType<void>()
+      .AddInputType<float>()            // alpha
+      .AddInputType<int>()              // M
+      .AddInputType<int>()              // N
+      .AddInputType<int>()              // K
+      .AddInputType<bool>()             // ta
+      .AddInputType<bool>()             // tb
+      .AddInputType<int>()              // lda
+      .AddInputType<int>()              // ldb
+      .AddInputType<int>()              // ldc
+      .AddInputType<float>()            // beta
+      .AddInputType<cinn_buffer_t*>()   // A
+      .AddInputType<cinn_buffer_t*>()   // B
+      .AddOutputType<cinn_buffer_t*>()  // C
+      .SetShapeInference(inference_shape_gemm)
+      .End();
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_cpu_mkl_gemm_batch_fp32, host_target)
+      .SetRetType<void>()
+      .AddInputType<float>()            // alpha
+      .AddInputType<int>()              // batch
+      .AddInputType<int>()              // M
+      .AddInputType<int>()              // N
+      .AddInputType<int>()              // K
+      .AddInputType<bool>()             // ta
+      .AddInputType<bool>()             // tb
+      .AddInputType<int>()              // lda
+      .AddInputType<int>()              // ldb
+      .AddInputType<int>()              // ldc
+      .AddInputType<int>()              // a_stride
+      .AddInputType<int>()              // b_stride
+      .AddInputType<int>()              // c_stride
+      .AddInputType<float>()            // beta
+      .AddInputType<cinn_buffer_t*>()   // A
+      .AddInputType<cinn_buffer_t*>()   // B
+      .AddOutputType<cinn_buffer_t*>()  // C
+      .SetShapeInference(inference_shape_gemm_batch)
+      .End();
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cholesky_host, host_target)
+      .SetRetType<void>()
+      .AddInputType<void*>()  // v_args
+      .AddInputType<int>()    // num_args
+      .AddInputType<int>()    // batch_size
+      .AddInputType<int>()    // m
+      .AddInputType<bool>()   // upper
+      .End();
+
+  return true;
+}
diff --git a/paddle/cinn/runtime/cpu/cblas.h b/paddle/cinn/runtime/cpu/cblas.h
new file mode 100644
index 0000000000000..369a93b03cc09
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/cblas.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+//! \file This file defines some C APIs to trigger CBLAS methods.
+#include "cinn/runtime/cinn_runtime.h"
+
+#ifdef CINN_WITH_MKL_CBLAS
+#include <mkl_cblas.h>
+#include <mkl_lapacke.h>
+#else
+#include <cblas.h>
+#endif
+
+// define some C APIs
+extern "C" {
+
+/**
+ * \brief Do GEMM on buffer A and B and write result to buffer C.
+ * We pass the \param M, \param N, \param K although the shape can retrieve from cinn_buffer_t because the size of a
+ * matrix not equals the shape of a buffer it is stored.
+ * @param alpha The scaling factor of the product of A and B
+ * @param M Number of the rows of A
+ * @param N the number of the columns in both B and C
+ * @param K the number of columns of A
+ * @param ta whether to transpose A
+ * @param tb whether to transpose B
+ * @param lda The size of the first dimension of A
+ * @param ldb The size of the first dimension of B
+ * @param ldc The size of the first dimension of C
+ * @param beta The scaling factor of C
+ * @param A The matrix A
+ * @param B The matrix B
+ * @param C The output matrix
+ */
+void cinn_cpu_mkl_gemm_fp32(float alpha,
+                            int M,
+                            int N,
+                            int K,
+                            bool ta,
+                            bool tb,
+                            int lda,
+                            int ldb,
+                            int ldc,
+                            float beta,
+                            cinn_buffer_t* A,
+                            cinn_buffer_t* B,
+                            cinn_buffer_t* C);
+
+/**
+ * \brief Do GEMM on buffer A and B and write result to buffer C.
+ * We pass the \param M, \param N, \param K although the shape can retrieve from cinn_buffer_t because the size of a
+ * matrix not equals the shape of a buffer it is stored.
+ * @param alpha The scaling factor of the product of A and B
+ * @param batch_size the batch size of A and B
+ * @param M Number of the rows of A
+ * @param N the number of the columns in both B and C
+ * @param K the number of columns of A
+ * @param ta whether to transpose A
+ * @param tb whether to transpose B
+ * @param lda The size of the first dimension of A
+ * @param ldb The size of the first dimension of B
+ * @param ldc The size of the first dimension of C
+ * @param a_stride The stride of A(number of elements, not bytes) between batches
+ * @param b_stride The stride of B(number of elements, not bytes) between batches
+ * @param c_stride The stride of C(number of elements, not bytes) between batches
+ * @param beta The scaling factor of C
+ * @param A The matrix A
+ * @param B The matrix B
+ * @param C The output matrix
+ */
+void cinn_cpu_mkl_gemm_batch_fp32(float alpha,
+                                  int batch_size,
+                                  int M,
+                                  int N,
+                                  int K,
+                                  bool ta,
+                                  bool tb,
+                                  int lda,
+                                  int ldb,
+                                  int ldc,
+                                  int a_stride,
+                                  int b_stride,
+                                  int c_stride,
+                                  float beta,
+                                  cinn_buffer_t* A,
+                                  cinn_buffer_t* B,
+                                  cinn_buffer_t* C);
+
+void cinn_call_cholesky_host(void* v_args, int num_args, int batch_size, int m, bool upper);
+}  // extern "C"
diff --git a/paddle/cinn/runtime/cpu/host_intrinsics.cc b/paddle/cinn/runtime/cpu/host_intrinsics.cc
new file mode 100644
index 0000000000000..f6ce5ca108ce5
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/host_intrinsics.cc
@@ -0,0 +1,460 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cpu/host_intrinsics.h"
+
+#include <glog/logging.h>
+#include <math.h>
+
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/function_prototype.h"
+#include "cinn/common/target.h"
+#include "cinn/runtime/custom_function.h"
+
+#ifdef CINN_WITH_MKL_CBLAS
+#include "cinn/runtime/cpu/mkl_math.h"
+#endif
+
+extern "C" {
+
+void __cinn_host_tanh_v(const cinn_buffer_t* x, cinn_buffer_t* out) {
+  CINN_CHECK_EQ(x->num_elements(), out->num_elements());
+  int xn         = x->num_elements();
+  auto* x_data   = (float*)(x->memory);
+  auto* out_data = (float*)(out->memory);
+  for (int i = 0; i < x->num_elements(); i++) {
+    out_data[i] = tanhf(x_data[i]);
+  }
+}
+
+#define __cinn_host_find_kernel(buf, size, num, type, begin, stride)                   \
+  do {                                                                                 \
+    for (int i = (size - 1) * stride + begin; i >= begin; i -= stride) {               \
+      if (reinterpret_cast<type*>(buf->memory)[i] == num) return (i - begin) / stride; \
+    }                                                                                  \
+    return -1;                                                                         \
+  } while (0)
+
+inline int cinn_host_find_int(const cinn_buffer_t* buf, int size, int num) {
+  __cinn_host_find_kernel(buf, size, num, int, 0, 1);
+}
+
+inline int cinn_host_find_float(const cinn_buffer_t* buf, int size, float num) {
+  __cinn_host_find_kernel(buf, size, num, float, 0, 1);
+}
+
+inline int cinn_host_find_int_nd(const cinn_buffer_t* buf, int size, int num, int begin, int stride) {
+  __cinn_host_find_kernel(buf, size, num, int, begin, stride);
+}
+
+inline int cinn_host_find_float_nd(const cinn_buffer_t* buf, int size, float num, int begin, int stride) {
+  __cinn_host_find_kernel(buf, size, num, float, begin, stride);
+}
+
+#undef __cinn_host_find_kernel
+
+inline int cinn_host_next_smallest_int32(cinn_buffer_t* buf, int size, int num, int begin, int stride) {
+  int id = -1;
+  for (int i = begin; i < begin + size * stride; i += stride) {
+    if (id == -1 || reinterpret_cast<int*>(buf->memory)[i] < reinterpret_cast<int*>(buf->memory)[id]) {
+      id = i;
+    }
+  }
+  if (id != -1) {
+    reinterpret_cast<int*>(buf->memory)[id] = 2147483647;
+    return (id - begin) / stride;
+  }
+  return -1;
+}
+
+#define CINN_HOST_LT_NUM(TYPE_SUFFIX, TYPE)                                                           \
+  inline int cinn_host_lt_num_##TYPE_SUFFIX(                                                          \
+      const cinn_buffer_t* buf, const int size, const TYPE num, const int offset, const int stride) { \
+    int out = 0;                                                                                      \
+    for (int i = (size - 1) * stride + offset; i >= offset; i -= stride) {                            \
+      if (reinterpret_cast<TYPE*>(buf->memory)[i] < num) out++;                                       \
+    }                                                                                                 \
+    return out;                                                                                       \
+  }
+
+CINN_HOST_LT_NUM(fp32, float)
+CINN_HOST_LT_NUM(fp64, double)
+CINN_HOST_LT_NUM(int32, int)
+CINN_HOST_LT_NUM(int64, int64_t)
+
+#undef CINN_HOST_LT_NUM
+
+#define CINN_HOST_GT_NUM(TYPE_SUFFIX, TYPE)                                                           \
+  inline int cinn_host_gt_num_##TYPE_SUFFIX(                                                          \
+      const cinn_buffer_t* buf, const int size, const TYPE num, const int offset, const int stride) { \
+    int out = 0;                                                                                      \
+    for (int i = (size - 1) * stride + offset; i >= offset; i -= stride) {                            \
+      if (reinterpret_cast<TYPE*>(buf->memory)[i] > num) out++;                                       \
+    }                                                                                                 \
+    return out;                                                                                       \
+  }
+
+CINN_HOST_GT_NUM(fp32, float)
+CINN_HOST_GT_NUM(fp64, double)
+CINN_HOST_GT_NUM(int32, int)
+CINN_HOST_GT_NUM(int64, int64_t)
+
+#undef CINN_HOST_GT_NUM
+
+int cinn_host_resize_bilinear(const cinn_buffer_t* buf,
+                              const int c_size,
+                              const int in_h,
+                              const int in_w,
+                              const int out_h,
+                              const int out_w,
+                              const int n,
+                              const int c,
+                              const int y,
+                              const int x) {
+  // same with paddle resize when use cv2 backend
+  float scale_y = static_cast<float>(in_h) / out_h;
+  float scale_x = static_cast<float>(in_w) / out_w;
+  float in_y    = (y + 0.5F) * scale_y - 0.5F;
+  float in_x    = (x + 0.5F) * scale_x - 0.5F;
+  int in_y_int  = static_cast<int>(std::floor(in_y));
+  int in_x_int  = static_cast<int>(std::floor(in_x));
+  float y_lerp  = in_y - in_y_int;
+  float x_lerp  = in_x - in_x_int;
+  float p[2][2];
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      int near_y = in_y_int + i;
+      int near_x = in_x_int + j;
+      near_y     = std::max(std::min(near_y, in_h - 1), 0);
+      near_x     = std::max(std::min(near_x, in_w - 1), 0);
+      p[i][j] =
+          reinterpret_cast<int*>(buf->memory)[n * c_size * in_h * in_w + c * in_h * in_w + near_y * in_w + near_x];
+    }
+  }
+
+  float top    = p[0][0] * (1.0F - x_lerp) + p[0][1] * x_lerp;
+  float bottom = p[1][0] * (1.0F - x_lerp) + p[1][1] * x_lerp;
+  float value  = top * (1.0F - y_lerp) + bottom * y_lerp;
+  return value;
+}
+
+int cinn_host_resize_bicubic(const cinn_buffer_t* buf,
+                             const int c_size,
+                             const int in_h,
+                             const int in_w,
+                             const int out_h,
+                             const int out_w,
+                             const int n,
+                             const int c,
+                             const int y,
+                             const int x) {
+  // same with paddle resize when use cv2 backend
+  float scale_y = static_cast<float>(in_h) / out_h;
+  float scale_x = static_cast<float>(in_w) / out_w;
+  float in_y    = (y + 0.5F) * scale_y - 0.5F;
+  float in_x    = (x + 0.5F) * scale_x - 0.5F;
+  int in_y_int  = static_cast<int>(std::floor(in_y));
+  int in_x_int  = static_cast<int>(std::floor(in_x));
+  float y_fract = in_y - std::floor(in_y);
+  float x_fract = in_x - std::floor(in_x);
+  float p[4][4];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      int near_y = in_y_int + i - 1;
+      int near_x = in_x_int + j - 1;
+      near_y     = std::max(std::min(near_y, in_h - 1), 0);
+      near_x     = std::max(std::min(near_x, in_w - 1), 0);
+      p[i][j] =
+          reinterpret_cast<int*>(buf->memory)[n * c_size * in_h * in_w + c * in_h * in_w + near_y * in_w + near_x];
+    }
+  }
+
+  float alpha = -0.75F;
+  float w[2][4];
+
+  for (int i = 0; i < 2; ++i) {
+    float t  = (i == 0 ? x_fract : y_fract);
+    float t2 = t * t;
+    float t3 = t * t * t;
+    w[i][0]  = alpha * (t3 - 2 * t2 + t);
+    w[i][1]  = (alpha + 2) * t3 - (3 + alpha) * t2 + 1;
+    w[i][2]  = -(alpha + 2) * t3 + (3 + 2 * alpha) * t2 - alpha * t;
+    w[i][3]  = -alpha * t3 + alpha * t2;
+  }
+
+  float col[4];
+
+  for (int i = 0; i < 4; ++i) {
+    col[i] = 0.0F;
+    for (int j = 0; j < 4; ++j) {
+      col[i] += p[i][j] * w[0][j];
+    }
+  }
+
+  float value = 0.0F;
+
+  for (int i = 0; i < 4; ++i) {
+    value += col[i] * w[1][i];
+  }
+
+  return value;
+}
+
+#define FN_FP32(func) cinn_host_##func##_fp32
+
+inline float FN_FP32(cbrt)(float x) { return cbrt(x); }
+
+inline float FN_FP32(pow)(float x, float y) { return powf(x, y); }
+
+#undef FN_FP32
+
+#define FN_FP64(func) cinn_host_##func##_fp64
+
+inline double FN_FP64(cbrt)(double x) { return cbrt(x); }
+
+inline double FN_FP64(pow)(double x, double y) { return pow(x, y); }
+
+#undef FN_FP64
+
+#define FN_INT32(func) cinn_host_##func##_int32
+
+inline int FN_INT32(pow)(int x, int y) {
+  if (x == 0 && y < 0) {
+    return -1;
+  }
+  return pow(x, y);
+}
+
+inline int FN_INT32(clz)(int x) { return __builtin_clz(x); }
+
+inline int FN_INT32(popc)(int x) { return __builtin_popcount(x); }
+
+inline int FN_INT32(logical_right_shift)(int x, int y) { return ((unsigned int)x >> y); }
+
+#undef FN_INT32
+
+#define FN_INT64(func) cinn_host_##func##_int64
+
+inline int64_t FN_INT64(clz)(int64_t x) { return __builtin_clzll(x); }
+
+inline int64_t FN_INT64(popc)(int64_t x) { return __builtin_popcountll(x); }
+
+inline int64_t FN_INT64(pow)(int64_t x, int64_t y) { return pow(x, y); }
+
+inline int64_t FN_INT64(logical_right_shift)(int64_t x, int64_t y) { return ((uint64_t)x >> y); }
+
+#undef FN_INT64
+}  // extern "C"
+
+namespace cinn {
+namespace runtime {
+
+void cinn_assert_true_host(void* v_args, int num_args, int msg, bool only_warning) {
+  cinn::runtime::cinn_assert_true(v_args, num_args, msg, only_warning, nullptr, cinn::common::DefaultHostTarget());
+}
+}  // namespace runtime
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(host_intrinsics) {
+  auto host_target = cinn::common::DefaultHostTarget();
+  using cinn::backends::FunctionProto;
+
+#define REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32(func__) REGISTER_EXTERN_FUNC_1_IN_1_OUT(func__, host_target, float, float);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32(erff);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32(acosf);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32(acoshf);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32(asinf);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32(asinhf);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32(atanf);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32(atanhf);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32(cinn_host_cbrt_fp32);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32
+
+#define REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP64(func__) \
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT(func__, host_target, double, double);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP64(cinn_host_cbrt_fp64);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP64
+
+#define REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32_INT(func__) \
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT(func__, host_target, float, int);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_OUT_FP32_INT
+
+#define REGISTER_EXTERN_FUNC_2_IN_1_F(func__) REGISTER_EXTERN_FUNC_2_IN_1_OUT(func__, host_target, float, float, float);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_F(powf)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_F
+
+#define REGISTER_EXTERN_FUNC_2_IN_1_FP32(func__) \
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT(cinn_host_##func__##_fp32, host_target, float, float, float);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_FP32(pow)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_FP32
+
+#define REGISTER_EXTERN_FUNC_2_IN_1_FP64(func__) \
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT(cinn_host_##func__##_fp64, host_target, double, double, double);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_FP64(pow)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_FP64
+
+#define REGISTER_EXTERN_FUNC_2_IN_1_INT32(func__) \
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT(cinn_host_##func__##_int32, host_target, int, int, int);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_INT32(pow)
+
+  REGISTER_EXTERN_FUNC_2_IN_1_INT32(logical_right_shift)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_INT32
+
+#define REGISTER_EXTERN_FUNC_2_IN_1_INT64(func__) \
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT(cinn_host_##func__##_int64, host_target, int64_t, int64_t, int64_t);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_INT64(pow)
+
+  REGISTER_EXTERN_FUNC_2_IN_1_INT64(logical_right_shift)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_INT64
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT(cinn_host_clz_int32, host_target, int, int);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT(cinn_host_clz_int64, host_target, int64_t, int64_t);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT(cinn_host_popc_int32, host_target, int, int);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT(cinn_host_popc_int64, host_target, int64_t, int64_t);
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_find_int, host_target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t*>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_find_float, host_target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t*>()
+      .AddInputType<int>()
+      .AddInputType<float>()
+      .End();
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_find_int_nd, host_target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t*>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_find_float_nd, host_target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t*>()
+      .AddInputType<int>()
+      .AddInputType<float>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_next_smallest_int32, host_target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t*>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+#define _REGISTER_CINN_HOST_LT_NUM(TYPE_SUFFIX, TYPE)                      \
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_lt_num_##TYPE_SUFFIX, host_target) \
+      .SetRetType<int>()                                                   \
+      .AddInputType<cinn_buffer_t*>()                                      \
+      .AddInputType<int>()                                                 \
+      .AddInputType<TYPE>()                                                \
+      .AddInputType<int>()                                                 \
+      .AddInputType<int>()                                                 \
+      .End();
+
+  _REGISTER_CINN_HOST_LT_NUM(fp32, float);
+  _REGISTER_CINN_HOST_LT_NUM(fp64, double);
+  _REGISTER_CINN_HOST_LT_NUM(int32, int);
+  _REGISTER_CINN_HOST_LT_NUM(int64, int64_t);
+
+#undef _REGISTER_CINN_HOST_LT_NUM
+
+#define _REGISTER_CINN_HOST_GT_NUM(TYPE_SUFFIX, TYPE)                      \
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_gt_num_##TYPE_SUFFIX, host_target) \
+      .SetRetType<int>()                                                   \
+      .AddInputType<cinn_buffer_t*>()                                      \
+      .AddInputType<int>()                                                 \
+      .AddInputType<TYPE>()                                                \
+      .AddInputType<int>()                                                 \
+      .AddInputType<int>()                                                 \
+      .End();
+
+  _REGISTER_CINN_HOST_GT_NUM(fp32, float);
+  _REGISTER_CINN_HOST_GT_NUM(fp64, double);
+  _REGISTER_CINN_HOST_GT_NUM(int32, int);
+  _REGISTER_CINN_HOST_GT_NUM(int64, int64_t);
+
+#undef _REGISTER_CINN_HOST_GT_NUM
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_resize_bilinear, host_target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t*>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_resize_bicubic, host_target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t*>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  // TODO(thisjiang): change msg type from 'int' to 'std::string' when custom call support 'std::string' type
+  using cinn::runtime::cinn_assert_true_host;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_assert_true_host, host_target)
+      .SetRetType<void>()
+      .AddInputType<void*>()  // v_args
+      .AddInputType<int>()    // num_args
+      .AddInputType<int>()    // msg
+      .AddInputType<bool>()   // only_warning
+      .End();
+
+  return true;
+}
diff --git a/paddle/cinn/runtime/cpu/host_intrinsics.h b/paddle/cinn/runtime/cpu/host_intrinsics.h
new file mode 100644
index 0000000000000..385d9fec1cfb3
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/host_intrinsics.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+/**
+ * \file This file implements some intrinsic functions for math operation in host device.
+ */
+#include "cinn/runtime/cinn_runtime.h"
+
+extern "C" {
+
+//! math extern functions
+//@{
+void __cinn_host_tanh_v(const cinn_buffer_t* x, cinn_buffer_t* out);
+//@}
+
+inline int cinn_host_find_int(const cinn_buffer_t* buf, int size, int num);
+
+inline int cinn_host_find_float(const cinn_buffer_t* buf, int size, float num);
+
+inline int cinn_host_find_int_nd(const cinn_buffer_t* buf, int size, int num, int begin, int stride);
+
+inline int cinn_host_find_float_nd(const cinn_buffer_t* buf, int size, float num, int begin, int stride);
+
+#define CINN_HOST_LT_NUM(TYPE_SUFFIX, TYPE)  \
+  inline int cinn_host_lt_num_##TYPE_SUFFIX( \
+      const cinn_buffer_t* buf, const int size, const TYPE num, const int offset, const int stride);
+
+CINN_HOST_LT_NUM(fp32, float)
+CINN_HOST_LT_NUM(fp64, double)
+CINN_HOST_LT_NUM(int32, int)
+CINN_HOST_LT_NUM(int64, int64_t)
+
+#undef CINN_HOST_LT_NUM
+
+#define CINN_HOST_GT_NUM(TYPE_SUFFIX, TYPE)  \
+  inline int cinn_host_gt_num_##TYPE_SUFFIX( \
+      const cinn_buffer_t* buf, const int size, const TYPE num, const int offset, const int stride);
+
+CINN_HOST_GT_NUM(fp32, float)
+CINN_HOST_GT_NUM(fp64, double)
+CINN_HOST_GT_NUM(int32, int)
+CINN_HOST_GT_NUM(int64, int64_t)
+
+#undef CINN_HOST_GT_NUM
+
+int cinn_host_resize_bilinear(const cinn_buffer_t* buf,
+                              const int c_size,
+                              const int in_h,
+                              const int in_w,
+                              const int out_h,
+                              const int out_w,
+                              const int n,
+                              const int c,
+                              const int y,
+                              const int x);
+
+int cinn_host_resize_bicubic(const cinn_buffer_t* buf,
+                             const int c_size,
+                             const int in_h,
+                             const int in_w,
+                             const int out_h,
+                             const int out_w,
+                             const int n,
+                             const int c,
+                             const int y,
+                             const int x);
+
+#define FN_INT32(func) cinn_host_##func##_int32
+
+inline int FN_INT32(pow)(int x, int y);
+
+inline int FN_INT32(clz)(int x);
+
+inline int FN_INT32(popc)(int x);
+
+inline int FN_INT32(logical_right_shift)(int x, int y);
+
+#undef FN_INT32
+
+#define FN_INT64(func) cinn_host_##func##_int64
+
+inline int64_t FN_INT64(clz)(int64_t x);
+
+inline int64_t FN_INT64(popc)(int64_t x);
+
+inline int64_t FN_INT64(pow)(int64_t x, int64_t y);
+
+inline int64_t FN_INT64(logical_right_shift)(int64_t x, int64_t y);
+
+#undef FN_INT64
+
+#define FN_FP32(func) cinn_host_##func##_fp32
+
+inline float FN_FP32(cbrt)(float x);
+
+#undef FN_FP32
+
+#define FN_FP64(func) cinn_host_##func##_fp64
+
+inline double FN_FP64(cbrt)(double x);
+
+#undef FN_FP64
+}
+
+namespace cinn {
+namespace runtime {
+
+void cinn_assert_true_host(void* v_args, int num_args, int msg, bool only_warning);
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cpu/host_intrinsics_test.cc b/paddle/cinn/runtime/cpu/host_intrinsics_test.cc
new file mode 100644
index 0000000000000..1a7e6200617bc
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/host_intrinsics_test.cc
@@ -0,0 +1,208 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cpu/host_intrinsics.h"
+
+#include <gtest/gtest.h>
+
+#include "cinn/backends/compiler.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/cinn.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/target.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/runtime/cpu/use_extern_funcs.h"
+
+namespace cinn {
+namespace runtime {
+namespace cpu {
+
+TEST(tanh, basic) {
+  Expr M(10), N(20);
+  Placeholder<float> x("x", {M, N});
+  auto y = Compute(
+      {M, N}, [&](Expr i, Expr j) { return CallExtern("tanh", {x(i, j)}); }, "y");
+
+  auto stages = CreateStages({y});
+
+  auto jit = backends::SimpleJIT::Create();
+
+  ir::Module::Builder builder("module1", common::DefaultHostTarget());
+
+  auto fn = Lower("fn", stages, {x, y});
+  LOG(INFO) << "fn:\n" << fn;
+
+  builder.AddFunction(fn);
+
+  jit->Link(builder.Build());
+
+  auto fn_ptr = jit->Lookup("fn");
+  auto fnp    = reinterpret_cast<lower_func_ptr_t>(fn_ptr);
+  ASSERT_TRUE(fnp);
+
+  auto* x_buf   = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+  auto* out_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_zero().Build();
+  auto args     = common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
+  fnp(args.data(), args.size());
+
+  auto* x_buf_data   = reinterpret_cast<float*>(x_buf->memory);
+  auto* out_buf_data = reinterpret_cast<float*>(out_buf->memory);
+
+  for (int i = 0; i < x_buf->num_elements(); i++) {
+    LOG_FIRST_N(INFO, 3) << out_buf_data[i];
+    ASSERT_NEAR(out_buf_data[i], std::tanh(x_buf_data[i]), 1e-5);
+  }
+}
+
+TEST(find_value_nd, basic) {
+  Expr M(10), N(20);
+  Placeholder<float> x("x", {M, N});
+  auto y = Compute(
+      {N},
+      [&](Expr i) {
+        return CallExtern("cinn_host_find_float_nd", {x, M, x({Expr(5), Expr(3)}), i, N});
+      },
+      "y");
+
+  auto stages = CreateStages({y});
+
+  auto jit = backends::SimpleJIT::Create();
+
+  ir::Module::Builder builder("module1", common::DefaultHostTarget());
+
+  auto fn = Lower("fn", stages, {x, y});
+  LOG(INFO) << "fn:\n" << fn;
+
+  builder.AddFunction(fn);
+
+  jit->Link(builder.Build());
+
+  auto fn_ptr = jit->Lookup("fn");
+  auto fnp    = reinterpret_cast<lower_func_ptr_t>(fn_ptr);
+  ASSERT_TRUE(fnp);
+
+  auto* x_buf   = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+  auto* out_buf = common::BufferBuilder(Int(32), {N.as_int32()}).set_zero().Build();
+  auto args     = common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
+  fnp(args.data(), args.size());
+
+  auto* x_buf_data   = reinterpret_cast<float*>(x_buf->memory);
+  auto* out_buf_data = reinterpret_cast<int*>(out_buf->memory);
+
+  for (int i = 0; i < out_buf->num_elements(); i++) {
+    LOG_FIRST_N(INFO, 3) << out_buf_data[i];
+    if (out_buf_data[i] != -1) {
+      ASSERT_NEAR(x_buf_data[out_buf_data[i] * 20 + i], x_buf_data[5 * 20 + 3], 1e-5);
+    }
+  }
+}
+
+TEST(cinn_host_lt_num_fp32, basic) {
+  Expr M(10), N(20);
+  Placeholder<float> x("x", {M, N});
+  auto y = Compute(
+      {N},
+      [&](Expr j) {
+        return CallExtern("cinn_host_lt_num_fp32", {x, M, x({Expr(0), j}), j, N});
+      },
+      "y");
+
+  auto stages = CreateStages({y});
+
+  auto jit = backends::SimpleJIT::Create();
+
+  ir::Module::Builder builder("module1", common::DefaultHostTarget());
+
+  auto fn = Lower("fn", stages, {x, y});
+  LOG(INFO) << "fn:\n" << fn;
+
+  builder.AddFunction(fn);
+
+  jit->Link(builder.Build());
+
+  auto fn_ptr = jit->Lookup("fn");
+  auto fnp    = reinterpret_cast<lower_func_ptr_t>(fn_ptr);
+  ASSERT_TRUE(fnp);
+
+  auto* x_buf   = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+  auto* out_buf = common::BufferBuilder(Int(32), {N.as_int32()}).set_zero().Build();
+  auto args     = common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
+  fnp(args.data(), args.size());
+
+  auto* x_buf_data   = reinterpret_cast<float*>(x_buf->memory);
+  auto* out_buf_data = reinterpret_cast<int*>(out_buf->memory);
+
+  for (int j = 0; j < 20; j++) {
+    int out = 0;
+    for (int i = 0; i < 10; i++) {
+      int index = i * 20 + j;
+      if (x_buf_data[index] < x_buf_data[j]) {
+        out++;
+      }
+    }
+    ASSERT_NEAR(out_buf_data[j], out, 1e-5);
+  }
+}
+
+TEST(cinn_host_gt_num_fp32, basic) {
+  Expr M(10), N(20);
+  Placeholder<float> x("x", {M, N});
+  auto y = Compute(
+      {N},
+      [&](Expr j) {
+        return CallExtern("cinn_host_gt_num_fp32", {x, M, x({Expr(0), j}), j, N});
+      },
+      "y");
+
+  auto stages = CreateStages({y});
+
+  auto jit = backends::SimpleJIT::Create();
+
+  ir::Module::Builder builder("module1", common::DefaultHostTarget());
+
+  auto fn = Lower("fn", stages, {x, y});
+  LOG(INFO) << "fn:\n" << fn;
+
+  builder.AddFunction(fn);
+
+  jit->Link(builder.Build());
+
+  auto fn_ptr = jit->Lookup("fn");
+  auto fnp    = reinterpret_cast<lower_func_ptr_t>(fn_ptr);
+  ASSERT_TRUE(fnp);
+
+  auto* x_buf   = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+  auto* out_buf = common::BufferBuilder(Int(32), {N.as_int32()}).set_zero().Build();
+  auto args     = common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
+  fnp(args.data(), args.size());
+
+  auto* x_buf_data   = reinterpret_cast<float*>(x_buf->memory);
+  auto* out_buf_data = reinterpret_cast<int*>(out_buf->memory);
+
+  for (int j = 0; j < 20; j++) {
+    int out = 0;
+    for (int i = 0; i < 10; i++) {
+      int index = i * 20 + j;
+      if (x_buf_data[index] > x_buf_data[j]) {
+        out++;
+      }
+    }
+    ASSERT_NEAR(out_buf_data[j], out, 1e-5);
+  }
+}
+
+}  // namespace cpu
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cpu/mkl_math.cc b/paddle/cinn/runtime/cpu/mkl_math.cc
new file mode 100644
index 0000000000000..ca389d3dbc378
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/mkl_math.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cpu/mkl_math.h"
+
+#include <glog/logging.h>
+#include <mkl.h>
+#include <mkl_vml_functions.h>
+
+#include <cmath>
+
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/function_prototype.h"
+#include "cinn/runtime/cpu/host_intrinsics.h"
+
+#define CINN_MKL_VECTOR_MATH_FP(fn__, name__)                                                                    \
+  void cinn_mkl_##name__##_v_fp32(cinn_buffer_t *x, cinn_buffer_t *out) {                                        \
+    CHECK_EQ(x->num_elements(), out->num_elements());                                                            \
+    vs##fn__(x->num_elements(), reinterpret_cast<float *>(x->memory), reinterpret_cast<float *>(out->memory));   \
+  }                                                                                                              \
+  void cinn_mkl_##name__##_v_fp64(cinn_buffer_t *x, cinn_buffer_t *out) {                                        \
+    CHECK_EQ(x->num_elements(), out->num_elements());                                                            \
+    vd##fn__(x->num_elements(), reinterpret_cast<double *>(x->memory), reinterpret_cast<double *>(out->memory)); \
+  }
+
+CINN_MKL_VECTOR_MATH_FP(Exp, exp);
+CINN_MKL_VECTOR_MATH_FP(Erf, erf);
+CINN_MKL_VECTOR_MATH_FP(Sqrt, sqrt);
+CINN_MKL_VECTOR_MATH_FP(Ln, log);
+CINN_MKL_VECTOR_MATH_FP(Floor, floor);
+CINN_MKL_VECTOR_MATH_FP(Ceil, ceil);
+CINN_MKL_VECTOR_MATH_FP(Round, round);
+CINN_MKL_VECTOR_MATH_FP(Tanh, tanh);
+//! Todo: current mklml.so not support
+// CINN_MKL_VECTOR_MATH_FP(Log2, log2);
+// CINN_MKL_VECTOR_MATH_FP(Log10, log10);
+// CINN_MKL_VECTOR_MATH_FP(Trunc, trunc);
+// CINN_MKL_VECTOR_MATH_FP(Cos, cos);
+// CINN_MKL_VECTOR_MATH_FP(Sin, sin);
+// CINN_MKL_VECTOR_MATH_FP(Cosh, cosh);
+// CINN_MKL_VECTOR_MATH_FP(Tan, tan);
+// CINN_MKL_VECTOR_MATH_FP(Sinh, sinh);
+// CINN_MKL_VECTOR_MATH_FP(Acos, acos);
+// CINN_MKL_VECTOR_MATH_FP(Acosh, acosh);
+// CINN_MKL_VECTOR_MATH_FP(Asin, asin);
+// CINN_MKL_VECTOR_MATH_FP(Asinh, asinh);
+// CINN_MKL_VECTOR_MATH_FP(Atan, atan);
+// CINN_MKL_VECTOR_MATH_FP(Atanh, atanh);
+
+CINN_REGISTER_HELPER(mkl_math) {
+  using cinn::backends::FunctionProto;
+
+  auto host_target = cinn::common::DefaultHostTarget();
+
+#define REGISTER_MKL_FUNCS(fn__)                                     \
+  REGISTER_EXTERN_FUNC_HELPER(cinn_mkl_##fn__##_v_fp32, host_target) \
+      .SetRetType<void>()                                            \
+      .AddInputType<cinn_buffer_t *>()                               \
+      .AddOutputType<cinn_buffer_t *>()                              \
+      .SetShapeInference(FunctionProto::ShapeFollowNthArgument(0))   \
+      .End();                                                        \
+  REGISTER_EXTERN_FUNC_HELPER(cinn_mkl_##fn__##_v_fp64, host_target) \
+      .SetRetType<void>()                                            \
+      .AddInputType<cinn_buffer_t *>()                               \
+      .AddOutputType<cinn_buffer_t *>()                              \
+      .SetShapeInference(FunctionProto::ShapeFollowNthArgument(0))   \
+      .End();
+
+  REGISTER_MKL_FUNCS(exp);
+  REGISTER_MKL_FUNCS(erf);
+  REGISTER_MKL_FUNCS(sqrt);
+  REGISTER_MKL_FUNCS(log);
+  REGISTER_MKL_FUNCS(floor);
+  REGISTER_MKL_FUNCS(ceil);
+  REGISTER_MKL_FUNCS(round);
+  REGISTER_MKL_FUNCS(tanh);
+  //! Todo: current mklml.so not support
+  // REGISTER_MKL_FUNCS(log2);
+  // REGISTER_MKL_FUNCS(log10);
+  // REGISTER_MKL_FUNCS(trunc);
+  // REGISTER_MKL_FUNCS(cos);
+  // REGISTER_MKL_FUNCS(sin);
+  // REGISTER_MKL_FUNCS(cosh);
+  // REGISTER_MKL_FUNCS(tan);
+  // REGISTER_MKL_FUNCS(sinh);
+  // REGISTER_MKL_FUNCS(acos);
+  // REGISTER_MKL_FUNCS(acosh);
+  // REGISTER_MKL_FUNCS(asin);
+  // REGISTER_MKL_FUNCS(asinh);
+  // REGISTER_MKL_FUNCS(atan);
+  // REGISTER_MKL_FUNCS(atanh);
+
+  return true;
+}
diff --git a/paddle/cinn/runtime/cpu/mkl_math.h b/paddle/cinn/runtime/cpu/mkl_math.h
new file mode 100644
index 0000000000000..da39998e72995
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/mkl_math.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * This file defines some math extern functions thouse can be called in CINN IR.
+ */
+#pragma once
+
+#include "cinn/runtime/cinn_runtime.h"
+
+extern "C" {
+
+//! 1 buffer as input, and 1 buffer as output
+// @{
+#define CINN_DECL_MKL_VECTOR_MATH_FP(fn__)                             \
+  void cinn_mkl_##fn__##_v_fp32(cinn_buffer_t* x, cinn_buffer_t* out); \
+  void cinn_mkl_##fn__##_v_fp64(cinn_buffer_t* x, cinn_buffer_t* out);
+
+CINN_DECL_MKL_VECTOR_MATH_FP(exp);
+CINN_DECL_MKL_VECTOR_MATH_FP(erf);
+CINN_DECL_MKL_VECTOR_MATH_FP(sqrt);
+CINN_DECL_MKL_VECTOR_MATH_FP(log);
+CINN_DECL_MKL_VECTOR_MATH_FP(log2);
+CINN_DECL_MKL_VECTOR_MATH_FP(log10);
+CINN_DECL_MKL_VECTOR_MATH_FP(floor);
+CINN_DECL_MKL_VECTOR_MATH_FP(ceil);
+CINN_DECL_MKL_VECTOR_MATH_FP(round);
+CINN_DECL_MKL_VECTOR_MATH_FP(trunc);
+CINN_DECL_MKL_VECTOR_MATH_FP(cos);
+CINN_DECL_MKL_VECTOR_MATH_FP(sin);
+CINN_DECL_MKL_VECTOR_MATH_FP(cosh);
+CINN_DECL_MKL_VECTOR_MATH_FP(tan);
+CINN_DECL_MKL_VECTOR_MATH_FP(tanh);
+CINN_DECL_MKL_VECTOR_MATH_FP(sinh);
+CINN_DECL_MKL_VECTOR_MATH_FP(acos);
+CINN_DECL_MKL_VECTOR_MATH_FP(acosh);
+CINN_DECL_MKL_VECTOR_MATH_FP(asin);
+CINN_DECL_MKL_VECTOR_MATH_FP(asinh);
+CINN_DECL_MKL_VECTOR_MATH_FP(atan);
+CINN_DECL_MKL_VECTOR_MATH_FP(atanh);
+// @}
+}
diff --git a/paddle/cinn/runtime/cpu/mkl_math_test.cc b/paddle/cinn/runtime/cpu/mkl_math_test.cc
new file mode 100644
index 0000000000000..a580a8c12107b
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/mkl_math_test.cc
@@ -0,0 +1,217 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/backends/compiler.h"
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/cinn.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/target.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "cinn/runtime/cpu/use_extern_funcs.h"
+
+namespace cinn {
+namespace runtime {
+namespace cpu {
+
+cinn_buffer_t *CreateBuffer(const std::vector<int> shape, bool random = true, int set_value = 0) {
+  if (random) {
+    return common::BufferBuilder(Float(32), shape).set_random().Build();
+  } else if (set_value != 0) {
+    return common::BufferBuilder(Float(32), shape).set_val(set_value).Build();
+  }
+  return common::BufferBuilder(Float(32), shape).set_zero().Build();
+}
+
+template <typename FuncRuntime>
+void TestCallElementwise(
+    const std::string &fn_name, FuncRuntime fn_runtime, bool is_elementwise, Type type = Float(32), int set_value = 0) {
+  Expr M(10);
+  Expr N(10);
+  Placeholder<float> x("x", {M, N});
+
+  ir::Tensor out;
+
+  std::vector<ir::Tensor> lower_args({x});
+  if (is_elementwise) {
+    out = Compute(
+        {M, N}, [&](Var i, Var j) -> Expr { return lang::CallExtern(fn_name, {x(i, j)}); }, fn_name + "_out");
+    lower_args.push_back(out);
+  } else {
+    auto comp_out = Compute(
+        {Expr(1)}, [&]() -> Expr { return lang::CallExtern(fn_name, {x}); }, fn_name + "_out");
+    out = comp_out->TupleGet(0);
+    out->WithBuffer(Float(32));
+    lower_args.push_back(out);
+    lower_args.push_back(comp_out);
+  }
+
+  auto stages = CreateStages(lower_args);
+
+  auto target = common::DefaultHostTarget();
+  target.arch = Target::Arch::X86;
+  ir::Module::Builder builder("module0", target);
+  auto func = Lower("fn", stages, lower_args);
+  builder.AddFunction(func);
+
+  LOG(INFO) << "func:\n" << func;
+
+  auto jit    = backends::ExecutionEngine::Create({});
+  auto module = builder.Build();
+
+  jit->Link(module);
+  auto fn = jit->Lookup("fn");
+  CHECK(fn);
+  auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  cinn_buffer_t *A_buf;
+  if (set_value != 0) {
+    A_buf = CreateBuffer({10, 10}, false, set_value);
+  } else {
+    A_buf = CreateBuffer({10, 10});
+  }
+  auto *B_buf = common::BufferBuilder(type, {10, 10}).set_align(type.bits()).Build();
+
+  cinn_pod_value_t a_arg(A_buf), b_arg(B_buf);
+  cinn_pod_value_t args[] = {a_arg, b_arg};
+  fn_(args, 2);
+
+  auto *ad = reinterpret_cast<float *>(A_buf->memory);
+  if (type.is_bool()) {
+    auto *bd = reinterpret_cast<int8_t *>(B_buf->memory);
+    for (int i = 0; i < A_buf->num_elements(); i++) {
+      ASSERT_NEAR(bd[i], fn_runtime(ad[i]), 1e-5);
+    }
+  } else {
+    auto *bd = reinterpret_cast<float *>(B_buf->memory);
+    for (int i = 0; i < A_buf->num_elements(); i++) {
+      ASSERT_NEAR(bd[i], fn_runtime(ad[i]), 1e-5);
+    }
+  }
+}
+
+bool isnan(float e) { return std::isnan(e); }
+bool isfinite(float e) { return std::isfinite(e); }
+bool isinf(float e) { return std::isinf(e); }
+
+#define TEST_MKL_MATH_FP32(test_name__, is_elementwise) \
+  TEST(mkl_math, test_name__) { TestCallElementwise(#test_name__, test_name__##f, is_elementwise); }
+#define TEST_CINN_MKL_MATH_FP32(test_name__, is_elementwise)                                 \
+  TEST(mkl_math, test_name__) {                                                              \
+    TestCallElementwise("cinn_mkl_" #test_name__ "_v_fp32", test_name__##f, is_elementwise); \
+  }
+#define TEST_MKL_MATH_FP32_BOOL(test_name__, is_elementwise) \
+  TEST(mkl_math, test_name__) { TestCallElementwise(#test_name__, test_name__, is_elementwise, Bool()); }
+#define TEST_MKL_MATH_FP32_SET(test_name__, is_elementwise, value) \
+  TEST(mkl_math, test_name__) { TestCallElementwise(#test_name__, test_name__##f, is_elementwise, Float(32), value); }
+
+TEST_CINN_MKL_MATH_FP32(exp, false)
+TEST_CINN_MKL_MATH_FP32(erf, false)
+TEST_CINN_MKL_MATH_FP32(sqrt, false)
+TEST_CINN_MKL_MATH_FP32(log, false)
+TEST_MKL_MATH_FP32(log2, true)
+TEST_MKL_MATH_FP32(log10, true)
+TEST_CINN_MKL_MATH_FP32(floor, false)
+TEST_CINN_MKL_MATH_FP32(ceil, false)
+TEST_CINN_MKL_MATH_FP32(round, false)
+TEST_MKL_MATH_FP32(trunc, true)
+TEST_MKL_MATH_FP32(cos, true)
+TEST_MKL_MATH_FP32(cosh, true)
+TEST_MKL_MATH_FP32(tan, true)
+TEST_MKL_MATH_FP32(sin, true)
+TEST_MKL_MATH_FP32(sinh, true)
+TEST_MKL_MATH_FP32(acos, true)
+TEST_MKL_MATH_FP32_SET(acosh, true, 1.5)
+TEST_MKL_MATH_FP32(asin, true)
+TEST_MKL_MATH_FP32(asinh, true)
+TEST_MKL_MATH_FP32(atan, true)
+TEST_MKL_MATH_FP32(atanh, true)
+TEST_MKL_MATH_FP32_BOOL(isnan, true)
+TEST_CINN_MKL_MATH_FP32(tanh, false)
+TEST_MKL_MATH_FP32_BOOL(isfinite, true)
+TEST_MKL_MATH_FP32_BOOL(isinf, true)
+
+TEST(mkl_math, tanh_v_fp32) { TestCallElementwise("cinn_mkl_tanh_v_fp32", tanhf, false); }
+
+TEST(cinn_cpu_mkl_gemm_fp32, test) {
+  Expr M(30);
+  Expr N(20);
+  Expr K(40);
+
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  auto call = Compute(
+      {Expr(1)},
+      [=]() -> Expr {
+        return lang::CallExtern("cinn_cpu_mkl_gemm_fp32",
+                                {
+                                    common::make_one<float>(),   // alpha
+                                    M,                           // M
+                                    N,                           // N
+                                    K,                           // K
+                                    common::make_bool(false),    // ta
+                                    common::make_bool(false),    // tb
+                                    K,                           // lda
+                                    N,                           // ldb
+                                    N,                           // ldc
+                                    common::make_zero<float>(),  // beta
+                                    A.tensor(),                  // A
+                                    B.tensor(),                  // B
+                                });
+      },
+      "extern_call");
+
+  auto out = call->TupleGet(0);
+  out->WithBuffer(Float(32));
+
+  auto stages = CreateStages({call, out});
+
+  auto target = common::DefaultHostTarget();
+  target.arch = Target::Arch::X86;
+  ir::Module::Builder builder("module0", target);
+
+  auto func = Lower("fn", stages, {A, B, out, call});
+  builder.AddFunction(func);
+
+  LOG(INFO) << "func:\n" << func;
+
+  auto jit    = backends::SimpleJIT::Create();
+  auto module = builder.Build();
+
+  jit->Link(module, /*optimize=*/true);
+  auto fn     = jit->Lookup("fn");
+  auto fn_ptr = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  // test with real data
+  auto *A_buf = common::BufferBuilder(Float(32), {M.as_int32(), K.as_int32()}).set_random().Build();
+  auto *B_buf = common::BufferBuilder(Float(32), {K.as_int32(), N.as_int32()}).set_random().Build();
+  auto *C_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_zero().Build();
+
+  auto args = common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
+
+  fn_ptr(args.data(), args.size());
+
+  cinn_buffer_free(nullptr, A_buf);
+  cinn_buffer_free(nullptr, B_buf);
+  cinn_buffer_free(nullptr, C_buf);
+}
+
+}  // namespace cpu
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math.cc b/paddle/cinn/runtime/cpu/mkldnn_math.cc
new file mode 100644
index 0000000000000..47f9a01804057
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/mkldnn_math.cc
@@ -0,0 +1,204 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cpu/mkldnn_math.h"
+
+#include <vector>
+
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/common/cas.h"
+
+using mkldnn::algorithm;
+using mkldnn::memory;
+using tag = memory::format_tag;
+using dt  = memory::data_type;
+
+void cinn_cpu_mkldnn_softmax_fp32(
+    int batch, int channel, int h, int w, int axis, cinn_buffer_t* inputs, cinn_buffer_t* out) {
+  auto engine = mkldnn::engine(mkldnn::engine::kind::cpu, 0);
+  mkldnn::stream engine_stream(engine);
+
+  memory::dims src_dims = {batch, channel};
+  if (h != 1) src_dims.push_back(h);
+  if (w != 1) src_dims.push_back(w);
+  int size        = src_dims.size();
+  auto format_tag = tag::nc;
+  switch (size) {
+    case 2:
+      format_tag = tag::ab;
+      break;
+    case 3:
+      format_tag = tag::abc;
+      break;
+    case 4:
+      format_tag = tag::abcd;
+      break;
+    default:
+      LOG(FATAL) << "wrong dim: " << size;
+      break;
+  }
+
+  auto src_md       = memory::desc(src_dims, dt::f32, format_tag);
+  auto src_mem      = memory(src_md, engine, reinterpret_cast<float*>(inputs->memory));
+  auto dst_mem      = memory(src_md, engine, reinterpret_cast<float*>(out->memory));
+  auto softmax_d    = mkldnn::softmax_forward::desc(mkldnn::prop_kind::forward_inference, src_md, axis);
+  auto softmax_pd   = mkldnn::softmax_forward::primitive_desc(softmax_d, engine);
+  auto softmax_prim = mkldnn::softmax_forward(softmax_pd);
+
+  softmax_prim.execute(engine_stream, {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_mem}});
+  engine_stream.wait();
+}
+
+void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
+                                      int c_in,
+                                      int input_h,
+                                      int input_w,
+                                      int c_out,
+                                      int group,
+                                      int filter_h,
+                                      int filter_w,
+                                      int pad_h,
+                                      int pad_w,
+                                      int stride_h,
+                                      int stride_w,
+                                      int dilation_h,
+                                      int dilation_w,
+                                      cinn_buffer_t* inputs,
+                                      cinn_buffer_t* weights,
+                                      cinn_buffer_t* out) {
+  auto cpu_engine = mkldnn::engine(mkldnn::engine::kind::cpu, 0);
+  mkldnn::stream cpu_stream(cpu_engine);
+
+  memory::dims conv_src_tz     = {batch_size, c_in, input_h, input_w};
+  memory::dims conv_weights_tz = {c_out, c_in, filter_h, filter_w};
+  if (group > 1) {
+    conv_weights_tz = {group, c_out / group, c_in / group, filter_h, filter_w};
+  }
+  int out_h                   = (input_h - ((filter_h - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1;
+  int out_w                   = (input_w - ((filter_w - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + 1;
+  memory::dims conv_dst_tz    = {batch_size, c_out, out_h, out_w};
+  memory::dims conv_strides   = {stride_h, stride_w};
+  memory::dims conv_paddings  = {pad_h, pad_w};
+  memory::dims conv_dilations = {dilation_h - 1, dilation_w - 1};
+
+  auto conv_user_src_memory =
+      memory({{conv_src_tz}, dt::f32, tag::nchw}, cpu_engine, reinterpret_cast<float*>(inputs->memory));
+  auto conv_user_weights_memory = memory({{conv_weights_tz}, dt::f32, group > 1 ? tag::goihw : tag::oihw},
+                                         cpu_engine,
+                                         reinterpret_cast<float*>(weights->memory));
+  auto conv_user_dst_memory =
+      memory({{conv_dst_tz}, dt::f32, tag::nchw}, cpu_engine, reinterpret_cast<float*>(out->memory));
+
+  auto conv_src_md     = memory::desc({conv_src_tz}, dt::f32, tag::any);
+  auto conv_weights_md = memory::desc({conv_weights_tz}, dt::f32, tag::any);
+  auto conv_dst_md     = memory::desc({conv_dst_tz}, dt::f32, tag::nchw);
+
+  auto conv_desc = mkldnn::convolution_forward::desc(mkldnn::prop_kind::forward_inference,
+                                                     mkldnn::algorithm::convolution_direct,
+                                                     conv_src_md,
+                                                     conv_weights_md,
+                                                     conv_dst_md,
+                                                     conv_strides,
+                                                     conv_dilations,
+                                                     conv_paddings,
+                                                     conv_paddings);
+
+  auto conv_prim_desc = mkldnn::convolution_forward::primitive_desc(conv_desc, cpu_engine);
+
+  auto conv_src_memory     = conv_user_src_memory;
+  auto conv_weights_memory = conv_user_weights_memory;
+  auto conv_dst_memory     = conv_user_dst_memory;
+  if (conv_prim_desc.dst_desc() != conv_user_dst_memory.get_desc()) {
+    conv_dst_memory = memory(conv_prim_desc.dst_desc(), cpu_engine);
+  }
+  auto conv = mkldnn::convolution_forward(conv_prim_desc);
+  conv.execute(cpu_stream,
+               {{MKLDNN_ARG_SRC, conv_src_memory},
+                {MKLDNN_ARG_WEIGHTS, conv_weights_memory},
+                {MKLDNN_ARG_DST, conv_dst_memory}});
+  if (conv_prim_desc.dst_desc() != conv_user_dst_memory.get_desc()) {
+    mkldnn::reorder(conv_dst_memory, conv_user_dst_memory).execute(cpu_stream, conv_dst_memory, conv_user_dst_memory);
+  } else {
+    conv_user_dst_memory = conv_dst_memory;
+  }
+
+  cpu_stream.wait();
+}
+
+CINN_REGISTER_HELPER(cinn_cpu_mkldnn) {
+  using namespace cinn;  // NOLINT
+  using backends::FunctionProto;
+  auto host_target = common::DefaultHostTarget();
+
+  FunctionProto::shape_inference_t inference_shape_conv2d_nchw = [](const std::vector<Expr>& args, int offset) {
+    CHECK_EQ(args.size(), 16UL) << "Wrong number of arguments passed in";
+    auto N         = common::AutoSimplify(args[0]);
+    int input_h    = common::AutoSimplify(args[2]).as_int32();
+    int input_w    = common::AutoSimplify(args[3]).as_int32();
+    auto c_out     = common::AutoSimplify(args[4]);
+    int filter_h   = common::AutoSimplify(args[6]).as_int32();
+    int filter_w   = common::AutoSimplify(args[7]).as_int32();
+    int pad_h      = common::AutoSimplify(args[8]).as_int32();
+    int pad_w      = common::AutoSimplify(args[9]).as_int32();
+    int stride_h   = common::AutoSimplify(args[10]).as_int32();
+    int stride_w   = common::AutoSimplify(args[11]).as_int32();
+    int dilation_h = common::AutoSimplify(args[12]).as_int32();
+    int dilation_w = common::AutoSimplify(args[13]).as_int32();
+    int out_h      = (input_h - ((filter_h - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1;
+    int out_w      = (input_w - ((filter_w - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + 1;
+
+    std::vector<Expr> shape;
+    shape.push_back(N);
+    shape.push_back(c_out);
+    shape.push_back(Expr(out_h));
+    shape.push_back(Expr(out_w));
+    return shape;
+  };
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_cpu_mkldnn_conv2d_nchw_fp32, host_target)
+      .SetRetType<void>()
+      .AddInputType<int>()              // batch_size
+      .AddInputType<int>()              // c_in
+      .AddInputType<int>()              // input_h
+      .AddInputType<int>()              // input_w
+      .AddInputType<int>()              // c_out
+      .AddInputType<int>()              // group
+      .AddInputType<int>()              // filter_h
+      .AddInputType<int>()              // filter_w
+      .AddInputType<int>()              // pad_h
+      .AddInputType<int>()              // pad_w
+      .AddInputType<int>()              // stride_h
+      .AddInputType<int>()              // stride_w
+      .AddInputType<int>()              // dilation_h
+      .AddInputType<int>()              // dilation_w
+      .AddInputType<cinn_buffer_t*>()   // inputs
+      .AddInputType<cinn_buffer_t*>()   // weights
+      .AddOutputType<cinn_buffer_t*>()  // out
+      .SetShapeInference(inference_shape_conv2d_nchw)
+      .End();
+
+  REGISTER_EXTERN_FUNC_HELPER(cinn_cpu_mkldnn_softmax_fp32, host_target)
+      .SetRetType<void>()
+      .AddInputType<int>()              // batch_size
+      .AddInputType<int>()              // c_in
+      .AddInputType<int>()              // h
+      .AddInputType<int>()              // w
+      .AddInputType<int>()              // axis
+      .AddInputType<cinn_buffer_t*>()   // inputs
+      .AddOutputType<cinn_buffer_t*>()  // out
+      .SetShapeInference(FunctionProto::ShapeFollowNthArgument(5))
+      .End();
+
+  return true;
+}
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math.h b/paddle/cinn/runtime/cpu/mkldnn_math.h
new file mode 100644
index 0000000000000..74a0f9a07f097
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/mkldnn_math.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/runtime/cinn_runtime.h"
+
+#ifdef CINN_WITH_MKLDNN
+#include "mkldnn.hpp"
+#endif
+
+// define some C APIs
+extern "C" {
+void cinn_cpu_mkldnn_softmax_fp32(
+    int batch, int channel, int h, int w, int axis, cinn_buffer_t* inputs, cinn_buffer_t* out);
+
+void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
+                                      int c_in,
+                                      int input_h,
+                                      int input_w,
+                                      int c_out,
+                                      int group,
+                                      int filter_h,
+                                      int filter_w,
+                                      int pad_h,
+                                      int pad_w,
+                                      int stride_h,
+                                      int stride_w,
+                                      int dilation_h,
+                                      int dilation_w,
+                                      cinn_buffer_t* inputs,
+                                      cinn_buffer_t* weights,
+                                      cinn_buffer_t* out);
+
+}  // extern "C"
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math_test.cc b/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
new file mode 100644
index 0000000000000..53d25ae7a45d5
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/backends/compiler.h"
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/cinn.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/common/target.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "cinn/runtime/cpu/use_extern_funcs.h"
+
+namespace cinn {
+namespace runtime {
+namespace cpu {
+
+cinn_buffer_t *CreateBuffer(const std::vector<int> shape, bool random = true, int set_value = 0) {
+  if (random) {
+    return common::BufferBuilder(Float(32), shape).set_random().Build();
+  } else if (set_value != 0) {
+    return common::BufferBuilder(Float(32), shape).set_val(set_value).Build();
+  }
+  return common::BufferBuilder(Float(32), shape).set_zero().Build();
+}
+
+TEST(cinn_cpu_mkldnn_conv2d_nchw_fp32, test) {
+  int n(1);
+  int c_in(3);
+  int i_h(224);
+  int i_w(224);
+  int c_out(64);
+  int k_h(7);
+  int k_w(7);
+  int pad_h(1);
+  int pad_w(1);
+  int stride_h(2);
+  int stride_w(2);
+  int dilation_h(1);
+  int dilation_w(1);
+
+  Placeholder<float> input("input", {Expr(n), Expr(c_in), Expr(i_h), Expr(i_w)});
+  Placeholder<float> weights("weights", {Expr(c_out), Expr(c_in), Expr(k_h), Expr(k_w)});
+
+  auto call = Compute(
+      {Expr(1)},
+      [=]() -> Expr {
+        return lang::CallExtern("cinn_cpu_mkldnn_conv2d_nchw_fp32",
+                                {
+                                    Expr(n),           // batch_size
+                                    Expr(c_in),        // c_in
+                                    Expr(i_h),         // input_h
+                                    Expr(i_w),         // input_w
+                                    Expr(c_out),       // c_out
+                                    Expr(1),           // group
+                                    Expr(k_h),         // filter_h
+                                    Expr(k_w),         // filter_w
+                                    Expr(pad_h),       // pad_h
+                                    Expr(pad_w),       // pad_w
+                                    Expr(stride_h),    // stride_h
+                                    Expr(stride_w),    // stride_w
+                                    Expr(dilation_h),  // dilation_h
+                                    Expr(dilation_w),  // dilation_w
+                                    input.tensor(),    // input
+                                    weights.tensor()   // weights
+                                });
+      },
+      "cinn_cpu_mkldnn_conv2d_nchw_fp32");
+
+  auto out = call->TupleGet(0);
+  out->WithBuffer(Float(32));
+
+  auto stages = CreateStages({call, out});
+
+  auto target = common::DefaultHostTarget();
+  target.arch = Target::Arch::X86;
+  ir::Module::Builder builder("module0", target);
+
+  auto func = Lower("fn", stages, {input, weights, out, call});
+  builder.AddFunction(func);
+
+  LOG(INFO) << "func:\n" << func;
+
+  auto jit    = backends::SimpleJIT::Create();
+  auto module = builder.Build();
+
+  jit->Link(module, /*optimize=*/true);
+  auto fn     = jit->Lookup("fn");
+  auto fn_ptr = reinterpret_cast<void (*)(void *, int32_t)>(fn);
+
+  // test with real data
+  int o_h     = (i_h - ((k_h - 1) * dilation_h + 1) + pad_h * 2) / stride_h + 1;
+  int o_w     = (i_w - ((k_w - 1) * dilation_w + 1) + pad_w * 2) / stride_w + 1;
+  auto *A_buf = common::BufferBuilder(Float(32), {n, c_in, i_h, i_w}).set_random().Build();
+  auto *B_buf = common::BufferBuilder(Float(32), {c_out, c_in, k_h, k_w}).set_random().Build();
+  auto *C_buf = common::BufferBuilder(Float(32), {n, c_out, o_h, o_w}).set_zero().Build();
+
+  auto args = common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
+
+  fn_ptr(args.data(), args.size());
+
+  cinn_buffer_free(nullptr, A_buf);
+  cinn_buffer_free(nullptr, B_buf);
+  cinn_buffer_free(nullptr, C_buf);
+}
+
+}  // namespace cpu
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cpu/thread_backend.cc b/paddle/cinn/runtime/cpu/thread_backend.cc
new file mode 100644
index 0000000000000..9594bec36a1a4
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/thread_backend.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cpu/thread_backend.h"
+
+#include <algorithm>
+#include <vector>
+
+#ifdef CINN_USE_OPENMP
+#include <omp.h>
+#endif  // CINN_USE_OPENMP
+
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "cinn/common/cas.h"
+#include "cinn/runtime/intrinsic.h"
+
+int max_concurrency() {
+  int max_concurrency = 1;
+  const char* val     = getenv("CINN_NUM_THREADS");
+  if (val == nullptr) {
+    val = getenv("OMP_NUM_THREADS");
+  }
+  if (val != nullptr) {
+    max_concurrency = atoi(val);
+  } else {
+    max_concurrency = std::thread::hardware_concurrency();
+#if defined(_M_X64) || defined(__x86_64__)
+    max_concurrency /= 2;  // ignore hyper-threading
+#endif
+  }
+  return std::max(max_concurrency, 1);
+}
+
+int cinn_backend_parallel_launch(FCINNParallelLambda flambda, void* datas, int num_task) {
+  int num_workers = max_concurrency();
+  if (num_task == 0) num_task = num_workers;
+#ifdef CINN_USE_OPENMP
+  omp_set_num_threads(num_task);
+#pragma omp parallel num_threads(num_task)
+  {
+    int thread_num = omp_get_thread_num();
+    (*flambda)(thread_num, num_task, datas);
+  }
+#else
+  LOG(FATAL) << "CINN host parallel launch need OpenMP! Please check.";
+#endif  // CINN_USE_OPENMP
+  return 0;
+}
+
+CINN_REGISTER_HELPER(cinn_backend_parallel) {
+  using namespace cinn;  // NOLINT
+  using backends::FunctionProto;
+  auto host_target = common::DefaultHostTarget();
+  backends::GlobalSymbolRegistry::Global().RegisterFn(runtime::intrinsic::parallel_launch,
+                                                      reinterpret_cast<void*>(&cinn_backend_parallel_launch));
+  return true;
+}
diff --git a/paddle/cinn/runtime/cpu/thread_backend.h b/paddle/cinn/runtime/cpu/thread_backend.h
new file mode 100644
index 0000000000000..ef6c71af116f8
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/thread_backend.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <thread>
+
+#include "cinn/runtime/cinn_runtime.h"
+
+extern "C" {
+
+int max_concurrency();
+
+/**
+ * @brief The callback function to execute a parallel lambda
+ * @param task_id the task id of the function.
+ * @param num_task The Number of tasks to launch. If 0, it means to launch
+ *           with all available threads.
+ * @param datas The closure datas.
+ */
+typedef int (*FCINNParallelLambda)(int task_id, int num_task, void* datas);
+
+/**
+ * @brief Backend function for running parallel jobs.
+ *
+ * @param flambda The parallel function to be launched.
+ * @param datas The closure datas.
+ * @param num_task The Number of tasks to launch. If 0, it means to launch
+ *           with all available threads.
+ *
+ * @return 0 when no error is thrown, -1 when failure happens
+ */
+int cinn_backend_parallel_launch(FCINNParallelLambda flambda, void* datas, int num_task);
+
+}  // extern "C"
diff --git a/paddle/cinn/runtime/cpu/use_extern_funcs.h b/paddle/cinn/runtime/cpu/use_extern_funcs.h
new file mode 100644
index 0000000000000..0e4895e2daabc
--- /dev/null
+++ b/paddle/cinn/runtime/cpu/use_extern_funcs.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/backends/extern_func_jit_register.h"
+
+CINN_USE_REGISTER(host_intrinsics)
+#ifdef CINN_WITH_MKL_CBLAS
+CINN_USE_REGISTER(mkl_math)
+CINN_USE_REGISTER(cinn_cpu_mkl)
+#ifdef CINN_WITH_MKLDNN
+CINN_USE_REGISTER(cinn_cpu_mkldnn)
+#endif
+#endif
+CINN_USE_REGISTER(cinn_backend_parallel)
diff --git a/paddle/cinn/runtime/cuda/CMakeLists.txt b/paddle/cinn/runtime/cuda/CMakeLists.txt
new file mode 100755
index 0000000000000..7d8ae44b9afd9
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/CMakeLists.txt
@@ -0,0 +1,19 @@
+if (NOT WITH_CUDA)
+    return()
+endif ()
+
+core_gather_headers()
+
+
+gather_srcs(cinnapi_src SRCS
+        cuda_module.cc
+        cuda_util.cc
+        cuda_intrinsics.cc
+        cuda_intrinsics_reduce.cc
+        cuda_instrinsics_float16.cc
+        cuda_instrinsics_bfloat16.cc
+        )
+
+
+nv_test(test_cuda_module SRCS cuda_module_test.cc DEPS cinncore)
+nv_library(cuda_runtime SRCS cinn_cuda_runtime_source.cuh)
diff --git a/paddle/cinn/runtime/cuda/bfloat16.h b/paddle/cinn/runtime/cuda/bfloat16.h
new file mode 100644
index 0000000000000..27501008bf5bf
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/bfloat16.h
@@ -0,0 +1,402 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CINN_COMMON_BFLOAT16_H
+#define CINN_COMMON_BFLOAT16_H
+
+#ifdef __cplusplus
+#pragma once
+#endif  // __cplusplus
+
+#include <stdint.h>
+
+#include <cmath>
+#include <cstring>
+
+#ifdef CINN_WITH_CUDA
+#include <cuda.h>
+
+#if (defined(__CUDACC__) || defined(__CUDACC_RTC__)) && CUDA_VERSION >= 11000
+#define CINN_CUDA_BF16
+#include <cuda_bf16.h>
+
+#endif  // __CUDACC__
+#endif  // CINN_WITH_CUDA
+
+#ifdef __cplusplus
+
+#ifndef _WIN32
+#define CINN_ALIGN(x) __attribute__((aligned(x)))
+#else  // _WIN32
+#define CINN_ALIGN(x) __declspec(align(x))
+#endif  // _WIN32
+
+#else  // __cplusplus
+#define CINN_ALIGN(x)
+#endif  // __cplusplus
+
+// The `HOST` macro definition is not used here, it has a potential
+// conflict with the enumeration `kHOST` representing the backend.
+#ifndef __host__
+#define __host__
+#endif
+#ifndef __device__
+#define __device__
+#endif
+
+#ifdef __cplusplus
+namespace cinn {
+namespace common {
+#endif  // __cplusplus
+
+// Use CINN_ALIGNED(2) to ensure that each bfloat16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes bfloat16 compatible
+// with CUDA half
+struct CINN_ALIGN(2) bfloat16 {
+  uint16_t x;
+
+#ifdef __cplusplus
+  // Constructors
+  bfloat16()                  = default;
+  bfloat16(const bfloat16& o) = default;
+  bfloat16& operator=(const bfloat16& o) = default;
+  bfloat16(bfloat16&& o)                 = default;
+  bfloat16& operator=(bfloat16&& o) = default;
+  ~bfloat16()                       = default;
+
+  __host__ __device__ inline explicit bfloat16(float val) {
+#if defined(CINN_CUDA_BF16)
+    __nv_bfloat16 tmp = __float2bfloat16(val);
+    x                 = *reinterpret_cast<uint16_t*>(&tmp);
+#else
+    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+#endif
+  }
+
+#if defined(CINN_CUDA_BF16)
+  __host__ __device__ inline explicit bfloat16(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);  // NOLINT
+  }
+#endif
+
+  template <class T>
+  __host__ __device__ inline explicit bfloat16(const T& val) : x(bfloat16(static_cast<float>(val)).x) {}
+
+// Assignment operators
+#if defined(CINN_CUDA_BF16)
+  __host__ __device__ inline bfloat16& operator=(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);  // NOLINT
+    return *this;
+  }
+#endif
+
+  __host__ __device__ inline bfloat16& operator=(bool b) {
+    x = b ? 0x3f80 : 0;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(int8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(uint8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(int16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(uint16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(int32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(uint32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(int64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(uint64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(float val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline bfloat16& operator=(double val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  // Conversion opertors
+  __host__ __device__ inline operator float() const {
+#ifdef CINN_CUDA_BF16
+    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
+    float val     = 0.f;
+    uint16_t temp = x;
+    std::memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp), 2);
+    return val;
+#endif
+  }
+
+#ifdef CINN_CUDA_BF16
+  __host__ __device__ inline __nv_bfloat16 to_nv_bfloat16() const {
+    return *reinterpret_cast<const __nv_bfloat16*>(&x);
+  }
+#endif
+
+  __host__ __device__ inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  __host__ __device__ inline explicit operator int8_t() const { return static_cast<int8_t>(static_cast<float>(*this)); }
+
+  __host__ __device__ inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline operator double() const { return static_cast<double>(static_cast<float>(*this)); }
+#endif  // __cplusplus
+};
+
+__host__ __device__ inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(__hadd(a.to_nv_bfloat16(), b.to_nv_bfloat16()));
+#else
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(__hsub(a.to_nv_bfloat16(), b.to_nv_bfloat16()));
+#else
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(__hmul(a.to_nv_bfloat16(), b.to_nv_bfloat16()));
+#else
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(__hdiv(a.to_nv_bfloat16(), b.to_nv_bfloat16()));
+#else
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline bfloat16 operator-(const bfloat16& a) {
+  bfloat16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+__host__ __device__ inline bfloat16& operator+=(bfloat16& a,  // NOLINT
+                                                const bfloat16& b) {
+  a = a + b;
+  return a;
+}
+
+__host__ __device__ inline bfloat16& operator-=(bfloat16& a,  // NOLINT
+                                                const bfloat16& b) {
+  a = a - b;
+  return a;
+}
+
+__host__ __device__ inline bfloat16& operator*=(bfloat16& a,  // NOLINT
+                                                const bfloat16& b) {
+  a = a * b;
+  return a;
+}
+
+__host__ __device__ inline bfloat16& operator/=(bfloat16& a,  // NOLINT
+                                                const bfloat16& b) {
+  a = a / b;
+  return a;
+}
+
+__host__ __device__ inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
+  bfloat16 res;
+  res.x = a;
+  return res;
+}
+
+// Comparison operators
+__host__ __device__ inline bool operator==(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __heq(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) == static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator!=(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hne(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) != static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator<(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hlt(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) < static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator<=(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hle(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) <= static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator>(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hgt(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) > static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator>=(const bfloat16& a, const bfloat16& b) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hge(a.to_nv_bfloat16(), b.to_nv_bfloat16());
+#else
+  return static_cast<float>(a) >= static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool(isnan)(const bfloat16& a) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hisnan(a.to_nv_bfloat16());
+#else
+  return (a.x & 0x7FFF) > 0x7F80;
+#endif
+}
+
+__host__ __device__ inline bool(isinf)(const bfloat16& a) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hisinf(a.to_nv_bfloat16());
+#else
+  return (a.x & 0x7F80) == 0x7F80;
+#endif
+}
+
+__host__ __device__ inline bool(isfinite)(const bfloat16& a) { return !((isnan)(a)) && !((isinf)(a)); }
+
+__host__ __device__ inline bfloat16(abs)(const bfloat16& a) {
+#if defined(CINN_CUDA_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(__habs(a.to_nv_bfloat16()));
+#else
+  return bfloat16(std::abs(static_cast<float>(a)));
+#endif
+}
+
+#ifdef __cplusplus
+}  // namespace common
+}  // namespace cinn
+#endif  // __cplusplus
+
+// for runtime calls
+#if defined(__cplusplus) && defined(CINN_CUDA_BF16)
+__device__ inline cinn::common::bfloat16 __shfl_sync(unsigned mask,
+                                                     cinn::common::bfloat16 var,
+                                                     int srcLane,
+                                                     int width = warpSize) {
+  return cinn::common::bfloat16(__shfl_sync(mask, var.to_nv_bfloat16(), srcLane, width));
+}
+
+__device__ inline cinn::common::bfloat16 __shfl_up_sync(unsigned mask,
+                                                        cinn::common::bfloat16 var,
+                                                        unsigned int delta,
+                                                        int width = warpSize) {
+  return cinn::common::bfloat16(__shfl_up_sync(mask, var.to_nv_bfloat16(), delta, width));
+}
+
+__device__ inline cinn::common::bfloat16 __shfl_down_sync(unsigned mask,
+                                                          cinn::common::bfloat16 var,
+                                                          unsigned int delta,
+                                                          int width = warpSize) {
+  return cinn::common::bfloat16(__shfl_down_sync(mask, var.to_nv_bfloat16(), delta, width));
+}
+
+__device__ inline cinn::common::bfloat16 __shfl_xor_sync(unsigned mask,
+                                                         cinn::common::bfloat16 var,
+                                                         int laneMask,
+                                                         int width = warpSize) {
+  return cinn::common::bfloat16(__shfl_xor_sync(mask, var.to_nv_bfloat16(), laneMask, width));
+}
+
+__host__ __device__ inline cinn::common::bfloat16 max(const cinn::common::bfloat16& a,
+                                                      const cinn::common::bfloat16& b) {
+  return a > b ? a : b;
+}
+__host__ __device__ inline cinn::common::bfloat16 min(const cinn::common::bfloat16& a,
+                                                      const cinn::common::bfloat16& b) {
+  return a < b ? a : b;
+}
+#endif  // __cplusplus && CINN_CUDA_FP16
+
+#endif  // CINN_COMMON_BFLOAT16_H
diff --git a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
new file mode 100644
index 0000000000000..0525e7f4b12a9
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
@@ -0,0 +1,865 @@
+/**
+ * \file This file contains all the intrinsics available to be used in CUDA code generated by CodeGen.
+ */
+
+extern "C" {
+
+#define CINN_INT32_MAX 2147483647
+#define CINN_INT32_MIN -2147483648
+
+// *************************************************************** //
+// bool unary and binary operator
+#define FN_BOOL(func) cinn_nvgpu_##func##_bool
+__device__ inline bool FN_BOOL(bitwise_and)(bool a, bool b) { return a & b; }
+__device__ inline bool FN_BOOL(bitwise_or)(bool a, bool b) { return a | b; }
+__device__ inline bool FN_BOOL(bitwise_xor)(bool a, bool b) { return a ^ b; }
+__device__ inline bool FN_BOOL(bitwise_not)(bool a) { return !a; }
+
+// *************************************************************** //
+// uint8 unary and binary operator
+#define FN_UINT8(func) cinn_nvgpu_##func##_uint8
+__device__ inline uint8_t FN_UINT8(bitwise_and)(uint8_t a, uint8_t b) { return a & b; }
+__device__ inline uint8_t FN_UINT8(bitwise_or)(uint8_t a, uint8_t b) { return a | b; }
+__device__ inline uint8_t FN_UINT8(bitwise_xor)(uint8_t a, uint8_t b) { return a ^ b; }
+__device__ inline uint8_t FN_UINT8(bitwise_not)(uint8_t a) { return ~a; }
+__device__ inline uint8_t FN_UINT8(logical_right_shift)(uint8_t a, uint8_t b) { return ((uint8_t)a >> b); }
+
+// *************************************************************** //
+// int8 unary and binary operator
+#define FN_INT8(func) cinn_nvgpu_##func##_int8
+__device__ inline int8_t FN_INT8(bitwise_and)(int8_t a, int8_t b) { return a & b; }
+__device__ inline int8_t FN_INT8(bitwise_or)(int8_t a, int8_t b) { return a | b; }
+__device__ inline int8_t FN_INT8(bitwise_xor)(int8_t a, int8_t b) { return a ^ b; }
+__device__ inline int8_t FN_INT8(bitwise_not)(int8_t a) { return ~a; }
+__device__ inline int8_t FN_INT8(logical_right_shift)(int8_t a, int8_t b) { return ((uint8_t)a >> b); }
+
+// *************************************************************** //
+// int16 unary and binary operator
+#define FN_INT16(func) cinn_nvgpu_##func##_int16
+__device__ inline int16_t FN_INT16(bitwise_and)(int16_t a, int16_t b) { return a & b; }
+__device__ inline int16_t FN_INT16(bitwise_or)(int16_t a, int16_t b) { return a | b; }
+__device__ inline int16_t FN_INT16(bitwise_xor)(int16_t a, int16_t b) { return a ^ b; }
+__device__ inline int16_t FN_INT16(bitwise_not)(int16_t a) { return ~a; }
+__device__ inline int16_t FN_INT16(logical_right_shift)(int16_t a, int16_t b) { return ((uint16_t)a >> b); }
+
+// *************************************************************** //
+// float32 unary and binary operator
+#define FN_FP32(func) cinn_nvgpu_##func##_fp32
+// NOTE Due to function override, we don't need to use type (such as '_fp32') as the suffix of function's name.
+__device__ inline float FN_FP32(sin)(float x) { return sin(x); }
+__device__ inline float FN_FP32(cos)(float x) { return cos(x); }
+__device__ inline float FN_FP32(tan)(float x) { return tan(x); }
+__device__ inline float FN_FP32(sinh)(float x) { return sinh(x); }
+__device__ inline float FN_FP32(cosh)(float x) { return cosh(x); }
+__device__ inline float FN_FP32(tanh)(float x) { return tanh(x); }
+
+__device__ inline float FN_FP32(asin)(float x) { return asin(x); }
+__device__ inline float FN_FP32(acos)(float x) { return acos(x); }
+__device__ inline float FN_FP32(atan)(float x) { return atan(x); }
+__device__ inline float FN_FP32(asinh)(float x) { return asinh(x); }
+__device__ inline float FN_FP32(acosh)(float x) { return acosh(x); }
+__device__ inline float FN_FP32(atanh)(float x) { return atanh(x); }
+
+__device__ inline float FN_FP32(ceil)(float x) { return ceil(x); }
+__device__ inline float FN_FP32(round)(float x) { return round(x); }
+__device__ inline float FN_FP32(trunc)(float x) { return trunc(x); }
+__device__ inline float FN_FP32(abs)(float x) { return abs(x); }
+__device__ inline float FN_FP32(floor)(float x) { return floor(x); }
+__device__ inline float FN_FP32(log)(float x) { return log(x); }
+__device__ inline float FN_FP32(log2)(float x) { return log2(x); }
+__device__ inline float FN_FP32(log10)(float x) { return log10(x); }
+__device__ inline float FN_FP32(exp)(float x) { return exp(x); }
+__device__ inline float FN_FP32(erf)(float x) { return erf(x); }
+__device__ inline float FN_FP32(sigmoid)(float x) { return 1.0f / (1.0f + exp(-x)); }
+__device__ inline float FN_FP32(sqrt)(float x) { return sqrt(x); }
+__device__ inline float FN_FP32(rsqrt)(float x) { return rsqrt(x); }
+__device__ inline float FN_FP32(cbrt)(float x) { return cbrt(x); }
+
+__device__ inline bool FN_FP32(isfinite)(float x) { return isfinite(x); }
+__device__ inline bool FN_FP32(isinf)(float x) { return isinf(x); }
+__device__ inline bool FN_FP32(isnan)(float x) { return isnan(x); }
+
+__device__ inline float FN_FP32(pow)(float a, float b) { return powf(a, b); }
+
+__device__ inline float FN_FP32(mod)(float a, float b) {
+  float res = fmodf(a, b);
+  if ((res != 0.0f) && ((res < 0.0f) != (b < 0.0f))) res += b;
+  return res;
+}
+
+// *************************************************************** //
+// float64 unary and binary operator
+#define FN_FP64(func) cinn_nvgpu_##func##_fp64
+
+__device__ inline double FN_FP64(sin)(double x) { return sin(x); }
+__device__ inline double FN_FP64(cos)(double x) { return cos(x); }
+__device__ inline double FN_FP64(tan)(double x) { return tan(x); }
+__device__ inline double FN_FP64(sinh)(double x) { return sinh(x); }
+__device__ inline double FN_FP64(cosh)(double x) { return cosh(x); }
+__device__ inline double FN_FP64(tanh)(double x) { return tanh(x); }
+
+__device__ inline double FN_FP64(asin)(double x) { return asin(x); }
+__device__ inline double FN_FP64(acos)(double x) { return acos(x); }
+__device__ inline double FN_FP64(atan)(double x) { return atan(x); }
+__device__ inline double FN_FP64(asinh)(double x) { return asinh(x); }
+__device__ inline double FN_FP64(acosh)(double x) { return acosh(x); }
+__device__ inline double FN_FP64(atanh)(double x) { return atanh(x); }
+
+__device__ inline double FN_FP64(ceil)(double x) { return ceil(x); }
+__device__ inline double FN_FP64(round)(double x) { return round(x); }
+__device__ inline double FN_FP64(trunc)(double x) { return trunc(x); }
+__device__ inline double FN_FP64(abs)(double x) { return abs(x); }
+__device__ inline double FN_FP64(floor)(double x) { return floor(x); }
+__device__ inline double FN_FP64(log)(double x) { return log(x); }
+__device__ inline double FN_FP64(log2)(double x) { return log2(x); }
+__device__ inline double FN_FP64(log10)(double x) { return log10(x); }
+__device__ inline double FN_FP64(exp)(double x) { return exp(x); }
+__device__ inline double FN_FP64(erf)(double x) { return erf(x); }
+__device__ inline double FN_FP64(sigmoid)(double x) { return 1.0 / (1.0 + exp(-x)); }
+__device__ inline double FN_FP64(sqrt)(double x) { return sqrt(x); }
+__device__ inline double FN_FP64(rsqrt)(double x) { return rsqrt(x); }
+__device__ inline double FN_FP64(cbrt)(double x) { return cbrt(x); }
+
+__device__ inline bool FN_FP64(isfinite)(double x) { return isfinite(x); }
+__device__ inline bool FN_FP64(isinf)(double x) { return isinf(x); }
+__device__ inline bool FN_FP64(isnan)(double x) { return isnan(x); }
+
+__device__ inline double FN_FP64(pow)(double a, double b) { return pow(a, b); }
+__device__ inline double FN_FP64(mod)(double a, double b) {
+  double res = fmod(a, b);
+  if ((res != 0.0) && ((res < 0.0) != (b < 0.0))) res += b;
+  return res;
+}
+
+// *************************************************************** //
+// int32 unary and binary operator
+#define FN_INT32(func) cinn_nvgpu_##func##_int32
+
+__device__ inline int FN_INT32(pow)(int a, int b) {
+  if (a == 0 && b < 0) {
+    return -1;
+  }
+  float res = pow(__int2float_rd(a), __int2float_rd(b));
+  return __float2int_rn(res);
+}
+
+__device__ inline int FN_INT32(left_shift)(int a, int b) { return a << b; }
+__device__ inline int FN_INT32(right_shift)(int a, int b) { return a >> b; }
+__device__ inline int FN_INT32(bitwise_and)(int a, int b) { return a & b; }
+__device__ inline int FN_INT32(bitwise_or)(int a, int b) { return a | b; }
+__device__ inline int FN_INT32(bitwise_xor)(int a, int b) { return a ^ b; }
+__device__ inline int FN_INT32(bitwise_not)(int a) { return ~a; }
+__device__ inline int FN_INT32(clz)(int a) { return __clz(a); }
+__device__ inline int FN_INT32(popc)(int a) { return __popc(a); }
+__device__ inline int FN_INT32(logical_right_shift)(int a, int b) { return ((unsigned int)a >> b); }
+__device__ inline int FN_INT32(trunc)(int a) { return a; }
+
+__device__ inline int FN_INT32(max)(int a, int b) { return max(a, b); }
+__device__ inline int FN_INT32(min)(int a, int b) { return min(a, b); }
+
+__device__ inline int FN_INT32(mod)(int a, int b) {
+  int res = a % b;
+  if ((res != 0) && ((b ^ res) < 0)) res += b;
+  return res;
+}
+
+// *************************************************************** //
+
+// int64 unary and binary operator
+#define FN_INT64(func) cinn_nvgpu_##func##_int64
+
+__device__ inline long long int FN_INT64(bitwise_and)(long long int a, long long int b) { return a & b; }
+__device__ inline long long int FN_INT64(bitwise_or)(long long int a, long long int b) { return a | b; }
+__device__ inline long long int FN_INT64(bitwise_xor)(long long int a, long long int b) { return a ^ b; }
+__device__ inline long long int FN_INT64(bitwise_not)(long long int a) { return ~a; }
+__device__ inline long long int FN_INT64(clz)(long long int a) { return __clzll(a); }
+__device__ inline long long int FN_INT64(popc)(long long int a) { return __popcll(a); }
+__device__ inline long long int FN_INT64(logical_right_shift)(long long int a, long long int b) { return ((unsigned long long int)a >> b); }
+__device__ inline long long int FN_INT64(trunc)(long long int a) { return a; }
+__device__ inline long long int FN_INT64(mod)(long long int a, long long int b) {
+  long long int res = a % b;
+  if ((res != 0) && ((b ^ res) < 0)) res += b;
+  return res;
+}
+
+__device__ inline long long int FN_INT64(pow)(long long int a, long long int b) {
+  double res = pow(__ll2double_rd(a), __ll2double_rd(b));
+  return __double2ll_rn(res);
+}
+
+// *************************************************************** //
+// bfloat16 unary and binary operator
+#ifdef CINN_CUDA_BF16
+
+#define FN_BF16(func) cinn_nvgpu_##func##_bf16
+
+__device__ inline bfloat16 FN_BF16(ceil)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hceil(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(ceil)(static_cast<float>(x)));
+#endif
+}
+__device__ inline bfloat16 FN_BF16(floor)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hfloor(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(floor)(static_cast<float>(x)));
+#endif
+}
+__device__ inline bfloat16 FN_BF16(round)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hrint(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(round)(static_cast<float>(x)));
+#endif
+}
+__device__ inline bfloat16 FN_BF16(trunc)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(htrunc(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(trunc)(static_cast<float>(x)));
+#endif
+}
+
+__device__ inline bfloat16 FN_BF16(sin)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hsin(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(sin)(static_cast<float>(x)));
+#endif
+}
+__device__ inline bfloat16 FN_BF16(cos)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hcos(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(cos)(static_cast<float>(x)));
+#endif
+}
+
+__device__ inline bfloat16 FN_BF16(exp)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hexp(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(exp)(static_cast<float>(x)));
+#endif
+}
+__device__ inline bfloat16 FN_BF16(log)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hlog(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(log)(static_cast<float>(x)));
+#endif
+}
+__device__ inline bfloat16 FN_BF16(log2)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hlog2(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(log2)(static_cast<float>(x)));
+#endif
+}
+__device__ inline bfloat16 FN_BF16(log10)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hlog10(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(log10)(static_cast<float>(x)));
+#endif
+}
+
+__device__ inline bfloat16 FN_BF16(sqrt)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hsqrt(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(sqrt)(static_cast<float>(x)));
+#endif
+}
+__device__ inline bfloat16 FN_BF16(rsqrt)(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return bfloat16(hrsqrt(x.to_nv_bfloat16()));
+#else
+  return bfloat16(FN_FP32(rsqrt)(static_cast<float>(x)));
+#endif
+}
+
+__device__ inline bfloat16 FN_BF16(cbrt)(bfloat16 x) { return bfloat16(FN_FP32(cbrt)(static_cast<float>(x))); }
+
+__device__ inline bfloat16 FN_BF16(abs)(bfloat16 x) { return cinn::common::abs(x); }
+
+__device__ inline bool FN_BF16(isnan)(bfloat16 x) { return cinn::common::isnan(x); }
+__device__ inline bool FN_BF16(isinf)(bfloat16 x) { return cinn::common::isinf(x); }
+__device__ inline bool FN_BF16(isfinite)(bfloat16 x) { return cinn::common::isfinite(x); }
+
+__device__ inline bfloat16 FN_BF16(erf)(bfloat16 x) { return bfloat16(FN_FP32(erf)(static_cast<float>(x))); }
+
+__device__ inline bfloat16 FN_BF16(tan)(bfloat16 x) { return bfloat16(FN_FP32(tan)(static_cast<float>(x))); }
+__device__ inline bfloat16 FN_BF16(sinh)(bfloat16 x) { return bfloat16(FN_FP32(sinh)(static_cast<float>(x))); }
+__device__ inline bfloat16 FN_BF16(cosh)(bfloat16 x) { return bfloat16(FN_FP32(cosh)(static_cast<float>(x))); }
+__device__ inline bfloat16 FN_BF16(tanh)(bfloat16 x) { return bfloat16(FN_FP32(tanh)(static_cast<float>(x))); }
+__device__ inline bfloat16 FN_BF16(asin)(bfloat16 x) { return bfloat16(FN_FP32(asin)(static_cast<float>(x))); }
+__device__ inline bfloat16 FN_BF16(acos)(bfloat16 x) { return bfloat16(FN_FP32(acos)(static_cast<float>(x))); }
+__device__ inline bfloat16 FN_BF16(atan)(bfloat16 x) { return bfloat16(FN_FP32(atan)(static_cast<float>(x))); }
+__device__ inline bfloat16 FN_BF16(asinh)(bfloat16 x) { return bfloat16(FN_FP32(asinh)(static_cast<float>(x))); }
+__device__ inline bfloat16 FN_BF16(acosh)(bfloat16 x) { return bfloat16(FN_FP32(acosh)(static_cast<float>(x))); }
+__device__ inline bfloat16 FN_BF16(atanh)(bfloat16 x) { return bfloat16(FN_FP32(atanh)(static_cast<float>(x))); }
+
+__device__ inline bfloat16 FN_BF16(sigmoid)(bfloat16 x) { return bfloat16(FN_FP32(sigmoid)(static_cast<float>(x))); }
+
+__device__ inline bfloat16 FN_BF16(mod)(bfloat16 a, bfloat16 b) {
+  return bfloat16(FN_FP32(mod)(static_cast<float>(a), static_cast<float>(b)));
+}
+__device__ inline bfloat16 FN_BF16(pow)(bfloat16 a, bfloat16 b) {
+  return bfloat16(FN_FP32(pow)(static_cast<float>(a), static_cast<float>(b)));
+}
+
+#endif
+
+// *************************************************************** //
+// float16 unary and binary operator
+#ifdef CINN_CUDA_FP16
+
+#define FN_FP16(func) cinn_nvgpu_##func##_fp16
+
+__device__ inline float16 FN_FP16(ceil)(float16 x) { return float16(hceil(x.to_half())); }
+__device__ inline float16 FN_FP16(floor)(float16 x) { return float16(hfloor(x.to_half())); }
+__device__ inline float16 FN_FP16(round)(float16 x) { return float16(FN_FP32(round)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(trunc)(float16 x) { return float16(htrunc(x.to_half())); }
+
+__device__ inline float16 FN_FP16(sin)(float16 x) { return float16(hsin(x.to_half())); }
+__device__ inline float16 FN_FP16(cos)(float16 x) { return float16(hcos(x.to_half())); }
+
+__device__ inline float16 FN_FP16(exp)(float16 x) { return float16(hexp(x.to_half())); }
+__device__ inline float16 FN_FP16(log)(float16 x) { return float16(hlog(x.to_half())); }
+__device__ inline float16 FN_FP16(log2)(float16 x) { return float16(hlog2(x.to_half())); }
+__device__ inline float16 FN_FP16(log10)(float16 x) { return float16(hlog10(x.to_half())); }
+
+__device__ inline float16 FN_FP16(sqrt)(float16 x) { return float16(hsqrt(x.to_half())); }
+__device__ inline float16 FN_FP16(rsqrt)(float16 x) { return float16(hrsqrt(x.to_half())); }
+
+__device__ inline float16 FN_FP16(cbrt)(float16 x) { return float16(FN_FP32(cbrt)(static_cast<float>(x))); }
+
+__device__ inline float16 FN_FP16(abs)(float16 x) { return cinn::common::abs(x); }
+
+__device__ inline bool FN_FP16(isnan)(float16 x) { return cinn::common::isnan(x); }
+__device__ inline bool FN_FP16(isinf)(float16 x) { return cinn::common::isinf(x); }
+__device__ inline bool FN_FP16(isfinite)(float16 x) { return cinn::common::isfinite(x); }
+
+__device__ inline float16 FN_FP16(erf)(float16 x) { return float16(FN_FP32(erf)(static_cast<float>(x))); }
+
+__device__ inline float16 FN_FP16(tan)(float16 x) { return float16(FN_FP32(tan)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(sinh)(float16 x) { return float16(FN_FP32(sinh)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(cosh)(float16 x) { return float16(FN_FP32(cosh)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(tanh)(float16 x) { return float16(FN_FP32(tanh)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(asin)(float16 x) { return float16(FN_FP32(asin)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(acos)(float16 x) { return float16(FN_FP32(acos)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(atan)(float16 x) { return float16(FN_FP32(atan)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(asinh)(float16 x) { return float16(FN_FP32(asinh)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(acosh)(float16 x) { return float16(FN_FP32(acosh)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(atanh)(float16 x) { return float16(FN_FP32(atanh)(static_cast<float>(x))); }
+
+__device__ inline float16 FN_FP16(sigmoid)(float16 x) { return float16(FN_FP32(sigmoid)(static_cast<float>(x))); }
+
+__device__ inline float16 FN_FP16(mod)(float16 a, float16 b) {
+  return float16(FN_FP32(mod)(static_cast<float>(a), static_cast<float>(b)));
+}
+__device__ inline float16 FN_FP16(pow)(float16 a, float16 b) {
+  return float16(FN_FP32(pow)(static_cast<float>(a), static_cast<float>(b)));
+}
+
+#endif
+
+// *************************************************************** //
+// reduce operator, need `--expt-relaxed-constexpr` option to call std function in device kernel
+#define EXPAND_REDUCE_INT32_MARCO(MARCO, ...)       \
+  MARCO(sum_int32, 0, int, ##__VA_ARGS__)           \
+  MARCO(prod_int32, 1, int, ##__VA_ARGS__)          \
+  MARCO(max_int32, CINN_INT32_MIN, int, ##__VA_ARGS__) \
+  MARCO(min_int32, CINN_INT32_MAX, int, ##__VA_ARGS__)
+
+__device__ inline int cinn_sum_int32(const int left, const int right) { return left + right; }
+__device__ inline int cinn_prod_int32(const int left, const int right) { return left * right; }
+__device__ inline int cinn_max_int32(const int left, const int right) { return max(left, right); }
+__device__ inline int cinn_min_int32(const int left, const int right) { return min(left, right); }
+
+#define EXPAND_REDUCE_INT64_MARCO(MARCO, ...)                          \
+  MARCO(sum_int64, 0, long long int, ##__VA_ARGS__)                    \
+  MARCO(prod_int64, 1, long long int, ##__VA_ARGS__)                   \
+  MARCO(max_int64, -9223372036854775808, long long int, ##__VA_ARGS__) \
+  MARCO(min_int64, 9223372036854775807, long long int, ##__VA_ARGS__)
+
+__device__ inline long long int cinn_sum_int64(const long long int left, const long long int right) {
+  return left + right;
+}
+__device__ inline long long int cinn_prod_int64(const long long int left, const long long int right) {
+  return left * right;
+}
+__device__ inline long long int cinn_max_int64(const long long int left, const long long int right) {
+  return max(left, right);
+}
+__device__ inline long long int cinn_min_int64(const long long int left, const long long int right) {
+  return min(left, right);
+}
+
+#define EXPAND_REDUCE_FP32_MACRO(MACRO, ...)           \
+  MACRO(sum_fp32, 0.0f, float, ##__VA_ARGS__)          \
+  MACRO(prod_fp32, 1.0f, float, ##__VA_ARGS__)         \
+  MACRO(max_fp32, -3.40282e+38f, float, ##__VA_ARGS__) \
+  MACRO(min_fp32, 3.40282e+38f, float, ##__VA_ARGS__)
+
+__device__ inline float cinn_sum_fp32(const float left, const float right) { return left + right; }
+__device__ inline float cinn_prod_fp32(const float left, const float right) { return left * right; }
+__device__ inline float cinn_max_fp32(const float left, const float right) { return max(left, right); }
+__device__ inline float cinn_min_fp32(const float left, const float right) { return min(left, right); }
+
+#ifdef CINN_CUDA_BF16
+
+#define EXPAND_REDUCE_BF16_MACRO(MACRO, ...)                                             \
+  MACRO(sum_bf16, bfloat16(0.0), bfloat16, ##__VA_ARGS__)                                \
+  MACRO(prod_bf16, bfloat16(1.0), bfloat16, ##__VA_ARGS__)                               \
+  MACRO(max_bf16, cinn::common::raw_uint16_to_bfloat16(0xfbff), bfloat16, ##__VA_ARGS__) \
+  MACRO(min_bf16, cinn::common::raw_uint16_to_bfloat16(0x7bff), bfloat16, ##__VA_ARGS__)
+
+__device__ inline bfloat16 cinn_sum_bf16(const bfloat16 left, const bfloat16 right) { return left + right; }
+__device__ inline bfloat16 cinn_prod_bf16(const bfloat16 left, const bfloat16 right) { return left * right; }
+__device__ inline bfloat16 cinn_max_bf16(const bfloat16 left, const bfloat16 right) { return max(left, right); }
+__device__ inline bfloat16 cinn_min_bf16(const bfloat16 left, const bfloat16 right) { return min(left, right); }
+#endif
+
+#ifdef CINN_CUDA_FP16
+
+#define EXPAND_REDUCE_FP16_MACRO(MACRO, ...)                                           \
+  MACRO(sum_fp16, float16(0.0), float16, ##__VA_ARGS__)                                \
+  MACRO(prod_fp16, float16(1.0), float16, ##__VA_ARGS__)                               \
+  MACRO(max_fp16, cinn::common::raw_uint16_to_float16(0xfbff), float16, ##__VA_ARGS__) \
+  MACRO(min_fp16, cinn::common::raw_uint16_to_float16(0x7bff), float16, ##__VA_ARGS__)
+
+__device__ inline float16 cinn_sum_fp16(const float16 left, const float16 right) { return left + right; }
+__device__ inline float16 cinn_prod_fp16(const float16 left, const float16 right) { return left * right; }
+__device__ inline float16 cinn_max_fp16(const float16 left, const float16 right) { return max(left, right); }
+__device__ inline float16 cinn_min_fp16(const float16 left, const float16 right) { return min(left, right); }
+#endif
+
+#define EXPAND_REDUCE_FP64_MACRO(MACRO, ...)            \
+  MACRO(sum_fp64, 0.0, double, ##__VA_ARGS__)           \
+  MACRO(prod_fp64, 1.0, double, ##__VA_ARGS__)          \
+  MACRO(max_fp64, -1.79769e+308, double, ##__VA_ARGS__) \
+  MACRO(min_fp64, 1.79769e+308, double, ##__VA_ARGS__)
+
+__device__ inline double cinn_sum_fp64(const double left, const double right) { return left + right; }
+__device__ inline double cinn_prod_fp64(const double left, const double right) { return left * right; }
+__device__ inline double cinn_max_fp64(const double left, const double right) { return max(left, right); }
+__device__ inline double cinn_min_fp64(const double left, const double right) { return min(left, right); }
+
+#define EXPAND_REDUCE_BOOL_MACRO(MACRO, ...) \
+  MACRO(all, true, bool, ##__VA_ARGS__)      \
+  MACRO(any, false, bool, ##__VA_ARGS__)
+
+__device__ inline bool cinn_all(const bool left, const bool right) { return left && right; }
+__device__ inline bool cinn_any(const bool left, const bool right) { return left || right; }
+
+#define CINN_SHUFFLE_FUNCTION(offset, op, init)           \
+  shfl_res = __shfl_down_sync(mask, tmp_val, offset, 32); \
+  tmp_val  = op((threadIdx.x & 0x1f) + offset < lane ? shfl_res : init, tmp_val);
+
+#define CINN_WARP_SHUFFLE_INTERNAL_IMPL(REDUCE_TYPE, INITIAL_VALUE, DTYPE)                \
+  __device__ inline DTYPE cinn_warp_shuffle_##REDUCE_TYPE##_internal(const DTYPE value) { \
+    DTYPE tmp_val     = value, shfl_res;                                                  \
+    unsigned int mask = __activemask();                                                   \
+    unsigned int lane = __popc(mask);                                                     \
+    if (lane < 32) {                                                                      \
+      CINN_SHUFFLE_FUNCTION(16, cinn_##REDUCE_TYPE, (DTYPE)(INITIAL_VALUE))               \
+      CINN_SHUFFLE_FUNCTION(8, cinn_##REDUCE_TYPE, (DTYPE)(INITIAL_VALUE))                \
+      CINN_SHUFFLE_FUNCTION(4, cinn_##REDUCE_TYPE, (DTYPE)(INITIAL_VALUE))                \
+      CINN_SHUFFLE_FUNCTION(2, cinn_##REDUCE_TYPE, (DTYPE)(INITIAL_VALUE))                \
+      CINN_SHUFFLE_FUNCTION(1, cinn_##REDUCE_TYPE, (DTYPE)(INITIAL_VALUE))                \
+      tmp_val = __shfl_sync(mask, tmp_val, 0, 32);                                        \
+      return tmp_val;                                                                     \
+    } else {                                                                              \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_down_sync(mask, tmp_val, 16, 32));     \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_down_sync(mask, tmp_val, 8, 32));      \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_down_sync(mask, tmp_val, 4, 32));      \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_down_sync(mask, tmp_val, 2, 32));      \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, __shfl_down_sync(mask, tmp_val, 1, 32));      \
+      return tmp_val;                                                                     \
+    }                                                                                     \
+  }
+
+EXPAND_REDUCE_INT32_MARCO(CINN_WARP_SHUFFLE_INTERNAL_IMPL)
+EXPAND_REDUCE_INT64_MARCO(CINN_WARP_SHUFFLE_INTERNAL_IMPL)
+EXPAND_REDUCE_FP32_MACRO(CINN_WARP_SHUFFLE_INTERNAL_IMPL)
+EXPAND_REDUCE_FP64_MACRO(CINN_WARP_SHUFFLE_INTERNAL_IMPL)
+EXPAND_REDUCE_BOOL_MACRO(CINN_WARP_SHUFFLE_INTERNAL_IMPL)
+
+#ifdef CINN_CUDA_BF16
+EXPAND_REDUCE_BF16_MACRO(CINN_WARP_SHUFFLE_INTERNAL_IMPL)
+#endif
+
+#ifdef CINN_CUDA_FP16
+EXPAND_REDUCE_FP16_MACRO(CINN_WARP_SHUFFLE_INTERNAL_IMPL)
+#endif
+
+#undef CINN_WARP_SHUFFLE_INTERNAL_IMPL
+
+#define CINN_WARP_REDUCE_IMPL(REDUCE_TYPE, INITIAL_VALUE, DTYPE)                                     \
+  __device__ inline DTYPE cinn_warp_reduce_##REDUCE_TYPE(const DTYPE *buf, int offset, int extend) { \
+    DTYPE tmp_val = (DTYPE)(INITIAL_VALUE);                                                          \
+    for (int i = threadIdx.x; i < extend; i += 32) {                                                 \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, buf[offset + i]);                                        \
+    }                                                                                                \
+    return cinn_warp_shuffle_##REDUCE_TYPE##_internal(tmp_val);                                      \
+  }
+
+EXPAND_REDUCE_INT32_MARCO(CINN_WARP_REDUCE_IMPL)
+EXPAND_REDUCE_INT64_MARCO(CINN_WARP_REDUCE_IMPL)
+EXPAND_REDUCE_FP32_MACRO(CINN_WARP_REDUCE_IMPL)
+EXPAND_REDUCE_FP64_MACRO(CINN_WARP_REDUCE_IMPL)
+EXPAND_REDUCE_BOOL_MACRO(CINN_WARP_REDUCE_IMPL)
+
+#ifdef CINN_CUDA_BF16
+EXPAND_REDUCE_BF16_MACRO(CINN_WARP_REDUCE_IMPL)
+#endif
+
+#ifdef CINN_CUDA_FP16
+EXPAND_REDUCE_FP16_MACRO(CINN_WARP_REDUCE_IMPL)
+#endif
+
+#undef CINN_WARP_REDUCE_IMPL
+
+__device__ inline float cinn_warp_reduce_avg_fp32(const float *buf, int offset, int extend) {
+  return cinn_warp_reduce_sum_fp32(buf, offset, extend) / extend;
+}
+
+#define CINN_BLOCK_REDUCE_INTERNAL_IMPL(TYPE, value, init_value, cinn_warp_shuffle_internal) \
+  int warp_id = threadIdx.x / 32;                                                            \
+  __shared__ TYPE tmp[32];                                                                   \
+  if (warp_id == 0) {                                                                        \
+    tmp[threadIdx.x] = init_value;                                                           \
+  }                                                                                          \
+  TYPE tmp_val = cinn_warp_shuffle_internal(value);                                          \
+  if (blockDim.x <= 32) {                                                                    \
+    return tmp_val;                                                                          \
+  }                                                                                          \
+  __syncthreads();                                                                           \
+  if (threadIdx.x % 32 == 0) {                                                               \
+    tmp[warp_id] = tmp_val;                                                                  \
+  }                                                                                          \
+  __syncthreads();                                                                           \
+  if (warp_id == 0) {                                                                        \
+    tmp_val = tmp[threadIdx.x];                                                              \
+    tmp_val = cinn_warp_shuffle_internal(tmp_val);                                           \
+    if (threadIdx.x == 0) {                                                                  \
+      tmp[0] = tmp_val;                                                                      \
+    }                                                                                        \
+  }                                                                                          \
+  __syncthreads();                                                                           \
+  return tmp[0];
+
+#define CINN_BLOCK_REDUCE_INTERNAL_MACRO(REDUCE_TYPE, INITIAL_VALUE, DTYPE)                                            \
+  __device__ inline DTYPE cinn_block_reduce_##REDUCE_TYPE##_internal(const DTYPE value) {                              \
+    CINN_BLOCK_REDUCE_INTERNAL_IMPL(DTYPE, value, (DTYPE)(INITIAL_VALUE), cinn_warp_shuffle_##REDUCE_TYPE##_internal); \
+  }
+
+EXPAND_REDUCE_INT32_MARCO(CINN_BLOCK_REDUCE_INTERNAL_MACRO)
+EXPAND_REDUCE_INT64_MARCO(CINN_BLOCK_REDUCE_INTERNAL_MACRO)
+EXPAND_REDUCE_FP32_MACRO(CINN_BLOCK_REDUCE_INTERNAL_MACRO)
+EXPAND_REDUCE_FP64_MACRO(CINN_BLOCK_REDUCE_INTERNAL_MACRO)
+EXPAND_REDUCE_BOOL_MACRO(CINN_BLOCK_REDUCE_INTERNAL_MACRO)
+
+#ifdef CINN_CUDA_BF16
+EXPAND_REDUCE_BF16_MACRO(CINN_BLOCK_REDUCE_INTERNAL_MACRO)
+#endif
+
+#ifdef CINN_CUDA_FP16
+EXPAND_REDUCE_FP16_MACRO(CINN_BLOCK_REDUCE_INTERNAL_MACRO)
+#endif
+
+#undef CINN_BLOCK_REDUCE_INTERNAL_IMPL
+#undef CINN_BLOCK_REDUCE_INTERNAL_MACRO
+
+#define CINN_BLOCK_REDUCE_IMPL(REDUCE_TYPE, INITIAL_VALUE, DTYPE)                                     \
+  __device__ inline DTYPE cinn_block_reduce_##REDUCE_TYPE(const DTYPE *buf, int offset, int extend) { \
+    DTYPE tmp_val = (DTYPE)(INITIAL_VALUE);                                                           \
+    for (int i = threadIdx.x; i < extend; i += blockDim.x) {                                          \
+      tmp_val = cinn_##REDUCE_TYPE(tmp_val, buf[offset + i]);                                         \
+    }                                                                                                 \
+    return cinn_block_reduce_##REDUCE_TYPE##_internal(tmp_val);                                       \
+  }
+
+EXPAND_REDUCE_INT32_MARCO(CINN_BLOCK_REDUCE_IMPL)
+EXPAND_REDUCE_INT64_MARCO(CINN_BLOCK_REDUCE_IMPL)
+EXPAND_REDUCE_FP32_MACRO(CINN_BLOCK_REDUCE_IMPL)
+EXPAND_REDUCE_FP64_MACRO(CINN_BLOCK_REDUCE_IMPL)
+EXPAND_REDUCE_BOOL_MACRO(CINN_BLOCK_REDUCE_IMPL)
+
+#ifdef CINN_CUDA_BF16
+EXPAND_REDUCE_BF16_MACRO(CINN_BLOCK_REDUCE_IMPL)
+#endif
+
+#ifdef CINN_CUDA_FP16
+EXPAND_REDUCE_FP16_MACRO(CINN_BLOCK_REDUCE_IMPL)
+#endif
+
+#undef CINN_BLOCK_REDUCE_IMPL
+
+#undef EXPAND_REDUCE_INT32_MARCO
+#undef EXPAND_REDUCE_INT64_MARCO
+#undef EXPAND_REDUCE_FP32_MACRO
+#undef EXPAND_REDUCE_FP64_MACRO
+#undef EXPAND_REDUCE_BOOL_MACRO
+
+#ifdef CINN_CUDA_BF16
+#undef EXPAND_REDUCE_BF16_MACRO
+#endif
+
+#ifdef CINN_CUDA_FP16
+#undef EXPAND_REDUCE_FP16_MACRO
+#endif
+
+// *************************************************************** //
+// other function
+#define __cinn_cuda_find_kernel(buf, size, num, begin, stride)           \
+  do {                                                                   \
+    for (int i = (size - 1) * stride + begin; i >= begin; i -= stride) { \
+      if (buf[i] == num) return (i - begin) / stride;                    \
+    }                                                                    \
+    return -1;                                                           \
+  } while (0)
+
+__device__ inline int cinn_cuda_find_int(const int *buf, int size, int num) {
+  __cinn_cuda_find_kernel(buf, size, num, 0, 1);
+}
+
+__device__ inline int cinn_cuda_find_float(const float *buf, int size, float num) {
+  __cinn_cuda_find_kernel(buf, size, num, 0, 1);
+}
+
+__device__ inline int cinn_cuda_find_int_nd(const int *buf, int size, int num, int begin, int stride) {
+  __cinn_cuda_find_kernel(buf, size, num, begin, stride);
+}
+
+__device__ inline int cinn_cuda_find_float_nd(const float *buf, int size, float num, int begin, int stride) {
+  __cinn_cuda_find_kernel(buf, size, num, begin, stride);
+}
+
+#undef __cinn_cuda_find_kernel
+
+__device__ inline int cinn_nvgpu_next_smallest_int32(int *buf, int size, int num, int begin, int stride) {
+  int id = -1;
+  for (int i = begin; i < begin + size * stride; i += stride) {
+    if (id == -1 || buf[i] < buf[id]) {
+      id = i;
+    }
+  }
+  if (id != -1) {
+    buf[id] = CINN_INT32_MAX;
+    return (id - begin) / stride;
+  }
+  return -1;
+}
+
+#define __cinn_cuda_find_from_kernel(buf, size, num, begin) \
+  do {                                                      \
+    for (int i = begin; i < size; ++i) {                    \
+      if (buf[i] == num) return i;                          \
+    }                                                       \
+    return -1;                                              \
+  } while (0)
+
+__device__ inline int cinn_cuda_find_int_from(const int *buf, int size, int num, int begin) {
+  __cinn_cuda_find_from_kernel(buf, size, num, begin);
+}
+
+__device__ inline int cinn_cuda_find_float_from(const float *buf, int size, float num, int begin) {
+  __cinn_cuda_find_from_kernel(buf, size, num, begin);
+}
+
+#undef __cinn_cuda_find_from_kernel
+
+#define CINN_NVGPU_LT_NUM(TYPE_SUFFIX, TYPE)                                                  \
+  __device__ inline int cinn_nvgpu_lt_num_##TYPE_SUFFIX(                                      \
+      const TYPE *buf, const int size, const TYPE num, const int offset, const int stride) { \
+    int out = 0;                                                                             \
+    for (int i = (size - 1) * stride + offset; i >= offset; i -= stride) {                   \
+      if (buf[i] < num) out++;                                                               \
+    }                                                                                        \
+    return out;                                                                              \
+  }
+
+CINN_NVGPU_LT_NUM(fp32, float)
+CINN_NVGPU_LT_NUM(fp64, double)
+CINN_NVGPU_LT_NUM(int32, int)
+CINN_NVGPU_LT_NUM(int64, long long int)
+#ifdef CINN_CUDA_FP16
+CINN_NVGPU_LT_NUM(fp16, float16)
+#endif
+
+#undef CINN_NVGPU_LT_NUM
+
+#define CINN_NVGPU_GT_NUM(TYPE_SUFFIX, TYPE)                                                 \
+  __device__ inline int cinn_nvgpu_gt_num_##TYPE_SUFFIX(                                     \
+      const TYPE *buf, const int size, const TYPE num, const int offset, const int stride) { \
+    int out = 0;                                                                             \
+    for (int i = (size - 1) * stride + offset; i >= offset; i -= stride) {                   \
+      if (buf[i] > num) out++;                                                               \
+    }                                                                                        \
+    return out;                                                                              \
+  }
+
+CINN_NVGPU_GT_NUM(fp32, float)
+CINN_NVGPU_GT_NUM(fp64, double)
+CINN_NVGPU_GT_NUM(int32, int)
+CINN_NVGPU_GT_NUM(int64, long long int)
+#ifdef CINN_CUDA_FP16
+CINN_NVGPU_GT_NUM(fp16, float16)
+#endif
+
+#undef CINN_NVGPU_GT_NUM
+
+#define CINN_NVGPU_INDEX_ADD(TYPE_SUFFIX, TYPE)                               \
+  __device__ inline TYPE cinn_nvgpu_index_add_##TYPE_SUFFIX(const TYPE x,     \
+                                            const int axis_indice,            \
+                                            const TYPE *__restrict__ y,       \
+                                            const int offset,                 \
+                                            const int stride,                 \
+                                            const int *__restrict__ index,    \
+                                            const int index_size) {           \
+    TYPE res = x;                                                             \
+    int idx  = -1;                                                            \
+    do {                                                                      \
+      idx = cinn_cuda_find_int_from(index, index_size, axis_indice, idx + 1); \
+      if (idx >= 0) {                                                         \
+        res += y[offset + idx * stride];                                      \
+      }                                                                       \
+    } while (idx != -1);                                                      \
+    return res;                                                               \
+  }
+
+CINN_NVGPU_INDEX_ADD(bool, bool)
+CINN_NVGPU_INDEX_ADD(int8, int8_t)
+CINN_NVGPU_INDEX_ADD(int32, int32_t)
+CINN_NVGPU_INDEX_ADD(int64, int64_t)
+CINN_NVGPU_INDEX_ADD(fp32, float)
+CINN_NVGPU_INDEX_ADD(fp64, double)
+#ifdef CINN_CUDA_FP16
+CINN_NVGPU_INDEX_ADD(fp16, float16)
+#endif
+
+#undef CINN_CUDA_INDEX_ADD
+
+__device__ int cinn_cuda_resize_bilinear(const int *buf,
+                                         const int c_size,
+                                         const int in_h,
+                                         const int in_w,
+                                         const int out_h,
+                                         const int out_w,
+                                         const int n,
+                                         const int c,
+                                         const int y,
+                                         const int x) {
+  float scale_y = static_cast<float>(in_h) / out_h;
+  float scale_x = static_cast<float>(in_w) / out_w;
+  float in_y    = (y + 0.5F) * scale_y - 0.5F;
+  float in_x    = (x + 0.5F) * scale_x - 0.5F;
+  int in_y_int  = static_cast<int>(FN_FP32(floor)(in_y));
+  int in_x_int  = static_cast<int>(FN_FP32(floor)(in_x));
+  float y_lerp  = in_y - in_y_int;
+  float x_lerp  = in_x - in_x_int;
+  float p[2][2];
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      int near_y = in_y_int + i;
+      int near_x = in_x_int + j;
+      near_y     = FN_INT32(max)(FN_INT32(min)(near_y, in_h - 1), 0);
+      near_x     = FN_INT32(max)(FN_INT32(min)(near_x, in_w - 1), 0);
+      p[i][j]    = buf[n * c_size * in_h * in_w + c * in_h * in_w + near_y * in_w + near_x];
+    }
+  }
+
+  float top    = p[0][0] * (1.0F - x_lerp) + p[0][1] * x_lerp;
+  float bottom = p[1][0] * (1.0F - x_lerp) + p[1][1] * x_lerp;
+  float value  = top * (1.0F - y_lerp) + bottom * y_lerp;
+  return value;
+}
+
+__device__ int cinn_cuda_resize_bicubic(const int *buf,
+                                        const int c_size,
+                                        const int in_h,
+                                        const int in_w,
+                                        const int out_h,
+                                        const int out_w,
+                                        const int n,
+                                        const int c,
+                                        const int y,
+                                        const int x) {
+  float scale_y = static_cast<float>(in_h) / out_h;
+  float scale_x = static_cast<float>(in_w) / out_w;
+  float in_y    = (y + 0.5F) * scale_y - 0.5F;
+  float in_x    = (x + 0.5F) * scale_x - 0.5F;
+  int in_y_int  = static_cast<int>(cinn_nvgpu_floor_fp32(in_y));
+  int in_x_int  = static_cast<int>(cinn_nvgpu_floor_fp32(in_x));
+  float y_fract = in_y - cinn_nvgpu_floor_fp32(in_y);
+  float x_fract = in_x - cinn_nvgpu_floor_fp32(in_x);
+  float p[4][4];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      int near_y = in_y_int + i - 1;
+      int near_x = in_x_int + j - 1;
+      near_y     = FN_INT32(max)(FN_INT32(min)(near_y, in_h - 1), 0);
+      near_x     = FN_INT32(max)(FN_INT32(min)(near_x, in_w - 1), 0);
+      p[i][j]    = buf[n * c_size * in_h * in_w + c * in_h * in_w + near_y * in_w + near_x];
+    }
+  }
+
+  float alpha = -0.5F;
+  float w[2][4];
+
+  for (int i = 0; i < 2; ++i) {
+    float t  = (i == 0 ? x_fract : y_fract);
+    float t2 = t * t;
+    float t3 = t * t * t;
+    w[i][0]  = alpha * (t3 - 2 * t2 + t);
+    w[i][1]  = (alpha + 2) * t3 - (3 + alpha) * t2 + 1;
+    w[i][2]  = -(alpha + 2) * t3 + (3 + 2 * alpha) * t2 - alpha * t;
+    w[i][3]  = -alpha * t3 + alpha * t2;
+  }
+
+  float col[4];
+
+  for (int i = 0; i < 4; ++i) {
+    col[i] = 0.0F;
+    for (int j = 0; j < 4; ++j) {
+      col[i] += p[i][j] * w[0][j];
+    }
+  }
+
+  float value = 0.0F;
+
+  for (int i = 0; i < 4; ++i) {
+    value += col[i] * w[1][i];
+  }
+
+  return value;
+}
+
+// *************************************************************** //
+// end of macro undef
+#undef CINN_INT32_MAX
+#undef CINN_INT32_MIN
+#undef FN_BOOL
+#undef FN_UINT8
+#undef FN_INT8
+#undef FN_INT16
+#undef FN_FP32
+#undef FN_FP64
+#undef FN_INT32
+#undef FN_INT64
+
+#ifdef CINN_CUDA_BF16
+#undef FN_BF16
+#endif
+
+#ifdef CINN_CUDA_FP16
+#undef FN_FP16
+#endif
+}
diff --git a/paddle/cinn/runtime/cuda/cublas_util.h b/paddle/cinn/runtime/cuda/cublas_util.h
new file mode 100644
index 0000000000000..24ae8774c2dc5
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cublas_util.h
@@ -0,0 +1,328 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cublas_v2.h>
+
+#include "cinn/common/type.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace runtime {
+namespace cuda {
+
+inline cublasStatus_t cublasGemm(cudaDataType_t dtype,
+                                 cublasHandle_t handle,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 float alpha,
+                                 const void *A,
+                                 int lda,
+                                 const void *B,
+                                 int ldb,
+                                 float beta,
+                                 void *C,
+                                 int ldc) {
+  if (dtype == CUDA_R_32F) {
+    return cublasSgemm(handle,
+                       transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       reinterpret_cast<const float *>(&alpha),
+                       reinterpret_cast<const float *>(A),
+                       lda,
+                       reinterpret_cast<const float *>(B),
+                       ldb,
+                       reinterpret_cast<const float *>(&beta),
+                       reinterpret_cast<float *>(C),
+                       ldc);
+  } else if (dtype == CUDA_R_64F) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    return cublasDgemm(handle,
+                       transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       &alpha_fp64,
+                       reinterpret_cast<const double *>(A),
+                       lda,
+                       reinterpret_cast<const double *>(B),
+                       ldb,
+                       &beta_fp64,
+                       reinterpret_cast<double *>(C),
+                       ldc);
+  } else if (dtype == CUDA_R_16F) {
+    common::float16 alpha_fp16{alpha};
+    common::float16 beta_fp16{beta};
+    return cublasHgemm(handle,
+                       transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       reinterpret_cast<const __half *>(&alpha_fp16),
+                       reinterpret_cast<const __half *>(A),
+                       lda,
+                       reinterpret_cast<const __half *>(B),
+                       ldb,
+                       reinterpret_cast<const __half *>(&beta_fp16),
+                       reinterpret_cast<__half *>(C),
+                       ldc);
+  } else if (dtype == CUDA_R_16BF) {
+#if CUDA_VERSION >= 11000
+    return cublasGemmEx(handle,
+                        transa,
+                        transb,
+                        m,
+                        n,
+                        k,
+                        &alpha,
+                        A,
+                        CUDA_R_16BF,
+                        lda,
+                        B,
+                        CUDA_R_16BF,
+                        ldb,
+                        &beta,
+                        C,
+                        CUDA_R_16BF,
+                        ldc,
+                        CUBLAS_COMPUTE_32F,
+                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#else
+    LOG(FATAL) << "cublasGemmEx with bfloat16 is not supported on cuda <= 11";
+#endif
+  }
+  LOG(FATAL) << "Unsupported cublasGemm precision.";
+}
+
+inline cublasStatus_t cublasGemmStridedBatched(cudaDataType_t dtype,
+                                               cublasHandle_t handle,
+                                               cublasOperation_t transa,
+                                               cublasOperation_t transb,
+                                               int m,
+                                               int n,
+                                               int k,
+                                               float alpha,
+                                               const void *A,
+                                               int lda,
+                                               long long int strideA,
+                                               const void *B,
+                                               int ldb,
+                                               long long int strideB,
+                                               float beta,
+                                               void *C,
+                                               int ldc,
+                                               long long int strideC,
+                                               int batchCount) {
+  if (dtype == CUDA_R_32F) {
+    return cublasSgemmStridedBatched(handle,
+                                     transa,
+                                     transb,
+                                     m,
+                                     n,
+                                     k,
+                                     reinterpret_cast<const float *>(&alpha),
+                                     reinterpret_cast<const float *>(A),
+                                     lda,
+                                     strideA,
+                                     reinterpret_cast<const float *>(B),
+                                     ldb,
+                                     strideB,
+                                     reinterpret_cast<const float *>(&beta),
+                                     reinterpret_cast<float *>(C),
+                                     ldc,
+                                     strideC,
+                                     batchCount);
+  } else if (dtype == CUDA_R_64F) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    return cublasDgemmStridedBatched(handle,
+                                     transa,
+                                     transb,
+                                     m,
+                                     n,
+                                     k,
+                                     &alpha_fp64,
+                                     reinterpret_cast<const double *>(A),
+                                     lda,
+                                     strideA,
+                                     reinterpret_cast<const double *>(B),
+                                     ldb,
+                                     strideB,
+                                     &beta_fp64,
+                                     reinterpret_cast<double *>(C),
+                                     ldc,
+                                     strideC,
+                                     batchCount);
+  } else if (dtype == CUDA_R_16F) {
+    common::float16 alpha_fp16{alpha};
+    common::float16 beta_fp16{beta};
+    return cublasHgemmStridedBatched(handle,
+                                     transa,
+                                     transb,
+                                     m,
+                                     n,
+                                     k,
+                                     reinterpret_cast<const __half *>(&alpha_fp16),
+                                     reinterpret_cast<const __half *>(A),
+                                     lda,
+                                     strideA,
+                                     reinterpret_cast<const __half *>(B),
+                                     ldb,
+                                     strideB,
+                                     reinterpret_cast<const __half *>(&beta_fp16),
+                                     reinterpret_cast<__half *>(C),
+                                     ldc,
+                                     strideC,
+                                     batchCount);
+  } else if (dtype == CUDA_R_16BF) {
+#if CUDA_VERSION >= 11000
+    return cublasGemmStridedBatchedEx(handle,
+                                      transa,
+                                      transb,
+                                      m,
+                                      n,
+                                      k,
+                                      &alpha,
+                                      A,
+                                      CUDA_R_16BF,
+                                      lda,
+                                      strideA,
+                                      B,
+                                      CUDA_R_16BF,
+                                      ldb,
+                                      strideB,
+                                      &beta,
+                                      C,
+                                      CUDA_R_16BF,
+                                      ldc,
+                                      strideC,
+                                      batchCount,
+                                      CUBLAS_COMPUTE_32F,
+                                      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#else
+    LOG(FATAL) << "cublasGemmStridedBatched with bfloat16 is not supported on cuda <= 11";
+#endif
+  }
+  LOG(FATAL) << "Unsupported cublasGemmStridedBatched precision.";
+}
+
+inline cublasStatus_t cublasGemmBatched(cudaDataType_t dtype,
+                                        cublasHandle_t handle,
+                                        cublasOperation_t transa,
+                                        cublasOperation_t transb,
+                                        int m,
+                                        int n,
+                                        int k,
+                                        float alpha,
+                                        void **A,
+                                        int lda,
+                                        void **B,
+                                        int ldb,
+                                        float beta,
+                                        void **C,
+                                        int ldc,
+                                        int batchCount) {
+  if (dtype == CUDA_R_32F) {
+    return cublasSgemmBatched(handle,
+                              transa,
+                              transb,
+                              m,
+                              n,
+                              k,
+                              &alpha,
+                              reinterpret_cast<float **>(A),
+                              lda,
+                              reinterpret_cast<float **>(B),
+                              ldb,
+                              &beta,
+                              reinterpret_cast<float **>(C),
+                              ldc,
+                              batchCount);
+  } else if (dtype == CUDA_R_64F) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    return cublasDgemmBatched(handle,
+                              transa,
+                              transb,
+                              m,
+                              n,
+                              k,
+                              &alpha_fp64,
+                              reinterpret_cast<double **>(A),
+                              lda,
+                              reinterpret_cast<double **>(B),
+                              ldb,
+                              &beta_fp64,
+                              reinterpret_cast<double **>(C),
+                              ldc,
+                              batchCount);
+  } else if (dtype == CUDA_R_16F) {
+    __half alpha_fp16{alpha};
+    __half beta_fp16{beta};
+    return cublasHgemmBatched(handle,
+                              transa,
+                              transb,
+                              m,
+                              n,
+                              k,
+                              &alpha_fp16,
+                              reinterpret_cast<__half **>(A),
+                              lda,
+                              reinterpret_cast<__half **>(B),
+                              ldb,
+                              &beta_fp16,
+                              reinterpret_cast<__half **>(C),
+                              ldc,
+                              batchCount);
+  } else if (dtype == CUDA_R_16BF) {
+#if CUDA_VERSION >= 11000
+    return cublasGemmBatchedEx(handle,
+                               transa,
+                               transb,
+                               m,
+                               n,
+                               k,
+                               &alpha,
+                               A,
+                               CUDA_R_16BF,
+                               lda,
+                               B,
+                               CUDA_R_16BF,
+                               ldb,
+                               &beta,
+                               C,
+                               CUDA_R_16BF,
+                               ldc,
+                               batchCount,
+                               CUBLAS_COMPUTE_32F,
+                               CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#else
+    LOG(FATAL) << "cublasGemmBatched with bfloat16 is not supported on cuda <= 11";
+#endif
+  }
+  LOG(FATAL) << "Unsupported cublasGemmBatched precision.";
+}
+
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cuda/cuda_instrinsics_bfloat16.cc b/paddle/cinn/runtime/cuda/cuda_instrinsics_bfloat16.cc
new file mode 100644
index 0000000000000..c46eb00d6ea85
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cuda_instrinsics_bfloat16.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/function_prototype.h"
+#include "cinn/common/bfloat16.h"
+#include "cinn/common/cas.h"
+#include "cinn/runtime/cuda/cuda_util.h"
+#include "cinn/runtime/custom_function.h"
+
+using cinn::common::bfloat16;
+
+CINN_REGISTER_HELPER(cuda_intrinsics_bfloat16) {
+  auto target = cinn::common::DefaultNVGPUTarget();
+  using cinn::backends::FunctionProto;
+
+// bfloat16
+#define REGISTER_EXTERN_FUNC_2_IN_1_BF16(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(cinn_nvgpu_##func__##_bf16, target, bfloat16, bfloat16, bfloat16);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_BF16(pow)
+  REGISTER_EXTERN_FUNC_2_IN_1_BF16(mod)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_BF16
+
+#define REGISTER_EXTERN_FUNC_1_IN_1_BF16(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_bf16, target, bfloat16, bfloat16);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(ceil)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(floor)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(round)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(trunc)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(sin)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(cos)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(tan)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(exp)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(log)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(log2)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(log10)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(sqrt)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(rsqrt)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(cbrt)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(abs)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(erf)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(sinh)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(cosh)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(tanh)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(asin)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(acos)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(atan)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(asinh)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(acosh)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(atanh)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16(sigmoid)
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_BF16
+
+#define REGISTER_EXTERN_FUNC_1_IN_1_BF16_OUT_BOOL(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_bf16, target, bfloat16, bool);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16_OUT_BOOL(isnan)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16_OUT_BOOL(isinf)
+  REGISTER_EXTERN_FUNC_1_IN_1_BF16_OUT_BOOL(isfinite)
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_BF16_OUT_BOOL
+
+  return true;
+}
diff --git a/paddle/cinn/runtime/cuda/cuda_instrinsics_float16.cc b/paddle/cinn/runtime/cuda/cuda_instrinsics_float16.cc
new file mode 100644
index 0000000000000..ee6440ba530ba
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cuda_instrinsics_float16.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/function_prototype.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/float16.h"
+#include "cinn/runtime/cuda/cuda_util.h"
+#include "cinn/runtime/custom_function.h"
+
+using cinn::common::float16;
+
+CINN_REGISTER_HELPER(cuda_intrinsics_float16) {
+  auto target = cinn::common::DefaultNVGPUTarget();
+  using cinn::backends::FunctionProto;
+
+// float16
+#define REGISTER_EXTERN_FUNC_2_IN_1_FP16(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(cinn_nvgpu_##func__##_fp16, target, float16, float16, float16);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_FP16(pow)
+  REGISTER_EXTERN_FUNC_2_IN_1_FP16(mod)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_FP16
+
+#define REGISTER_EXTERN_FUNC_1_IN_1_FP16(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_fp16, target, float16, float16);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(ceil)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(floor)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(round)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(trunc)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(sin)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(cos)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(tan)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(exp)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(log)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(log2)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(log10)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(sqrt)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(rsqrt)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(cbrt)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(abs)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(erf)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(sinh)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(cosh)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(tanh)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(asin)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(acos)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(atan)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(asinh)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(acosh)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(atanh)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16(sigmoid)
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_FP16
+
+#define REGISTER_EXTERN_FUNC_1_IN_1_FP16_OUT_BOOL(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_fp16, target, float16, bool);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16_OUT_BOOL(isnan)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16_OUT_BOOL(isinf)
+  REGISTER_EXTERN_FUNC_1_IN_1_FP16_OUT_BOOL(isfinite)
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_FP16_OUT_BOOL
+
+#define REGISTER_CINN_NVGPU_GT_NUM(TYPE_SUFFIX, TYPE)                         \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_gt_num_##TYPE_SUFFIX, target) \
+      .SetRetType<int>()                                                      \
+      .AddInputType<cinn_buffer_t *>()                                        \
+      .AddInputType<int>()                                                    \
+      .AddInputType<TYPE>()                                                   \
+      .AddInputType<int>()                                                    \
+      .AddInputType<int>()                                                    \
+      .End();
+
+  REGISTER_CINN_NVGPU_GT_NUM(fp16, float16);
+
+#undef REGISTER_CINN_NVGPU_GT_NUM
+
+#define REGISTER_CINN_NVGPU_LT_NUM(TYPE_SUFFIX, TYPE)                         \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_lt_num_##TYPE_SUFFIX, target) \
+      .SetRetType<int>()                                                      \
+      .AddInputType<cinn_buffer_t *>()                                        \
+      .AddInputType<int>()                                                    \
+      .AddInputType<TYPE>()                                                   \
+      .AddInputType<int>()                                                    \
+      .AddInputType<int>()                                                    \
+      .End();
+
+  REGISTER_CINN_NVGPU_LT_NUM(fp16, float16);
+
+#undef REGISTER_CINN_NVGPU_LT_NUM
+
+#define REGISTER_CINN_NVGPU_INDEX_ADD(TYPE_SUFFIX, TYPE)                         \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_index_add_##TYPE_SUFFIX, target) \
+      .SetRetType<TYPE>()                                                        \
+      .AddInputType<TYPE>()                                                      \
+      .AddInputType<int>()                                                       \
+      .AddInputType<cinn_buffer_t *>()                                           \
+      .AddInputType<int>()                                                       \
+      .AddInputType<int>()                                                       \
+      .AddInputType<cinn_buffer_t *>()                                           \
+      .AddInputType<int>()                                                       \
+      .End();
+
+  REGISTER_CINN_NVGPU_INDEX_ADD(fp16, float16);
+
+#undef REGISTER_CINN_NVGPU_INDEX_ADD
+
+  return true;
+}
diff --git a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc
new file mode 100644
index 0000000000000..88e48973b3d13
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc
@@ -0,0 +1,733 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/function_prototype.h"
+#include "cinn/common/cas.h"
+#include "cinn/runtime/cuda/cuda_util.h"
+#include "cinn/runtime/custom_function.h"
+
+CINN_REGISTER_HELPER(cuda_intrinsics) {
+  auto target = cinn::common::DefaultNVGPUTarget();
+  using cinn::backends::FunctionProto;
+
+// bool for 1 input 1 output
+#define REGISTER_EXTERN_FUNC_1_IN_1_OUT_BOOL(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_bool, target, bool, bool)
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_BOOL(bitwise_not);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_OUT_BOOL
+
+// bool for 2 input 1 output
+#define REGISTER_EXTERN_FUNC_2_IN_1_OUT_BOOL(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(cinn_nvgpu_##func__##_bool, target, bool, bool, bool)
+
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_BOOL(bitwise_and);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_BOOL(bitwise_or);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_BOOL(bitwise_xor);
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_OUT_BOOL
+
+// uint8 for 1 input 1 output
+#define REGISTER_EXTERN_FUNC_1_IN_1_OUT_UINT8(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_uint8, target, uint8_t, uint8_t)
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_UINT8(bitwise_not);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_OUT_UINT8
+
+// uint8 for 2 input 1 output
+#define REGISTER_EXTERN_FUNC_2_IN_1_OUT_UINT8(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(cinn_nvgpu_##func__##_uint8, target, uint8_t, uint8_t, uint8_t);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_UINT8(bitwise_and);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_UINT8(bitwise_or);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_UINT8(bitwise_xor);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_UINT8(logical_right_shift);
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_OUT_UINT8
+
+// int8 for 1 input 1 output
+#define REGISTER_EXTERN_FUNC_1_IN_1_OUT_INT8(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_int8, target, int8_t, int8_t)
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_INT8(bitwise_not);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_OUT_INT8
+
+// int8 for 2 input 1 output
+#define REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT8(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(cinn_nvgpu_##func__##_int8, target, int8_t, int8_t, int8_t);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT8(bitwise_and);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT8(bitwise_or);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT8(bitwise_xor);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT8(logical_right_shift);
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT8
+
+// int16 for 1 input 1 output
+#define REGISTER_EXTERN_FUNC_1_IN_1_OUT_INT16(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_int16, target, int16_t, int16_t)
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_INT16(bitwise_not);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_OUT_INT16
+
+// int16 for 2 input 1 output
+#define REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT16(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(cinn_nvgpu_##func__##_int16, target, int16_t, int16_t, int16_t);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT16(bitwise_and);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT16(bitwise_or);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT16(bitwise_xor);
+  REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT16(logical_right_shift);
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_OUT_INT16
+
+// float
+#define REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_fp32, target, float, float);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(abs);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(exp);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(erf);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(sqrt);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(rsqrt);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(log);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(log2);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(log10);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(floor);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(ceil);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(round);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(trunc);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(cos);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(cosh);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(tan);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(sin);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(sinh);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(acos);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(acosh);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(asin);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(asinh);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(atan);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(atanh);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(tanh);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(cbrt);
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT(sigmoid);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_OUT_FLOAT
+
+#define REGISTER_EXTERN_FUNC_1_IN_FLOAT_1_OUT_BOOL(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_fp32, target, float, bool);
+
+  REGISTER_EXTERN_FUNC_1_IN_FLOAT_1_OUT_BOOL(isnan);
+  REGISTER_EXTERN_FUNC_1_IN_FLOAT_1_OUT_BOOL(isfinite);
+  REGISTER_EXTERN_FUNC_1_IN_FLOAT_1_OUT_BOOL(isinf);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_FLOAT_1_OUT_BOOL
+
+#define REGISTER_EXTERN_FUNC_2_IN_1_FLOAT(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(cinn_nvgpu_##func__##_fp32, target, float, float, float);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_FLOAT(pow)
+  REGISTER_EXTERN_FUNC_2_IN_1_FLOAT(mod)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_FLOAT
+
+  // double
+
+#define REGISTER_EXTERN_FUNC_1_IN_1_FP64(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_fp64, target, double, double);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(abs);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(exp);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(erf);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(sqrt);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(rsqrt);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(log);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(log2);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(log10);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(floor);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(ceil);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(round);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(trunc);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(cos);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(cosh);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(tan);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(sin);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(sinh);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(acos);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(acosh);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(asin);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(asinh);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(atan);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(atanh);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(tanh);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(cbrt);
+  REGISTER_EXTERN_FUNC_1_IN_1_FP64(sigmoid);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_FP64
+
+#define REGISTER_EXTERN_FUNC_1_IN_FP64_1_OUT_BOOL(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_fp64, target, double, bool);
+
+  REGISTER_EXTERN_FUNC_1_IN_FP64_1_OUT_BOOL(isnan);
+  REGISTER_EXTERN_FUNC_1_IN_FP64_1_OUT_BOOL(isfinite);
+  REGISTER_EXTERN_FUNC_1_IN_FP64_1_OUT_BOOL(isinf);
+
+#undef REGISTER_EXTERN_FUNC_1_IN_FP64_1_OUT_BOOL
+
+#define REGISTER_EXTERN_FUNC_2_IN_1_FP64(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(cinn_nvgpu_##func__##_fp64, target, double, double, double);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_FP64(pow)
+  REGISTER_EXTERN_FUNC_2_IN_1_FP64(mod)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_FP64
+
+  // int32
+
+#define REGISTER_EXTERN_FUNC_1_IN_1_INT32(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_int32, target, int, int);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_INT32(bitwise_not)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT32(clz)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT32(popc)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT32(trunc)
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_INT32
+
+#define REGISTER_EXTERN_FUNC_1_IN_1_INT64(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_int64, target, int64_t, int64_t);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_INT64(bitwise_not)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT64(clz)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT64(popc)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT64(trunc)
+
+#undef REGISTER_EXTERN_FUNC_1_IN_1_INT64
+
+#define REGISTER_EXTERN_FUNC_2_IN_1_INT32(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(cinn_nvgpu_##func__##_int32, target, int, int, int);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_INT32(pow)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT32(left_shift)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT32(right_shift)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT32(bitwise_and)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT32(bitwise_or)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT32(bitwise_xor)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT32(logical_right_shift)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT32(mod)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_INT32
+
+#define REGISTER_EXTERN_FUNC_2_IN_1_INT64(func__) \
+  REGISTER_EXTERN_SOURCE_FUNC_2_IN_1_OUT(cinn_nvgpu_##func__##_int64, target, int64_t, int64_t, int64_t);
+
+  REGISTER_EXTERN_FUNC_2_IN_1_INT64(pow)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT64(bitwise_and)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT64(bitwise_or)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT64(bitwise_xor)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT64(mod)
+  REGISTER_EXTERN_FUNC_2_IN_1_INT64(logical_right_shift)
+
+#undef REGISTER_EXTERN_FUNC_2_IN_1_INT64
+
+  FunctionProto::shape_inference_t inference_shape_globalpool = [](const std::vector<cinn::ir::Expr> &args,
+                                                                   int offset) {
+    auto t = args[0].as_tensor();
+    std::vector<cinn::ir::Expr> shape;
+    shape.push_back(t->shape[0]);
+    shape.push_back(t->shape[1]);
+    shape.push_back(cinn::ir::Expr(1));
+    shape.push_back(cinn::ir::Expr(1));
+    return shape;
+  };
+
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_cuda_find_int, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_cuda_find_float, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<float>()
+      .End();
+
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_cuda_find_int_nd, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_cuda_find_float_nd, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<float>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_cuda_find_int_from, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_cuda_find_float_from, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<float>()
+      .AddInputType<int>()
+      .End();
+
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_next_smallest_int32, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+#define _REGISTER_CINN_NVGPU_LT_NUM(TYPE_SUFFIX, TYPE)                        \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_lt_num_##TYPE_SUFFIX, target) \
+      .SetRetType<int>()                                                      \
+      .AddInputType<cinn_buffer_t *>()                                        \
+      .AddInputType<int>()                                                    \
+      .AddInputType<TYPE>()                                                   \
+      .AddInputType<int>()                                                    \
+      .AddInputType<int>()                                                    \
+      .End();
+
+  _REGISTER_CINN_NVGPU_LT_NUM(fp32, float);
+  _REGISTER_CINN_NVGPU_LT_NUM(fp64, double);
+  _REGISTER_CINN_NVGPU_LT_NUM(int32, int);
+  _REGISTER_CINN_NVGPU_LT_NUM(int64, int64_t);
+
+#undef _REGISTER_CINN_NVGPU_LT_NUM
+
+#define _REGISTER_CINN_NVGPU_GT_NUM(TYPE_SUFFIX, TYPE)                        \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_gt_num_##TYPE_SUFFIX, target) \
+      .SetRetType<int>()                                                      \
+      .AddInputType<cinn_buffer_t *>()                                        \
+      .AddInputType<int>()                                                    \
+      .AddInputType<TYPE>()                                                   \
+      .AddInputType<int>()                                                    \
+      .AddInputType<int>()                                                    \
+      .End();
+
+  _REGISTER_CINN_NVGPU_GT_NUM(fp32, float);
+  _REGISTER_CINN_NVGPU_GT_NUM(fp64, double);
+  _REGISTER_CINN_NVGPU_GT_NUM(int32, int);
+  _REGISTER_CINN_NVGPU_GT_NUM(int64, int64_t);
+
+#undef _REGISTER_CINN_NVGPU_GT_NUM
+
+#define _REGISTER_CINN_NVGPU_INDEX_ADD(TYPE_SUFFIX, TYPE)                        \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_index_add_##TYPE_SUFFIX, target) \
+      .SetRetType<TYPE>()                                                        \
+      .AddInputType<TYPE>()                                                      \
+      .AddInputType<int>()                                                       \
+      .AddInputType<cinn_buffer_t *>()                                           \
+      .AddInputType<int>()                                                       \
+      .AddInputType<int>()                                                       \
+      .AddInputType<cinn_buffer_t *>()                                           \
+      .AddInputType<int>()                                                       \
+      .End();
+
+  _REGISTER_CINN_NVGPU_INDEX_ADD(bool, bool);
+  _REGISTER_CINN_NVGPU_INDEX_ADD(int8, int8_t);
+  _REGISTER_CINN_NVGPU_INDEX_ADD(int32, int32_t);
+  _REGISTER_CINN_NVGPU_INDEX_ADD(int64, int64_t);
+  _REGISTER_CINN_NVGPU_INDEX_ADD(fp32, float);
+  _REGISTER_CINN_NVGPU_INDEX_ADD(fp64, double);
+
+#undef _REGISTER_CINN_NVGPU_INDEX_ADD
+
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_cuda_resize_bilinear, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_cuda_resize_bicubic, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(cinn_cuda_host_api) {
+  using cinn::runtime::cuda::cinn_call_cuda_kernel;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cuda_kernel, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // kernel_fn
+      .AddInputType<void *>()  // args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // grid_x
+      .AddInputType<int>()     // grid_y
+      .AddInputType<int>()     // grid_z
+      .AddInputType<int>()     // block_x
+      .AddInputType<int>()     // block_y
+      .AddInputType<int>()     // block_z
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_cublas;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cublas, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<bool>()    // trans_a
+      .AddInputType<bool>()    // trans_b
+      .AddInputType<float>()   // alpha
+      .AddInputType<float>()   // beta
+      .AddInputType<int>()     // a1
+      .AddInputType<int>()     // a2
+      .AddInputType<int>()     // a3
+      .AddInputType<int>()     // a4
+      .AddInputType<int>()     // b1
+      .AddInputType<int>()     // b2
+      .AddInputType<int>()     // b3
+      .AddInputType<int>()     // b4
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_batched_cublas;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_batched_cublas, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // opside
+      .AddInputType<bool>()    // trans_a
+      .AddInputType<bool>()    // trans_b
+      .AddInputType<float>()   // alpha
+      .AddInputType<float>()   // beta
+      .AddInputType<int>()     // a1
+      .AddInputType<int>()     // a2
+      .AddInputType<int>()     // a3
+      .AddInputType<int>()     // a4
+      .AddInputType<int>()     // b1
+      .AddInputType<int>()     // b2
+      .AddInputType<int>()     // b3
+      .AddInputType<int>()     // b4
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_cuda_memset;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cuda_memset, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // value
+      .AddInputType<size_t>()  // count
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_cuda_memcpy;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cuda_memcpy, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<size_t>()  // count
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_gaussian_random;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_gaussian_random, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<float>()   // mean
+      .AddInputType<float>()   // std
+      .AddInputType<int>()     // seed
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_uniform_random;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_uniform_random, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<float>()   // min
+      .AddInputType<float>()   // max
+      .AddInputType<int>()     // seed
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_randint;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_randint, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // seed
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_cholesky_nvgpu;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cholesky_nvgpu, cinn::common::DefaultNVGPUTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // batch_size
+      .AddInputType<int>()     // m
+      .AddInputType<bool>()    // upper
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_triangular_solve_nvgpu;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_triangular_solve_nvgpu, cinn::common::DefaultNVGPUTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // batch_size
+      .AddInputType<int>()     // m
+      .AddInputType<int>()     // k
+      .AddInputType<bool>()    // left_side
+      .AddInputType<bool>()    // upper
+      .AddInputType<bool>()    // transpose_a
+      .AddInputType<bool>()    // unit_diagonal
+      .AddInputType<void *>()  // stream
+      .End();
+
+  // TODO(thisjiang): change msg type from 'int' to 'std::string' when custom call support 'std::string' type
+  using cinn::runtime::cuda::cinn_assert_true_nvgpu;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_assert_true_nvgpu, cinn::common::DefaultNVGPUTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // msg
+      .AddInputType<bool>()    // only_warning
+      .AddInputType<void *>()  // stream
+      .End();
+
+#ifdef CINN_WITH_CUDNN
+  using cinn::runtime::cuda::cinn_call_cudnn_conv2d_forward;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cudnn_conv2d_forward, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // format
+      .AddInputType<float>()   // alpha
+      .AddInputType<float>()   // beta
+      .AddInputType<int>()     // in
+      .AddInputType<int>()     // ic
+      .AddInputType<int>()     // ih
+      .AddInputType<int>()     // iw
+      .AddInputType<int>()     // fn
+      .AddInputType<int>()     // fc
+      .AddInputType<int>()     // fh
+      .AddInputType<int>()     // fw
+      .AddInputType<int>()     // ph
+      .AddInputType<int>()     // pw
+      .AddInputType<int>()     // sh
+      .AddInputType<int>()     // sw
+      .AddInputType<int>()     // dh
+      .AddInputType<int>()     // dw
+      .AddInputType<int>()     // g
+      .AddInputType<int>()     // on
+      .AddInputType<int>()     // oc
+      .AddInputType<int>()     // oh
+      .AddInputType<int>()     // ow
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_cudnn_conv2d_backward_data;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cudnn_conv2d_backward_data, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // format
+      .AddInputType<float>()   // alpha
+      .AddInputType<float>()   // beta
+      .AddInputType<int>()     // in
+      .AddInputType<int>()     // ic
+      .AddInputType<int>()     // ih
+      .AddInputType<int>()     // iw
+      .AddInputType<int>()     // fn
+      .AddInputType<int>()     // fc
+      .AddInputType<int>()     // fh
+      .AddInputType<int>()     // fw
+      .AddInputType<int>()     // ph
+      .AddInputType<int>()     // pw
+      .AddInputType<int>()     // sh
+      .AddInputType<int>()     // sw
+      .AddInputType<int>()     // dh
+      .AddInputType<int>()     // dw
+      .AddInputType<int>()     // g
+      .AddInputType<int>()     // on
+      .AddInputType<int>()     // oc
+      .AddInputType<int>()     // oh
+      .AddInputType<int>()     // ow
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_cudnn_conv2d_backward_filter;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cudnn_conv2d_backward_filter, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // format
+      .AddInputType<float>()   // alpha
+      .AddInputType<float>()   // beta
+      .AddInputType<int>()     // in
+      .AddInputType<int>()     // ic
+      .AddInputType<int>()     // ih
+      .AddInputType<int>()     // iw
+      .AddInputType<int>()     // fn
+      .AddInputType<int>()     // fc
+      .AddInputType<int>()     // fh
+      .AddInputType<int>()     // fw
+      .AddInputType<int>()     // ph
+      .AddInputType<int>()     // pw
+      .AddInputType<int>()     // sh
+      .AddInputType<int>()     // sw
+      .AddInputType<int>()     // dh
+      .AddInputType<int>()     // dw
+      .AddInputType<int>()     // g
+      .AddInputType<int>()     // on
+      .AddInputType<int>()     // oc
+      .AddInputType<int>()     // oh
+      .AddInputType<int>()     // ow
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_cudnn_pool2d_forward;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cudnn_pool2d_forward, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // mode
+      .AddInputType<int>()     // format
+      .AddInputType<float>()   // alpha
+      .AddInputType<float>()   // beta
+      .AddInputType<int>()     // in
+      .AddInputType<int>()     // ic
+      .AddInputType<int>()     // ih
+      .AddInputType<int>()     // iw
+      .AddInputType<int>()     // kh
+      .AddInputType<int>()     // kw
+      .AddInputType<int>()     // ph
+      .AddInputType<int>()     // pw
+      .AddInputType<int>()     // sh
+      .AddInputType<int>()     // sw
+      .AddInputType<int>()     // on
+      .AddInputType<int>()     // oc
+      .AddInputType<int>()     // oh
+      .AddInputType<int>()     // ow
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_cudnn_pool2d_backward;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cudnn_pool2d_backward, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // v_args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // mode
+      .AddInputType<int>()     // format
+      .AddInputType<float>()   // alpha
+      .AddInputType<float>()   // beta
+      .AddInputType<int>()     // in
+      .AddInputType<int>()     // ic
+      .AddInputType<int>()     // ih
+      .AddInputType<int>()     // iw
+      .AddInputType<int>()     // kh
+      .AddInputType<int>()     // kw
+      .AddInputType<int>()     // ph
+      .AddInputType<int>()     // pw
+      .AddInputType<int>()     // sh
+      .AddInputType<int>()     // sw
+      .AddInputType<int>()     // on
+      .AddInputType<int>()     // oc
+      .AddInputType<int>()     // oh
+      .AddInputType<int>()     // ow
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_cudnn_softmax_forward;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cudnn_softmax_forward, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // mode
+      .AddInputType<int>()     // format
+      .AddInputType<float>()   // alpha
+      .AddInputType<float>()   // beta
+      .AddInputType<int>()     // in
+      .AddInputType<int>()     // ic
+      .AddInputType<int>()     // ih
+      .AddInputType<int>()     // iw
+      .AddInputType<int>()     // on
+      .AddInputType<int>()     // oc
+      .AddInputType<int>()     // oh
+      .AddInputType<int>()     // ow
+      .AddInputType<void *>()  // stream
+      .End();
+
+  using cinn::runtime::cuda::cinn_call_cudnn_softmax_backward;
+  REGISTER_EXTERN_FUNC_HELPER(cinn_call_cudnn_softmax_backward, cinn::common::DefaultHostTarget())
+      .SetRetType<void>()
+      .AddInputType<void *>()  // args
+      .AddInputType<int>()     // num_args
+      .AddInputType<int>()     // mode
+      .AddInputType<int>()     // format
+      .AddInputType<float>()   // alpha
+      .AddInputType<float>()   // beta
+      .AddInputType<int>()     // in
+      .AddInputType<int>()     // ic
+      .AddInputType<int>()     // ih
+      .AddInputType<int>()     // iw
+      .AddInputType<int>()     // on
+      .AddInputType<int>()     // oc
+      .AddInputType<int>()     // oh
+      .AddInputType<int>()     // ow
+      .AddInputType<void *>()  // stream
+      .End();
+#endif
+
+  return true;
+}
diff --git a/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc b/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
new file mode 100644
index 0000000000000..0ad5010a0ff5b
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/backends/function_prototype.h"
+#include "cinn/common/bfloat16.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/float16.h"
+#include "cinn/runtime/cuda/cuda_util.h"
+#include "cinn/runtime/custom_function.h"
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+CINN_REGISTER_HELPER(cuda_intrinsics_reduce) {
+  auto target = cinn::common::DefaultNVGPUTarget();
+  using cinn::backends::FunctionProto;
+
+#define EXPAND_REDUCE_INT32_REGISTER_MARCO(MARCO, ...) \
+  MARCO(sum_int32, int, ##__VA_ARGS__)                 \
+  MARCO(prod_int32, int, ##__VA_ARGS__)                \
+  MARCO(max_int32, int, ##__VA_ARGS__)                 \
+  MARCO(min_int32, int, ##__VA_ARGS__)
+
+#define EXPAND_REDUCE_INT64_REGISTER_MARCO(MARCO, ...) \
+  MARCO(sum_int64, int64_t, ##__VA_ARGS__)             \
+  MARCO(prod_int64, int64_t, ##__VA_ARGS__)            \
+  MARCO(max_int64, int64_t, ##__VA_ARGS__)             \
+  MARCO(min_int64, int64_t, ##__VA_ARGS__)
+
+#define EXPAND_REDUCE_FP32_REGISTER_MACRO(MACRO, ...) \
+  MACRO(sum_fp32, float, ##__VA_ARGS__)               \
+  MACRO(prod_fp32, float, ##__VA_ARGS__)              \
+  MACRO(max_fp32, float, ##__VA_ARGS__)               \
+  MACRO(min_fp32, float, ##__VA_ARGS__)
+
+#define EXPAND_REDUCE_BOOL_REGISTER_MACRO(MACRO, ...) \
+  MACRO(all, bool, ##__VA_ARGS__)                     \
+  MACRO(any, bool, ##__VA_ARGS__)
+
+#define EXPAND_REDUCE_BF16_REGISTER_MACRO(MACRO, ...) \
+  MACRO(sum_bf16, bfloat16, ##__VA_ARGS__)            \
+  MACRO(prod_bf16, bfloat16, ##__VA_ARGS__)           \
+  MACRO(max_bf16, bfloat16, ##__VA_ARGS__)            \
+  MACRO(min_bf16, bfloat16, ##__VA_ARGS__)
+
+#define EXPAND_REDUCE_FP16_REGISTER_MACRO(MACRO, ...) \
+  MACRO(sum_fp16, float16, ##__VA_ARGS__)             \
+  MACRO(prod_fp16, float16, ##__VA_ARGS__)            \
+  MACRO(max_fp16, float16, ##__VA_ARGS__)             \
+  MACRO(min_fp16, float16, ##__VA_ARGS__)
+
+#define EXPAND_REDUCE_FP64_REGISTER_MACRO(MACRO, ...) \
+  MACRO(sum_fp64, double, ##__VA_ARGS__)              \
+  MACRO(prod_fp64, double, ##__VA_ARGS__)             \
+  MACRO(max_fp64, double, ##__VA_ARGS__)              \
+  MACRO(min_fp64, double, ##__VA_ARGS__)
+
+#define REGISTER_WARP_REDUCE_FUNC_IMPL(REDUCE_TYPE, DTYPE)                   \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_warp_reduce_##REDUCE_TYPE, target) \
+      .SetRetType<DTYPE>()                                                   \
+      .AddInputType<cinn_buffer_t *>()                                       \
+      .AddInputType<int>()                                                   \
+      .AddInputType<int>()                                                   \
+      .End();
+
+  EXPAND_REDUCE_INT32_REGISTER_MARCO(REGISTER_WARP_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_INT64_REGISTER_MARCO(REGISTER_WARP_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_BF16_REGISTER_MACRO(REGISTER_WARP_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_FP16_REGISTER_MACRO(REGISTER_WARP_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_FP32_REGISTER_MACRO(REGISTER_WARP_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_FP64_REGISTER_MACRO(REGISTER_WARP_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_BOOL_REGISTER_MACRO(REGISTER_WARP_REDUCE_FUNC_IMPL)
+
+#undef REGISTER_WARP_REDUCE_FUNC_IMPL
+
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_warp_reduce_avg_fp32, target)
+      .SetRetType<float>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
+#define REGISTER_BLOCK_REDUCE_INTERNAL_FUNC_IMPL(REDUCE_TYPE, DTYPE)                     \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_block_reduce_##REDUCE_TYPE##_internal, target) \
+      .SetRetType<DTYPE>()                                                               \
+      .AddInputType<DTYPE>()                                                             \
+      .End();
+
+  EXPAND_REDUCE_INT32_REGISTER_MARCO(REGISTER_BLOCK_REDUCE_INTERNAL_FUNC_IMPL)
+  EXPAND_REDUCE_INT64_REGISTER_MARCO(REGISTER_BLOCK_REDUCE_INTERNAL_FUNC_IMPL)
+  EXPAND_REDUCE_BF16_REGISTER_MACRO(REGISTER_BLOCK_REDUCE_INTERNAL_FUNC_IMPL)
+  EXPAND_REDUCE_FP16_REGISTER_MACRO(REGISTER_BLOCK_REDUCE_INTERNAL_FUNC_IMPL)
+  EXPAND_REDUCE_FP32_REGISTER_MACRO(REGISTER_BLOCK_REDUCE_INTERNAL_FUNC_IMPL)
+  EXPAND_REDUCE_FP64_REGISTER_MACRO(REGISTER_BLOCK_REDUCE_INTERNAL_FUNC_IMPL)
+  EXPAND_REDUCE_BOOL_REGISTER_MACRO(REGISTER_BLOCK_REDUCE_INTERNAL_FUNC_IMPL)
+
+#undef REGISTER_BLOCK_REDUCE_INTERNAL_FUNC_IMPL
+
+#define REGISTER_BLOCK_REDUCE_FUNC_IMPL(REDUCE_TYPE, DTYPE)                   \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_block_reduce_##REDUCE_TYPE, target) \
+      .SetRetType<DTYPE>()                                                    \
+      .AddInputType<cinn_buffer_t *>()                                        \
+      .AddInputType<int>()                                                    \
+      .AddInputType<int>()                                                    \
+      .End();
+
+  EXPAND_REDUCE_INT32_REGISTER_MARCO(REGISTER_BLOCK_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_INT64_REGISTER_MARCO(REGISTER_BLOCK_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_BF16_REGISTER_MACRO(REGISTER_BLOCK_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_FP16_REGISTER_MACRO(REGISTER_BLOCK_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_FP32_REGISTER_MACRO(REGISTER_BLOCK_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_FP64_REGISTER_MACRO(REGISTER_BLOCK_REDUCE_FUNC_IMPL)
+  EXPAND_REDUCE_BOOL_REGISTER_MACRO(REGISTER_BLOCK_REDUCE_FUNC_IMPL)
+
+#undef REGISTER_BLOCK_REDUCE_FUNC_IMPL
+
+#define REGISTER_BLOCK_SHUFLLE_FUNC_IMPL(REDUCE_TYPE, DTYPE)              \
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(block_shuffle_##REDUCE_TYPE, target) \
+      .SetRetType<DTYPE>()                                                \
+      .AddInputType<cinn_buffer_t *>()                                    \
+      .AddInputType<int>()                                                \
+      .End();
+
+  EXPAND_REDUCE_INT32_REGISTER_MARCO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
+  EXPAND_REDUCE_INT64_REGISTER_MARCO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
+  EXPAND_REDUCE_BF16_REGISTER_MACRO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
+  EXPAND_REDUCE_FP16_REGISTER_MACRO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
+  EXPAND_REDUCE_FP32_REGISTER_MACRO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
+  EXPAND_REDUCE_FP64_REGISTER_MACRO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
+  EXPAND_REDUCE_BOOL_REGISTER_MACRO(REGISTER_BLOCK_SHUFLLE_FUNC_IMPL)
+
+#undef REGISTER_BLOCK_SHUFLLE_FUNC_IMPL
+
+#undef EXPAND_REDUCE_INT32_REGISTER_MARCO
+#undef EXPAND_REDUCE_INT64_REGISTER_MARCO
+#undef EXPAND_REDUCE_BF16_REGISTER_MACRO
+#undef EXPAND_REDUCE_FP16_REGISTER_MACRO
+#undef EXPAND_REDUCE_FP32_REGISTER_MACRO
+#undef EXPAND_REDUCE_FP64_REGISTER_MACRO
+#undef EXPAND_REDUCE_BOOL_REGISTER_MACRO
+
+  return true;
+}
diff --git a/paddle/cinn/runtime/cuda/cuda_module.cc b/paddle/cinn/runtime/cuda/cuda_module.cc
new file mode 100644
index 0000000000000..56963e4efb934
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cuda_module.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cuda/cuda_module.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <glog/logging.h>
+#include <glog/raw_logging.h>
+
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/runtime/cuda/cuda_util.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/profiler.h"
+
+namespace cinn {
+namespace runtime {
+namespace cuda {
+
+CUDAModule::CUDAModule(const std::string& data, Kind kind) : data_(data), kind_(kind) {
+  CHECK(!data.empty());
+
+  cudaGetDeviceCount(&num_devices_);
+  CHECK_GT(num_devices_, 0) << "No available devices";
+
+  // TODO(Superjomn) Determine whether to initialize all the devices.
+  int current_device_id;
+  cudaGetDevice(&current_device_id);
+  cudaSetDevice(current_device_id);
+  cuDeviceGet(&device_, current_device_id);
+  cuCtxGetCurrent(&context_);
+  cuDevicePrimaryCtxRetain(&context_, device_);
+}
+
+void CUDAModule::LaunchKernel(int device_id,
+                              const std::string& func_name,
+                              dim3 gridDim,
+                              dim3 blockDim,
+                              void** args,
+                              size_t share_memory_size,
+                              CUstream stream) {
+  VLOG(3) << "cuLaunchKernel with func_name : " << func_name << ", gridDim.x:" << gridDim.x
+          << ", gridDim.y:" << gridDim.y << ", gridDim.z:" << gridDim.z << ", blockDim.x:" << blockDim.x
+          << ", blockDim.y:" << blockDim.y << ", blockDim.z:" << blockDim.z
+          << ", share_memory_size:" << share_memory_size;
+  auto function = GetFunction(device_id, func_name);
+  CHECK(function);
+  cinn::utils::RecordEvent record_run("cuLaunchKernel", cinn::utils::EventType::kInstruction);
+  CUDA_DRIVER_CALL(cuLaunchKernel(function,
+                                  gridDim.x,
+                                  gridDim.y,
+                                  gridDim.z,
+                                  blockDim.x,
+                                  blockDim.y,
+                                  blockDim.z,
+                                  share_memory_size,
+                                  stream,
+                                  args,
+                                  nullptr));
+}
+
+CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name) {
+  VLOG(5) << "GetFuncion : " << func_name << " with device_id : " << device_id;
+  cinn::utils::RecordEvent record_run("cuLaunchKernel", cinn::utils::EventType::kOrdinary);
+  if (!module_per_card_[device_id]) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    // Compilation with parameters
+    const size_t jit_num_options = 5;
+    std::vector<CUjit_option> jit_options(jit_num_options);
+    std::vector<void*> jit_opt_vals(jit_num_options);
+
+    // set up size of compilation log buffer
+    jit_options[0]         = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+    size_t log_buffer_size = 1024;
+    jit_opt_vals[0]        = reinterpret_cast<void*>(log_buffer_size);
+
+    // set up pointer to the compilation log buffer
+    jit_options[1] = CU_JIT_ERROR_LOG_BUFFER;
+    std::vector<char> log_buffer(log_buffer_size, '\0');
+    jit_opt_vals[1] = log_buffer.data();
+
+    int value = 1;
+    // Specifies whether to create debug information in output (-g)
+    jit_options[2]  = CU_JIT_GENERATE_DEBUG_INFO;
+    jit_opt_vals[2] = reinterpret_cast<void*>(value);
+
+    // Generate verbose log messages
+    jit_options[3]  = CU_JIT_LOG_VERBOSE;
+    jit_opt_vals[3] = reinterpret_cast<void*>(value);
+
+    // Generate line number information (-lineinfo)
+    jit_options[4]  = CU_JIT_GENERATE_LINE_INFO;
+    jit_opt_vals[4] = reinterpret_cast<void*>(value);
+
+    if (runtime::CanUseNvccCompiler()) {
+      CUDA_DRIVER_CALL(cuModuleLoad(&module_per_card_[device_id], data_.c_str()));
+    } else {
+      CUDA_DRIVER_CALL(cuModuleLoadDataEx(
+          &module_per_card_[device_id], data_.c_str(), jit_num_options, jit_options.data(), jit_opt_vals.data()));
+    }
+  }
+
+  CUfunction func;
+  CUDA_DRIVER_CALL(cuModuleGetFunction(&func, module_per_card_[device_id], func_name.c_str()));
+  return func;
+}
+
+CUdeviceptr CUDAModule::GetGlobal(int device_id, const std::string& name, size_t nbytes) {
+  if (!module_per_card_[device_id]) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (runtime::CanUseNvccCompiler()) {
+      CUDA_DRIVER_CALL(cuModuleLoad(&module_per_card_[device_id], data_.c_str()));
+    } else {
+      CUDA_DRIVER_CALL(cuModuleLoadData(&module_per_card_[device_id], data_.c_str()));
+    }
+  }
+
+  size_t _nbytes;
+  CUdeviceptr global;
+  CUDA_DRIVER_CALL(cuModuleGetGlobal(&global, &_nbytes, module_per_card_[device_id], name.c_str()));
+  return global;
+}
+
+CUDAModule::~CUDAModule() {
+  for (int i = 0; i < module_per_card_.size(); i++) {
+    auto* module = module_per_card_[i];
+    if (module) {
+      CUDA_CALL(cudaSetDevice(i));
+      CUDA_DRIVER_CALL(cuModuleUnload(module));
+    }
+  }
+}
+
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cuda/cuda_module.h b/paddle/cinn/runtime/cuda/cuda_module.h
new file mode 100644
index 0000000000000..1fb2151e31d35
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cuda_module.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef CINN_WITH_CUDA
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/runtime/cuda/cuda_util.h"
+
+namespace cinn {
+namespace runtime {
+namespace cuda {
+
+/**
+ * The CUDA module, helps to compile CUDA codes and fetch symbols.
+ * Currently, it is a wrapper of NVRTC.
+ */
+class CUDAModule {
+ public:
+  enum class Kind {
+    PTX   = 0,
+    CUBIN = 1,
+  };
+
+  CUDAModule(const std::string& data, Kind kind);
+
+  void LaunchKernel(int device_id,
+                    const std::string& func_name,
+                    dim3 gridDim,
+                    dim3 blockDim,
+                    void** args,
+                    size_t share_memory_size = 0,
+                    CUstream stream          = nullptr);
+
+  //! Get a function.
+  CUfunction GetFunction(int device_id, const std::string& func_name);
+
+  //! Get a global variable.
+  CUdeviceptr GetGlobal(int device_id, const std::string& name, size_t nbytes);
+
+  ~CUDAModule();
+
+ private:
+  //! The input data.
+  std::string data_;
+  //! Kind of the input.
+  Kind kind_;
+  //! To make parallel, we prepare one module for each card.
+  std::vector<CUmodule> module_per_card_{kCUDAMaxCards, nullptr};
+  std::string cuda_source_;
+  std::mutex mutex_;
+
+  CUdevice device_;
+  CUcontext context_;
+  int num_devices_{0};
+};
+
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace cinn
+
+#endif  // CINN_WITH_CUDA
diff --git a/paddle/cinn/runtime/cuda/cuda_module_test.cc b/paddle/cinn/runtime/cuda/cuda_module_test.cc
new file mode 100644
index 0000000000000..f58f57d393f1a
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cuda_module_test.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cuda/cuda_module.h"
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "cinn/cinn.h"
+#include "cinn/runtime/cuda/cuda_util.h"
+#include "cinn/runtime/cuda/test_util.h"
+#include "cinn/runtime/cuda/use_extern_funcs.h"
+
+namespace cinn {
+namespace runtime {
+namespace cuda {
+
+TEST(CUDAModule, basic) {
+  backends::nvrtc::Compiler compiler;
+
+  std::string source_code = R"ROC(
+extern "C" __global__
+void saxpy(float a, float *x, float *y, float *out, size_t n)
+{
+  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n) {
+    out[tid] = a * x[tid] + y[tid];
+  }
+}
+)ROC";
+
+  auto ptx = compiler(source_code);
+  CHECK(!ptx.empty());
+
+  CUDAModule module(ptx, CUDAModule::Kind::PTX);
+  auto func = module.GetFunction(0, "saxpy");
+  ASSERT_TRUE(func);
+}
+
+TEST(CUDAModule, float16) {
+  using common::float16;
+  using namespace runtime::cuda::util;
+
+  auto generate_ptx = [] {
+    backends::nvrtc::Compiler compiler;
+
+    std::string source_code = R"(
+  #include <cstdint>
+  #define CINN_WITH_CUDA
+  #include "float16.h"
+  using cinn::common::float16;
+
+  extern "C" __global__
+  void cast_fp32_to_fp16_cuda_kernel(const float* input, const int num, float16* output) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+   if (idx < num) {
+      output[idx] = float16(input[idx]);
+    }
+  }
+  )";
+
+    auto ptx = compiler(source_code);
+    CHECK(!ptx.empty());
+    return ptx;
+  };
+
+  auto ptx = generate_ptx();
+
+  CUDAModule cuda_module(ptx, CUDAModule::Kind::PTX);
+  auto func = cuda_module.GetFunction(0, "cast_fp32_to_fp16_cuda_kernel");
+  ASSERT_TRUE(func);
+
+  int size = 100;
+  dim3 blocks_per_grid(1);
+  dim3 threads_per_block(100);
+
+  std::vector<float> x_host(size);
+  {
+    std::random_device r;
+    std::default_random_engine eng(r());
+    std::uniform_real_distribution<float> dis(1e-5f, 1.0f);
+    for (size_t i = 0; i < x_host.size(); ++i) {
+      x_host[i] = dis(eng);
+    }
+  }
+  Vector<float> x_device(x_host);
+  Vector<float16> y_device(size);
+  auto* x_p{x_device.data()};
+  auto* y_p{y_device.data()};
+
+  void* args[] = {&x_p, &size, &y_p};
+  cuda_module.LaunchKernel(0, "cast_fp32_to_fp16_cuda_kernel", blocks_per_grid, threads_per_block, args);
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  std::vector<float16> y_host = y_device.to_host();
+  bool res = std::equal(x_host.begin(), x_host.end(), y_host.begin(), [](float x, float16 y) -> bool {
+    return std::abs(x - static_cast<float>(y)) < 1e-2f;
+  });
+  CHECK(res) << "The difference between two arrays exceeds the bound.";
+}
+
+TEST(CUDAModule, bfloat16) {
+  using common::bfloat16;
+  using namespace runtime::cuda::util;
+
+  auto generate_ptx = [] {
+    backends::nvrtc::Compiler compiler;
+
+    std::string source_code = R"(
+  #include <cstdint>
+  #define CINN_WITH_CUDA
+  #include "bfloat16.h"
+  using cinn::common::bfloat16;
+
+  extern "C" __global__
+  void cast_fp32_to_bf16_cuda_kernel(const float* input, const int num, bfloat16* output) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+   if (idx < num) {
+      output[idx] = bfloat16(input[idx]);
+    }
+  }
+  )";
+
+    auto ptx = compiler(source_code);
+    CHECK(!ptx.empty());
+    return ptx;
+  };
+
+  auto ptx = generate_ptx();
+
+  CUDAModule cuda_module(ptx, CUDAModule::Kind::PTX);
+  auto func = cuda_module.GetFunction(0, "cast_fp32_to_bf16_cuda_kernel");
+  ASSERT_TRUE(func);
+
+  int size = 100;
+  dim3 blocks_per_grid(1);
+  dim3 threads_per_block(100);
+
+  std::vector<float> x_host(size);
+  {
+    std::random_device r;
+    std::default_random_engine eng(r());
+    std::uniform_real_distribution<float> dis(1e-5f, 1.0f);
+    for (size_t i = 0; i < x_host.size(); ++i) {
+      x_host[i] = dis(eng);
+    }
+  }
+  Vector<float> x_device(x_host);
+  Vector<bfloat16> y_device(size);
+  auto* x_p{x_device.data()};
+  auto* y_p{y_device.data()};
+
+  void* args[] = {&x_p, &size, &y_p};
+  cuda_module.LaunchKernel(0, "cast_fp32_to_bf16_cuda_kernel", blocks_per_grid, threads_per_block, args);
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  std::vector<bfloat16> y_host = y_device.to_host();
+  bool res = std::equal(x_host.begin(), x_host.end(), y_host.begin(), [](float x, bfloat16 y) -> bool {
+    return std::abs(x - static_cast<float>(y)) < 1e-2f;
+  });
+  CHECK(res) << "The difference between two arrays exceeds the bound.";
+}
+
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc
new file mode 100644
index 0000000000000..3671987c6159c
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cuda_util.cc
@@ -0,0 +1,2277 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/cuda/cuda_util.h"
+
+#include <absl/container/flat_hash_map.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cusolverDn.h>
+#include <glog/logging.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <algorithm>
+#include <string>
+#ifdef CINN_WITH_CUDNN
+#include <cudnn.h>
+#endif
+
+#include "cinn/backends/cuda_util.h"
+#include "cinn/backends/extern_func_jit_register.h"
+#include "cinn/common/target.h"
+#include "cinn/runtime/cuda/cublas_util.h"
+#include "cinn/runtime/custom_function.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/profiler.h"
+#include "cinn/utils/timer.h"
+
+namespace cinn {
+namespace runtime {
+namespace cuda {
+
+class CublasHandle {
+ public:
+  CublasHandle(const CublasHandle &) = delete;
+  CublasHandle &operator=(const CublasHandle &) = delete;
+  ~CublasHandle() {
+    CUBLAS_CALL(cublasDestroy(cuhandle));
+    CUDA_CALL(cudaStreamDestroy(custream));
+  }
+  static CublasHandle &GetInstance() {
+    static CublasHandle instance;
+    return instance;
+  }
+  cudaStream_t GetCuStream() { return custream; }
+  cublasHandle_t &GetCublasHandle() { return cuhandle; }
+
+ private:
+  CublasHandle() {
+    CUDA_CALL(cudaStreamCreate(&custream));
+    CUBLAS_CALL(cublasCreate(&cuhandle));
+    cudaMemPool_t mem_pool;
+    CUDA_CALL(cudaDeviceGetMemPool(&mem_pool, 0));
+
+    uint64_t threshold = UINT32_MAX;
+    CUDA_CALL(cudaMemPoolSetAttribute(mem_pool, cudaMemPoolAttrReleaseThreshold, &threshold));
+
+    int enable = 1;
+    CUDA_CALL(cudaMemPoolSetAttribute(mem_pool, cudaMemPoolReuseFollowEventDependencies, &enable));
+    CUDA_CALL(cudaMemPoolSetAttribute(mem_pool, cudaMemPoolReuseAllowInternalDependencies, &enable));
+  }
+  cudaStream_t custream;
+  cublasHandle_t cuhandle;
+};
+
+void cinn_call_cuda_kernel(void *kernel_fn,
+                           void *v_args,
+                           int num_args,
+                           int grid_x,
+                           int grid_y,
+                           int grid_z,
+                           int block_x,
+                           int block_y,
+                           int block_z,
+                           void *stream) {
+  VLOG(3) << "cinn_call_cuda_kernel, grid_dim={" << grid_x << ", " << grid_y << ", " << grid_z << "}, block_dim={"
+          << block_x << ", " << block_y << ", " << block_z << "}, num_args=" << num_args << ", stream=" << stream;
+
+  std::vector<void *> kernel_args;
+  {
+    cinn::utils::RecordEvent record_run("prepare_args", cinn::utils::EventType::kInstruction);
+    kernel_args.reserve(num_args);
+    cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+    for (int idx = 0; idx < num_args; ++idx) {
+      if (args[idx].type_code() == ::cinn_type_code<cinn_buffer_t *>()) {
+        kernel_args.emplace_back(&((cinn_buffer_t *)(args[idx]))->memory);
+      } else {
+        kernel_args.emplace_back(args[idx].data_addr());
+      }
+    }
+  }
+
+  {
+    cinn::utils::RecordEvent record_run("cuLaunchKernel", cinn::utils::EventType::kInstruction);
+    CUDA_DRIVER_CALL(cuLaunchKernel(static_cast<CUfunction>(kernel_fn),
+                                    grid_x,
+                                    grid_y,
+                                    grid_z,
+                                    block_x,
+                                    block_y,
+                                    block_z,
+                                    0,  // share memory
+                                    static_cast<CUstream>(stream),
+                                    kernel_args.data(),
+                                    nullptr))
+  }
+}
+
+void cinn_call_cublas(void *v_args,
+                      int num_args,
+                      bool trans_a,
+                      bool trans_b,
+                      bool trans_o,
+                      float alpha,
+                      float beta,
+                      int a1,
+                      int a2,
+                      int a3,
+                      int a4,
+                      int b1,
+                      int b2,
+                      int b3,
+                      int b4,
+                      void *stream) {
+  cinn::utils::RecordEvent record_run("cinn_call_cublas", cinn::utils::EventType::kInstruction);
+  CHECK_EQ(num_args, 3);
+  cublasHandle_t &cuhandle = CublasHandle::GetInstance().GetCublasHandle();
+  cinn_pod_value_t *args   = static_cast<cinn_pod_value_t *>(v_args);
+  cudaStream_t custream    = static_cast<cudaStream_t>(stream);
+  CUBLAS_CALL(cublasSetStream(cuhandle, custream));
+  VLOG(3) << "a1 ~ a4: " << a1 << " " << a2 << " " << a3 << " " << a4;
+  VLOG(3) << "b1 ~ b4: " << b1 << " " << b2 << " " << b3 << " " << b4;
+  VLOG(3) << "trans_a: " << trans_a << ", trans_b: " << trans_b << ", trans_o: " << trans_o;
+
+  void *A = args[0].operator cinn_buffer_t *()->memory;
+  void *B = args[1].operator cinn_buffer_t *()->memory;
+  void *C = args[2].operator cinn_buffer_t *()->memory;
+
+  int m = trans_o ? (trans_a ? a4 : a3) : (trans_b ? b3 : b4);
+  int n = trans_o ? (trans_b ? b3 : b4) : (trans_a ? a4 : a3);
+  int k = trans_a ? a3 : a4;
+
+  cublasOperation_t trans_op_l =
+      trans_o ? (trans_a ? CUBLAS_OP_N : CUBLAS_OP_T) : (trans_b ? CUBLAS_OP_T : CUBLAS_OP_N);
+  cublasOperation_t trans_op_r =
+      trans_o ? (trans_b ? CUBLAS_OP_N : CUBLAS_OP_T) : (trans_a ? CUBLAS_OP_T : CUBLAS_OP_N);
+  int ldl = trans_op_l == CUBLAS_OP_N ? m : k;  // trans_o ? (trans_a ? k : m) : (trans_b ? k : m);
+  int ldr = trans_op_r == CUBLAS_OP_N ? k : n;  // trans_o ? (trans_b ? n : k) : (trans_a ? n : k);
+  int ldc = m;
+
+  void *lhs = trans_o ? A : B;
+  void *rhs = trans_o ? B : A;
+
+  cudaDataType_t cuda_dtype;
+  auto type_code   = args[0].operator cinn_buffer_t *()->type.code;
+  bool is_float    = type_code == cinn_type_float;
+  bool is_bfloat16 = type_code == cinn_type_bfloat;
+  int bytes        = args[0].operator cinn_buffer_t *()->type.bits / CHAR_BIT;
+  if (is_float && bytes == sizeof(common::float16)) {
+    cuda_dtype = CUDA_R_16F;
+  } else if (is_float && bytes == sizeof(float)) {
+    cuda_dtype = CUDA_R_32F;
+  } else if (is_float && bytes == sizeof(double)) {
+    cuda_dtype = CUDA_R_64F;
+  } else if (is_bfloat16) {
+    cuda_dtype = CUDA_R_16BF;
+  } else {
+    LOG(FATAL) << "unsupported cublas data type: " << static_cast<int>(type_code) << ", bytes = " << bytes;
+  }
+
+  if (a1 * a2 * b1 * b2 == 1) {
+    VLOG(3) << "call cublasGemm for a1 * a2 * b1 * b2 == 1";
+    cinn::utils::RecordEvent record_run("Call cublasGemm", cinn::utils::EventType::kInstruction);
+    CUBLAS_CALL(
+        cublasGemm(cuda_dtype, cuhandle, trans_op_l, trans_op_r, m, n, k, alpha, lhs, ldl, rhs, ldr, beta, C, ldc));
+  } else if (a1 * b1 == 1) {
+    CHECK(a2 == b2 || a2 == 1 || b2 == 1);
+    if (b2 == 1 && trans_op_r == CUBLAS_OP_N) {
+      // In case of [1, bs, M, K] * [1, 1, K, N]
+      VLOG(3) << "call cublasGemm for a1 * b1 = 1, b2 = 1, trans_op_r:" << trans_op_r;
+      cinn::utils::RecordEvent record_run("Call cublasGemm", cinn::utils::EventType::kInstruction);
+      CUBLAS_CALL(cublasGemm(
+          cuda_dtype, cuhandle, trans_op_l, trans_op_r, m, a2 * n, k, alpha, lhs, ldl, A, ldr, beta, C, ldc));
+    } else {
+      int stride_l = trans_o ? (a2 > 1 ? a3 * a4 : 0) : (b2 > 1 ? b3 * b4 : 0);
+      int stride_r = trans_o ? (b2 > 1 ? b3 * b4 : 0) : (a2 > 1 ? a3 * a4 : 0);
+      int batch    = std::max(a2, b2);
+      VLOG(3) << "call cublasGemmStridedBatched with a1*b1 = 1, stride_l = " << stride_l << ", stride_r = " << stride_r
+              << ", batch = " << batch;
+      cinn::utils::RecordEvent record_run("Call cublasGemmStridedBatched", cinn::utils::EventType::kInstruction);
+      CUBLAS_CALL(cublasGemmStridedBatched(cuda_dtype,
+                                           cuhandle,
+                                           trans_op_l,
+                                           trans_op_r,
+                                           m,
+                                           n,
+                                           k,
+                                           alpha,
+                                           lhs,
+                                           ldl,
+                                           stride_l,
+                                           rhs,
+                                           ldr,
+                                           stride_r,
+                                           beta,
+                                           C,
+                                           ldc,
+                                           m * n,
+                                           batch));
+    }
+  } else {
+    int l1 = trans_o ? a1 : b1, l2 = trans_o ? a2 : b2, l3 = trans_o ? a3 : b3, l4 = trans_o ? a4 : b4;
+    int r1 = trans_o ? b1 : a1, r2 = trans_o ? b2 : a2, r3 = trans_o ? b3 : a3, r4 = trans_o ? b4 : a4;
+
+    if ((l1 == r1 && l2 == r2) || (l1 == 1 && l2 == 1) || (r1 == 1 && r2 == 1)) {
+      int stride_l = (l1 == 1 && l2 == 1) ? 0 : l3 * l4;
+      int stride_r = (r1 == 1 && r2 == 1) ? 0 : r3 * r4;
+
+      // four types matmul:
+      // (N, L) * (N, L) , (N, 1) * (N, 1)
+      // (N, L) * (1, 1) , (1, 1) * (N, L)
+      VLOG(3) << "call cublasGemmStridedBatched for stride_l = " << stride_l << ", stride_r = " << stride_r
+              << ", batch = " << std::max(l1, r1) * std::max(l2, r2);
+      cinn::utils::RecordEvent record_run("Call cublasGemmStridedBatched", cinn::utils::EventType::kInstruction);
+      CUBLAS_CALL(cublasGemmStridedBatched(cuda_dtype,
+                                           cuhandle,
+                                           trans_op_l,
+                                           trans_op_r,
+                                           m,
+                                           n,
+                                           k,
+                                           alpha,
+                                           lhs,
+                                           ldl,
+                                           stride_l,
+                                           rhs,
+                                           ldr,
+                                           stride_r,
+                                           beta,
+                                           C,
+                                           ldc,
+                                           m * n,
+                                           std::max(l1, r1) * std::max(l2, r2)));
+    } else {
+      cinn::utils::RecordEvent record_run("Call cublasGemmBatched", cinn::utils::EventType::kInstruction);
+      // (N, L) / (N, 1) / (1, L)
+      int bstride_l = (l1 != 1 && l2 != 1) ? (l2 * m * k) : ((l1 != 1) ? m * k : 0);
+      // (N, L) / (N, 1) / (1, L)
+      int bstride_r = (r1 != 1 && r2 != 1) ? (r2 * k * n) : ((r1 != 1) ? k * n : 0);
+      int bstride_c = std::max(l2, r2) * m * n;
+
+      int stride_l = l2 == 1 ? 0 : l3 * l4;
+      int stride_r = r2 == 1 ? 0 : r3 * r4;
+      // six type matmul:
+      // (N, L) * (N, 1) , (N, L) * (1, L)
+      // (N, 1) * (N, L) , (1, L) * (N, L)
+      // (N, 1) * (1, L) , (1, L) * (N, 1)
+
+      void **ptr_arr        = nullptr;
+      cudaStream_t g_stream = CublasHandle::GetInstance().GetCuStream();
+      CUDA_CALL(cudaMallocAsync(&ptr_arr, sizeof(void *) * 3 * std::max(l1, r1) * std::max(l2, r2), g_stream));
+
+      std::vector<void *> ptr(3 * std::max(l1, r1) * std::max(l2, r2));
+      void **ptr_a = ptr.data();
+      void **ptr_b = ptr.data() + std::max(l1, r1) * std::max(l2, r2);
+      void **ptr_c = ptr.data() + std::max(l1, r1) * std::max(l2, r2) * 2;
+
+      for (int idx = 0, index = 0; idx < std::max(l1, r1); ++idx) {
+        for (int idy = 0; idy < std::max(l2, r2); ++idy) {
+          ptr_a[index] = reinterpret_cast<uint8_t *>(lhs) + (idx * bstride_l + idy * stride_l) * bytes;
+          ptr_b[index] = reinterpret_cast<uint8_t *>(rhs) + (idx * bstride_r + idy * stride_r) * bytes;
+          ptr_c[index] = reinterpret_cast<uint8_t *>(C) + (idx * bstride_c + idy * m * n) * bytes;
+          ++index;
+        }
+      }
+      CUDA_CALL(cudaMemcpyAsync(ptr_arr, ptr.data(), ptr.size() * sizeof(void *), cudaMemcpyHostToDevice, g_stream));
+      CUDA_CALL(cudaStreamSynchronize(g_stream));
+
+      CUBLAS_CALL(cublasGemmBatched(cuda_dtype,
+                                    cuhandle,
+                                    trans_op_l,
+                                    trans_op_r,
+                                    m,
+                                    n,
+                                    k,
+                                    alpha,
+                                    ptr_arr,
+                                    ldl,
+                                    ptr_arr + std::max(l1, r1) * std::max(l2, r2),
+                                    ldr,
+                                    beta,
+                                    ptr_arr + std::max(l1, r1) * std::max(l2, r2) * 2,
+                                    ldc,
+                                    std::max(l1, r1) * std::max(l2, r2)));
+      CUDA_CALL(cudaFreeAsync(ptr_arr, custream));
+    }
+  }
+}
+
+void cinn_call_batched_cublas(void *v_args,
+                              int num_args,
+                              int opside,
+                              bool trans_a,
+                              bool trans_b,
+                              bool trans_o,
+                              float alpha,
+                              float beta,
+                              int a1,
+                              int a2,
+                              int a3,
+                              int a4,
+                              int b1,
+                              int b2,
+                              int b3,
+                              int b4,
+                              void *stream) {
+  // A * [B, C, D, ...] or [B, C, D, ...] * A
+  CHECK_EQ((num_args - 1) % 2, 0);
+  cublasHandle_t &cuhandle = CublasHandle::GetInstance().GetCublasHandle();
+  cinn_pod_value_t *args   = static_cast<cinn_pod_value_t *>(v_args);
+  cudaStream_t custream    = static_cast<cudaStream_t>(stream);
+  CUBLAS_CALL(cublasSetStream(cuhandle, custream));
+
+  cudaDataType_t cuda_dtype;
+  auto type_code   = args[0].operator cinn_buffer_t *()->type.code;
+  bool is_float    = type_code == cinn_type_float;
+  bool is_bfloat16 = type_code == cinn_type_bfloat;
+  int bytes        = args[0].operator cinn_buffer_t *()->type.bits / CHAR_BIT;
+  if (is_float && bytes == sizeof(common::float16)) {
+    cuda_dtype = CUDA_R_16F;
+  } else if (is_float && bytes == sizeof(float)) {
+    cuda_dtype = CUDA_R_32F;
+  } else if (is_float && bytes == sizeof(double)) {
+    cuda_dtype = CUDA_R_64F;
+  } else if (is_bfloat16) {
+    cuda_dtype = CUDA_R_16BF;
+  } else {
+    LOG(FATAL) << "unsupported cublas data type: " << static_cast<int>(type_code) << ", bytes = " << bytes;
+  }
+
+  int m = trans_o ? (trans_a ? a4 : a3) : (trans_b ? b3 : b4);
+  int n = trans_o ? (trans_b ? b3 : b4) : (trans_a ? a4 : a3);
+  int k = trans_a ? a3 : a4;
+
+  cublasOperation_t trans_op_l =
+      trans_o ? (trans_a ? CUBLAS_OP_N : CUBLAS_OP_T) : (trans_b ? CUBLAS_OP_T : CUBLAS_OP_N);
+  cublasOperation_t trans_op_r =
+      trans_o ? (trans_b ? CUBLAS_OP_N : CUBLAS_OP_T) : (trans_a ? CUBLAS_OP_T : CUBLAS_OP_N);
+  int ldl = trans_op_l == CUBLAS_OP_N ? m : k;  // trans_o ? (trans_a ? k : m) : (trans_b ? k : m);
+  int ldr = trans_op_r == CUBLAS_OP_N ? k : n;  // trans_o ? (trans_b ? n : k) : (trans_a ? n : k);
+  int ldc = m;
+
+  int l1 = trans_o ? a1 : b1, l2 = trans_o ? a2 : b2, l3 = trans_o ? a3 : b3, l4 = trans_o ? a4 : b4;
+  int r1 = trans_o ? b1 : a1, r2 = trans_o ? b2 : a2, r3 = trans_o ? b3 : a3, r4 = trans_o ? b4 : a4;
+
+  // (N, L): L * M * K
+  // (N, 1): 1 * M * K
+  // (1, L): 0
+  // (1, 1): 0
+  int bstride_l = (l1 != 1 && l2 != 1) ? (l2 * m * k) : ((l1 != 1) ? m * k : 0);
+  int bstride_r = (r1 != 1 && r2 != 1) ? (r2 * k * n) : ((r1 != 1) ? k * n : 0);
+  int bstride_c = std::max(l2, r2) * m * n;
+  // (N, L): K * N
+  // (N, 1): 0
+  // (1, L): K * N
+  // (1, 1): 0
+  int stride_l = l2 == 1 ? 0 : l3 * l4;
+  int stride_r = r2 == 1 ? 0 : r3 * r4;
+
+  int num_gemm = ((num_args - 1) / 2);
+  std::vector<void *> ptr(3 * std::max(l1, r1) * std::max(l2, r2) * num_gemm);
+  void **ptr_a = ptr.data();
+  void **ptr_b = ptr.data() + std::max(l1, r1) * std::max(l2, r2) * num_gemm;
+  void **ptr_c = ptr.data() + std::max(l1, r1) * std::max(l2, r2) * num_gemm * 2;
+
+  void **ptr_arr        = nullptr;
+  cudaStream_t g_stream = CublasHandle::GetInstance().GetCuStream();
+  CUDA_CALL(cudaMallocAsync(&ptr_arr, sizeof(void *) * ptr.size(), g_stream));
+
+  for (int g = 0, index = 0; g < num_gemm; ++g) {
+    void *A = args[0].operator cinn_buffer_t *()->memory;
+    void *B = args[1 + g].operator cinn_buffer_t *()->memory;
+    void *C = args[1 + num_gemm + g].operator cinn_buffer_t *()->memory;
+
+    // if opside is 1, exhange A,B.
+    if (opside) {
+      auto tmp = A;
+      A        = B;
+      B        = tmp;
+    }
+
+    void *lhs = trans_o ? A : B;
+    void *rhs = trans_o ? B : A;
+
+    for (int idx = 0; idx < std::max(l1, r1); ++idx) {
+      for (int idy = 0; idy < std::max(l2, r2); ++idy) {
+        ptr_a[index] = reinterpret_cast<uint8_t *>(lhs) + (idx * bstride_l + idy * stride_l) * bytes;
+        ptr_b[index] = reinterpret_cast<uint8_t *>(rhs) + (idx * bstride_r + idy * stride_r) * bytes;
+        ptr_c[index] = reinterpret_cast<uint8_t *>(C) + (idx * bstride_c + idy * m * n) * bytes;
+        ++index;
+      }
+    }
+  }
+
+  CUDA_CALL(cudaMemcpyAsync(ptr_arr, ptr.data(), ptr.size() * sizeof(void *), cudaMemcpyHostToDevice, g_stream));
+  CUDA_CALL(cudaStreamSynchronize(g_stream));
+
+  CUBLAS_CALL(cublasGemmBatched(cuda_dtype,
+                                cuhandle,
+                                trans_op_l,
+                                trans_op_r,
+                                m,
+                                n,
+                                k,
+                                alpha,
+                                ptr_arr,
+                                ldl,
+                                ptr_arr + std::max(l1, r1) * std::max(l2, r2) * num_gemm,
+                                ldr,
+                                beta,
+                                ptr_arr + std::max(l1, r1) * std::max(l2, r2) * 2 * num_gemm,
+                                ldc,
+                                std::max(l1, r1) * std::max(l2, r2) * num_gemm));
+  CUDA_CALL(cudaFreeAsync(ptr_arr, custream));
+}
+
+void cinn_call_cuda_memset(void *v_args, int num_args, int value, size_t count, void *stream) {
+  CHECK_EQ(num_args, 1) << "The cinn_call_cuda_memset only accept a output";
+  VLOG(4) << "call cinn_call_cuda_memset with value=" << value << ", count=" << count;
+
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+  void *output           = args[0].operator cinn_buffer_t *()->memory;
+
+  cudaStream_t custream = static_cast<cudaStream_t>(stream);
+
+  CUDA_CALL(cudaMemsetAsync(output, value, count, custream));
+}
+
+void cinn_call_cuda_memcpy(void *v_args, int num_args, size_t count, void *stream) {
+  CHECK_EQ(num_args, 2) << "The cinn_call_cuda_memcpy only accept a input and a output";
+  VLOG(4) << "call cinn_call_cuda_memcpy with count=" << count;
+
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+  void *input            = args[0].operator cinn_buffer_t *()->memory;
+  void *output           = args[1].operator cinn_buffer_t *()->memory;
+
+  cudaStream_t custream = static_cast<cudaStream_t>(stream);
+
+  CUDA_CALL(cudaMemcpyAsync(output, input, count, cudaMemcpyDeviceToDevice, custream));
+}
+
+#ifdef CINN_WITH_CUDNN
+class CudnnHandle {
+ public:
+  CudnnHandle(const CudnnHandle &) = delete;
+  CudnnHandle &operator=(const CudnnHandle &) = delete;
+  ~CudnnHandle() {
+    CUDNN_CALL(cudnnDestroy(cuhandle_));
+    if (workspace_) {
+      CUDA_CALL(cudaFree(workspace_));
+    }
+  }
+  static CudnnHandle &GetInstance() {
+    static CudnnHandle instance;
+    return instance;
+  }
+  cudnnHandle_t &GetCudnnHandle() { return cuhandle_; }
+  void *GetWorkSpace(size_t size) {
+    if (size_ >= size) {
+      return workspace_;
+    } else {
+      if (workspace_) {
+        CUDA_CALL(cudaFree(workspace_));
+      }
+      size_ = size;
+      CUDA_CALL(cudaMalloc(&workspace_, size_));
+      return workspace_;
+    }
+  }
+
+ private:
+  CudnnHandle() : workspace_(nullptr), size_(0) { CUDNN_CALL(cudnnCreate(&cuhandle_)); }
+  cudnnHandle_t cuhandle_;
+  void *workspace_;
+  size_t size_;
+};
+
+class ConvAlgoMap {
+ public:
+  ConvAlgoMap(const ConvAlgoMap &) = delete;
+  ConvAlgoMap &operator=(const ConvAlgoMap &) = delete;
+  static ConvAlgoMap &GetInstance() {
+    static ConvAlgoMap instance;
+    return instance;
+  }
+  void InsertAlgo(const std::string &key, const int algo) { algo_map_[key] = algo; }
+  int GetAlgo(const std::string &key) { return algo_map_.count(key) ? algo_map_[key] : -1; }
+
+ private:
+  ConvAlgoMap() {}
+  absl::flat_hash_map<std::string, int> algo_map_;
+};
+
+cudnnDataType_t convert_to_cudnn_dtype(void *v_args, int num_args) {
+  CHECK_GT(num_args, 0) << "the number of arguments must larger than zero";
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+  auto type_code         = args[0].operator cinn_buffer_t *()->type.code;
+  int bits               = args[0].operator cinn_buffer_t *()->type.bits;
+  for (int i = 1; i < num_args; ++i) {
+    auto t = args[i].operator cinn_buffer_t *()->type.code;
+    int b  = args[0].operator cinn_buffer_t *()->type.bits;
+    if (t != type_code || bits != b) {
+      LOG(FATAL) << "The types of all arguments need to be consistent.";
+    }
+  }
+  cudnnDataType_t data_type;
+  bool is_float    = type_code == cinn_type_float;
+  bool is_bfloat16 = type_code == cinn_type_bfloat;
+  if (is_float && bits == 16) {
+    data_type = CUDNN_DATA_HALF;
+  } else if (is_float && bits == 32) {
+    data_type = CUDNN_DATA_FLOAT;
+  } else if (is_bfloat16) {
+    data_type = CUDNN_DATA_BFLOAT16;
+  } else if (is_float && bits == 64) {
+    data_type = CUDNN_DATA_DOUBLE;
+  } else {
+    LOG(FATAL) << "unsupported cudnn data type: " << static_cast<int>(type_code) << ", bits = " << bits;
+  }
+  return data_type;
+}
+
+cudnnDataType_t get_cudnn_compute_dtype(cudnnDataType_t data_type) {
+  switch (data_type) {
+    case CUDNN_DATA_FLOAT:
+    case CUDNN_DATA_HALF:
+    case CUDNN_DATA_BFLOAT16:
+      return CUDNN_DATA_FLOAT;
+    case CUDNN_DATA_DOUBLE:
+      return CUDNN_DATA_DOUBLE;
+    default:
+      LOG(FATAL) << "unsupported cudnn data type, only support float16/bfloat16/float32/float64 now!";
+  }
+  return CUDNN_DATA_FLOAT;
+}
+
+std::string debug_cudnn_tensor_format(cudnnTensorFormat_t tensor_format) {
+  switch (tensor_format) {
+    case CUDNN_TENSOR_NCHW:
+      return "NCHW";
+    case CUDNN_TENSOR_NHWC:
+      return "NHWC";
+    default:
+      LOG(FATAL) << "Only support NCHW and NHWC data layout\n";
+  };
+  return "";
+}
+
+std::string debug_cudnn_tensor_dtype(cudnnDataType_t tensor_dtype) {
+  switch (tensor_dtype) {
+    case CUDNN_DATA_FLOAT:
+      return "float32";
+    case CUDNN_DATA_HALF:
+      return "float16";
+    case CUDNN_DATA_BFLOAT16:
+      return "bfloat16";
+    case CUDNN_DATA_DOUBLE:
+      return "float64";
+    default:
+      LOG(FATAL) << "Only support float16/bfloat16/float32/float64 now!";
+  };
+  return "";
+}
+
+std::string debug_cudnn_pool_mode(cudnnPoolingMode_t pool_mode) {
+  switch (pool_mode) {
+    case CUDNN_POOLING_MAX:
+      return "max";
+    case CUDNN_POOLING_MAX_DETERMINISTIC:
+      return "max_deterministic";
+    case CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING:
+      return "avg_include_padding";
+    case CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING:
+      return "avg_exclulude_padding";
+    default:
+      LOG(FATAL) << "Pool only support max and avg now!";
+  };
+  return "";
+}
+
+void cinn_call_cudnn_conv2d_forward(void *v_args,
+                                    int num_args,
+                                    int format,
+                                    float alpha,
+                                    float beta,
+                                    int input_n,
+                                    int input_c,
+                                    int input_h,
+                                    int input_w,
+                                    int filter_n,
+                                    int filter_c,
+                                    int filter_h,
+                                    int filter_w,
+                                    int pad_h,
+                                    int pad_w,
+                                    int stride_h,
+                                    int stride_w,
+                                    int dilation_h,
+                                    int dilation_w,
+                                    int groups,
+                                    int output_n,
+                                    int output_c,
+                                    int output_h,
+                                    int output_w,
+                                    void *stream) {
+  CHECK_EQ(num_args, 3);
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+  void *_x               = args[0].operator cinn_buffer_t *()->memory;
+  void *_w               = args[1].operator cinn_buffer_t *()->memory;
+  void *_y               = args[2].operator cinn_buffer_t *()->memory;
+
+  cudnnTensorFormat_t tensor_format = static_cast<cudnnTensorFormat_t>(format);
+  cudnnDataType_t data_type         = convert_to_cudnn_dtype(v_args, num_args);
+
+  cudnnTensorDescriptor_t x_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(x_desc, tensor_format, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnFilterDescriptor_t w_desc;
+  CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc));
+  CUDNN_CALL(cudnnSetFilter4dDescriptor(w_desc, data_type, tensor_format, filter_n, filter_c, filter_h, filter_w));
+
+  cudnnConvolutionDescriptor_t conv_desc;
+  CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
+  CUDNN_CALL(cudnnSetConvolution2dDescriptor(conv_desc,
+                                             pad_h,
+                                             pad_w,
+                                             stride_h,
+                                             stride_w,
+                                             dilation_h,
+                                             dilation_w,
+                                             CUDNN_CROSS_CORRELATION,
+                                             get_cudnn_compute_dtype(data_type)));
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(conv_desc, groups));
+  CUDNN_CALL(cudnnSetConvolutionMathType(conv_desc, CUDNN_DEFAULT_MATH));
+
+  cudnnTensorDescriptor_t y_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(y_desc, tensor_format, data_type, output_n, output_c, output_h, output_w));
+
+  auto &conv_algo_map  = ConvAlgoMap::GetInstance();
+  std::string hash_key = "conv2d forward, layout=" + debug_cudnn_tensor_format(tensor_format) +
+                         ", dtype=" + debug_cudnn_tensor_dtype(data_type) + ", input_nchw={" + std::to_string(input_n) +
+                         "," + std::to_string(input_c) + "," + std::to_string(input_h) + "," + std::to_string(input_w) +
+                         "}, filter_nchw={" + std::to_string(filter_n) + "," + std::to_string(filter_c) + "," +
+                         std::to_string(filter_h) + "," + std::to_string(filter_w) + "}, output_nchw={" +
+                         std::to_string(output_n) + "," + std::to_string(output_c) + "," + std::to_string(output_h) +
+                         "," + std::to_string(output_w) + "}";
+  VLOG(4) << hash_key;
+  cudnnConvolutionFwdAlgo_t algo;
+  int algo_int = conv_algo_map.GetAlgo(hash_key);
+  if (algo_int >= 0) {
+    algo = cudnnConvolutionFwdAlgo_t(algo_int);
+  } else {
+    int count = 0;
+    cudnnConvolutionFwdAlgoPerf_t algo_perf;
+    CUDNN_CALL(cudnnFindConvolutionForwardAlgorithm(handle, x_desc, w_desc, conv_desc, y_desc, 1, &count, &algo_perf));
+
+    algo = algo_perf.algo;
+    conv_algo_map.InsertAlgo(hash_key, static_cast<int>(algo_perf.algo));
+  }
+
+  if (GetCinnCudnnDeterministic()) {
+    algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
+  }
+
+  size_t workspace_size = 0;
+  CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(handle, x_desc, w_desc, conv_desc, y_desc, algo, &workspace_size));
+
+  void *workspace_data = CudnnHandle::GetInstance().GetWorkSpace(workspace_size);
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    CUDNN_CALL(cudnnConvolutionForward(handle,
+                                       &alpha_fp64,
+                                       x_desc,
+                                       _x,
+                                       w_desc,
+                                       _w,
+                                       conv_desc,
+                                       algo,
+                                       workspace_data,
+                                       workspace_size,
+                                       &beta_fp64,
+                                       y_desc,
+                                       _y));
+  } else {
+    CUDNN_CALL(cudnnConvolutionForward(
+        handle, &alpha, x_desc, _x, w_desc, _w, conv_desc, algo, workspace_data, workspace_size, &beta, y_desc, _y));
+  }
+
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc));
+  CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc));
+  CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc));
+}
+
+void cinn_call_cudnn_conv2d_backward_data(void *v_args,
+                                          int num_args,
+                                          int format,
+                                          float alpha,
+                                          float beta,
+                                          int input_n,
+                                          int input_c,
+                                          int input_h,
+                                          int input_w,
+                                          int filter_n,
+                                          int filter_c,
+                                          int filter_h,
+                                          int filter_w,
+                                          int pad_h,
+                                          int pad_w,
+                                          int stride_h,
+                                          int stride_w,
+                                          int dilation_h,
+                                          int dilation_w,
+                                          int groups,
+                                          int output_n,
+                                          int output_c,
+                                          int output_h,
+                                          int output_w,
+                                          void *stream) {
+  CHECK_EQ(num_args, 3);
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+  void *_w               = args[0].operator cinn_buffer_t *()->memory;
+  void *_dy              = args[1].operator cinn_buffer_t *()->memory;
+  void *_dx              = args[2].operator cinn_buffer_t *()->memory;
+
+  cudnnTensorFormat_t tensor_format = static_cast<cudnnTensorFormat_t>(format);
+  cudnnDataType_t data_type         = convert_to_cudnn_dtype(v_args, num_args);
+
+  cudnnTensorDescriptor_t x_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(x_desc, tensor_format, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnFilterDescriptor_t w_desc;
+  CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc));
+  CUDNN_CALL(cudnnSetFilter4dDescriptor(w_desc, data_type, tensor_format, filter_n, filter_c, filter_h, filter_w));
+
+  cudnnConvolutionDescriptor_t conv_desc;
+  CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
+  CUDNN_CALL(cudnnSetConvolution2dDescriptor(conv_desc,
+                                             pad_h,
+                                             pad_w,
+                                             stride_h,
+                                             stride_w,
+                                             dilation_h,
+                                             dilation_w,
+                                             CUDNN_CROSS_CORRELATION,
+                                             get_cudnn_compute_dtype(data_type)));
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(conv_desc, groups));
+  CUDNN_CALL(cudnnSetConvolutionMathType(conv_desc, CUDNN_DEFAULT_MATH));
+
+  cudnnTensorDescriptor_t y_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(y_desc, tensor_format, data_type, output_n, output_c, output_h, output_w));
+
+  auto &conv_algo_map  = ConvAlgoMap::GetInstance();
+  std::string hash_key = "conv2d backward data, layout=" + debug_cudnn_tensor_format(tensor_format) +
+                         ", dtype=" + debug_cudnn_tensor_dtype(data_type) + ", input_nchw={" + std::to_string(input_n) +
+                         "," + std::to_string(input_c) + "," + std::to_string(input_h) + "," + std::to_string(input_w) +
+                         "}, filter_nchw={" + std::to_string(filter_n) + "," + std::to_string(filter_c) + "," +
+                         std::to_string(filter_h) + "," + std::to_string(filter_w) + "}, output_nchw={" +
+                         std::to_string(output_n) + "," + std::to_string(output_c) + "," + std::to_string(output_h) +
+                         "," + std::to_string(output_w) + "}";
+
+  VLOG(4) << hash_key;
+
+  int algo_int = conv_algo_map.GetAlgo(hash_key);
+  cudnnConvolutionBwdDataAlgo_t algo;
+  if (algo_int >= 0) {
+    algo = cudnnConvolutionBwdDataAlgo_t(algo_int);
+  } else {
+    int count = 0;
+    cudnnConvolutionBwdDataAlgoPerf_t algo_perf;
+    CUDNN_CALL(
+        cudnnFindConvolutionBackwardDataAlgorithm(handle, w_desc, y_desc, conv_desc, x_desc, 1, &count, &algo_perf));
+
+    algo = algo_perf.algo;
+    conv_algo_map.InsertAlgo(hash_key, static_cast<int>(algo_perf.algo));
+  }
+
+  if (GetCinnCudnnDeterministic()) {
+    algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+  }
+
+  size_t workspace_size = 0;
+  CUDNN_CALL(
+      cudnnGetConvolutionBackwardDataWorkspaceSize(handle, w_desc, y_desc, conv_desc, x_desc, algo, &workspace_size));
+
+  void *workspace_data = CudnnHandle::GetInstance().GetWorkSpace(workspace_size);
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    CUDNN_CALL(cudnnConvolutionBackwardData(handle,
+                                            &alpha_fp64,
+                                            w_desc,
+                                            _w,
+                                            y_desc,
+                                            _dy,
+                                            conv_desc,
+                                            algo,
+                                            workspace_data,
+                                            workspace_size,
+                                            &beta_fp64,
+                                            x_desc,
+                                            _dx));
+  } else {
+    CUDNN_CALL(cudnnConvolutionBackwardData(
+        handle, &alpha, w_desc, _w, y_desc, _dy, conv_desc, algo, workspace_data, workspace_size, &beta, x_desc, _dx));
+  }
+
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc));
+  CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc));
+  CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc));
+}
+
+void cinn_call_cudnn_conv2d_backward_filter(void *v_args,
+                                            int num_args,
+                                            int format,
+                                            float alpha,
+                                            float beta,
+                                            int input_n,
+                                            int input_c,
+                                            int input_h,
+                                            int input_w,
+                                            int filter_n,
+                                            int filter_c,
+                                            int filter_h,
+                                            int filter_w,
+                                            int pad_h,
+                                            int pad_w,
+                                            int stride_h,
+                                            int stride_w,
+                                            int dilation_h,
+                                            int dilation_w,
+                                            int groups,
+                                            int output_n,
+                                            int output_c,
+                                            int output_h,
+                                            int output_w,
+                                            void *stream) {
+  CHECK_EQ(num_args, 3);
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+
+  void *_x  = args[0].operator cinn_buffer_t *()->memory;
+  void *_dy = args[1].operator cinn_buffer_t *()->memory;
+  void *_dw = args[2].operator cinn_buffer_t *()->memory;
+
+  cudnnTensorFormat_t tensor_format = static_cast<cudnnTensorFormat_t>(format);
+  cudnnDataType_t data_type         = convert_to_cudnn_dtype(v_args, num_args);
+
+  cudnnTensorDescriptor_t x_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(x_desc, tensor_format, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnFilterDescriptor_t w_desc;
+  CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc));
+  CUDNN_CALL(cudnnSetFilter4dDescriptor(w_desc, data_type, tensor_format, filter_n, filter_c, filter_h, filter_w));
+
+  cudnnConvolutionDescriptor_t conv_desc;
+  CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
+  CUDNN_CALL(cudnnSetConvolution2dDescriptor(conv_desc,
+                                             pad_h,
+                                             pad_w,
+                                             stride_h,
+                                             stride_w,
+                                             dilation_h,
+                                             dilation_w,
+                                             CUDNN_CROSS_CORRELATION,
+                                             get_cudnn_compute_dtype(data_type)));
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(conv_desc, groups));
+  CUDNN_CALL(cudnnSetConvolutionMathType(conv_desc, CUDNN_DEFAULT_MATH));
+
+  cudnnTensorDescriptor_t y_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(y_desc, tensor_format, data_type, output_n, output_c, output_h, output_w));
+
+  auto &algo_map       = ConvAlgoMap::GetInstance();
+  std::string hash_key = "conv2d backward filter, layout=" + debug_cudnn_tensor_format(tensor_format) +
+                         ", dtype=" + debug_cudnn_tensor_dtype(data_type) + ", input_nchw={" + std::to_string(input_n) +
+                         "," + std::to_string(input_c) + "," + std::to_string(input_h) + "," + std::to_string(input_w) +
+                         "}, filter_nchw={" + std::to_string(filter_n) + "," + std::to_string(filter_c) + "," +
+                         std::to_string(filter_h) + "," + std::to_string(filter_w) + "}, output_nchw={" +
+                         std::to_string(output_n) + "," + std::to_string(output_c) + "," + std::to_string(output_h) +
+                         "," + std::to_string(output_w) + "}";
+
+  VLOG(4) << hash_key;
+
+  int algo_int = algo_map.GetAlgo(hash_key);
+  cudnnConvolutionBwdFilterAlgo_t algo;
+  if (algo_int >= 0) {
+    algo = cudnnConvolutionBwdFilterAlgo_t(algo_int);
+  } else {
+    int count = 0;
+    cudnnConvolutionBwdFilterAlgoPerf_t algo_perf;
+    CUDNN_CALL(
+        cudnnFindConvolutionBackwardFilterAlgorithm(handle, x_desc, y_desc, conv_desc, w_desc, 1, &count, &algo_perf));
+
+    algo = algo_perf.algo;
+    algo_map.InsertAlgo(hash_key, static_cast<int>(algo_perf.algo));
+  }
+
+  if (GetCinnCudnnDeterministic()) {
+    algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+  }
+
+  size_t workspace_size = 0;
+  CUDNN_CALL(
+      cudnnGetConvolutionBackwardFilterWorkspaceSize(handle, x_desc, y_desc, conv_desc, w_desc, algo, &workspace_size));
+
+  void *workspace_data = CudnnHandle::GetInstance().GetWorkSpace(workspace_size);
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    CUDNN_CALL(cudnnConvolutionBackwardFilter(handle,
+                                              &alpha_fp64,
+                                              x_desc,
+                                              _x,
+                                              y_desc,
+                                              _dy,
+                                              conv_desc,
+                                              algo,
+                                              workspace_data,
+                                              workspace_size,
+                                              &beta_fp64,
+                                              w_desc,
+                                              _dw));
+  } else {
+    CUDNN_CALL(cudnnConvolutionBackwardFilter(
+        handle, &alpha, x_desc, _x, y_desc, _dy, conv_desc, algo, workspace_data, workspace_size, &beta, w_desc, _dw));
+  }
+
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc));
+  CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc));
+  CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc));
+}
+
+void cinn_call_cudnn_pool2d_forward(void *v_args,
+                                    int num_args,
+                                    int mode,
+                                    int format,
+                                    float alpha,
+                                    float beta,
+                                    int input_n,
+                                    int input_c,
+                                    int input_h,
+                                    int input_w,
+                                    int kernel_h,
+                                    int kernel_w,
+                                    int pad_h,
+                                    int pad_w,
+                                    int stride_h,
+                                    int stride_w,
+                                    int output_n,
+                                    int output_c,
+                                    int output_h,
+                                    int output_w,
+                                    void *stream) {
+  CHECK_EQ(num_args, 2);
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+
+  void *_x = args[0].operator cinn_buffer_t *()->memory;
+  void *_y = args[1].operator cinn_buffer_t *()->memory;
+
+  cudnnPoolingMode_t pool_mode      = static_cast<cudnnPoolingMode_t>(mode);
+  cudnnTensorFormat_t tensor_format = static_cast<cudnnTensorFormat_t>(format);
+  cudnnDataType_t data_type         = convert_to_cudnn_dtype(v_args, num_args);
+
+  if (GetCinnCudnnDeterministic() && pool_mode == CUDNN_POOLING_MAX) {
+    pool_mode = CUDNN_POOLING_MAX_DETERMINISTIC;
+  }
+
+  std::string hash_key =
+      "pool2d forward, layout=" + debug_cudnn_tensor_format(tensor_format) +
+      ", pool_type=" + debug_cudnn_pool_mode(pool_mode) + ", dtype=" + debug_cudnn_tensor_dtype(data_type) +
+      ", input_nchw={" + std::to_string(input_n) + "," + std::to_string(input_c) + "," + std::to_string(input_h) + "," +
+      std::to_string(input_w) + "}, kernel_hw={" + std::to_string(kernel_h) + "," + std::to_string(kernel_w) +
+      "}, pad_hw={" + std::to_string(pad_h) + "," + std::to_string(pad_w) + "}, stride_hw={" +
+      std::to_string(stride_h) + "," + std::to_string(stride_w) + "}, output_nchw={" + std::to_string(output_n) + "," +
+      std::to_string(output_c) + "," + std::to_string(output_h) + "," + std::to_string(output_w) + "}";
+
+  VLOG(4) << hash_key;
+
+  cudnnPoolingDescriptor_t pool_desc;
+  CUDNN_CALL(cudnnCreatePoolingDescriptor(&pool_desc));
+  CUDNN_CALL(cudnnSetPooling2dDescriptor(
+      pool_desc, pool_mode, CUDNN_NOT_PROPAGATE_NAN, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w));
+
+  cudnnTensorDescriptor_t x_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(x_desc, tensor_format, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnTensorDescriptor_t y_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(y_desc, tensor_format, data_type, output_n, output_c, output_h, output_w));
+
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    CUDNN_CALL(cudnnPoolingForward(handle, pool_desc, &alpha_fp64, x_desc, _x, &beta_fp64, y_desc, _y));
+  } else {
+    CUDNN_CALL(cudnnPoolingForward(handle, pool_desc, &alpha, x_desc, _x, &beta, y_desc, _y));
+  }
+
+  CUDNN_CALL(cudnnDestroyPoolingDescriptor(pool_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc));
+}
+
+void cinn_call_cudnn_pool2d_backward(void *v_args,
+                                     int num_args,
+                                     int mode,
+                                     int format,
+                                     float alpha,
+                                     float beta,
+                                     int input_n,
+                                     int input_c,
+                                     int input_h,
+                                     int input_w,
+                                     int kernel_h,
+                                     int kernel_w,
+                                     int pad_h,
+                                     int pad_w,
+                                     int stride_h,
+                                     int stride_w,
+                                     int output_n,
+                                     int output_c,
+                                     int output_h,
+                                     int output_w,
+                                     void *stream) {
+  CHECK_EQ(num_args, 4);
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+
+  void *_x  = args[0].operator cinn_buffer_t *()->memory;
+  void *_y  = args[1].operator cinn_buffer_t *()->memory;
+  void *_dy = args[2].operator cinn_buffer_t *()->memory;
+  void *_dx = args[3].operator cinn_buffer_t *()->memory;
+
+  cudnnPoolingMode_t pool_mode      = static_cast<cudnnPoolingMode_t>(mode);
+  cudnnTensorFormat_t tensor_format = static_cast<cudnnTensorFormat_t>(format);
+  cudnnDataType_t data_type         = convert_to_cudnn_dtype(v_args, num_args);
+
+  if (GetCinnCudnnDeterministic() && pool_mode == CUDNN_POOLING_MAX) {
+    pool_mode = CUDNN_POOLING_MAX_DETERMINISTIC;
+  }
+
+  std::string hash_key =
+      "pool2d backward, layout=" + debug_cudnn_tensor_format(tensor_format) +
+      ", pool_type=" + debug_cudnn_pool_mode(pool_mode) + ", dtype=" + debug_cudnn_tensor_dtype(data_type) +
+      ", input_nchw={" + std::to_string(input_n) + "," + std::to_string(input_c) + "," + std::to_string(input_h) + "," +
+      std::to_string(input_w) + "}, kernel_hw={" + std::to_string(kernel_h) + "," + std::to_string(kernel_w) +
+      "}, pad_hw={" + std::to_string(pad_h) + "," + std::to_string(pad_w) + "}, stride_hw={" +
+      std::to_string(stride_h) + "," + std::to_string(stride_w) + ", output_nchw={" + std::to_string(output_n) + "," +
+      std::to_string(output_c) + "," + std::to_string(output_h) + "," + std::to_string(output_w) + "}";
+
+  VLOG(4) << hash_key;
+
+  cudnnPoolingDescriptor_t pool_desc;
+  CUDNN_CALL(cudnnCreatePoolingDescriptor(&pool_desc));
+  CUDNN_CALL(cudnnSetPooling2dDescriptor(
+      pool_desc, pool_mode, CUDNN_NOT_PROPAGATE_NAN, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w));
+
+  cudnnTensorDescriptor_t x_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(x_desc, tensor_format, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnTensorDescriptor_t y_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(y_desc, tensor_format, data_type, output_n, output_c, output_h, output_w));
+
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    CUDNN_CALL(cudnnPoolingBackward(
+        handle, pool_desc, &alpha_fp64, y_desc, _y, y_desc, _dy, x_desc, _x, &beta_fp64, x_desc, _dx));
+  } else {
+    CUDNN_CALL(
+        cudnnPoolingBackward(handle, pool_desc, &alpha, y_desc, _y, y_desc, _dy, x_desc, _x, &beta, x_desc, _dx));
+  }
+
+  CUDNN_CALL(cudnnDestroyPoolingDescriptor(pool_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc));
+}
+
+void cinn_call_cudnn_softmax_forward(void *v_args,
+                                     int num_args,
+                                     int mode,
+                                     int format,
+                                     float alpha,
+                                     float beta,
+                                     int input_n,
+                                     int input_c,
+                                     int input_h,
+                                     int input_w,
+                                     int output_n,
+                                     int output_c,
+                                     int output_h,
+                                     int output_w,
+                                     void *stream) {
+  CHECK_EQ(num_args, 2);
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+
+  void *_x = args[0].operator cinn_buffer_t *()->memory;
+  void *_y = args[1].operator cinn_buffer_t *()->memory;
+
+  cudnnSoftmaxMode_t softmax_mode   = static_cast<cudnnSoftmaxMode_t>(mode);
+  cudnnTensorFormat_t tensor_format = static_cast<cudnnTensorFormat_t>(format);
+  cudnnDataType_t data_type         = convert_to_cudnn_dtype(v_args, num_args);
+
+  cudnnTensorDescriptor_t x_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(x_desc, tensor_format, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnTensorDescriptor_t y_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(y_desc, tensor_format, data_type, output_n, output_c, output_h, output_w));
+
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    CUDNN_CALL(
+        cudnnSoftmaxForward(handle, CUDNN_SOFTMAX_LOG, softmax_mode, &alpha_fp64, x_desc, _x, &beta_fp64, y_desc, _y));
+  } else {
+    CUDNN_CALL(cudnnSoftmaxForward(handle, CUDNN_SOFTMAX_LOG, softmax_mode, &alpha, x_desc, _x, &beta, y_desc, _y));
+  }
+
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc));
+}
+
+void cinn_call_cudnn_softmax_backward(void *v_args,
+                                      int num_args,
+                                      int mode,
+                                      int format,
+                                      float alpha,
+                                      float beta,
+                                      int input_n,
+                                      int input_c,
+                                      int input_h,
+                                      int input_w,
+                                      int output_n,
+                                      int output_c,
+                                      int output_h,
+                                      int output_w,
+                                      void *stream) {
+  CHECK_EQ(num_args, 3);
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+
+  void *_y  = args[0].operator cinn_buffer_t *()->memory;
+  void *_dy = args[1].operator cinn_buffer_t *()->memory;
+  void *_dx = args[2].operator cinn_buffer_t *()->memory;
+
+  cudnnSoftmaxMode_t softmax_mode   = static_cast<cudnnSoftmaxMode_t>(mode);
+  cudnnTensorFormat_t tensor_format = static_cast<cudnnTensorFormat_t>(format);
+  cudnnDataType_t data_type         = convert_to_cudnn_dtype(v_args, num_args);
+
+  cudnnTensorDescriptor_t x_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(x_desc, tensor_format, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnTensorDescriptor_t y_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(y_desc, tensor_format, data_type, output_n, output_c, output_h, output_w));
+
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    CUDNN_CALL(cudnnSoftmaxBackward(
+        handle, CUDNN_SOFTMAX_LOG, softmax_mode, &alpha_fp64, y_desc, _y, y_desc, _dy, &beta_fp64, x_desc, _dx));
+  } else {
+    CUDNN_CALL(cudnnSoftmaxBackward(
+        handle, CUDNN_SOFTMAX_LOG, softmax_mode, &alpha, y_desc, _y, y_desc, _dy, &beta, x_desc, _dx));
+  }
+
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc));
+}
+
+#endif  // CINN_WITH_CUDNN
+
+/********************to be removed in future***********************/
+
+namespace details {
+
+void Gemm(const cublasHandle_t &cublas,
+          bool lhs_trans,
+          bool rhs_trans,
+          const float alpha,
+          const float *lhs_data,
+          const std::vector<int> &lhs_shape,
+          const float *rhs_data,
+          const std::vector<int> &rhs_shape,
+          const float *bias_data,
+          const float beta,
+          float *output_data,
+          const std::vector<int> &output_shape,
+          cudaStream_t stream) {
+  int lhs_row    = lhs_shape[0];
+  int lhs_col    = lhs_shape[1];
+  int rhs_row    = rhs_shape[0];
+  int rhs_col    = rhs_shape[1];
+  int output_row = output_shape[0];
+  int output_col = output_shape[1];
+
+  // copy values of bias_data to the output_data
+  if (bias_data != nullptr) {
+    cudaMemcpyAsync(output_data, bias_data, output_row * output_col * sizeof(float), cudaMemcpyDeviceToDevice, stream);
+  }
+
+  int contracting_size = lhs_trans ? lhs_row : lhs_col;
+  CHECK_EQ(contracting_size, (rhs_trans ? rhs_col : rhs_row))
+      << "The contracting dimension value of lhs matrix should be equal to the one of rhs matrix.";
+  auto trans_a = rhs_trans ? CUBLAS_OP_T : CUBLAS_OP_N;
+  auto trans_b = lhs_trans ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasSgemm(cublas,
+              trans_a,
+              trans_b,
+              output_col,
+              output_row,
+              contracting_size,
+              &alpha,
+              rhs_data,
+              rhs_col,
+              lhs_data,
+              lhs_col,
+              &beta,
+              output_data,
+              output_col);
+}
+
+void GemmStridedBatched(const cublasHandle_t &cublas,
+                        bool lhs_trans,
+                        bool rhs_trans,
+                        const float alpha,
+                        const float *lhs_data,
+                        const std::vector<int> &lhs_shape,
+                        const float *rhs_data,
+                        const std::vector<int> &rhs_shape,
+                        const float *bias_data,
+                        const float beta,
+                        float *output_data,
+                        const std::vector<int> &output_shape,
+                        cudaStream_t stream) {
+  int lhs_bs     = lhs_shape[0];
+  int lhs_row    = lhs_shape[1];
+  int lhs_col    = lhs_shape[2];
+  int rhs_bs     = rhs_shape[0];
+  int rhs_row    = rhs_shape[1];
+  int rhs_col    = rhs_shape[2];
+  int output_bs  = output_shape[0];
+  int output_row = output_shape[1];
+  int output_col = output_shape[2];
+  CHECK_EQ(lhs_bs, rhs_bs);
+  CHECK_EQ(lhs_bs, output_bs);
+
+  // copy values of bias_data to the output_data
+  if (bias_data != nullptr) {
+    cudaMemcpyAsync(
+        output_data, bias_data, output_bs * output_row * output_col * sizeof(float), cudaMemcpyDeviceToDevice, stream);
+  }
+
+  int contracting_size = lhs_trans ? lhs_row : lhs_col;
+  CHECK_EQ(contracting_size, (rhs_trans ? rhs_col : rhs_row))
+      << "The contracting dimension value of lhs matrix should be equal to the one of rhs matrix.";
+  auto trans_a          = rhs_trans ? CUBLAS_OP_T : CUBLAS_OP_N;
+  auto trans_b          = lhs_trans ? CUBLAS_OP_T : CUBLAS_OP_N;
+  int64_t lhs_stride    = lhs_row * lhs_col;
+  int64_t rhs_stride    = rhs_row * rhs_col;
+  int64_t output_stride = output_row * output_col;
+  cublasSgemmStridedBatched(cublas,
+                            trans_a,
+                            trans_b,
+                            output_col,
+                            output_row,
+                            contracting_size,
+                            &alpha,
+                            rhs_data,
+                            rhs_col,
+                            rhs_stride,
+                            lhs_data,
+                            lhs_col,
+                            lhs_stride,
+                            &beta,
+                            output_data,
+                            output_col,
+                            output_stride,
+                            output_bs);
+}
+
+}  // namespace details
+
+class CusolverHandle {
+ public:
+  CusolverHandle(const CusolverHandle &) = delete;
+  CusolverHandle &operator=(const CusolverHandle &) = delete;
+  ~CusolverHandle() { CUSOLVER_CALL(cusolverDnDestroy(handle_)); }
+  static CusolverHandle &GetInstance() {
+    static CusolverHandle instance;
+    return instance;
+  }
+  cusolverDnHandle_t &GetHandle() { return handle_; }
+
+ private:
+  CusolverHandle() { CUSOLVER_CALL(cusolverDnCreate(&handle_)); }
+  cusolverDnHandle_t handle_;
+};
+
+void cinn_call_cholesky_nvgpu(void *v_args, int num_args, int batch_size, int m, bool upper, void *stream) {
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+  cinn_buffer_t *x       = args[0].operator cinn_buffer_t *();
+  cinn_buffer_t *out     = args[1].operator cinn_buffer_t *();
+  // In cuSOLVER, dense matrix stores in COL_MAJOR, thus FILL_MODE needs to be filpped.
+  // See also: https://docs.nvidia.com/cuda/cusolver/index.html#matrix-dense-format
+  cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  size_t numel          = x->num_elements();
+  uint8_t bits          = x->type.bits;
+  uint8_t bytes         = bits / 8;
+  CHECK_EQ(x->type.code, cinn_type_code_t::cinn_type_float);
+  CHECK(bits == 32 || bits == 64) << "Unsupported bits = " << bits << " float data type for cholesky";
+
+  auto cuda_stream = static_cast<cudaStream_t>(stream);
+
+  // Copy data from x to out
+  void *x_ptr   = reinterpret_cast<void *>(x->memory);
+  void *out_ptr = reinterpret_cast<void *>(out->memory);
+  CUDA_CALL(cudaMemcpyAsync(out_ptr, x_ptr, numel * bytes, cudaMemcpyDeviceToDevice, cuda_stream));
+  // Generate pointer array
+  thrust::host_vector<void *> host_out_ptr(batch_size, nullptr);
+  for (int i = 0; i < batch_size; ++i) {
+    host_out_ptr[i] = reinterpret_cast<char *>(out_ptr) + i * m * m * bytes;
+  }
+  thrust::device_vector<void *> dev_out_ptr(host_out_ptr.begin(), host_out_ptr.end());
+  // Store the return value of each matrix
+  thrust::host_vector<int> host_info(batch_size, 0);
+  thrust::device_vector<int> dev_info(host_info.begin(), host_info.end());
+
+  cusolverDnHandle_t handler = CusolverHandle::GetInstance().GetHandle();
+  CUSOLVER_CALL(cusolverDnSetStream(handler, cuda_stream));
+  if (bits == 32) {
+    CUSOLVER_CALL(cusolverDnSpotrfBatched(handler,
+                                          uplo,
+                                          m,
+                                          reinterpret_cast<float **>(dev_out_ptr.data().get()),
+                                          m,
+                                          thrust::raw_pointer_cast(dev_info.data()),
+                                          batch_size));
+  } else if (bits == 64) {
+    CUSOLVER_CALL(cusolverDnDpotrfBatched(handler,
+                                          uplo,
+                                          m,
+                                          reinterpret_cast<double **>(dev_out_ptr.data().get()),
+                                          m,
+                                          thrust::raw_pointer_cast(dev_info.data()),
+                                          batch_size));
+  }
+
+  // Check result
+  thrust::copy(dev_info.begin(), dev_info.end(), host_info.begin());
+  for (int i = 0; i < host_info.size(); i++) {
+    CHECK_EQ(host_info[i], 0) << "Cholesky decomposition fail, please check the " << i + 1 << "th input matrix.";
+  }
+}
+
+void cinn_call_triangular_solve_nvgpu(void *v_args,
+                                      int num_args,
+                                      int batch_size,
+                                      int m,
+                                      int k,
+                                      bool left_side,
+                                      bool upper,
+                                      bool transpose_a,
+                                      bool unit_diagonal,
+                                      void *stream) {
+  cublasHandle_t &handle = CublasHandle::GetInstance().GetCublasHandle();
+  cudaStream_t custream  = static_cast<cudaStream_t>(stream);
+  CUBLAS_CALL(cublasSetStream(handle, custream));
+
+  int b_rows               = left_side ? k : m;
+  int b_cols               = left_side ? m : k;
+  int lda                  = m;
+  int ldb                  = b_rows;
+  cublasSideMode_t side    = left_side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
+  cublasFillMode_t uplo    = upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  cublasOperation_t transa = transpose_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasDiagType_t diag    = unit_diagonal ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
+
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+  cinn_buffer_t *input1  = args[0].operator cinn_buffer_t *();
+  cinn_buffer_t *input2  = args[1].operator cinn_buffer_t *();
+  cinn_buffer_t *output  = args[2].operator cinn_buffer_t *();
+
+  CHECK_EQ(input1->type.code, cinn_type_code_t::cinn_type_float);
+  CHECK_EQ(input2->type.code, cinn_type_code_t::cinn_type_float);
+  CHECK_EQ(input1->type.bits, input2->type.bits);
+  uint8_t bits  = input1->type.bits;
+  uint8_t bytes = bits / 8;
+  CHECK(bits == 32 || bits == 64) << "unsupported bits = " << bits << " float data type for triangular solve";
+
+  std::string debug_info =
+      "triangular solve op: left_side=" + std::to_string(left_side) + ", upper=" + std::to_string(uplo) +
+      ", transpose_a=" + std::to_string(transa) + ", unit_diagonal=" + std::to_string(unit_diagonal) +
+      ", batch_size=" + std::to_string(batch_size) + ", m=" + std::to_string(m) + ", k=" + std::to_string(k) +
+      ", input1_dtype={code: " + std::to_string(input1->type.code) + ", bits: " + std::to_string(input1->type.bits) +
+      "}" + ", input2_dtype={code: " + std::to_string(input2->type.code) +
+      ", bits: " + std::to_string(input2->type.bits) + "}";
+  VLOG(4) << debug_info;
+
+  void *a_ptr = reinterpret_cast<void *>(input1->memory);
+  void *b_ptr = reinterpret_cast<void *>(input2->memory);
+  void *x_ptr = reinterpret_cast<void *>(output->memory);
+
+  // The API cublasStrsmBatched overwrites the right-hand sides, so the right-hand sides should be copied to the output.
+  // The output can then be used directly for the calculation.
+  size_t numel = input2->num_elements();
+  CUDA_CALL(cudaMemcpyAsync(x_ptr, b_ptr, numel * bytes, cudaMemcpyDeviceToDevice, custream));
+
+  std::vector<void *> a_array(batch_size, nullptr);
+  std::vector<void *> x_array(batch_size, nullptr);
+  for (int i = 0; i < batch_size; ++i) {
+    a_array[i] = reinterpret_cast<char *>(a_ptr) + i * m * m * bytes;
+    x_array[i] = reinterpret_cast<char *>(x_ptr) + i * m * k * bytes;
+  }
+  thrust::device_vector<void *> dev_a_array(a_array.begin(), a_array.end());
+  thrust::device_vector<void *> dev_x_array(x_array.begin(), x_array.end());
+
+  if (bits == 32) {
+    std::vector<float> alpha(batch_size, 1.0f);
+    CUBLAS_CALL(cublasStrsmBatched(handle,
+                                   side,
+                                   uplo,
+                                   transa,
+                                   diag,
+                                   b_rows,
+                                   b_cols,
+                                   alpha.data(),
+                                   reinterpret_cast<float **>(dev_a_array.data().get()),
+                                   lda,
+                                   reinterpret_cast<float **>(dev_x_array.data().get()),
+                                   ldb,
+                                   batch_size));
+  } else if (bits == 64) {
+    std::vector<double> alpha(batch_size, 1.0);
+    CUBLAS_CALL(cublasDtrsmBatched(handle,
+                                   side,
+                                   uplo,
+                                   transa,
+                                   diag,
+                                   b_rows,
+                                   b_cols,
+                                   alpha.data(),
+                                   reinterpret_cast<double **>(dev_a_array.data().get()),
+                                   lda,
+                                   reinterpret_cast<double **>(dev_x_array.data().get()),
+                                   ldb,
+                                   batch_size));
+  }
+}
+
+void cinn_assert_true_nvgpu(void *v_args, int num_args, int msg, bool only_warning, void *stream) {
+  cinn_assert_true(v_args, num_args, msg, only_warning, stream, common::DefaultNVGPUTarget());
+}
+
+void cinn_gpu_cublas_mul(const std::vector<int> &attrs,
+                         cinn_buffer_t *input1,
+                         cinn_buffer_t *input2,
+                         cinn_buffer_t *output,
+                         cudaStream_t stream) {
+  cublasHandle_t &handle = CublasHandle::GetInstance().GetCublasHandle();
+  CHECK_EQ(input1->type.code, cinn_type_code_t::cinn_type_float);
+  cudaStream_t custream = static_cast<cudaStream_t>(stream);
+  CUBLAS_CALL(cublasSetStream(handle, custream));
+  float *x_data   = reinterpret_cast<float *>(input1->memory);
+  float *y_data   = reinterpret_cast<float *>(input2->memory);
+  float *out_data = reinterpret_cast<float *>(output->memory);
+  int M           = 1;
+  CHECK_GE(attrs.size(), 6);
+  for (int i = 0; i < attrs[attrs.size() - 2]; i++) {
+    M *= attrs[i];
+  }
+  int N       = attrs[attrs.size() - 3];
+  int K       = attrs[attrs.size() - 4];
+  float alpha = 1.f;
+  float beta  = 0.f;
+  // M,N * N,K
+  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, K, M, N, &alpha, y_data, K, x_data, N, &beta, out_data, K);
+}
+
+void cinn_gpu_cublas_gemm(const std::vector<int> &attrs,
+                          cinn_buffer_t *lhs,
+                          cinn_buffer_t *rhs,
+                          cinn_buffer_t *bias,
+                          cinn_buffer_t *output,
+                          cudaStream_t stream) {
+  cublasHandle_t &handle = CublasHandle::GetInstance().GetCublasHandle();
+  cudaStream_t custream  = static_cast<cudaStream_t>(stream);
+  CUBLAS_CALL(cublasSetStream(handle, custream));
+
+  CHECK_EQ(lhs->type.code, cinn_type_code_t::cinn_type_float);
+  const float *lhs_data  = reinterpret_cast<const float *>(lhs->memory);
+  const float *rhs_data  = reinterpret_cast<const float *>(rhs->memory);
+  const float *bias_data = bias ? reinterpret_cast<const float *>(bias->memory) : nullptr;
+  float *output_data     = reinterpret_cast<float *>(output->memory);
+
+  CHECK_GE(attrs.size(), 13);
+  int lhs_dim_size = attrs[attrs.size() - 7];
+  int rhs_dim_size = attrs[attrs.size() - 6];
+  int out_dim_size = attrs[attrs.size() - 5];
+  bool lhs_trans   = static_cast<bool>(attrs[attrs.size() - 4]);
+  bool rhs_trans   = static_cast<bool>(attrs[attrs.size() - 3]);
+  bool out_trans   = static_cast<bool>(attrs[attrs.size() - 2]);
+  // 1）C = A^T * B    -->  C^T = B^T * A
+  // 2）C = A * B^T    -->  C^T = B * A^T
+  // 3）C = A^T * B^T  -->  C^T = B * A
+  // 4）C = A * B      -->  C^T = B^T * A^T
+  if (out_trans) {
+    lhs_trans = static_cast<bool>(attrs[attrs.size() - 3]) ^ out_trans;
+    rhs_trans = static_cast<bool>(attrs[attrs.size() - 4]) ^ out_trans;
+  }
+  const float alpha = *reinterpret_cast<const float *>(&attrs[attrs.size() - 1]);
+  const float beta  = bias ? 1.f : 0.f;
+  VLOG(4) << "The lhs_trans value used by cinn_gpu_cublas_gemm: " << lhs_trans;
+  VLOG(4) << "The rhs_trans value used by cinn_gpu_cublas_gemm: " << rhs_trans;
+  VLOG(4) << "The out_trans value used by cinn_gpu_cublas_gemm: " << out_trans;
+  VLOG(4) << "The alpha value used by cinn_gpu_cublas_gemm: " << alpha;
+  VLOG(4) << "The beta value used by cinn_gpu_cublas_gemm: " << beta;
+  CHECK_EQ(lhs_dim_size, rhs_dim_size);
+  CHECK_EQ(lhs_dim_size, out_dim_size);
+  CHECK((lhs_dim_size == 2 || lhs_dim_size == 3));
+
+  if (lhs_dim_size == 2) {
+    // [row, col]
+    std::vector<int> lhs_shape{attrs[0], attrs[1]};
+    std::vector<int> rhs_shape{attrs[2], attrs[3]};
+    std::vector<int> output_shape{attrs[4], attrs[5]};
+    if (out_trans) {
+      std::swap(lhs_shape, rhs_shape);
+      std::swap(lhs_data, rhs_data);
+    }
+    details::Gemm(handle,
+                  lhs_trans,
+                  rhs_trans,
+                  alpha,
+                  lhs_data,
+                  lhs_shape,
+                  rhs_data,
+                  rhs_shape,
+                  bias_data,
+                  beta,
+                  output_data,
+                  output_shape,
+                  stream);
+  } else {
+    // [batch, row, col]
+    std::vector<int> lhs_shape{attrs[0], attrs[1], attrs[2]};
+    std::vector<int> rhs_shape{attrs[3], attrs[4], attrs[5]};
+    std::vector<int> output_shape{attrs[6], attrs[7], attrs[8]};
+    if (out_trans) {
+      std::swap(lhs_shape, rhs_shape);
+      std::swap(lhs_data, rhs_data);
+    }
+    details::GemmStridedBatched(handle,
+                                lhs_trans,
+                                rhs_trans,
+                                alpha,
+                                lhs_data,
+                                lhs_shape,
+                                rhs_data,
+                                rhs_shape,
+                                bias_data,
+                                beta,
+                                output_data,
+                                output_shape,
+                                stream);
+  }
+}
+
+class CurandGenerator {
+ public:
+  CurandGenerator() { CURAND_CALL(curandCreateGenerator(&generator_, CURAND_RNG_PSEUDO_DEFAULT)); }
+
+  CurandGenerator(curandRngType rng_type) { CURAND_CALL(curandCreateGenerator(&generator_, rng_type)); }
+
+  ~CurandGenerator() { CURAND_CALL(curandDestroyGenerator(generator_)); }
+
+  curandGenerator_t &GetGenerator() { return generator_; }
+
+  CurandGenerator &SetOffset(unsigned long long offset = 0ULL) {
+    CURAND_CALL(curandSetGeneratorOffset(generator_, offset));
+    VLOG(4) << "Set curand generator offset to: " << offset;
+    return *this;
+  }
+
+  CurandGenerator &SetSeed(unsigned long long seed = 0ULL) {
+    // set global seed if seed is zero
+    auto rand_seed = (seed == 0ULL) ? RandomSeed::GetOrSet() : seed;
+    if (rand_seed != 0ULL && rand_seed != seed_) {
+      CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator_, rand_seed));
+      VLOG(4) << "Change curand random seed from: " << seed_ << " to: " << rand_seed;
+      seed_ = rand_seed;
+    }
+    return *this;
+  }
+
+  CurandGenerator &SetStream(cudaStream_t stream) {
+    if (stream != nullptr && stream != stream_) {
+      CURAND_CALL(curandSetStream(generator_, stream));
+      VLOG(4) << "Change curand generator stream from: " << stream_ << " to: " << stream;
+      stream_ = stream;
+    }
+    return *this;
+  }
+
+ private:
+  curandGenerator_t generator_;
+  unsigned long long seed_ = 0ULL;
+  cudaStream_t stream_     = nullptr;
+};
+
+class CurandGeneratorFactory {
+ public:
+  enum class CurandGeneratorType {
+    GENERATOR_DEFAULT,
+    GENERATOR_GAUSSIAN,
+    GENERATOR_UNIFORM,
+    GENERATOR_RANDINT,
+  };
+
+  static CurandGenerator &Get(CurandGeneratorType type) {
+    switch (type) {
+      case CurandGeneratorType::GENERATOR_GAUSSIAN:
+        static CurandGenerator gaussian_generator(CURAND_RNG_PSEUDO_PHILOX4_32_10);
+        return gaussian_generator;
+      case CurandGeneratorType::GENERATOR_UNIFORM:
+        static CurandGenerator uniform_generator(CURAND_RNG_PSEUDO_PHILOX4_32_10);
+        return uniform_generator;
+      case CurandGeneratorType::GENERATOR_RANDINT:
+        static CurandGenerator randint_generator(CURAND_RNG_PSEUDO_MT19937);
+        return randint_generator;
+      default:
+        static CurandGenerator default_generator;
+        return default_generator;
+    }
+  }
+};
+
+void cinn_call_gaussian_random(void *v_args, int num_args, float mean, float std, int seed, void *stream) {
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+  cinn_buffer_t *output  = args[0].operator cinn_buffer_t *();
+  cinn_type_t dtype      = output->type;
+  size_t numel           = output->num_elements();
+
+  curandGenerator_t generator =
+      CurandGeneratorFactory::Get(CurandGeneratorFactory::CurandGeneratorType::GENERATOR_GAUSSIAN)
+          .SetStream(static_cast<cudaStream_t>(stream))
+          .SetSeed(seed)
+          .GetGenerator();
+
+  VLOG(4) << "cinn_call_gaussian_random: output_size=" << numel << ", mean=" << mean << ", std=" << std
+          << ", seed=" << seed;
+
+  if (dtype == cinn_float32_t()) {
+    float *ptr = reinterpret_cast<float *>(output->memory);
+    CURAND_CALL(curandGenerateNormal(generator, ptr, numel, mean, std));
+  } else if (dtype == cinn_float64_t()) {
+    double *ptr = reinterpret_cast<double *>(output->memory);
+    CURAND_CALL(curandGenerateNormalDouble(generator, ptr, numel, mean, std));
+  } else {
+    LOG(FATAL) << "gaussian_random only support float32 and float64! Please check.";
+  }
+}
+
+void cinn_call_uniform_random(void *v_args, int num_args, float min, float max, int seed, void *stream) {
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+  cinn_buffer_t *output  = args[0].operator cinn_buffer_t *();
+  cinn_type_t dtype      = output->type;
+  size_t numel           = output->num_elements();
+
+  curandGenerator_t generator =
+      CurandGeneratorFactory::Get(CurandGeneratorFactory::CurandGeneratorType::GENERATOR_UNIFORM)
+          .SetStream(static_cast<cudaStream_t>(stream))
+          .SetSeed(seed)
+          .GetGenerator();
+
+  VLOG(4) << "cinn_call_uniform_random: output_size=" << numel << ", min=" << min << ", max=" << max
+          << ", seed=" << seed;
+
+  if (dtype == cinn_float32_t()) {
+    float *ptr = reinterpret_cast<float *>(output->memory);
+    CURAND_CALL(curandGenerateUniform(generator, ptr, numel));
+  } else if (dtype == cinn_float64_t()) {
+    double *ptr = reinterpret_cast<double *>(output->memory);
+    CURAND_CALL(curandGenerateUniformDouble(generator, ptr, numel));
+  } else {
+    LOG(FATAL) << "uniform_random only support float32 and float64! Please check.";
+  }
+}
+
+void cinn_call_randint(void *v_args, int num_args, int seed, void *stream) {
+  cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
+  cinn_buffer_t *output  = args[0].operator cinn_buffer_t *();
+  cinn_type_t dtype      = output->type;
+  size_t numel           = output->num_elements();
+
+  VLOG(4) << "cinn_call_randint: output_size=" << numel << ", seed=" << seed;
+
+  curandGenerator_t generator =
+      CurandGeneratorFactory::Get(CurandGeneratorFactory::CurandGeneratorType::GENERATOR_RANDINT)
+          .SetStream(static_cast<cudaStream_t>(stream))
+          .SetSeed(seed)
+          .GetGenerator();
+
+  if (dtype == cinn_int32_t()) {
+    unsigned int *ptr = reinterpret_cast<unsigned int *>(output->memory);
+    CURAND_CALL(curandGenerate(generator, ptr, numel));
+  } else {
+    LOG(FATAL) << "randint only support int32! Please check.";
+  }
+}
+
+#ifdef CINN_WITH_CUDNN
+
+namespace {
+cudnnDataType_t convert_to_cudnn_dtype(cinn_buffer_t *input) {
+  CHECK(input) << "the pointer of input is null";
+  auto type_code = input->type.code;
+  int bits       = input->type.bits;
+  cudnnDataType_t data_type;
+  bool is_float    = type_code == cinn_type_float;
+  bool is_bfloat16 = type_code == cinn_type_bfloat;
+  if (is_float && bits == 16) {
+    data_type = CUDNN_DATA_HALF;
+  } else if (is_float && bits == 32) {
+    data_type = CUDNN_DATA_FLOAT;
+  } else if (is_bfloat16) {
+    data_type = CUDNN_DATA_BFLOAT16;
+  } else if (is_float && bits == 64) {
+    data_type = CUDNN_DATA_DOUBLE;
+  } else {
+    LOG(FATAL) << "unsupported cudnn data type: " << static_cast<int>(type_code) << ", bits = " << bits;
+  }
+  return data_type;
+}
+}  // namespace
+
+#define GetAttrValue(attr_map, key_name, default_value)      \
+  int key_name = 0;                                          \
+  if (attr_map.count(#key_name) != 0) {                      \
+    key_name = attr_map.find(#key_name)->second;             \
+  } else if (default_value >= 0) {                           \
+    key_name = default_value;                                \
+  } else {                                                   \
+    LOG(FATAL) << #key_name << " is not exist in attr_map!"; \
+  }
+
+void cinn_gpu_cudnn_conv2d(const absl::flat_hash_map<std::string, int> &attr,
+                           cinn_buffer_t *x,
+                           cinn_buffer_t *w,
+                           cinn_buffer_t *y,
+                           cudaStream_t stream,
+                           common::Layout target) {
+  cudnnTensorFormat_t cudnn_tensor_format;
+  if (target == common::Layout::kNCHW) {
+    cudnn_tensor_format = CUDNN_TENSOR_NCHW;
+  } else if (target == common::Layout::kNHWC) {
+    cudnn_tensor_format = CUDNN_TENSOR_NHWC;
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  GetAttrValue(attr, input_n, -1);
+  GetAttrValue(attr, input_c, -1);
+  GetAttrValue(attr, input_h, -1);
+  GetAttrValue(attr, input_w, -1);
+  GetAttrValue(attr, weights_n, -1);
+  GetAttrValue(attr, weights_c, -1);
+  GetAttrValue(attr, weights_h, -1);
+  GetAttrValue(attr, weights_w, -1);
+  GetAttrValue(attr, pad_h, 0);
+  GetAttrValue(attr, pad_w, 0);
+  GetAttrValue(attr, stride_h, 1);
+  GetAttrValue(attr, stride_w, 1);
+  GetAttrValue(attr, dilation_h, 1);
+  GetAttrValue(attr, dilation_w, 1);
+  GetAttrValue(attr, groups, 1);
+  GetAttrValue(attr, output_n, -1);
+  GetAttrValue(attr, output_c, -1);
+  GetAttrValue(attr, output_h, -1);
+  GetAttrValue(attr, output_w, -1);
+
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  void *_x = x->memory;
+  void *_w = w->memory;
+  void *_y = y->memory;
+
+  auto data_type = convert_to_cudnn_dtype(x);
+
+  cudnnTensorDescriptor_t x_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(x_desc, cudnn_tensor_format, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnFilterDescriptor_t w_desc;
+  CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc));
+  CUDNN_CALL(
+      cudnnSetFilter4dDescriptor(w_desc, data_type, cudnn_tensor_format, weights_n, weights_c, weights_h, weights_w));
+
+  cudnnConvolutionDescriptor_t conv_desc;
+  CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
+  CUDNN_CALL(cudnnSetConvolution2dDescriptor(conv_desc,
+                                             pad_h,
+                                             pad_w,
+                                             stride_h,
+                                             stride_w,
+                                             dilation_h,
+                                             dilation_w,
+                                             CUDNN_CROSS_CORRELATION,
+                                             get_cudnn_compute_dtype(data_type)));
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(conv_desc, groups));
+  CUDNN_CALL(cudnnSetConvolutionMathType(conv_desc, CUDNN_DEFAULT_MATH));
+
+  cudnnTensorDescriptor_t y_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc));
+  CUDNN_CALL(
+      cudnnSetTensor4dDescriptor(y_desc, cudnn_tensor_format, data_type, output_n, output_c, output_h, output_w));
+
+  auto &conv_algo_map  = ConvAlgoMap::GetInstance();
+  std::string hash_key = "conv2d forward, layout=" + debug_cudnn_tensor_format(CUDNN_TENSOR_NCHW) +
+                         ", dtype=" + debug_cudnn_tensor_dtype(data_type) + ", input_nchw={" + std::to_string(input_n) +
+                         "," + std::to_string(input_c) + "," + std::to_string(input_h) + "," + std::to_string(input_w) +
+                         "}, filter_nchw={" + std::to_string(weights_n) + "," + std::to_string(weights_c) + "," +
+                         std::to_string(weights_h) + "," + std::to_string(weights_w) + "}, output_nchw={" +
+                         std::to_string(output_n) + "," + std::to_string(output_c) + "," + std::to_string(output_h) +
+                         "," + std::to_string(output_w) + "}";
+
+  cudnnConvolutionFwdAlgo_t algo;
+  int algo_int = conv_algo_map.GetAlgo(hash_key);
+  if (algo_int >= 0) {
+    algo = cudnnConvolutionFwdAlgo_t(algo_int);
+  } else {
+    int count = 0;
+    cudnnConvolutionFwdAlgoPerf_t algo_perf;
+    CUDNN_CALL(cudnnFindConvolutionForwardAlgorithm(handle, x_desc, w_desc, conv_desc, y_desc, 1, &count, &algo_perf));
+
+    algo = algo_perf.algo;
+    conv_algo_map.InsertAlgo(hash_key, static_cast<int>(algo_perf.algo));
+  }
+
+  if (GetCinnCudnnDeterministic()) {
+    algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
+  }
+
+  size_t ws_size = 0;
+  CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(handle, x_desc, w_desc, conv_desc, y_desc, algo, &ws_size));
+
+  void *ws_data = CudnnHandle::GetInstance().GetWorkSpace(ws_size);
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    double alpha[] = {1.f}, beta[] = {0.f};
+    CUDNN_CALL(cudnnConvolutionForward(
+        handle, alpha, x_desc, _x, w_desc, _w, conv_desc, algo, ws_data, ws_size, beta, y_desc, _y));
+  } else {
+    float alpha[] = {1.f}, beta[] = {0.f};
+    CUDNN_CALL(cudnnConvolutionForward(
+        handle, alpha, x_desc, _x, w_desc, _w, conv_desc, algo, ws_data, ws_size, beta, y_desc, _y));
+  }
+
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc));
+  CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc));
+  CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc));
+}
+
+void cinn_gpu_cudnn_conv2d_backward_data(const absl::flat_hash_map<std::string, int> &attr,
+                                         cinn_buffer_t *w,
+                                         cinn_buffer_t *dy,
+                                         cinn_buffer_t *dx,
+                                         cudaStream_t stream) {
+  GetAttrValue(attr, input_n, -1);
+  GetAttrValue(attr, input_c, -1);
+  GetAttrValue(attr, input_h, -1);
+  GetAttrValue(attr, input_w, -1);
+  GetAttrValue(attr, weights_n, -1);
+  GetAttrValue(attr, weights_c, -1);
+  GetAttrValue(attr, weights_h, -1);
+  GetAttrValue(attr, weights_w, -1);
+  GetAttrValue(attr, pad_h, 0);
+  GetAttrValue(attr, pad_w, 0);
+  GetAttrValue(attr, stride_h, 1);
+  GetAttrValue(attr, stride_w, 1);
+  GetAttrValue(attr, dilation_h, 1);
+  GetAttrValue(attr, dilation_w, 1);
+  GetAttrValue(attr, groups, 1);
+  GetAttrValue(attr, output_n, -1);
+  GetAttrValue(attr, output_c, -1);
+  GetAttrValue(attr, output_h, -1);
+  GetAttrValue(attr, output_w, -1);
+
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  void *_w  = w->memory;
+  void *_dy = dy->memory;
+  void *_dx = dx->memory;
+
+  auto data_type = convert_to_cudnn_dtype(w);
+
+  cudnnTensorDescriptor_t x_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnFilterDescriptor_t w_desc;
+  CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc));
+  CUDNN_CALL(
+      cudnnSetFilter4dDescriptor(w_desc, data_type, CUDNN_TENSOR_NCHW, weights_n, weights_c, weights_h, weights_w));
+
+  cudnnConvolutionDescriptor_t conv_desc;
+  CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
+  CUDNN_CALL(cudnnSetConvolution2dDescriptor(conv_desc,
+                                             pad_h,
+                                             pad_w,
+                                             stride_h,
+                                             stride_w,
+                                             dilation_h,
+                                             dilation_w,
+                                             CUDNN_CROSS_CORRELATION,
+                                             get_cudnn_compute_dtype(data_type)));
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(conv_desc, groups));
+  CUDNN_CALL(cudnnSetConvolutionMathType(conv_desc, CUDNN_DEFAULT_MATH));
+
+  cudnnTensorDescriptor_t y_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, data_type, output_n, output_c, output_h, output_w));
+
+  auto &conv_algo_map  = ConvAlgoMap::GetInstance();
+  std::string hash_key = "conv2d backward data, layout=" + debug_cudnn_tensor_format(CUDNN_TENSOR_NCHW) +
+                         ", dtype=" + debug_cudnn_tensor_dtype(data_type) + ", input_nchw={" + std::to_string(input_n) +
+                         "," + std::to_string(input_c) + "," + std::to_string(input_h) + "," + std::to_string(input_w) +
+                         "}, filter_nchw={" + std::to_string(weights_n) + "," + std::to_string(weights_c) + "," +
+                         std::to_string(weights_h) + "," + std::to_string(weights_w) + "}, output_nchw={" +
+                         std::to_string(output_n) + "," + std::to_string(output_c) + "," + std::to_string(output_h) +
+                         "," + std::to_string(output_w) + "}";
+
+  int algo_int = conv_algo_map.GetAlgo(hash_key);
+  cudnnConvolutionBwdDataAlgo_t algo;
+  if (algo_int >= 0) {
+    algo = cudnnConvolutionBwdDataAlgo_t(algo_int);
+  } else {
+    int count = 0;
+    cudnnConvolutionBwdDataAlgoPerf_t algo_perf;
+
+    CUDNN_CALL(
+        cudnnFindConvolutionBackwardDataAlgorithm(handle, w_desc, y_desc, conv_desc, x_desc, 1, &count, &algo_perf));
+
+    algo = algo_perf.algo;
+    conv_algo_map.InsertAlgo(hash_key, static_cast<int>(algo_perf.algo));
+  }
+
+  if (GetCinnCudnnDeterministic()) {
+    algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+  }
+
+  size_t ws_size = 0;
+  CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(handle, w_desc, y_desc, conv_desc, x_desc, algo, &ws_size));
+
+  void *ws_data = CudnnHandle::GetInstance().GetWorkSpace(ws_size);
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    double alpha[] = {1.0f}, beta[] = {0.0f};
+    CUDNN_CALL(cudnnConvolutionBackwardData(
+        handle, alpha, w_desc, _w, y_desc, _dy, conv_desc, algo, ws_data, ws_size, beta, x_desc, _dx));
+  } else {
+    float alpha[] = {1.0f}, beta[] = {0.0f};
+    CUDNN_CALL(cudnnConvolutionBackwardData(
+        handle, alpha, w_desc, _w, y_desc, _dy, conv_desc, algo, ws_data, ws_size, beta, x_desc, _dx));
+  }
+
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc));
+  CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc));
+  CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc));
+}
+
+void cinn_gpu_cudnn_conv2d_backward_filter(const absl::flat_hash_map<std::string, int> &attr,
+                                           cinn_buffer_t *x,
+                                           cinn_buffer_t *dy,
+                                           cinn_buffer_t *dw,
+                                           cudaStream_t stream) {
+  GetAttrValue(attr, input_n, -1);
+  GetAttrValue(attr, input_c, -1);
+  GetAttrValue(attr, input_h, -1);
+  GetAttrValue(attr, input_w, -1);
+  GetAttrValue(attr, weights_n, -1);
+  GetAttrValue(attr, weights_c, -1);
+  GetAttrValue(attr, weights_h, -1);
+  GetAttrValue(attr, weights_w, -1);
+  GetAttrValue(attr, pad_h, 0);
+  GetAttrValue(attr, pad_w, 0);
+  GetAttrValue(attr, stride_h, 1);
+  GetAttrValue(attr, stride_w, 1);
+  GetAttrValue(attr, dilation_h, 1);
+  GetAttrValue(attr, dilation_w, 1);
+  GetAttrValue(attr, groups, 1);
+  GetAttrValue(attr, output_n, -1);
+  GetAttrValue(attr, output_c, -1);
+  GetAttrValue(attr, output_h, -1);
+  GetAttrValue(attr, output_w, -1);
+
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+
+  void *_x  = x->memory;
+  void *_dy = dy->memory;
+  void *_dw = dw->memory;
+
+  auto data_type = convert_to_cudnn_dtype(x);
+
+  cudnnTensorDescriptor_t x_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnFilterDescriptor_t w_desc;
+  CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc));
+  CUDNN_CALL(
+      cudnnSetFilter4dDescriptor(w_desc, data_type, CUDNN_TENSOR_NCHW, weights_n, weights_c, weights_h, weights_w));
+
+  cudnnConvolutionDescriptor_t conv_desc;
+  CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
+  CUDNN_CALL(cudnnSetConvolution2dDescriptor(conv_desc,
+                                             pad_h,
+                                             pad_w,
+                                             stride_h,
+                                             stride_w,
+                                             dilation_h,
+                                             dilation_w,
+                                             CUDNN_CROSS_CORRELATION,
+                                             get_cudnn_compute_dtype(data_type)));
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(conv_desc, groups));
+  CUDNN_CALL(cudnnSetConvolutionMathType(conv_desc, CUDNN_DEFAULT_MATH));
+
+  cudnnTensorDescriptor_t y_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, data_type, output_n, output_c, output_h, output_w));
+
+  auto &algo_map       = ConvAlgoMap::GetInstance();
+  std::string hash_key = "conv2d backward filter, layout=" + debug_cudnn_tensor_format(CUDNN_TENSOR_NCHW) +
+                         ", dtype=" + debug_cudnn_tensor_dtype(data_type) + ", input_nchw={" + std::to_string(input_n) +
+                         "," + std::to_string(input_c) + "," + std::to_string(input_h) + "," + std::to_string(input_w) +
+                         "}, filter_nchw={" + std::to_string(weights_n) + "," + std::to_string(weights_c) + "," +
+                         std::to_string(weights_h) + "," + std::to_string(weights_w) + "}, output_nchw={" +
+                         std::to_string(output_n) + "," + std::to_string(output_c) + "," + std::to_string(output_h) +
+                         "," + std::to_string(output_w) + "}";
+
+  int algo_int = algo_map.GetAlgo(hash_key);
+  cudnnConvolutionBwdFilterAlgo_t algo;
+  if (algo_int >= 0) {
+    algo = cudnnConvolutionBwdFilterAlgo_t(algo_int);
+  } else {
+    int count = 0;
+    cudnnConvolutionBwdFilterAlgoPerf_t algo_perf;
+    CUDNN_CALL(
+        cudnnFindConvolutionBackwardFilterAlgorithm(handle, x_desc, y_desc, conv_desc, w_desc, 1, &count, &algo_perf));
+
+    algo = algo_perf.algo;
+    algo_map.InsertAlgo(hash_key, static_cast<int>(algo_perf.algo));
+  }
+
+  if (GetCinnCudnnDeterministic()) {
+    algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+  }
+
+  size_t ws_size = 0;
+  CUDNN_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle, x_desc, y_desc, conv_desc, w_desc, algo, &ws_size));
+
+  void *ws_data = CudnnHandle::GetInstance().GetWorkSpace(ws_size);
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    double alpha[] = {1.0}, beta[] = {0.0};
+    CUDNN_CALL(cudnnConvolutionBackwardFilter(
+        handle, alpha, x_desc, _x, y_desc, _dy, conv_desc, algo, ws_data, ws_size, beta, w_desc, _dw));
+  } else {
+    float alpha[] = {1.0}, beta[] = {0.0};
+    CUDNN_CALL(cudnnConvolutionBackwardFilter(
+        handle, alpha, x_desc, _x, y_desc, _dy, conv_desc, algo, ws_data, ws_size, beta, w_desc, _dw));
+  }
+
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc));
+  CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc));
+  CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc));
+}
+
+void cinn_gpu_cudnn_pool2d(const std::vector<int> &attrs,
+                           const std::vector<std::string> &str_attrs,
+                           cinn_buffer_t *input,
+                           cinn_buffer_t *output,
+                           cudaStream_t stream) {
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  CHECK_EQ(attrs.size(), 17);
+  // Here the input paddings are pad_top, pad_bottom, pad_left, pad_right.
+  // Since pad_top==pad_bottom and pad_left==pad_rifht, we only take pad_top and pad_left.
+  int input_n           = attrs[0];
+  int input_c           = attrs[1];
+  int input_h           = attrs[2];
+  int input_w           = attrs[3];
+  int kernel_h          = attrs[4];
+  int kernel_w          = attrs[5];
+  int pad_h             = attrs[6];
+  int pad_w             = attrs[8];
+  int stride_h          = attrs[10];
+  int stride_w          = attrs[11];
+  int output_n          = attrs[12];
+  int output_c          = attrs[13];
+  int output_h          = attrs[14];
+  int output_w          = attrs[15];
+  int adaptive          = attrs[16];
+  std::string pool_type = str_attrs[0];
+  cudnnPoolingDescriptor_t pooling_desc;
+  CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc));
+  cudnnPoolingMode_t pool_mode;
+  if (pool_type == "max") {
+    pool_mode = CUDNN_POOLING_MAX;
+  } else if (pool_type == "avg") {
+    pool_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+  } else {
+    LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
+  }
+  if (adaptive == 1) {
+    stride_h = input_h / output_h;
+    stride_w = input_w / output_w;
+    kernel_h = input_h - (output_h - 1) * stride_h;
+    kernel_w = input_w - (output_w - 1) * stride_w;
+  }
+
+  auto data_type = convert_to_cudnn_dtype(input);
+
+  CUDNN_CALL(cudnnSetPooling2dDescriptor(
+      pooling_desc, pool_mode, CUDNN_NOT_PROPAGATE_NAN, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w));
+
+  cudnnTensorDescriptor_t in_desc;
+
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc));
+
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc, CUDNN_TENSOR_NCHW, data_type, input_n, input_c, input_h, input_w));
+
+  cudnnTensorDescriptor_t out_desc;
+
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
+
+  CUDNN_CALL(
+      cudnnSetTensor4dDescriptor(out_desc, CUDNN_TENSOR_NCHW, data_type, output_n, output_c, output_h, output_w));
+
+  void *in_data  = input->memory;
+  void *out_data = output->memory;
+
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    double alpha = 1.0f;
+    double beta  = 0.0f;
+    CUDNN_CALL(cudnnPoolingForward(handle, pooling_desc, &alpha, in_desc, in_data, &beta, out_desc, out_data));
+  } else {
+    float alpha = 1.0f;
+    float beta  = 0.0f;
+    CUDNN_CALL(cudnnPoolingForward(handle, pooling_desc, &alpha, in_desc, in_data, &beta, out_desc, out_data));
+  }
+
+  cudnnDestroyTensorDescriptor(in_desc);
+  cudnnDestroyTensorDescriptor(out_desc);
+  cudnnDestroyPoolingDescriptor(pooling_desc);
+}
+
+void cinn_gpu_cudnn_softmax(const std::vector<int> &attrs,
+                            cinn_buffer_t *input,
+                            cinn_buffer_t *output,
+                            cudaStream_t stream) {
+  std::vector<int> shape;
+  int rank = attrs.size() - 1;
+  for (int i = 0; i < rank; i++) {
+    shape.push_back(attrs[i]);
+  }
+  int axis      = attrs.back();
+  axis          = axis < 0 ? rank + axis : axis;
+  int inner_num = 1;
+  int outer_num = 1;
+  for (int i = 0; i < shape.size(); i++) {
+    if (i < axis)
+      outer_num *= shape[i];
+    else if (i > axis)
+      inner_num *= shape[i];
+  }
+  rank = shape.size();
+
+  auto data_type = convert_to_cudnn_dtype(input);
+
+  cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
+  CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
+  void *in_data  = input->memory;
+  void *out_data = output->memory;
+
+  cudnnTensorDescriptor_t in_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc, CUDNN_TENSOR_NCHW, data_type, outer_num, shape[axis], inner_num, 1));
+
+  cudnnTensorDescriptor_t out_desc;
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc, CUDNN_TENSOR_NCHW, data_type, outer_num, shape[axis], inner_num, 1));
+
+  if (data_type == CUDNN_DATA_DOUBLE) {
+    double alpha = 1.f;
+    double beta  = 0.f;
+    CUDNN_CALL(cudnnSoftmaxForward(handle,
+                                   CUDNN_SOFTMAX_ACCURATE,
+                                   CUDNN_SOFTMAX_MODE_CHANNEL,
+                                   &alpha,
+                                   in_desc,
+                                   in_data,
+                                   &beta,
+                                   out_desc,
+                                   out_data));
+  } else {
+    float alpha = 1.f;
+    float beta  = 0.f;
+    CUDNN_CALL(cudnnSoftmaxForward(handle,
+                                   CUDNN_SOFTMAX_ACCURATE,
+                                   CUDNN_SOFTMAX_MODE_CHANNEL,
+                                   &alpha,
+                                   in_desc,
+                                   in_data,
+                                   &beta,
+                                   out_desc,
+                                   out_data));
+  }
+
+  cudnnDestroyTensorDescriptor(in_desc);
+  cudnnDestroyTensorDescriptor(out_desc);
+}
+
+#endif  // CINN_WITH_CUDNN
+
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cuda/cuda_util.h b/paddle/cinn/runtime/cuda/cuda_util.h
new file mode 100644
index 0000000000000..ef01ac644902e
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/cuda_util.h
@@ -0,0 +1,309 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <cuda_runtime.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/common/type.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace cinn {
+namespace runtime {
+namespace cuda {
+
+const int kCUDAMaxCards{8};
+
+void cinn_gpu_cublas_mul(const std::vector<int>& attrs,
+                         cinn_buffer_t* input1,
+                         cinn_buffer_t* input2,
+                         cinn_buffer_t* output,
+                         cudaStream_t stream = nullptr);
+
+void cinn_gpu_cublas_gemm(const std::vector<int>& attrs,
+                          cinn_buffer_t* lhs,
+                          cinn_buffer_t* rhs,
+                          cinn_buffer_t* bias,
+                          cinn_buffer_t* output,
+                          cudaStream_t stream = nullptr);
+
+void cinn_call_gaussian_random(void* v_args, int num_args, float mean, float std, int seed, void* stream = nullptr);
+
+void cinn_call_uniform_random(void* v_args, int num_args, float min, float max, int seed, void* stream = nullptr);
+
+void cinn_call_randint(void* v_args, int num_args, int seed, void* stream = nullptr);
+
+void cinn_call_cholesky_nvgpu(void* v_args, int num_args, int batch_size, int m, bool upper, void* stream = nullptr);
+
+void cinn_assert_true_nvgpu(void* v_args, int num_args, int msg, bool only_warning, void* stream = nullptr);
+
+void cinn_call_triangular_solve_nvgpu(void* v_args,
+                                      int num_args,
+                                      int batch_size,
+                                      int m,
+                                      int k,
+                                      bool left_side,
+                                      bool upper,
+                                      bool transpose_a,
+                                      bool unit_diagonal,
+                                      void* stream = nullptr);
+
+void cinn_call_cuda_memset(void* v_args, int num_args, int value, size_t count, void* stream = nullptr);
+void cinn_call_cuda_memcpy(void* v_args, int num_args, size_t count, void* stream = nullptr);
+
+/**
+ * Call a CUDA compiled kernel.
+ *
+ * @param kernel_fn the compiled PTX kernel.
+ * @param args an array of cinn_pod_value_ts(consists of scalars and buffers).
+ */
+void cinn_call_cuda_kernel(void* kernel_fn,
+                           void* v_args,
+                           int num_args,
+                           int grid_x,
+                           int grid_y,
+                           int grid_z,
+                           int block_x,
+                           int block_y,
+                           int block_z,
+                           void* stream);
+
+void cinn_call_cublas(void* v_args,
+                      int num_args,
+                      bool trans_a,
+                      bool trans_b,
+                      bool trans_o,
+                      float alpha,
+                      float beta,
+                      int a1,
+                      int a2,
+                      int a3,
+                      int a4,
+                      int b1,
+                      int b2,
+                      int b3,
+                      int b4,
+                      void* stream);
+
+void cinn_call_batched_cublas(void* v_args,
+                              int num_args,
+                              int opside,
+                              bool trans_a,
+                              bool trans_b,
+                              bool trans_o,
+                              float alpha,
+                              float beta,
+                              int a1,
+                              int a2,
+                              int a3,
+                              int a4,
+                              int b1,
+                              int b2,
+                              int b3,
+                              int b4,
+                              void* stream);
+
+#ifdef CINN_WITH_CUDNN
+void cinn_gpu_cudnn_conv2d(const absl::flat_hash_map<std::string, int>& attr,
+                           cinn_buffer_t* x,
+                           cinn_buffer_t* w,
+                           cinn_buffer_t* y,
+                           cudaStream_t stream   = nullptr,
+                           common::Layout target = common::Layout::kNCHW);
+
+void cinn_gpu_cudnn_conv2d_backward_data(const absl::flat_hash_map<std::string, int>& attr,
+                                         cinn_buffer_t* w,
+                                         cinn_buffer_t* dy,
+                                         cinn_buffer_t* dx,
+                                         cudaStream_t stream = nullptr);
+
+void cinn_gpu_cudnn_conv2d_backward_filter(const absl::flat_hash_map<std::string, int>& attr,
+                                           cinn_buffer_t* x,
+                                           cinn_buffer_t* dy,
+                                           cinn_buffer_t* dw,
+                                           cudaStream_t stream = nullptr);
+
+void cinn_gpu_cudnn_pool2d(const std::vector<int>& attrs,
+                           const std::vector<std::string>& str_attrs,
+                           cinn_buffer_t* input,
+                           cinn_buffer_t* output,
+                           cudaStream_t stream = nullptr);
+
+void cinn_gpu_cudnn_softmax(const std::vector<int>& attrs,
+                            cinn_buffer_t* input,
+                            cinn_buffer_t* output,
+                            cudaStream_t stream = nullptr);
+
+void cinn_call_cudnn_conv2d_forward(void* v_args,
+                                    int num_args,
+                                    int format,
+                                    float alpha,
+                                    float beta,
+                                    int input_n,
+                                    int input_c,
+                                    int input_h,
+                                    int input_w,
+                                    int filter_n,
+                                    int filter_c,
+                                    int filter_h,
+                                    int filter_w,
+                                    int pad_h,
+                                    int pad_w,
+                                    int stride_h,
+                                    int stride_w,
+                                    int dilation_h,
+                                    int dilation_w,
+                                    int groups,
+                                    int output_n,
+                                    int output_c,
+                                    int output_h,
+                                    int output_w,
+                                    void* stream);
+
+void cinn_call_cudnn_conv2d_backward_data(void* v_args,
+                                          int num_args,
+                                          int format,
+                                          float alpha,
+                                          float beta,
+                                          int input_n,
+                                          int input_c,
+                                          int input_h,
+                                          int input_w,
+                                          int filter_n,
+                                          int filter_c,
+                                          int filter_h,
+                                          int filter_w,
+                                          int pad_h,
+                                          int pad_w,
+                                          int stride_h,
+                                          int stride_w,
+                                          int dilation_h,
+                                          int dilation_w,
+                                          int groups,
+                                          int output_n,
+                                          int output_c,
+                                          int output_h,
+                                          int output_w,
+                                          void* stream);
+
+void cinn_call_cudnn_conv2d_backward_filter(void* v_args,
+                                            int num_args,
+                                            int format,
+                                            float alpha,
+                                            float beta,
+                                            int input_n,
+                                            int input_c,
+                                            int input_h,
+                                            int input_w,
+                                            int filter_n,
+                                            int filter_c,
+                                            int filter_h,
+                                            int filter_w,
+                                            int pad_h,
+                                            int pad_w,
+                                            int stride_h,
+                                            int stride_w,
+                                            int dilation_h,
+                                            int dilation_w,
+                                            int groups,
+                                            int output_n,
+                                            int output_c,
+                                            int output_h,
+                                            int output_w,
+                                            void* stream);
+
+void cinn_call_cudnn_pool2d_forward(void* v_args,
+                                    int num_args,
+                                    int mode,
+                                    int format,
+                                    float alpha,
+                                    float beta,
+                                    int input_n,
+                                    int input_c,
+                                    int input_h,
+                                    int input_w,
+                                    int kernel_h,
+                                    int kernel_w,
+                                    int pad_h,
+                                    int pad_w,
+                                    int stride_h,
+                                    int stride_w,
+                                    int output_n,
+                                    int output_c,
+                                    int output_h,
+                                    int output_w,
+                                    void* stream);
+
+void cinn_call_cudnn_pool2d_backward(void* v_args,
+                                     int num_args,
+                                     int mode,
+                                     int format,
+                                     float alpha,
+                                     float beta,
+                                     int input_n,
+                                     int input_c,
+                                     int input_h,
+                                     int input_w,
+                                     int kernel_h,
+                                     int kernel_w,
+                                     int pad_h,
+                                     int pad_w,
+                                     int stride_h,
+                                     int stride_w,
+                                     int output_n,
+                                     int output_c,
+                                     int output_h,
+                                     int output_w,
+                                     void* stream);
+
+void cinn_call_cudnn_softmax_forward(void* v_args,
+                                     int num_args,
+                                     int mode,
+                                     int format,
+                                     float alpha,
+                                     float beta,
+                                     int input_n,
+                                     int input_c,
+                                     int input_h,
+                                     int input_w,
+                                     int output_n,
+                                     int output_c,
+                                     int output_h,
+                                     int output_w,
+                                     void* stream);
+
+void cinn_call_cudnn_softmax_backward(void* v_args,
+                                      int num_args,
+                                      int mode,
+                                      int format,
+                                      float alpha,
+                                      float beta,
+                                      int input_n,
+                                      int input_c,
+                                      int input_h,
+                                      int input_w,
+                                      int output_n,
+                                      int output_c,
+                                      int output_h,
+                                      int output_w,
+                                      void* stream);
+
+#endif  // CINN_WITH_CUDNN
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cuda/float16.h b/paddle/cinn/runtime/cuda/float16.h
new file mode 100644
index 0000000000000..4bf8c64614b17
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/float16.h
@@ -0,0 +1,629 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CINN_COMMON_FLOAT16_H
+#define CINN_COMMON_FLOAT16_H
+
+#ifdef __cplusplus
+#pragma once
+#endif  // __cplusplus
+
+#if defined(_M_X64) || defined(__x86_64__) || defined(_M_IX86) || defined(__i386__)
+#define __CINN_x86__
+#include <immintrin.h>
+#endif
+
+#include <stdint.h>
+
+#include <cmath>
+
+#ifdef CINN_WITH_CUDA
+#include <cuda.h>
+
+#if (defined(__CUDACC__) || defined(__CUDACC_RTC__)) && CUDA_VERSION >= 7050
+#define CINN_CUDA_FP16
+#include <cuda_fp16.h>
+
+#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)
+#endif  // __CUDACC__
+#endif  // CINN_WITH_CUDA
+
+#ifdef __cplusplus
+#ifndef _WIN32
+#define CINN_ALIGN(x) __attribute__((aligned(x)))
+#else  // _WIN32
+#define CINN_ALIGN(x) __declspec(align(x))
+#endif  // _WIN32
+
+#else  // __cplusplus
+#define CINN_ALIGN(x)
+#endif  // __cplusplus
+
+// The `HOST` macro definition is not used here, it has a potential
+// conflict with the enumeration `kHOST` representing the backend.
+#ifndef __host__
+#define __host__
+#endif
+#ifndef __device__
+#define __device__
+#endif
+
+#ifdef __cplusplus
+namespace cinn {
+namespace common {
+#endif  // __cplusplus
+
+// Use CINN_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with CUDA half
+struct CINN_ALIGN(2) float16 {
+  uint16_t x;
+
+#ifdef __cplusplus
+  // The following defaulted special class member functions
+  // are added to make float16 pass the std::is_trivial test
+  float16()                 = default;
+  float16(const float16& o) = default;
+  float16& operator=(const float16& o) = default;
+  float16(float16&& o)                 = default;
+  float16& operator=(float16&& o) = default;
+  ~float16()                      = default;
+
+// Constructors
+#ifdef CINN_CUDA_FP16
+  __host__ __device__ inline explicit float16(const half& h) {
+#if (CUDA_VERSION >= 9000)
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
+#else
+    x = h.x;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // CINN_CUDA_FP16
+
+  __host__ __device__ inline explicit float16(float val) {
+#if defined(CINN_CUDA_FP16) && (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)
+    half tmp = __float2half(val);
+    x        = *reinterpret_cast<uint16_t*>(&tmp);
+
+#elif defined(__F16C__) && defined(__CINN_x86__)
+    x = _cvtss_sh(val, 0);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f           = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
+
+#endif
+  }
+
+  __host__ __device__ inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
+
+  template <class T>
+  __host__ __device__ inline explicit float16(const T& val) : x(float16(static_cast<float>(val)).x) {}
+
+// Assignment operators
+#ifdef CINN_CUDA_FP16
+  __host__ __device__ inline float16& operator=(const half& rhs) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
+#else
+    x = rhs.x;
+#endif
+    return *this;
+  }
+#endif
+
+  __host__ __device__ inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(int8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(int16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(int32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(int64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(float val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  __host__ __device__ inline float16& operator=(double val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+// Conversion opertors
+#ifdef CINN_CUDA_FP16
+  __host__ __device__ inline half to_half() const {
+#if CUDA_VERSION >= 9000
+    __half_raw h;
+    h.x = x;
+    return half(h);
+#else
+    half h;
+    h.x = x;
+    return h;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // CINN_CUDA_FP16
+
+  __host__ __device__ inline operator float() const {
+#if defined(CINN_CUDA_FP16) && (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)
+    half tmp = *reinterpret_cast<const half*>(this);
+    return __half2float(tmp);
+
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui         = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
+  }
+
+  __host__ __device__ inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  __host__ __device__ inline explicit operator int8_t() const { return static_cast<int8_t>(static_cast<float>(*this)); }
+
+  __host__ __device__ inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  __host__ __device__ inline operator double() const { return static_cast<double>(static_cast<float>(*this)); }
+
+ private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift     = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1) << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
+#endif  // __cplusplus
+};
+
+struct CINN_ALIGN(32) float8 {
+  float x, y, z, w, v, u, t, s;
+};
+
+struct CINN_ALIGN(16) half8 {
+  float16 x, y, z, w, v, u, t, s;
+};
+
+struct CINN_ALIGN(8) half4 {
+  float16 x, y, z, w;
+};
+
+#ifdef __cplusplus
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+// ROCM has built-in arithmetic operators as not defined
+// __HIP_NO_HALF_OPERATORS__
+#if defined(CINN_CUDA_FP16) && CUDA_VERSION < 9000
+__device__ inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+__device__ inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hsub(a, b);
+#else
+  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+__device__ inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hmul(a, b);
+#else
+  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+__device__ inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  float num   = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#else
+  float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
+  return float16(res).to_half();
+#endif
+}
+
+__device__ inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -static_cast<float>(float16(a));
+  return float16(res).to_half();
+#endif
+}
+
+__device__ inline half& operator+=(half& a, const half& b) {  // NOLINT
+  a = a + b;
+  return a;
+}
+
+__device__ inline half& operator-=(half& a, const half& b) {  // NOLINT
+  a = a - b;
+  return a;
+}
+
+__device__ inline half& operator*=(half& a, const half& b) {  // NOLINT
+  a = a * b;
+  return a;
+}
+
+__device__ inline half& operator/=(half& a, const half& b) {  // NOLINT
+  a = a / b;
+  return a;
+}
+
+__device__ inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a, b);
+#else
+  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
+#endif
+}
+
+__device__ inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a, b);
+#else
+  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
+#endif
+}
+
+__device__ inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a, b);
+#else
+  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
+#endif
+}
+
+__device__ inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a, b);
+#else
+  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
+#endif
+}
+
+__device__ inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a, b);
+#else
+  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
+#endif
+}
+
+__device__ inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a, b);
+#else
+  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
+#endif
+}
+
+#endif  // CINN_CUDA_FP16
+
+// Arithmetic operators for float16 on GPU
+__host__ __device__ inline float16 operator+(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hadd(a.to_half(), b.to_half()));
+#else
+  return float16(static_cast<float>(a) + static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline float16 operator-(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hsub(a.to_half(), b.to_half()));
+#else
+  return float16(static_cast<float>(a) - static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline float16 operator*(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hmul(a.to_half(), b.to_half()));
+#else
+  return float16(static_cast<float>(a) * static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline float16 operator/(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  // TODO(kexinzhao): check which cuda version starts to support __hdiv
+  float num   = __half2float(a.to_half());
+  float denom = __half2float(b.to_half());
+  return float16(num / denom);
+#else
+  return float16(static_cast<float>(a) / static_cast<float>(b));
+#endif
+}
+
+__host__ __device__ inline float16 operator-(const float16& a) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hneg(a.to_half()));
+#else
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+#endif
+}
+
+__host__ __device__ inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = a + b;
+  return a;
+}
+
+__host__ __device__ inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = a - b;
+  return a;
+}
+
+__host__ __device__ inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = a * b;
+  return a;
+}
+
+__host__ __device__ inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = a / b;
+  return a;
+}
+
+__host__ __device__ inline bool operator==(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) == static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator!=(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) != static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator<(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) < static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator<=(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) <= static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator>(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) > static_cast<float>(b);
+#endif
+}
+
+__host__ __device__ inline bool operator>=(const float16& a, const float16& b) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a.to_half(), b.to_half());
+#else
+  return static_cast<float>(a) >= static_cast<float>(b);
+#endif
+}
+#endif  // __cplusplus
+
+__host__ __device__ inline float16 raw_uint16_to_float16(uint16_t a) {
+  float16 res;
+  res.x = a;
+  return res;
+}
+
+__host__ __device__ inline bool(isnan)(const float16& a) {
+#if defined(CINN_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hisnan(a.to_half());
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+
+__host__ __device__ inline bool(isinf)(const float16& a) { return (a.x & 0x7fff) == 0x7c00; }
+
+__host__ __device__ inline bool(isfinite)(const float16& a) { return !((isnan)(a)) && !((isinf)(a)); }
+
+__host__ __device__ inline float16(abs)(const float16& a) {
+#if defined(CINN_CUDA_FP16) && (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+  return float16(__habs(a.to_half()));
+#else
+  return float16(fabsf(float(a)));
+#endif
+}
+
+__host__ __device__ inline float16(log)(const float16& a) { return float16(std::log(static_cast<float>(a))); }
+
+#ifdef __cplusplus
+}  // namespace common
+}  // namespace cinn
+#endif  // __cplusplus
+
+#if defined(__cplusplus) && defined(CINN_CUDA_FP16)
+__device__ inline cinn::common::float16 __shfl_sync(unsigned mask,
+                                                    cinn::common::float16 var,
+                                                    int srcLane,
+                                                    int width = warpSize) {
+  return cinn::common::float16(__shfl_sync(mask, var.to_half(), srcLane, width));
+}
+
+__device__ inline cinn::common::float16 __shfl_up_sync(unsigned mask,
+                                                       cinn::common::float16 var,
+                                                       unsigned int delta,
+                                                       int width = warpSize) {
+  return cinn::common::float16(__shfl_up_sync(mask, var.to_half(), delta, width));
+}
+
+__device__ inline cinn::common::float16 __shfl_down_sync(unsigned mask,
+                                                         cinn::common::float16 var,
+                                                         unsigned int delta,
+                                                         int width = warpSize) {
+  return cinn::common::float16(__shfl_down_sync(mask, var.to_half(), delta, width));
+}
+
+__device__ inline cinn::common::float16 __shfl_xor_sync(unsigned mask,
+                                                        cinn::common::float16 var,
+                                                        int laneMask,
+                                                        int width = warpSize) {
+  return cinn::common::float16(__shfl_xor_sync(mask, var.to_half(), laneMask, width));
+}
+
+__host__ __device__ inline cinn::common::float16 max(const cinn::common::float16& a, const cinn::common::float16& b) {
+  return a > b ? a : b;
+}
+__host__ __device__ inline cinn::common::float16 min(const cinn::common::float16& a, const cinn::common::float16& b) {
+  return a < b ? a : b;
+}
+#endif  // __cplusplus && CINN_CUDA_FP16
+
+#endif  // CINN_COMMON_FLOAT16_H
diff --git a/paddle/cinn/runtime/cuda/test_util.h b/paddle/cinn/runtime/cuda/test_util.h
new file mode 100644
index 0000000000000..bcf1b1d1b10c3
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/test_util.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <numeric>
+#include <vector>
+
+#include "cinn/backends/cuda_util.h"
+
+namespace cinn {
+namespace runtime {
+namespace cuda {
+namespace util {
+
+template <typename T>
+class Vector {
+ public:
+  explicit Vector(const std::vector<T>& other) : size_{other.size()} {
+    size_t bytes = sizeof(T) * size_;
+    CUDA_CALL(cudaMalloc(&ptr_, bytes));
+    CUDA_CALL(cudaMemcpy(ptr_, other.data(), bytes, cudaMemcpyHostToDevice));
+  }
+  explicit Vector(size_t size) : size_{size} {
+    size_t bytes = sizeof(T) * size_;
+    CUDA_CALL(cudaMalloc(&ptr_, bytes));
+    CUDA_CALL(cudaMemset(ptr_, 0, bytes));
+  }
+  std::vector<T> to_host() const {
+    std::vector<T> ret(size_);
+    size_t bytes = sizeof(T) * size_;
+    CUDA_CALL(cudaMemcpy(ret.data(), ptr_, bytes, cudaMemcpyDeviceToHost));
+    return ret;
+  }
+  ~Vector() { CUDA_CALL(cudaFree(ptr_)); }
+  size_t size() const { return size_; }
+  T* data() const { return ptr_; }
+
+ private:
+  size_t size_{};
+  T* ptr_{};
+};
+
+}  // namespace util
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cuda/use_extern_funcs.h b/paddle/cinn/runtime/cuda/use_extern_funcs.h
new file mode 100644
index 0000000000000..fea7d7e0b49d1
--- /dev/null
+++ b/paddle/cinn/runtime/cuda/use_extern_funcs.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "cinn/backends/extern_func_jit_register.h"
+
+#ifdef CINN_WITH_CUDA
+CINN_USE_REGISTER(cinn_cuda_host_api)
+CINN_USE_REGISTER(cuda_intrinsics)
+CINN_USE_REGISTER(cuda_intrinsics_reduce)
+CINN_USE_REGISTER(cuda_intrinsics_bfloat16)
+CINN_USE_REGISTER(cuda_intrinsics_float16)
+#endif
diff --git a/paddle/cinn/runtime/custom_function.cc b/paddle/cinn/runtime/custom_function.cc
new file mode 100644
index 0000000000000..4e79f6c9bf2d1
--- /dev/null
+++ b/paddle/cinn/runtime/custom_function.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
+#include "cinn/runtime/custom_function.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/string.h"
+
+DECLARE_string(cinn_check_fusion_accuracy_pass);
+
+namespace cinn {
+namespace runtime {
+
+using common::Target;
+using hlir::framework::Shape;
+using hlir::framework::Tensor;
+
+namespace utils {
+void AssertTrueMsgTool::SetMsg(int key, const std::string& msg) { global_msg_[key] = msg; }
+
+const std::string& AssertTrueMsgTool::GetMsg(int key) {
+  CHECK(global_msg_.find(key) != global_msg_.end()) << "Cannot find assert_true message key " << key;
+  return global_msg_[key];
+}
+
+void AssertTrueMsgTool::InitFlagInfo() {
+  // only need parse flag once
+  if (!flag_values_.empty()) {
+    return;
+  }
+  // default value
+  flag_values_ = {{"only_warning", false}, {"rtol", 1e-5f}, {"atol", 1e-8f}, {"equal_nan", false}};
+  if (CheckStringFlagFalse(FLAGS_cinn_check_fusion_accuracy_pass) ||
+      CheckStringFlagTrue(FLAGS_cinn_check_fusion_accuracy_pass)) {
+    // using default value
+    LOG(INFO) << "The FLAGS_cinn_check_fusion_accuracy_pass will check fusion group accuracy with: "
+                 "\"only_warning=false;rtol=1e-5;atol=1e-8;equal_nan=false\"";
+    return;
+  }
+
+  // parse flags
+  const auto& args = cinn::utils::Split(FLAGS_cinn_check_fusion_accuracy_pass, ";");
+  for (const auto& str : args) {
+    if (str.empty()) {
+      continue;
+    }
+    const auto& flag_arg = cinn::utils::Split(str, "=");
+    CHECK_EQ(flag_arg.size(), 2UL) << "The FLAGS_cinn_check_fusion_accuracy_pass must be the format of "
+                                      "\"only_warning=false;rtol=1e-5;atol=1e-8;equal_nan=false\"";
+
+    if (flag_arg[0] == "only_warning" || flag_arg[0] == "equal_nan") {
+      // bool type parameter
+      flag_values_[flag_arg[0]] = CheckStringFlagTrue(flag_arg[1]);
+    } else if (flag_arg[0] == "rtol" || flag_arg[0] == "atol") {
+      // string type parameter
+      flag_values_[flag_arg[0]] = std::stof(flag_arg[1]);
+    } else {
+      LOG(FATAL) << "The FLAGS_cinn_check_fusion_accuracy_pass only support parameter "
+                    "\"only_warning/rtol/atol/equal_nan\" now";
+    }
+  }
+
+  LOG(INFO) << "The FLAGS_cinn_check_fusion_accuracy_pass will check fusion group accuracy with: \""
+            << "only_warning=" << cinn::utils::Attribute2String(flag_values_.at("only_warning"))
+            << ";rtol=" << cinn::utils::Attribute2String(flag_values_.at("rtol"))
+            << ";atol=" << cinn::utils::Attribute2String(flag_values_.at("atol"))
+            << ";equal_nan=" << cinn::utils::Attribute2String(flag_values_.at("equal_nan")) << "\"";
+}
+
+bool MemcpyToHost(void* dst, const void* src, size_t bytes, const Target& input_target, void* stream = nullptr) {
+  if (input_target == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+    const auto& cuda_stream = static_cast<cudaStream_t>(stream);
+    cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, cuda_stream);
+    cudaStreamSynchronize(cuda_stream);
+    return true;
+#else
+    LOG(FATAL) << "NVGPU Target only support on flag CINN_WITH_CUDA ON! Please check.";
+    return false;
+#endif
+  }
+  if (input_target == common::DefaultHostTarget()) {
+    memcpy(dst, src, bytes);
+    return true;
+  }
+  LOG(FATAL) << "MemcpyToHost Only support cpu or nvgpu -> cpu, but here the input target is " << input_target
+             << "! Please check.";
+  return false;
+}
+
+bool MemcpyToDevice(void* dst, const void* src, size_t bytes, const Target& input_target, void* stream = nullptr) {
+#ifdef CINN_WITH_CUDA
+  if (input_target == common::DefaultNVGPUTarget()) {
+    cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToDevice, static_cast<cudaStream_t>(stream));
+    return true;
+  } else if (input_target == common::DefaultHostTarget()) {
+    cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, static_cast<cudaStream_t>(stream));
+    return true;
+  } else {
+    LOG(FATAL) << "MemcpyToDevice only support cpu or nvgpu -> nvgpu, but here the input target is " << input_target
+               << "! Please check.";
+    return false;
+  }
+#else
+  LOG(FATAL)
+      << "MemcpyToDevice only support nvgpu, and NVGPU Target only support when flag CINN_WITH_CUDA ON! Please check.";
+  return false;
+#endif
+}
+}  // namespace utils
+
+void CheckAssertTrue(
+    const bool* x, const size_t numel, bool only_warning, const std::string& msg, const Target& target) {
+  // check false number and first false offset
+  int error_num = 0, first_diff = -1;
+  for (int i = 0; i < numel; ++i) {
+    if (!x[i]) {
+      ++error_num;
+      if (first_diff == -1) {
+        first_diff = i;
+      }
+    }
+  }
+
+  // raise error information
+  if (error_num > 0) {
+    std::string error_info = "[AssertTrue] Check failed!\n";
+    error_info += "- target: " + target.arch_str() + "\n";
+    error_info += "- assert false number: " + std::to_string(error_num) + "\n";
+    error_info += "- first false offset: " + std::to_string(first_diff) + "\n";
+    error_info += "- group message:\n" + msg;
+
+    if (only_warning) {
+      LOG(WARNING) << error_info;
+    } else {
+      LOG(FATAL) << error_info;
+    }
+  } else {
+    VLOG(1) << "[AssertTrue] Check succeed!\n"
+            << "- group message:\n" + msg;
+  }
+}
+
+void cinn_assert_true(void* v_args, int num_args, int msg, bool only_warning, void* stream, const Target& target) {
+  // why x->type and output->type are empty?
+  // CHECK(x->type == cinn_bool_t()) << "The input type of AssertTrue should be bool, but here " << x->type.bits
+  //                                 << "! Please check.";
+  // CHECK(output->type == cinn_bool_t()) << "The output type of AssertTrue should be bool, but here " <<
+  // output->type.bits
+  //                                      << "! Please check.";
+
+  cinn_pod_value_t* args = static_cast<cinn_pod_value_t*>(v_args);
+
+  cinn_buffer_t* x      = args[0].operator cinn_buffer_t*();
+  cinn_buffer_t* output = args[1].operator cinn_buffer_t*();
+
+  // create cpu tensor
+  std::vector<int> shape;
+  shape.resize(x->dimensions);
+  for (int i = 0; i < shape.size(); ++i) {
+    shape[i] = x->dims[i];
+  }
+
+  Tensor cpu_tensor;
+  cpu_tensor->Resize(Shape(shape));
+  bool* dst = cpu_tensor->mutable_data<bool>(common::DefaultHostTarget());
+
+  // copy data from gpu to cpu
+  const bool* src = reinterpret_cast<const bool*>(x->memory);
+  size_t numel    = cpu_tensor->shape().numel();
+  utils::MemcpyToHost(dst, src, numel * sizeof(bool), target, stream);
+
+  CheckAssertTrue(dst, numel, only_warning, utils::AssertTrueMsgTool::GetInstance()->GetMsg(msg), target);
+
+  if (target == common::DefaultNVGPUTarget()) {
+    utils::MemcpyToDevice(output->memory, x->memory, numel * sizeof(bool), target, stream);
+  } else {
+    utils::MemcpyToHost(output->memory, x->memory, numel * sizeof(bool), target, stream);
+  }
+}
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/custom_function.h b/paddle/cinn/runtime/custom_function.h
new file mode 100644
index 0000000000000..520c3803fb19f
--- /dev/null
+++ b/paddle/cinn/runtime/custom_function.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/tensor.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace runtime {
+
+namespace utils {
+class AssertTrueMsgTool {
+ public:
+  static AssertTrueMsgTool* GetInstance() {
+    static AssertTrueMsgTool msg;
+    return &msg;
+  }
+
+  void SetMsg(int key, const std::string& msg);
+  const std::string& GetMsg(int key);
+
+  bool FindFlag(const std::string& param) { return flag_values_.count(param); }
+
+  template <typename T>
+  const T& GetFlagValue(const std::string& param) {
+    InitFlagInfo();
+    CHECK(flag_values_.count(param))
+        << "The FLAGS_cinn_check_fusion_accuracy_pass only support parameter \"only_warning/rtol/atol/equal_nan\" now";
+    CHECK(absl::holds_alternative<T>(flag_values_.at(param))) << "Try get value from a error type!";
+    return absl::get<T>(flag_values_.at(param));
+  }
+
+ private:
+  AssertTrueMsgTool() = default;
+
+  void InitFlagInfo();
+
+  std::unordered_map<std::string, cinn::utils::Attribute> flag_values_;
+  std::unordered_map<int, std::string> global_msg_;
+
+  CINN_DISALLOW_COPY_AND_ASSIGN(AssertTrueMsgTool);
+};
+}  // namespace utils
+
+void cinn_assert_true(void* v_args, int num_args, int msg, bool only_warning, void* stream, const Target& target);
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/custom_function_test.cc b/paddle/cinn/runtime/custom_function_test.cc
new file mode 100644
index 0000000000000..35012de7e731a
--- /dev/null
+++ b/paddle/cinn/runtime/custom_function_test.cc
@@ -0,0 +1,352 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <functional>
+#include <sstream>
+#include <vector>
+
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime.h>
+
+#include "cinn/runtime/cuda/cuda_util.h"
+#endif
+
+#ifdef CINN_WITH_MKL_CBLAS
+#include "cinn/runtime/cpu/cblas.h"
+#endif
+
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/runtime/custom_function.h"
+
+namespace cinn {
+namespace runtime {
+
+class CinnBufferAllocHelper {
+ public:
+  CinnBufferAllocHelper(cinn_device_kind_t device, cinn_type_t type, const std::vector<int>& shape, int align = 0) {
+    buffer_ = cinn_buffer_t::new_(device, type, shape, align);
+  }
+
+  template <typename T>
+  T* mutable_data(const Target& target) {
+    if (target_ != common::UnkTarget()) {
+      CHECK_EQ(target, target_) << "Cannot alloc twice, the memory had alloced at " << target_ << "! Please check.";
+      return reinterpret_cast<T*>(buffer_->memory);
+    }
+
+    target_ = target;
+    if (target == common::DefaultHostTarget()) {
+      cinn_buffer_malloc(nullptr, buffer_);
+    } else if (target == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+      cudaMalloc(&buffer_->memory, buffer_->num_elements() * sizeof(T));
+#else
+      LOG(FATAL) << "NVGPU Target only support on flag CINN_WITH_CUDA ON! Please check.";
+#endif
+    } else {
+      LOG(FATAL) << "Only support nvgpu and cpu, but here " << target << "! Please check.";
+    }
+
+    return reinterpret_cast<T*>(buffer_->memory);
+  }
+
+  template <typename T>
+  const T* data() {
+    if (target_ == common::UnkTarget()) {
+      LOG(FATAL) << "No memory had alloced! Please check.";
+    }
+    return reinterpret_cast<const T*>(buffer_->memory);
+  }
+
+  ~CinnBufferAllocHelper() {
+    if (buffer_) {
+      if (target_ == common::UnkTarget()) {
+        // pass
+      } else if (target_ == common::DefaultHostTarget()) {
+        cinn_buffer_free(nullptr, buffer_);
+      } else if (target_ == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+        cudaFree(buffer_->memory);
+#else
+        LOG(FATAL) << "NVGPU Target only support on flag CINN_WITH_CUDA ON! Please check.";
+#endif
+      } else {
+        LOG(FATAL) << "Only support nvgpu and cpu, but here " << target_ << "! Please check.";
+      }
+      delete buffer_;
+    }
+  }
+
+  cinn_buffer_t& operator*() const noexcept { return *buffer_; }
+  cinn_buffer_t* operator->() const noexcept { return buffer_; }
+  cinn_buffer_t* get() const noexcept { return buffer_; }
+
+ private:
+  cinn_buffer_t* buffer_{nullptr};
+  Target target_{common::UnkTarget()};
+};
+
+template <typename T>
+void SetInputValue(T* input, const T* input_h, size_t num, const Target& target) {
+  if (target == common::DefaultHostTarget()) {
+    for (int i = 0; i < num; ++i) {
+      input[i] = input_h[i];
+    }
+  } else if (target == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+    cudaMemcpy(input, input_h, num * sizeof(T), cudaMemcpyHostToDevice);
+#else
+    LOG(FATAL) << "NVGPU Target only support on flag CINN_WITH_CUDA ON! Please check.";
+#endif
+  }
+}
+
+TEST(CinnAssertTrue, test_true) {
+  Target target = common::DefaultTarget();
+
+  CinnBufferAllocHelper x(cinn_x86_device, cinn_bool_t(), {1});
+
+  // set inpute value true
+  bool input_h = true;
+  auto* input  = x.mutable_data<bool>(target);
+
+  SetInputValue(input, &input_h, 1, target);
+
+  CinnBufferAllocHelper y(cinn_x86_device, cinn_bool_t(), {1});
+  auto* output = y.mutable_data<bool>(target);
+
+  cinn_pod_value_t v_args[2] = {cinn_pod_value_t(x.get()), cinn_pod_value_t(y.get())};
+
+  std::stringstream ss;
+  ss << "Test AssertTrue(true) on " << target;
+  const auto& msg = ss.str();
+  int msg_key     = static_cast<int>(std::hash<std::string>()(msg));
+  cinn::runtime::utils::AssertTrueMsgTool::GetInstance()->SetMsg(msg_key, msg);
+  cinn_assert_true(v_args, 2, msg_key, true, nullptr, target);
+
+  if (target == common::DefaultHostTarget()) {
+    ASSERT_EQ(input[0], output[0]) << "The output of AssertTrue should be the same as input";
+  } else if (target == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+    bool output_h = false;
+    cudaMemcpy(&output_h, output, sizeof(bool), cudaMemcpyDeviceToHost);
+
+    ASSERT_EQ(input_h, output_h) << "The output of AssertTrue should be the same as input";
+#endif
+  }
+}
+
+TEST(CinnAssertTrue, test_false_only_warning) {
+  Target target = common::DefaultTarget();
+
+  CinnBufferAllocHelper x(cinn_x86_device, cinn_bool_t(), {1});
+
+  // set inpute value false
+  bool input_h = false;
+  auto* input  = x.mutable_data<bool>(target);
+
+  SetInputValue(input, &input_h, 1, target);
+
+  CinnBufferAllocHelper y(cinn_x86_device, cinn_bool_t(), {1});
+  auto* output = y.mutable_data<bool>(target);
+
+  cinn_pod_value_t v_args[2] = {cinn_pod_value_t(x.get()), cinn_pod_value_t(y.get())};
+
+  std::stringstream ss;
+  ss << "Test AssertTrue(false, only_warning=true) on " << target;
+  const auto& msg = ss.str();
+  int msg_key     = static_cast<int>(std::hash<std::string>()(msg));
+  cinn::runtime::utils::AssertTrueMsgTool::GetInstance()->SetMsg(msg_key, msg);
+  cinn_assert_true(v_args, 2, msg_key, true, nullptr, target);
+
+  if (target == common::DefaultHostTarget()) {
+    ASSERT_EQ(input[0], output[0]) << "The output of AssertTrue should be the same as input";
+  } else if (target == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+    bool output_h = false;
+    cudaMemcpy(&output_h, output, sizeof(bool), cudaMemcpyDeviceToHost);
+
+    ASSERT_EQ(input_h, output_h) << "The output of AssertTrue should be the same as input";
+#endif
+  }
+}
+
+TEST(CustomCallGaussianRandom, test_target_nvgpu) {
+  Target target = common::DefaultTarget();
+
+  // Arg mean
+  float mean = 0.0f;
+  // Arg std
+  float std = 1.0f;
+  // Arg seed
+  int seed = 10;
+
+  // Output matrix out
+  CinnBufferAllocHelper out(cinn_x86_device, cinn_float32_t(), {2, 3});
+  auto* output = out.mutable_data<float>(target);
+
+  int num_args               = 1;
+  cinn_pod_value_t v_args[1] = {cinn_pod_value_t(out.get())};
+
+  if (target == common::DefaultHostTarget()) {
+    LOG(INFO) << "Op gaussian random only support on NVGPU";
+  } else if (target == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+    cinn::runtime::cuda::cinn_call_gaussian_random(v_args, num_args, mean, std, seed, nullptr);
+
+    float output_data[6] = {0.0};
+    cudaMemcpy(output_data, output, 6 * sizeof(float), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < 6; i++) {
+      VLOG(6) << output_data[i];
+    }
+#else
+    LOG(FATAL) << "NVGPU Target only support on flag CINN_WITH_CUDA ON! Please check.";
+#endif
+  }
+}
+
+TEST(CustomCallUniformRandom, test_target_nvgpu) {
+  Target target = common::DefaultTarget();
+
+  // Arg min
+  float min = -1.0f;
+  // Arg max
+  float max = 1.0f;
+  // Arg seed
+  int seed = 10;
+
+  // Output matrix out
+  CinnBufferAllocHelper out(cinn_x86_device, cinn_float32_t(), {2, 3});
+  auto* output = out.mutable_data<float>(target);
+
+  int num_args               = 1;
+  cinn_pod_value_t v_args[1] = {cinn_pod_value_t(out.get())};
+
+  if (target == common::DefaultHostTarget()) {
+    LOG(INFO) << "Op uniform random only support on NVGPU";
+  } else if (target == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+    cinn::runtime::cuda::cinn_call_uniform_random(v_args, num_args, min, max, seed, nullptr);
+
+    float output_data[6] = {0.0f};
+    cudaMemcpy(output_data, output, 6 * sizeof(float), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < 6; i++) {
+      VLOG(6) << output_data[i];
+    }
+#else
+    LOG(FATAL) << "NVGPU Target only support on flag CINN_WITH_CUDA ON! Please check.";
+#endif
+  }
+}
+
+TEST(CustomCallCholesky, test) {
+  Target target = common::DefaultTarget();
+
+  // Batch size
+  int batch_size = 1;
+  // Dim
+  int m = 3;
+  // Upper
+  bool upper = false;
+
+  // Input matrix x
+  CinnBufferAllocHelper x(cinn_x86_device, cinn_float32_t(), {m, m});
+  float input_h[9] = {
+      0.96329159, 0.88160539, 0.40593964, 0.88160539, 1.39001071, 0.48823422, 0.40593964, 0.48823422, 0.19755946};
+  auto* input = x.mutable_data<float>(target);
+  SetInputValue(input, input_h, m * m, target);
+
+  // Output matrix out
+  CinnBufferAllocHelper out(cinn_x86_device, cinn_float32_t(), {m, m});
+  auto* output = out.mutable_data<float>(target);
+
+  // Result matrix
+  // In the calculation result of MKL, the matrix !upper part is the same as the original input
+  float host_result[9] = {
+      0.98147416, 0.88160539, 0.40593964, 0.89824611, 0.76365221, 0.48823422, 0.41360193, 0.15284170, 0.055967092};
+  // In the calculation results of cuSOLVER, the upper and lower triangles of the matrix are the same
+  float gpu_result[9] = {
+      0.98147416, 0.89824611, 0.41360193, 0.89824611, 0.76365221, 0.15284170, 0.41360193, 0.15284170, 0.055967092};
+
+  int num_args               = 2;
+  cinn_pod_value_t v_args[2] = {cinn_pod_value_t(x.get()), cinn_pod_value_t(out.get())};
+
+  if (target == common::DefaultHostTarget()) {
+#ifdef CINN_WITH_MKL_CBLAS
+    cinn_call_cholesky_host(v_args, num_args, batch_size, m, upper);
+    for (int i = 0; i < batch_size * m * m; i++) {
+      ASSERT_NEAR(output[i], host_result[i], 1e-5) << "The output of Cholesky should be the same as result";
+    }
+#else
+    LOG(INFO) << "Host Target only support on flag CINN_WITH_MKL_CBLAS ON! Please check.";
+#endif
+  } else if (target == common::DefaultNVGPUTarget()) {
+#ifdef CINN_WITH_CUDA
+    cinn::runtime::cuda::cinn_call_cholesky_nvgpu(v_args, num_args, batch_size, m, upper);
+    std::vector<float> host_output(batch_size * m * m, 0.0f);
+    cudaMemcpy(host_output.data(), output, batch_size * m * m * sizeof(float), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < batch_size * m * m; i++) {
+      ASSERT_NEAR(host_output[i], gpu_result[i], 1e-5) << "The output of Cholesky should be the same as result";
+    }
+#else
+    LOG(INFO) << "NVGPU Target only support on flag CINN_WITH_CUDA ON! Please check.";
+#endif
+  }
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(CustomCallTriangularSolve, test) {
+  Target target = common::DefaultNVGPUTarget();
+
+  int batch_size     = 1;
+  int m              = 3;
+  int k              = 1;
+  bool left_side     = true;
+  bool upper         = true;
+  bool transpose_a   = false;
+  bool unit_diagonal = false;
+
+  double input_a_host[9] = {1.0, 1.0, 1.0, 0.0, 2.0, 1.0, 0.0, 0.0, -1.0};
+  double input_b_host[3] = {0.0, -9.0, 5.0};
+  CinnBufferAllocHelper a(cinn_x86_device, cinn_float64_t(), {m, m});
+  CinnBufferAllocHelper b(cinn_x86_device, cinn_float64_t(), {m, k});
+  auto* input_a = a.mutable_data<double>(target);
+  auto* input_b = b.mutable_data<double>(target);
+  SetInputValue(input_a, input_a_host, m * m, target);
+  SetInputValue(input_b, input_b_host, m * k, target);
+
+  // Output matrix out
+  CinnBufferAllocHelper out(cinn_x86_device, cinn_float64_t(), {m, k});
+  auto* output = out.mutable_data<double>(target);
+
+  // Result matrix res
+  double result[3] = {7.0, -2.0, -5.0};
+
+  constexpr int num_args            = 3;
+  cinn_pod_value_t v_args[num_args] = {
+      cinn_pod_value_t(a.get()), cinn_pod_value_t(b.get()), cinn_pod_value_t(out.get())};
+  cinn::runtime::cuda::cinn_call_triangular_solve_nvgpu(
+      v_args, num_args, batch_size, m, k, left_side, upper, transpose_a, unit_diagonal);
+  std::vector<double> device_output(batch_size * m * k, 0.0f);
+  cudaMemcpy(device_output.data(), output, batch_size * m * k * sizeof(double), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < batch_size * m * k; i++) {
+    ASSERT_NEAR(device_output[i], result[i], 1e-5) << "The output of triangular solve should be the same as result";
+  }
+}
+#endif
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/flags.cc b/paddle/cinn/runtime/flags.cc
new file mode 100644
index 0000000000000..649baced8d2d8
--- /dev/null
+++ b/paddle/cinn/runtime/flags.cc
@@ -0,0 +1,227 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/flags.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <unordered_set>
+
+#include "cinn/common/target.h"
+
+#ifdef CINN_WITH_CUDNN
+DEFINE_bool(cinn_cudnn_deterministic,
+            false,
+            "Whether allow using an autotuning algorithm for convolution "
+            "operator. The autotuning algorithm may be non-deterministic. If "
+            "true, the algorithm is deterministic.");
+#endif
+
+using ::GFLAGS_NAMESPACE::BoolFromEnv;
+using ::GFLAGS_NAMESPACE::Int32FromEnv;
+using ::GFLAGS_NAMESPACE::Int64FromEnv;
+using ::GFLAGS_NAMESPACE::StringFromEnv;
+
+DEFINE_string(cinn_x86_builtin_code_root, StringFromEnv("FLAGS_cinn_x86_builtin_code_root", ""), "");
+DEFINE_string(cinn_nvcc_cmd_path,
+              StringFromEnv("FLAGS_cinn_nvcc_cmd_path", "/usr/local/cuda/bin"),
+              "Setting nvcc default path!");
+
+DEFINE_int32(cinn_parallel_compile_size,
+             Int32FromEnv("FLAGS_cinn_parallel_compile_size", 16),
+             "When use parallel compile, set the number of group compiled by each thread.");
+
+DEFINE_int32(cinn_parallel_compile_thread,
+             Int32FromEnv("FLAGS_cinn_parallel_compile_thread", -1),
+             "How much thread the parallel compile used.");
+
+DEFINE_bool(cinn_use_op_fusion, BoolFromEnv("FLAGS_cinn_use_op_fusion", true), "Whether to use op fusion pass.");
+
+DEFINE_bool(cinn_use_common_subexpression_elimination,
+            BoolFromEnv("FLAGS_cinn_use_common_subexpression_elimination", false),
+            "Whether to use common subexpression elimination pass.");
+
+DEFINE_string(cinn_custom_call_deny_ops,
+              StringFromEnv("FLAGS_cinn_custom_call_deny_ops", ""),
+              "a blacklist of op are denied by MarkCustomCallOps pass, separated by ;");
+
+DEFINE_bool(cinn_use_custom_call,
+            BoolFromEnv("FLAGS_cinn_use_custom_call", true),
+            "Whether to use custom_call for ops with external_api registered");
+
+DEFINE_bool(cinn_use_fill_constant_folding,
+            BoolFromEnv("FLAGS_cinn_use_fill_constant_folding", false),
+            "Whether use the FillConstantFolding pass.");
+
+DEFINE_string(cinn_check_fusion_accuracy_pass,
+              StringFromEnv("FLAGS_cinn_check_fusion_accuracy_pass", ""),
+              "Check the correct of fusion kernels, if the results not satisfied 'allclose(rtol=1e-05f, atol=1e-08f)', "
+              "report error and exited.");
+
+DEFINE_bool(cinn_use_cuda_vectorize,
+            BoolFromEnv("FLAGS_cinn_use_cuda_vectorize", false),
+            "Whether use cuda vectroize on schedule config");
+
+DEFINE_bool(cinn_ir_schedule,
+            BoolFromEnv("FLAGS_cinn_ir_schedule", true),
+            "Whether use reconstructed schedule primitives.");
+
+DEFINE_bool(use_reduce_split_pass, BoolFromEnv("FLAGS_use_reduce_split_pass", false), "Whether use reduce split pass.");
+
+DEFINE_bool(cinn_use_dense_merge_pass,
+            BoolFromEnv("FLAGS_cinn_use_dense_merge_pass", false),
+            "Whether use dense merge pass.");
+
+DEFINE_bool(nvrtc_compile_to_cubin,
+            BoolFromEnv("FLAGS_nvrtc_compile_to_cubin", false),
+            "Whether nvrtc compile cuda source into cubin instead of ptx (only works after cuda-11.1).");
+
+DEFINE_bool(cinn_compile_with_nvrtc,
+            BoolFromEnv("FLAGS_cinn_compile_with_nvrtc", true),
+            "Whether nvrtc compile cuda source with nvrtc(default nvcc).");
+
+// FLAGS for performance analysis and accuracy debug
+DEFINE_bool(cinn_sync_run,
+            BoolFromEnv("FLAGS_cinn_sync_run", false),
+            "Whether sync all devices after each instruction run, which is used for debug.");
+
+DEFINE_string(cinn_self_check_accuracy,
+              StringFromEnv("FLAGS_cinn_self_check_accuracy", ""),
+              "Whether self-check accuracy after each instruction run, which is used for debug.");
+
+DEFINE_int64(cinn_self_check_accuracy_num,
+             Int64FromEnv("FLAGS_cinn_self_check_accuracy_num", 0L),
+             "Set self-check accuracy print numel, which is used for debug.");
+
+DEFINE_string(cinn_fusion_groups_graphviz_dir,
+              StringFromEnv("FLAGS_cinn_fusion_groups_graphviz_dir", ""),
+              "Specify the directory path of dot file of graph, which is used for debug.");
+
+DEFINE_string(cinn_source_code_save_path,
+              StringFromEnv("FLAGS_cinn_source_code_save_path", ""),
+              "Specify the directory path of generated source code, which is used for debug.");
+
+DEFINE_string(cinn_pass_visualize_dir,
+              StringFromEnv("FLAGS_cinn_pass_visualize_dir", ""),
+              "Specify the directory path of pass visualize file of graph, which is used for debug.");
+
+DEFINE_bool(enable_auto_tuner, BoolFromEnv("FLAGS_enable_auto_tuner", false), "Whether enable auto tuner.");
+
+DEFINE_bool(auto_schedule_use_cost_model,
+            BoolFromEnv("FLAGS_auto_schedule_use_cost_model", true),
+            "Whether to use cost model in auto schedule, this is an on-developing flag and it will be removed when "
+            "cost model is stable.");
+
+DEFINE_bool(enhance_vertical_fusion_with_recompute,
+            BoolFromEnv("FLAGS_enhance_vertical_fusion_with_recompute", true),
+            "Whether to enhance check logic on vertical fusion with recompute");
+
+DEFINE_bool(verbose_function_register,
+            BoolFromEnv("FLAGS_verbose_function_register", false),
+            "Whether to verbose function regist log. This will only work if CINN build with flag -DWITH_DEBUG=ON.");
+
+DEFINE_int32(
+    cinn_profiler_state,
+    Int32FromEnv("FLAGS_cinn_profiler_state", -1),
+    "Specify the ProfilerState by Int in CINN, 0 for kDisabled, 1 for kCPU, 2 for kCUDA, 3 for kAll, default 0.");
+
+namespace cinn {
+namespace runtime {
+
+bool CheckStringFlagTrue(const std::string& flag) {
+  // from gflag FlagValue::ParseFrom:
+  // https://github.com/gflags/gflags/blob/master/src/gflags.cc#L292
+  static const std::unordered_set<std::string> kTrue = {"1", "t", "true", "y", "yes", "T", "True", "TRUE", "Y", "yes"};
+  return kTrue.count(flag);
+}
+
+bool CheckStringFlagFalse(const std::string& flag) {
+  // from gflag FlagValue::ParseFrom:
+  // https://github.com/gflags/gflags/blob/master/src/gflags.cc#L292
+  static const std::unordered_set<std::string> kFalse = {
+      "0", "f", "false", "n", "no", "F", "False", "FALSE", "N", "No", "NO"};
+  return flag.empty() || kFalse.count(flag);
+}
+
+void SetCinnCudnnDeterministic(bool state) {
+#ifdef CINN_WITH_CUDNN
+  FLAGS_cinn_cudnn_deterministic = state;
+#else
+  LOG(WARNING) << "CINN is compiled without cuDNN, this api is invalid!";
+#endif
+}
+
+bool GetCinnCudnnDeterministic() {
+#ifdef CINN_WITH_CUDNN
+  return FLAGS_cinn_cudnn_deterministic;
+#else
+  LOG(FATAL) << "CINN is compiled without cuDNN, this api is invalid!";
+  return false;
+#endif
+}
+
+unsigned long long RandomSeed::seed_ = 0ULL;
+
+unsigned long long RandomSeed::GetOrSet(unsigned long long seed) {
+  if (seed != 0ULL) {
+    seed_ = seed;
+  }
+  return seed_;
+}
+
+unsigned long long RandomSeed::Clear() {
+  auto old_seed = seed_;
+  seed_         = 0ULL;
+  return old_seed;
+}
+
+bool CanUseNvccCompiler() {
+  std::string nvcc_dir = FLAGS_cinn_nvcc_cmd_path + "/nvcc";
+  return (access(nvcc_dir.c_str(), 0) == -1 ? false : true) && (!FLAGS_cinn_compile_with_nvrtc);
+}
+
+bool IsCompiledWithCUDA() {
+#if !defined(CINN_WITH_CUDA)
+  return false;
+#else
+  return true;
+#endif
+}
+
+bool IsCompiledWithCUDNN() {
+#if !defined(CINN_WITH_CUDNN)
+  return false;
+#else
+  return true;
+#endif
+}
+
+common::Target CurrentTarget::target_ = common::DefaultTarget();
+
+void CurrentTarget::SetCurrentTarget(const common::Target& target) {
+  if (!IsCompiledWithCUDA() && target.arch == common::Target::Arch::NVGPU) {
+    LOG(FATAL) << "Current CINN version does not support NVGPU, please try to recompile with -DWITH_CUDA.";
+  } else {
+    target_ = target;
+  }
+}
+
+common::Target& CurrentTarget::GetCurrentTarget() { return target_; }
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/flags.h b/paddle/cinn/runtime/flags.h
new file mode 100644
index 0000000000000..6a663d12af231
--- /dev/null
+++ b/paddle/cinn/runtime/flags.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "cinn/common/target.h"
+
+namespace cinn {
+namespace runtime {
+
+bool CheckStringFlagTrue(const std::string &flag);
+bool CheckStringFlagFalse(const std::string &flag);
+
+void SetCinnCudnnDeterministic(bool state);
+bool GetCinnCudnnDeterministic();
+
+bool CanUseNvccCompiler();
+
+class RandomSeed {
+ public:
+  static unsigned long long GetOrSet(unsigned long long seed = 0);
+  static unsigned long long Clear();
+
+ private:
+  RandomSeed()                   = default;
+  RandomSeed(const RandomSeed &) = delete;
+  RandomSeed &operator=(const RandomSeed &) = delete;
+
+  static unsigned long long seed_;
+};
+
+bool IsCompiledWithCUDA();
+bool IsCompiledWithCUDNN();
+
+class CurrentTarget {
+ public:
+  static common::Target &GetCurrentTarget();
+  static void SetCurrentTarget(const common::Target &target);
+
+ private:
+  CurrentTarget()                      = default;
+  CurrentTarget(const CurrentTarget &) = delete;
+  CurrentTarget &operator=(const CurrentTarget &) = delete;
+
+  static common::Target target_;
+};
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/intrinsic.cc b/paddle/cinn/runtime/intrinsic.cc
new file mode 100644
index 0000000000000..a22fd8cbc0298
--- /dev/null
+++ b/paddle/cinn/runtime/intrinsic.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/intrinsic.h"
+
+#include "cinn/common/common.h"
+#include "cinn/ir/ir.h"
+
+namespace cinn {
+namespace runtime {
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+cinn_type_t ToRuntimeType(Type type) {
+#define SET_TYPE_CASE_ITEM(compiled_type, runtime_type) \
+  if (type == common::compiled_type()) {                \
+    return runtime_type();                              \
+  }
+
+  SET_TYPE_CASE_ITEM(Bool, cinn_bool_t)
+
+  SET_TYPE_CASE_ITEM(I8, cinn_int8_t)
+  SET_TYPE_CASE_ITEM(I16, cinn_int16_t)
+  SET_TYPE_CASE_ITEM(I32, cinn_int32_t)
+  SET_TYPE_CASE_ITEM(I64, cinn_int64_t)
+
+  SET_TYPE_CASE_ITEM(UI8, cinn_uint8_t)
+  SET_TYPE_CASE_ITEM(UI16, cinn_uint16_t)
+  SET_TYPE_CASE_ITEM(UI32, cinn_uint32_t)
+  SET_TYPE_CASE_ITEM(UI64, cinn_uint64_t)
+
+  SET_TYPE_CASE_ITEM(BF16, cinn_bfloat16_t)
+  SET_TYPE_CASE_ITEM(F16, cinn_float16_t)
+  SET_TYPE_CASE_ITEM(F32, cinn_float32_t)
+  SET_TYPE_CASE_ITEM(F64, cinn_float64_t)
+
+  SET_TYPE_CASE_ITEM(Float(32).PointerOf, cinn_type_of<float*>);
+  SET_TYPE_CASE_ITEM(Float(64).PointerOf, cinn_type_of<double*>);
+  SET_TYPE_CASE_ITEM(Float16().PointerOf, cinn_type_of<float16*>);
+  SET_TYPE_CASE_ITEM(BFloat16().PointerOf, cinn_type_of<bfloat16*>);
+
+  LOG(FATAL) << "Not supported type " << type;
+  return cinn_unk_t();
+#undef SET_TYPE_CASE_ITEM
+}
+
+Expr IntrinsicCall(Type type,
+                   const std::string& fn_name,
+                   const std::vector<Expr>& args,
+                   const std::vector<Expr>& write_args) {
+  return ir::Call::Make(type, fn_name, args, write_args, ir::CallType::Intrinsic, ir::FunctionRef(), 0);
+}
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/intrinsic.h b/paddle/cinn/runtime/intrinsic.h
new file mode 100644
index 0000000000000..7b1a7f9ab0948
--- /dev/null
+++ b/paddle/cinn/runtime/intrinsic.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "cinn/ir/buffer.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/runtime/intrinsic_types.h"
+
+/**
+ * \file This file implements some runtime concepts used in analysis and codegen.
+ */
+namespace cinn {
+
+namespace ir {
+class Expr;
+}  // namespace ir
+
+namespace runtime {
+
+namespace intrinsic {
+
+//! cinn_buffer_t::new_(buffer)
+static const char* buffer_create = "cinn_buffer_t::new_";
+//! cinn_buffer_t::delete_(buffer)
+static const char* buffer_destroy = "cinn_buffer_t::delete_";
+
+static const char* buffer_load = "cinn_buffer_load";
+
+static const char* buffer_malloc         = "cinn_buffer_malloc";
+static const char* buffer_free           = "cinn_buffer_free";
+static const char* buffer_create_default = "cinn_buffer_new_default";
+
+static const char* buffer_get_data_handle       = "cinn_buffer_get_data_handle";
+static const char* buffer_get_data_const_handle = "cinn_buffer_get_data_const_handle";
+
+//! Buffer load an element of some primitive type
+// @{
+static const char* buffer_load_bfloat16 = "buffer_load_bfloat16";
+static const char* buffer_load_float16  = "buffer_load_float16";
+static const char* buffer_load_float32  = "buffer_load_float32";
+static const char* buffer_load_float64  = "buffer_load_float64";
+// @}
+
+static const char* pod_value_ty = "cinn_pod_value_t";
+
+static const char* float_to_cinn_pod_value_repr    = "float_to_cinn_pod_value";
+static const char* double_to_cinn_pod_value_repr   = "double_to_cinn_pod_value";
+static const char* bfloat16_to_cinn_pod_value_repr = "bfloat16_to_cinn_pod_value";
+static const char* float16_to_cinn_pod_value_repr  = "float16_to_cinn_pod_value";
+
+static const char* bool_to_cinn_pod_value_repr = "bool_to_cinn_pod_value";
+
+static const char* int8_to_cinn_pod_value_repr  = "int8_to_cinn_pod_value";
+static const char* int16_to_cinn_pod_value_repr = "int16_to_cinn_pod_value";
+static const char* int32_to_cinn_pod_value_repr = "int32_to_cinn_pod_value";
+static const char* int64_to_cinn_pod_value_repr = "int64_to_cinn_pod_value";
+
+static const char* uint8_to_cinn_pod_value_repr  = "uint8_to_cinn_pod_value";
+static const char* uint16_to_cinn_pod_value_repr = "uint16_to_cinn_pod_value";
+static const char* uint32_to_cinn_pod_value_repr = "uint32_to_cinn_pod_value";
+static const char* uint64_to_cinn_pod_value_repr = "uint64_to_cinn_pod_value";
+
+static const char* buffer_p_to_cinn_pod_value_repr = "buffer_p_to_cinn_pod_value";
+
+static const char* pod_value_to_buffer_p = "cinn_pod_value_to_buffer_p";
+static const char* pod_value_to_bool     = "cinn_pod_value_to_bool";
+
+static const char* pod_value_to_int8  = "cinn_pod_value_to_int8";
+static const char* pod_value_to_int16 = "cinn_pod_value_to_int16";
+static const char* pod_value_to_int32 = "cinn_pod_value_to_int32";
+static const char* pod_value_to_int64 = "cinn_pod_value_to_int64";
+
+static const char* pod_value_to_uint8  = "cinn_pod_value_to_uint8";
+static const char* pod_value_to_uint16 = "cinn_pod_value_to_uint16";
+static const char* pod_value_to_uint32 = "cinn_pod_value_to_uint32";
+static const char* pod_value_to_uint64 = "cinn_pod_value_to_uint64";
+
+static const char* pod_value_to_float    = "cinn_pod_value_to_float";
+static const char* pod_value_to_double   = "cinn_pod_value_to_double";
+static const char* pod_value_to_bfloat16 = "cinn_pod_value_to_bfloat16";
+static const char* pod_value_to_float16  = "cinn_pod_value_to_float16";
+
+static const char* pod_value_to_void_p = "cinn_pod_value_to_void_p";
+
+static const char* print_debug_args_repr = "cinn_print_debug_args";
+
+static const char* call_cuda_kernel = "cinn_call_cuda_kernel";
+
+static const char* pod_values_to_array_repr = "pod_values_to_array";
+
+static const char* get_address_repr = "get_address";
+
+static const char* args_construct_repr = "cinn_args_construct";
+
+static const char* builtin_intrin_repr = "cinn_builtin_intrin";
+
+//! Name of the helper intrinsic used to display debug string.
+static const char* debug_log_repr = "cinn_print_debug_string";
+
+static const char* cuda_sync_threads = "__syncthreads";
+
+static const char* parallel_launch = "cinn_backend_parallel_launch";
+
+}  // namespace intrinsic
+
+/**
+ * Call an intrnsic function.
+ * @param type Return type of the function.
+ * @param fn_name Name of the function.
+ * @param args The arguments for the function.
+ * @return The Call node.
+ */
+Expr IntrinsicCall(Type type,
+                   const std::string& fn_name,
+                   const std::vector<Expr>& args,
+                   const std::vector<Expr>& write_args = {});
+
+//! Convert the Type in compile time to runtime type.
+cinn_type_t ToRuntimeType(Type type);
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/intrinsic_types.cc b/paddle/cinn/runtime/intrinsic_types.cc
new file mode 100644
index 0000000000000..6d2f20a61984a
--- /dev/null
+++ b/paddle/cinn/runtime/intrinsic_types.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/runtime/intrinsic_types.h"
+
+namespace cinn {
+namespace runtime {
+
+Type BufferType::cinn_type() {
+  Type type;
+  type.set_customized_type(c_type_repr);
+  return type;
+}
+
+char BufferType::c_type_repr[] = "cinn_buffer_t";
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/intrinsic_types.h b/paddle/cinn/runtime/intrinsic_types.h
new file mode 100644
index 0000000000000..ec9a5041ccf13
--- /dev/null
+++ b/paddle/cinn/runtime/intrinsic_types.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+/**
+ * \file This file implements some intrinsic types used in CodeGen.
+ */
+
+#include "cinn/common/common.h"
+
+namespace cinn {
+namespace runtime {
+
+/**
+ * Type representation for cinn_buffer_t.
+ */
+struct BufferType {
+  static BufferType Create(const Type& primitive) { return BufferType(primitive); }
+
+  static Type cinn_type();
+
+ private:
+  explicit BufferType(const Type& primitive_type) : primitive_type(primitive_type) {
+    CHECK(primitive_type.valid());
+    CHECK(primitive_type.is_primitive());
+  }
+
+  //! Determine the primitive of cinn_buffer_t.
+  Type primitive_type;
+  static char c_type_repr[];
+};
+
+static Type make_intrinsic_buffer_type(Type primitive_type) {
+  CHECK(primitive_type.is_primitive());
+  CHECK(primitive_type.valid());
+  Type res = BufferType::cinn_type();
+  return res;
+}
+
+}  // namespace runtime
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/tiny_runtime.cc b/paddle/cinn/runtime/tiny_runtime.cc
new file mode 100644
index 0000000000000..303f903e4c7d9
--- /dev/null
+++ b/paddle/cinn/runtime/tiny_runtime.cc
@@ -0,0 +1,158 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <dlfcn.h>
+#include <omp.h>
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "cinn_runtime.h"
+
+extern "C" {
+int max_num_workers = std::thread::hardware_concurrency();
+// move to standlone file
+struct param_context_t {
+  int major_v;
+  int minor_v;
+  std::vector<uint8_t> buf;
+  std::vector<std::vector<uint8_t>> temporary;
+  std::map<std::string, cinn_pod_value_t> name2podvalue;
+  std::vector<std::string> instructions;
+  std::vector<int> inst_argc;
+  std::vector<cinn_pod_value_t *> inst_argv;
+};
+
+void *load_program(const char *paramfile) {
+  FILE *f = fopen(paramfile, "r");
+  fseek(f, 0, SEEK_END);
+  int fsize = ftell(f);
+  rewind(f);
+  if (fsize < 32) {
+    fclose(f);
+    return nullptr;
+  }
+
+  std::unique_ptr<param_context_t> ctx(new param_context_t{});
+  int alignment = std::max(alignof(cinn_pod_value_t), alignof(cinn_buffer_t));
+  ctx->buf.resize(fsize + alignment);
+  uint8_t *buf = ctx->buf.data();
+  if ((uintptr_t)buf % alignment) {
+    buf = buf + alignment - ((uintptr_t)buf % alignment);
+  }
+  fread(buf, 1, fsize, f);
+  fclose(f);
+
+  if (std::string(buf, buf + 4) != "CINN") {
+    // TODO LOG fatal
+    return nullptr;
+  }
+  // TODO check param file version
+  ctx->major_v = *(int *)(buf + 4);
+  ctx->minor_v = *(int *)(buf + 8);
+
+  int *namelist_pos   = (int *)(buf + 16);
+  int *podvalue_pos   = (int *)(buf + *namelist_pos);
+  int *persistent_pos = (int *)(buf + *podvalue_pos);
+  int *inst_pos       = (int *)(buf + *persistent_pos);
+  if (fsize < *inst_pos) {
+    return nullptr;
+  }
+
+  int namelen = namelist_pos[1];
+  std::vector<const char *> namev(namelen);
+  std::map<std::string, int> name2index;
+  for (int i = 0; i < namelen; i++) {
+    int offset           = (namelist_pos + 2)[i];
+    namev[i]             = (char *)(buf + offset);
+    name2index[namev[i]] = i;
+  }
+
+  cinn_buffer_t *cb = (cinn_buffer_t *)(buf + podvalue_pos[1]);
+  for (int i = 0; i < namelen; i++) {
+    // currently only CPU device is supported, so just use malloc
+    if (cb[i].memory) {
+      cb[i].memory = buf + (uintptr_t)cb[i].memory;
+    } else {
+      int alignment = cb[i].align;
+      if (alignment == 0) {
+        alignment = 4;
+      }
+      ctx->temporary.emplace_back(alignment + cb[i].memory_size);
+      uint8_t *tbuf = ctx->temporary.back().data();
+      if ((uintptr_t)tbuf % alignment) {
+        tbuf = tbuf + alignment - ((uintptr_t)tbuf % alignment);
+      }
+      cb[i].memory = tbuf;
+    }
+    ctx->name2podvalue[namev[i]] = cinn_pod_value_t(cb + i);
+  }
+  for (int i = 0; i < inst_pos[1]; i++) {
+    const char *inst = (const char *)(buf + inst_pos[2 + i * 3 + 0]);
+    ctx->instructions.push_back(inst);
+    int instargc = inst_pos[2 + i * 3 + 1];
+    ctx->inst_argc.push_back(instargc);
+    cinn_pod_value_t *argv = (cinn_pod_value_t *)(buf + inst_pos[2 + i * 3 + 2]);
+    for (int i = 0; i < instargc; i++) {
+      int idx = (uintptr_t)((cinn_buffer_t *)argv[i]);
+      cinn_value_t tmp_v;
+      tmp_v.v_handle = &cb[idx];
+      argv[i].set_value(tmp_v);
+    }
+    ctx->inst_argv.push_back(argv);
+  }
+  return ctx.release();
+}
+
+int set_maxconcurrency(int c) {
+  int old_c       = max_num_workers;
+  max_num_workers = c;
+  return old_c;
+}
+
+typedef void (*func_t)(cinn_pod_value_t *, int);
+void run_program(void *ctx) {
+  param_context_t *pc = (param_context_t *)ctx;
+  for (int i = 0; i < pc->instructions.size(); i++) {
+    const char *sym = pc->instructions[i].c_str();
+    void *p         = dlsym(RTLD_DEFAULT, sym);
+    func_t f        = (func_t)p;
+    f(pc->inst_argv[i], pc->inst_argc[i]);
+  }
+}
+
+cinn_pod_value_t *get_pod_value(void *ctx, const char *tname) {
+  param_context_t *pc = (param_context_t *)ctx;
+  if (pc->name2podvalue.find(tname) != pc->name2podvalue.end()) {
+    return &pc->name2podvalue[tname];
+  }
+  return nullptr;
+}
+
+typedef int (*FCINNParallelLambda)(int task_id, int num_task, void *datas);
+int cinn_backend_parallel_launch(FCINNParallelLambda flambda, void *datas, int num_task) {
+  int num_workers = max_num_workers;
+  if (num_task == 0) num_task = num_workers;
+  omp_set_num_threads(num_task);
+#pragma omp parallel num_threads(num_task)
+  {
+    int thread_num = omp_get_thread_num();
+    (*flambda)(thread_num, num_task, datas);
+  }
+  return 0;
+}
+}
diff --git a/paddle/cinn/runtime/use_extern_funcs.h b/paddle/cinn/runtime/use_extern_funcs.h
new file mode 100644
index 0000000000000..a2afce9705296
--- /dev/null
+++ b/paddle/cinn/runtime/use_extern_funcs.h
@@ -0,0 +1,20 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cinn/runtime/cpu/use_extern_funcs.h"
+#ifdef CINN_WITH_CUDA
+#include "cinn/runtime/cuda/use_extern_funcs.h"
+#endif
diff --git a/paddle/cinn/utils/CMakeLists.txt b/paddle/cinn/utils/CMakeLists.txt
new file mode 100755
index 0000000000000..3b6e6ecb0d751
--- /dev/null
+++ b/paddle/cinn/utils/CMakeLists.txt
@@ -0,0 +1,23 @@
+core_gather_headers()
+
+
+gather_srcs(cinnapi_src SRCS
+  dot_lang.cc
+  error.cc
+  functional.cc
+  sized_multi_set.cc
+  small_vector.cc
+  string.cc
+  timer.cc
+  profiler.cc
+  event.cc
+  multi_threading.cc
+  data_util.cc
+  random_engine.cc
+  )
+
+cc_test(test_string SRCS string_test.cc DEPS cinncore)
+cc_test(test_sized_multi_set SRCS sized_multi_set_test.cc DEPS cinncore)
+cc_test(test_multi_threading SRCS multi_threading_test.cc DEPS cinncore)
+cc_test(test_functional SRCS string.cc functional.cc functional_test.cc DEPS absl Threads::Threads)
+cc_test(test_profiler SRCS profiler_test.cc DEPS cinncore)
diff --git a/paddle/cinn/utils/data_util.cc b/paddle/cinn/utils/data_util.cc
new file mode 100644
index 0000000000000..907d931f722d7
--- /dev/null
+++ b/paddle/cinn/utils/data_util.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/data_util.h"
+
+#include "iostream"
+
+namespace cinn {
+
+void SetRandInt(hlir::framework::Tensor tensor, const common::Target& target, int seed, int low, int high) {
+  if (seed == -1) {
+    std::random_device rd;
+    seed = rd();
+  }
+  std::default_random_engine engine(seed);
+  std::uniform_int_distribution<int> dist(low, high - 1);
+  size_t num_ele = tensor->shape().numel();
+  std::vector<int> random_data(num_ele);
+  for (size_t i = 0; i < num_ele; i++) {
+    random_data[i] = static_cast<int>(dist(engine));  // All random data
+  }
+
+  auto* data = tensor->mutable_data<int>(target);
+#ifdef CINN_WITH_CUDA
+  if (target == common::DefaultNVGPUTarget()) {
+    cudaMemcpy(data, random_data.data(), num_ele * sizeof(int), cudaMemcpyHostToDevice);
+    return;
+  }
+#endif
+  CHECK(target == common::DefaultHostTarget());
+  std::copy(random_data.begin(), random_data.end(), data);
+}
+
+template <>
+void SetRandData<int>(hlir::framework::Tensor tensor, const common::Target& target, int seed) {
+  if (seed == -1) {
+    std::random_device rd;
+    seed = rd();
+  }
+  std::default_random_engine engine(seed);
+  std::uniform_int_distribution<int> dist(1, 10);
+  size_t num_ele = tensor->shape().numel();
+  std::vector<float> random_data(num_ele);
+  for (size_t i = 0; i < num_ele; i++) {
+    random_data[i] = static_cast<float>(dist(engine));  // All random data
+  }
+
+  auto* data = tensor->mutable_data<float>(target);
+#ifdef CINN_WITH_CUDA
+  if (target == common::DefaultNVGPUTarget()) {
+    cudaMemcpy(data, random_data.data(), num_ele * sizeof(float), cudaMemcpyHostToDevice);
+    return;
+  }
+#endif
+  CHECK(target == common::DefaultHostTarget());
+  std::copy(random_data.begin(), random_data.end(), data);
+}
+
+template <>
+void SetRandData<float>(hlir::framework::Tensor tensor, const common::Target& target, int seed) {
+  if (seed == -1) {
+    std::random_device rd;
+    seed = rd();
+  }
+  std::default_random_engine engine(seed);
+  std::uniform_real_distribution<float> dist(0.f, 1.f);
+  size_t num_ele = tensor->shape().numel();
+  std::vector<float> random_data(num_ele);
+  for (size_t i = 0; i < num_ele; i++) {
+    random_data[i] = dist(engine);  // All random data
+  }
+
+  auto* data = tensor->mutable_data<float>(target);
+#ifdef CINN_WITH_CUDA
+  if (target == common::DefaultNVGPUTarget()) {
+    cudaMemcpy(data, random_data.data(), num_ele * sizeof(float), cudaMemcpyHostToDevice);
+  } else if (target == common::DefaultHostTarget()) {
+    std::copy(random_data.begin(), random_data.end(), data);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+#else
+  CHECK(target == common::DefaultHostTarget());
+  std::copy(random_data.begin(), random_data.end(), data);
+#endif
+}
+
+template <typename T>
+std::vector<T> GetTensorData(const hlir::framework::Tensor& tensor, const common::Target& target) {
+  auto size = tensor->shape().numel();
+  std::vector<T> data(size);
+#ifdef CINN_WITH_CUDA
+  if (target == common::DefaultNVGPUTarget()) {
+    cudaMemcpy(data.data(), static_cast<const void*>(tensor->data<T>()), size * sizeof(T), cudaMemcpyDeviceToHost);
+  } else if (target == common::DefaultHostTarget()) {
+    std::copy(tensor->data<T>(), tensor->data<T>() + size, data.begin());
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+#else
+  CHECK(target == common::DefaultHostTarget());
+  std::copy(tensor->data<T>(), tensor->data<T>() + size, data.begin());
+#endif
+  return data;
+}
+
+template std::vector<float> GetTensorData<float>(const hlir::framework::Tensor& tensor, const common::Target& target);
+template std::vector<int> GetTensorData<int>(const hlir::framework::Tensor& tensor, const common::Target& target);
+
+}  // namespace cinn
diff --git a/paddle/cinn/utils/data_util.h b/paddle/cinn/utils/data_util.h
new file mode 100644
index 0000000000000..39393e3b3d770
--- /dev/null
+++ b/paddle/cinn/utils/data_util.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <random>
+#include <vector>
+
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/tensor.h"
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
+namespace cinn {
+
+/**
+ * @brief  Fill an int Tensor with random data, which is going to be [low, high).
+ *
+ * @param tensor  A Tensor that needs to be filled with data has to be of type Int.
+ * @param target  The type of device that tensor need.
+ * @param seed    Random number seed. Default setting is -1.
+ * @param low     Set the lower bound of the data range, which is represented as [low, high).
+ * @param high    Set the upper bound of the data range, which is represented as [low, high).
+ */
+void SetRandInt(
+    hlir::framework::Tensor tensor, const common::Target& target, int seed = -1, int low = 0, int high = 11);
+
+template <typename T>
+void SetRandData(hlir::framework::Tensor tensor, const common::Target& target, int seed = -1);
+
+template <typename T>
+std::vector<T> GetTensorData(const hlir::framework::Tensor& tensor, const common::Target& target);
+
+}  // namespace cinn
diff --git a/paddle/cinn/utils/dot_lang.cc b/paddle/cinn/utils/dot_lang.cc
new file mode 100644
index 0000000000000..bf43f3277e478
--- /dev/null
+++ b/paddle/cinn/utils/dot_lang.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/dot_lang.h"
+
+#include <glog/logging.h>
+
+#include <sstream>
+
+namespace cinn {
+namespace utils {
+
+size_t dot_node_counter{0};
+size_t dot_cluster_counter{0};
+
+void ResetDotCounters() {
+  dot_node_counter    = 0;
+  dot_cluster_counter = 0;
+}
+
+std::string DotAttr::repr() const {
+  std::stringstream ss;
+  ss << key << "=" << '"' << value << '"';
+  return ss.str();
+}
+
+DotNode::DotNode(const std::string& name, const std::vector<DotAttr>& attrs, const std::string& cluster_id)
+    : name(name), attrs(attrs), cluster_id_(cluster_id) {
+  std::stringstream ss;
+  ss << "node_" << dot_node_counter++;
+  id_ = ss.str();
+}
+
+std::string DotNode::repr() const {
+  std::stringstream ss;
+  CHECK(!name.empty());
+  ss << id_;
+  if (attrs.empty()) {
+    ss << "[label=" << '"' << name << '"' << "]";
+    return ss.str();
+  }
+  for (size_t i = 0; i < attrs.size(); i++) {
+    if (i == 0) {
+      ss << "[label=" << '"' << name << '"' << " ";
+    }
+    ss << attrs[i].repr();
+    ss << ((i < attrs.size() - 1) ? " " : "]");
+  }
+  return ss.str();
+}
+
+DotCluster::DotCluster(const std::string& name, const std::vector<DotAttr>& attrs) : name(name), attrs(attrs) {
+  std::stringstream ss;
+  ss << "cluster_" << dot_cluster_counter++;
+  id_ = ss.str();
+}
+
+std::string DotEdge::repr() const {
+  std::stringstream ss;
+  CHECK(!source.empty());
+  CHECK(!target.empty());
+  ss << source << "->" << target;
+  for (size_t i = 0; i < attrs.size(); i++) {
+    if (i == 0) {
+      ss << "[";
+    }
+    ss << attrs[i].repr();
+    ss << ((i < attrs.size() - 1) ? " " : "]");
+  }
+  return ss.str();
+}
+
+void DotLang::AddNode(const std::string& id,
+                      const std::vector<DotAttr>& attrs,
+                      std::string label,
+                      std::string cluster_id,
+                      bool allow_duplicate) {
+  if (!allow_duplicate) {
+    CHECK(!nodes_.count(id)) << "duplicate Node '" << id << "'";
+  }
+  if (!nodes_.count(id)) {
+    if (label.empty()) {
+      label = id;
+    }
+    nodes_.emplace(id, DotNode{label, attrs, cluster_id});
+    if (!cluster_id.empty()) {
+      CHECK(clusters_.count(cluster_id)) << "Cluster '" << cluster_id << "'"
+                                         << " is not existed";
+      clusters_[cluster_id].Insert(&nodes_[id]);
+    }
+  }
+}
+
+void DotLang::AddCluster(const std::string& id, const std::vector<DotAttr>& attrs) {
+  CHECK(!clusters_.count(id)) << "duplicate Cluster '" << id << "'";
+  clusters_.emplace(id, DotCluster{id, attrs});
+}
+
+void DotLang::AddEdge(const std::string& source, const std::string& target, const std::vector<DotAttr>& attrs) {
+  CHECK(!source.empty());
+  CHECK(!target.empty());
+  CHECK(nodes_.find(source) != nodes_.end()) << "Call AddNode to add " << source << " to dot first";
+  CHECK(nodes_.find(target) != nodes_.end()) << "Call AddNode to add " << target << " to dot first";
+  auto sid = nodes_.at(source).id();
+  auto tid = nodes_.at(target).id();
+  edges_.emplace_back(sid, tid, attrs);
+}
+
+std::string DotLang::Build() const {
+  std::stringstream ss;
+  const std::string indent = "   ";
+  ss << "digraph G {" << '\n';
+
+  // Add graph attrs
+  for (const auto& attr : attrs_) {
+    ss << indent << attr.repr() << '\n';
+  }
+  // add clusters
+  for (auto& item : clusters_) {
+    const auto& cluster = item.second;
+    ss << indent << "subgraph " << cluster.id() << " {\n";
+    ss << indent << indent << "label=\"" << item.first << "\"\n";
+    if (!cluster.attrs.empty()) {
+      for (size_t i = 0; i < cluster.attrs.size(); i++) {
+        ss << indent << indent << cluster.attrs[i].repr() << "\n";
+      }
+    }
+    for (auto* node : cluster.nodes()) {
+      ss << indent << indent << node->repr() << "\n";
+    }
+    ss << indent << "}\n";
+  }
+  // add nodes
+  for (auto& item : nodes_) {
+    if (item.second.cluster_id().empty()) {
+      ss << indent << item.second.repr() << '\n';
+    }
+  }
+  // add edges
+  for (auto& edge : edges_) {
+    ss << indent << edge.repr() << '\n';
+  }
+  ss << "} // end G";
+  return ss.str();
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/dot_lang.h b/paddle/cinn/utils/dot_lang.h
new file mode 100644
index 0000000000000..6b7abfcf5f566
--- /dev/null
+++ b/paddle/cinn/utils/dot_lang.h
@@ -0,0 +1,135 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace cinn {
+namespace utils {
+
+void ResetDotCounters();
+
+struct DotNode;
+struct DotCluster;
+struct DotEdge;
+struct DotAttr;
+
+/*
+ * A Dot template that helps to build a DOT graph definition.
+ */
+class DotLang {
+ public:
+  DotLang() = default;
+
+  explicit DotLang(const std::vector<DotAttr>& attrs) : attrs_(attrs) {}
+
+  /**
+   * Add a node to the DOT graph.
+   * @param id Unique ID for this node.
+   * @param attrs DOT attributes.
+   * @param label Name of the node.
+   */
+  void AddNode(const std::string& id,
+               const std::vector<DotAttr>& attrs,
+               std::string label      = "",
+               std::string cluster_id = "",
+               bool allow_duplicate   = false);
+
+  /**
+   * Add a subgraph to the DOT graph.
+   * @param id Unique ID for this subgraph.
+   * @param attrs DOT attributes.
+   */
+  void AddCluster(const std::string& id, const std::vector<DotAttr>& attrs);
+
+  /**
+   * Add an edge to the DOT graph.
+   * @param source The id of the source of the edge.
+   * @param target The id of the sink of the edge.
+   * @param attrs The attributes of the edge.
+   */
+  void AddEdge(const std::string& source, const std::string& target, const std::vector<DotAttr>& attrs);
+
+  std::string operator()() const { return Build(); }
+
+ private:
+  // Compile to DOT language codes.
+  std::string Build() const;
+
+  std::map<std::string, DotNode> nodes_;
+  std::map<std::string, DotCluster> clusters_;
+  std::vector<DotEdge> edges_;
+  std::vector<DotAttr> attrs_;
+};
+
+struct DotAttr {
+  std::string key;
+  std::string value;
+
+  DotAttr(const std::string& key, const std::string& value) : key(key), value(value) {}
+
+  std::string repr() const;
+};
+
+struct DotNode {
+  std::string name;
+  std::vector<DotAttr> attrs;
+
+  DotNode() = default;
+  DotNode(const std::string& name, const std::vector<DotAttr>& attrs, const std::string& cluster_id);
+
+  std::string id() const { return id_; }
+  std::string cluster_id() const { return cluster_id_; }
+
+  std::string repr() const;
+
+ private:
+  std::string id_;
+  std::string cluster_id_;
+};
+
+struct DotCluster {
+  std::string name;
+  std::vector<DotAttr> attrs;
+
+  DotCluster() = default;
+  DotCluster(const std::string& name, const std::vector<DotAttr>& attrs);
+
+  void Insert(DotNode* node) { nodes_.insert(node); }
+
+  std::string id() const { return id_; }
+  std::set<DotNode*> nodes() const { return nodes_; }
+
+ private:
+  std::string id_;
+  std::set<DotNode*> nodes_;  // Not owned
+};
+
+struct DotEdge {
+  std::string source;
+  std::string target;
+  std::vector<DotAttr> attrs;
+
+  DotEdge(const std::string& source, const std::string& target, const std::vector<DotAttr>& attrs)
+      : source(source), target(target), attrs(attrs) {}
+
+  std::string repr() const;
+};
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/error.cc b/paddle/cinn/utils/error.cc
new file mode 100644
index 0000000000000..9207c319b3fb3
--- /dev/null
+++ b/paddle/cinn/utils/error.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/error.h"
+
+namespace cinn::utils {}  // namespace cinn::utils
diff --git a/paddle/cinn/utils/error.h b/paddle/cinn/utils/error.h
new file mode 100644
index 0000000000000..f7c277e5a0698
--- /dev/null
+++ b/paddle/cinn/utils/error.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+//! This file includes some utilities imported from  LLVM.
+#include "llvm/Support/Error.h"
+
+namespace cinn::utils {
+
+template <typename T>
+using Expected = llvm::Expected<T>;
+
+}  // namespace cinn::utils
diff --git a/paddle/cinn/utils/event.cc b/paddle/cinn/utils/event.cc
new file mode 100644
index 0000000000000..3e0ceaf0814e3
--- /dev/null
+++ b/paddle/cinn/utils/event.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/event.h"
+
+#include <glog/logging.h>  // for GLog
+
+#include <unordered_map>
+
+namespace cinn {
+namespace utils {
+inline std::string EventTypeToString(const EventType &type) {
+  switch (type) {
+    case EventType::kOrdinary:
+      return "Ordinary";
+    case EventType::kGraph:
+      return "Graph";
+    case EventType::kProgram:
+      return "Program";
+    case EventType::kFusePass:
+      return "FusePass";
+    case EventType::kCompute:
+      return "Compute";
+    case EventType::kSchedule:
+      return "Schedule";
+    case EventType::kOptimize:
+      return "Optimize";
+    case EventType::kCodeGen:
+      return "CodeGen";
+    case EventType::kCompile:
+      return "Compile";
+    case EventType::kInstruction:
+      return "Instruction";
+    default:
+      LOG(FATAL) << "Unknown event type";
+  }
+}
+
+std::ostream &operator<<(std::ostream &os, const EventType &type) {
+  os << EventTypeToString(type).c_str();
+  return os;
+}
+
+std::string Summary::Format(const std::vector<HostEvent> &events) {
+  std::vector<Item> items;
+  // TODO(Aurelius84): Consider EventType for more hash key info.
+  std::unordered_map<std::string, Item *> unique_items;
+  std::unordered_map<EventType, double> category_cost;
+
+  double total_cost     = 0.0;
+  size_t max_annot_size = 20;
+  for (auto &e : events) {
+    if (unique_items.count(e.annotation_) == 0U) {
+      items.emplace_back(e);
+      unique_items[e.annotation_]                    = &items.back();
+      unique_items.at(e.annotation_)->info.duration_ = 0.0;
+    }
+    // Sum cost for category
+    category_cost[e.type_] += e.duration_;
+    total_cost += e.duration_;
+    max_annot_size = std::max(max_annot_size, e.annotation_.size());
+
+    // Sum cost for same name
+    unique_items.at(e.annotation_)->info.duration_ += e.duration_;
+  }
+  // Calculate Ratio
+  for (auto &item : items) {
+    item.sub_raito   = item.info.duration_ / category_cost[item.info.type_] * 100.0;
+    item.total_raito = item.info.duration_ / total_cost * 100.0;
+  }
+
+  std::sort(items.begin(), items.end());
+
+  return AsStr(items, /*data_width=*/max_annot_size);
+}
+
+std::string Summary::AsStr(const std::vector<Item> &items, int data_width) {
+  std::ostringstream os;
+
+  os << "\n\n------------------------->     Profiling Report     <-------------------------\n\n";
+
+  std::vector<std::string> titles = {"Category", "Name", "CostTime(ms)", "Ratio in Category(%)", "Ratio in Total(%)"};
+  std::vector<int> widths         = {20, data_width, 20, 20, 20};
+
+  size_t pad_size = 0;
+  int idx         = 0;
+  for (auto &t : titles) {
+    pad_size = widths[idx] >= t.size() ? widths[idx] - t.size() : 1;
+    os << ' ' << t << std::string(pad_size, ' ');
+    ++idx;
+  }
+
+  os << "\n\n";
+
+  for (auto &item : items) {
+    std::vector<std::string> infos = {EventTypeToString(item.info.type_),
+                                      item.info.annotation_,
+                                      std::to_string(item.info.duration_),
+                                      item.sub_raito.ToStr(),
+                                      item.total_raito.ToStr()};
+    idx                            = 0;
+    for (auto &info : infos) {
+      pad_size = widths[idx] > info.size() ? widths[idx] - info.size() : 1;
+      os << ' ' << info << std::string(pad_size, ' ');
+      ++idx;
+    }
+    os << "\n";
+  }
+  os << "\n";
+  return os.str();
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/event.h b/paddle/cinn/utils/event.h
new file mode 100644
index 0000000000000..1f1212e468f70
--- /dev/null
+++ b/paddle/cinn/utils/event.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace cinn {
+namespace utils {
+
+/*
+TODO(Aurelius84): For now, we don't implement EventLevel to
+control or shield the event greater than specified level.
+So EventType is not strictly a single-layer structure.
+*/
+enum class EventType {
+  // kOrdinary is default type
+  kOrdinary,
+  // kGraph is frontend Graph process
+  kGraph,
+  // kProgram is fronted Program process
+  kProgram,
+  // kFusePass is Graph and Program pass process
+  kFusePass,
+  // kCompute is NetBuilder OpLower process in OpLowering
+  kCompute,
+  // kSchedule is applying Schedule process in OpLowering
+  kSchedule,
+  // kOptimize is applying Optimize process in OpLowering
+  kOptimize,
+  // kCodeGen is AstCodegen process
+  kCodeGen,
+  // kCompile is LLVM or CUDA NVTX compile process
+  kCompile,
+  // kInstruction is running instruction process
+  kInstruction
+};
+
+inline std::string EventTypeToString(const EventType& type);
+
+std::ostream& operator<<(std::ostream& os, const EventType& type);
+
+struct HostEvent {
+  std::string annotation_;
+  double duration_;  // ms
+  EventType type_;
+
+  HostEvent(const std::string& annotation, double duration, EventType type)
+      : annotation_(annotation), duration_(duration), type_(type) {}
+};
+
+class Summary {
+ public:
+  struct Raito {
+    double value;
+    Raito(double val) : value(val){};
+    std::string ToStr() const { return std::to_string(value); }
+  };
+
+  struct Item {
+    HostEvent info;
+    Raito sub_raito{0.0};    // percentage of EventType
+    Raito total_raito{0.0};  // precentage of total process
+
+    Item(const HostEvent& e) : info(e) {}
+    bool operator<(const Item& other) const { return total_raito.value > other.total_raito.value; }
+  };
+
+  static std::string Format(const std::vector<HostEvent>& events);
+
+  static std::string AsStr(const std::vector<Item>& itemsm, int data_width);
+};
+
+class HostEventRecorder {
+ public:
+  // singleton
+  static HostEventRecorder& GetInstance() {
+    static HostEventRecorder instance;
+    return instance;
+  }
+
+  static std::string Table() { return Summary::Format(GetInstance().Events()); }
+
+  void Clear() { events_.clear(); }
+
+  std::vector<HostEvent>& Events() { return events_; }
+
+  void RecordEvent(const std::string& annotation, double duration, EventType type) {
+    GetInstance().Events().emplace_back(annotation, duration, type);
+  }
+
+ private:
+  std::vector<HostEvent> events_;
+};
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/functional.cc b/paddle/cinn/utils/functional.cc
new file mode 100644
index 0000000000000..b0b471fdfc1e4
--- /dev/null
+++ b/paddle/cinn/utils/functional.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/functional.h"
+
+#include "glog/logging.h"
+
+namespace cinn {
+namespace utils {
+
+std::vector<int> GetPositiveAxes(const std::vector<int>& axes, int rank) {
+  std::vector<int> new_axes(axes.size());
+  for (int i = 0; i < axes.size(); ++i) {
+    int axis = axes[i] + (axes[i] < 0 ? rank : 0);
+    CHECK(axis >= 0 && axis < rank) << "The axis should in [" << -rank << ", " << rank << "), but axes[" << i
+                                    << "]=" << axes[i] << " not.";
+    new_axes[i] = axis;
+  }
+  return new_axes;
+}
+
+int GetPositiveAxes(int axis, int rank) {
+  int dim = axis + (axis < 0 ? rank : 0);
+  CHECK(dim >= 0 && dim < rank) << "The axis should in [0, " << rank << "), but axis=" << axis << " not.";
+  return dim;
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/functional.h b/paddle/cinn/utils/functional.h
new file mode 100644
index 0000000000000..ff92b06f26719
--- /dev/null
+++ b/paddle/cinn/utils/functional.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/types/optional.h>
+
+#include <algorithm>
+#include <functional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace cinn {
+namespace utils {
+
+template <typename InT, typename OutValT>
+std::vector<OutValT> Map(const InT &in, std::function<OutValT(const typename InT::value_type &)> fn) {
+  std::vector<OutValT> res;
+  std::transform(
+      in.begin(), in.end(), std::back_inserter(res), [&](const typename InT::value_type &x) { return fn(x); });
+  return res;
+}
+
+template <typename T>
+auto Min(T &&t) {
+  return t;
+}
+
+template <typename T, typename... Ts>
+auto Min(T &&t, Ts &&...ts) {
+  return std::min(t, Min(ts...));
+}
+
+template <typename T>
+auto Max(T &&t) {
+  return t;
+}
+
+template <typename T, typename... Ts>
+auto Max(T &&t, Ts &&...ts) {
+  return std::max(t, Max(ts...));
+}
+
+template <typename T>
+struct IsVector {
+  template <typename U>
+  static auto infer(U *)
+      -> std::enable_if_t<std::is_same<std::vector<typename U::value_type>, U>::value, std::true_type>;
+
+  template <typename U>
+  static std::false_type infer(...);
+
+  static constexpr bool value = decltype(infer<std::decay_t<std::remove_pointer_t<T>>>(nullptr))::value;
+};
+
+template <class T>
+struct IsString : std::integral_constant<bool, std::is_same<std::string, std::decay_t<T>>::value> {};
+
+template <typename T>
+auto Flatten(const absl::optional<std::reference_wrapper<const T>> &c)
+    -> std::enable_if_t<std::is_scalar<T>::value || IsString<T>::value, std::vector<T>> {
+  return c ? std::vector<T>{c->get()} : std::vector<T>{};
+}
+
+template <template <typename...> class C, typename E>
+auto Flatten(const absl::optional<std::reference_wrapper<const C<E>>> &c)
+    -> std::enable_if_t<std::is_scalar<E>::value && !IsString<decltype(c->get())>::value, std::vector<E>> {
+  return c ? std::vector<E>(c->get().begin(), c->get().end()) : std::vector<E>{};
+}
+
+template <typename T,
+          typename E = std::enable_if_t<!IsString<T>::value, std::decay_t<decltype(*std::declval<const T>().begin())>>>
+auto Flatten(const absl::optional<std::reference_wrapper<const T>> &c) {
+  absl::optional<std::reference_wrapper<const E>> val;
+  if (c && !c->get().empty()) {
+    val = *(c->get().begin());
+  }
+
+  auto res = Flatten(val);
+
+  if (val) {
+    auto it = ++(c->get().begin());
+    while (it != c->get().end()) {
+      val      = *it;
+      auto tmp = Flatten(val);
+      res.insert(res.end(), tmp.begin(), tmp.end());
+      ++it;
+    }
+  }
+  return res;
+}
+
+template <typename T>
+auto Flatten(const T &v) {
+  absl::optional<std::reference_wrapper<const T>> w = v;
+  return Flatten(w);
+}
+
+/*!
+ * \brief hash an object and combines it with previous keys
+ * \param seed The previous hash value
+ * \param value The object to be hashed and combined into seed
+ * \return the combined hash.
+ */
+template <typename T>
+inline uint64_t HashCombine(uint64_t seed, const T &value) {
+  return seed ^ (std::hash<T>()(value) + 0x9e3779b9 + (seed << 6) + (seed >> 2));
+}
+
+std::vector<int> GetPositiveAxes(const std::vector<int> &axes, int rank);
+
+int GetPositiveAxes(int axes, int rank);
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/functional_test.cc b/paddle/cinn/utils/functional_test.cc
new file mode 100644
index 0000000000000..caa36580f5b1c
--- /dev/null
+++ b/paddle/cinn/utils/functional_test.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/functional.h"
+
+#include <absl/algorithm/container.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <ios>
+#include <list>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace utils {
+
+TEST(Functional, IsVector) {
+  static_assert(!IsVector<int>::value, "int is not a vector");
+  static_assert(!IsVector<std::string>::value, "string is not a vector");
+  static_assert(!IsVector<const std::string *>::value, "const string* is not a vector");
+  static_assert(!IsVector<std::list<bool>>::value, "list<float> is not a vector");
+  static_assert(!IsVector<const std::list<float> &>::value, "const list<float>& is not a vector");
+  static_assert(!IsVector<std::set<bool>>::value, "set<double> is not a vector");
+  static_assert(!IsVector<std::set<double> *>::value, "set<double>* is not a vector");
+
+  static_assert(IsVector<std::vector<float>>::value, "vector<float> is a vector");
+  static_assert(IsVector<std::vector<int> &>::value, "vector<int>& is a vector");
+  static_assert(IsVector<std::vector<bool> *>::value, "vector<bool>* is a vector");
+
+  static_assert(IsVector<const std::vector<float>>::value, "const vector<float> is a vector");
+  static_assert(IsVector<const std::vector<int> &>::value, "const vector<int>& is a vector");
+  static_assert(IsVector<const std::vector<bool> *>::value, "const vector<bool>* is a vector");
+
+  static_assert(IsVector<volatile std::vector<float>>::value, "volatile vector<float> is a vector");
+  static_assert(IsVector<volatile std::vector<int> &>::value, "volatile vector<int>& is a vector");
+  static_assert(IsVector<volatile std::vector<bool> *>::value, "volatile vector<bool>* is a vector");
+
+  static_assert(IsVector<const volatile std::vector<float>>::value, "const volatile vector<float> is a vector");
+  static_assert(IsVector<const volatile std::vector<int> &>::value, "const volatile vector<int>& is a vector");
+  static_assert(IsVector<const volatile std::vector<bool> *>::value, "const volatile vector<bool>* is a vector");
+}
+
+TEST(Functional, Flatten) {
+  double d       = 3.14;
+  auto flatten_d = Flatten(d);
+  LOG(INFO) << utils::Join(flatten_d, ", ");
+  ASSERT_EQ(flatten_d.size(), 1);
+  ASSERT_TRUE(absl::c_equal(flatten_d, std::vector<double>{3.14}));
+
+  std::string s  = "constant";
+  auto flatten_s = Flatten(s);
+  LOG(INFO) << utils::Join(flatten_s, ", ");
+  ASSERT_EQ(flatten_s.size(), 1);
+  ASSERT_TRUE(absl::c_equal(flatten_s, std::vector<std::string>{"constant"}));
+  const std::string &sr = s;
+  auto flatten_sr       = Flatten(sr);
+  LOG(INFO) << utils::Join(flatten_sr, ", ");
+  ASSERT_EQ(flatten_sr.size(), 1);
+  ASSERT_TRUE(absl::c_equal(flatten_sr, std::vector<std::string>{"constant"}));
+
+  std::vector<std::vector<int>> i{{3, 4, 5}, {7, 8, 9, 10}};
+  auto flatten_i = Flatten(i);
+  LOG(INFO) << utils::Join(flatten_i, ", ");
+  ASSERT_EQ(flatten_i.size(), 7);
+  ASSERT_TRUE(absl::c_equal(flatten_i, std::vector<int>{3, 4, 5, 7, 8, 9, 10}));
+
+  std::vector<std::vector<std::vector<bool>>> v{{{true, false}, {true, false, true, false}},
+                                                {{false}, {true, true, false}}};
+  std::vector<bool> flatten_v = Flatten(v);
+  LOG(INFO) << utils::Join(flatten_v, ", ");
+  ASSERT_EQ(flatten_v.size(), 10);
+  ASSERT_TRUE(
+      absl::c_equal(flatten_v, std::vector<bool>{true, false, true, false, true, false, false, true, true, false}));
+
+  std::vector<std::set<std::list<std::string>>> str{{{"true", "false"}, {"true", "false", "true", "false"}},
+                                                    {{"false"}, {"true", "true", "false"}}};
+  auto flatten_str = Flatten(str);
+  LOG(INFO) << utils::Join(flatten_str, ", ");
+  ASSERT_EQ(flatten_str.size(), 10);
+  ASSERT_TRUE(absl::c_equal(
+      flatten_str,
+      std::vector<std::string>{"true", "false", "true", "false", "true", "false", "false", "true", "true", "false"}));
+
+  std::list<std::set<std::vector<float>>> a{{{1, 2, 3}, {1, 2, 3, 4, 5, 6}}, {{1, 2.2f, 3}, {1, 2, 3.3f, 4.5f}}};
+  auto flatten_a = Flatten(a);
+  LOG(INFO) << utils::Join(flatten_a, ", ");
+  ASSERT_EQ(flatten_a.size(), 16);
+  ASSERT_TRUE(absl::c_equal(flatten_a, std::vector<float>{1, 2, 3, 1, 2, 3, 4, 5, 6, 1, 2, 3.3, 4.5, 1, 2.2, 3}));
+
+  std::list<std::vector<std::set<bool>>> b;
+  auto flatten_b = Flatten(b);
+  LOG(INFO) << utils::Join(flatten_b, ", ");
+  ASSERT_EQ(flatten_b.size(), 0);
+  ASSERT_TRUE(absl::c_equal(flatten_b, std::vector<bool>{}));
+
+  std::list<std::list<std::vector<std::string>>> empty_str;
+  auto flatten_empty_str = Flatten(empty_str);
+  LOG(INFO) << utils::Join(flatten_empty_str, ", ");
+  ASSERT_EQ(flatten_empty_str.size(), 0);
+  ASSERT_TRUE(absl::c_equal(flatten_empty_str, std::vector<std::string>{}));
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/multi_threading.cc b/paddle/cinn/utils/multi_threading.cc
new file mode 100644
index 0000000000000..8828006de338b
--- /dev/null
+++ b/paddle/cinn/utils/multi_threading.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/multi_threading.h"
+
+#include <glog/logging.h>
+
+#include <future>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "cinn/utils/string.h"
+
+namespace cinn {
+namespace utils {
+
+SequenceDispatcher::SequenceDispatcher(int begin, int end, int step) : end_(end), step_(step), index_(begin) {
+  CHECK_LE(begin, end) << StringFormat("begin[%d] > end[%d]", begin, end);
+  CHECK_GT(step, 0) << "step is less than 0";
+}
+
+int SequenceDispatcher::Next() const {
+  int idx = -1;
+  if (index_ >= end_ || (idx = index_.fetch_add(step_)) >= end_) {
+    return -1;
+  }
+
+  return idx;
+}
+
+void parallel_run(const WorkerFuncType& fn, JobDispatcher&& dispatcher, int num_threads) {
+  if (num_threads == -1 || num_threads > std::thread::hardware_concurrency()) {
+    num_threads = std::thread::hardware_concurrency();
+  }
+  CHECK_GT(num_threads, 0) << "num_threads should be greater than 0";
+
+  // worker function of a thread
+  auto worker = [&fn, &dispatcher](int tid) -> int {
+    int index = -1, counter = 0;
+    while ((index = dispatcher.Next()) != -1) {
+      VLOG(5) << "Thread-" << tid << " process at index: " << index;
+      fn(index);
+      ++counter;
+    }
+    return counter;
+  };
+
+  std::vector<std::future<int>> futures;
+  std::vector<std::thread> threads;
+  // The first thread runs inplace, and other `num_threads - 1` threads launched
+  // with std::future to run asynchronously
+  if (num_threads > 1) {
+    futures.reserve(num_threads - 1);
+    threads.reserve(num_threads - 1);
+    for (int tid = 1; tid < num_threads; ++tid) {
+      std::packaged_task<int(int)> task(worker);
+      futures.emplace_back(task.get_future());
+      threads.emplace_back(std::move(task), tid);
+    }
+  }
+
+  // wait results and catch exceptions
+  try {
+    int tid     = 0;
+    int counter = worker(tid);
+    VLOG(4) << "Thread-0  process " << counter << " tasks.";
+
+    for (auto&& future : futures) {
+      counter = future.get();
+      ++tid;
+      VLOG(4) << "Thread-" << tid << " process " << counter << " tasks.";
+    }
+  } catch (const std::exception& e) {
+    LOG(FATAL) << "parallel_run incurs error: " << e.what();
+  }
+
+  // join threads
+  for (auto&& thread : threads) {
+    thread.join();
+  }
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/multi_threading.h b/paddle/cinn/utils/multi_threading.h
new file mode 100644
index 0000000000000..d958e5ed158f6
--- /dev/null
+++ b/paddle/cinn/utils/multi_threading.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <atomic>
+#include <functional>
+
+namespace cinn {
+namespace utils {
+
+// function prototype that takes a index of job as argument and complete the specified job
+using WorkerFuncType = std::function<void(int index)>;
+
+// This class defines which job will be executed in the next turn,
+// and returns the next job id through `Next` function, which will be used in multi-threads context
+// It should be used with a function instance of WorkerFuncType
+// which takes the index as argument.
+class JobDispatcher {
+ public:
+  // Attention!! this interface must be implemented to be thread-safe
+  virtual int Next() const = 0;
+};
+
+// This dispatcher simulates the execution of a for loop,
+// that is traversing from `begin`(inclusive) to `end`(exclusive)
+// with striding over `step` a time.
+class SequenceDispatcher : public JobDispatcher {
+ public:
+  SequenceDispatcher(int begin, int end, int step = 1);
+
+  int Next() const override;
+
+ private:
+  // the maxmimum index of extent
+  int end_;
+  // the traversal step to the next one
+  int step_;
+  // current index, using atomic to ensure thread-safe
+  mutable std::atomic<int> index_;
+};
+
+/**
+ * \brief A general function to run a batch of jobs in parallel
+ * \param fn A instance of WorkerFuncType, which defines how to complete a specified job
+ * \param dispatcher A instance of JobDispatcher, which pops index of the next job
+ * \param num_threads The number of threads used to run jobs, -1 means utilizing the maximum limit of hardware
+ */
+void parallel_run(const WorkerFuncType& fn, JobDispatcher&& dispatcher, int num_threads = -1);
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/multi_threading_test.cc b/paddle/cinn/utils/multi_threading_test.cc
new file mode 100644
index 0000000000000..66f300d37479b
--- /dev/null
+++ b/paddle/cinn/utils/multi_threading_test.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/multi_threading.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+namespace cinn {
+namespace utils {
+
+TEST(JobDispatcher, SequenceDispatcher) {
+  std::unique_ptr<JobDispatcher> dispatcher = std::make_unique<SequenceDispatcher>(1, 3);
+  ASSERT_EQ(1, dispatcher->Next());
+  ASSERT_EQ(2, dispatcher->Next());
+  // check reach the end
+  ASSERT_EQ(-1, dispatcher->Next());
+}
+
+TEST(parallel_run, Basic) {
+  std::vector<int> results(100, -1);
+  auto woker_fn = [&results](int index) {
+    CHECK_LT(index, results.size()) << "index invalid";
+    results[index] = index;
+  };
+  // check process every index in the extent of [0, 100) with step 1
+  parallel_run(woker_fn, SequenceDispatcher(0, 100), 2);
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_EQ(results[i], i);
+  }
+
+  // check only indexes in the extent of [0, 100) with step 3 are processed
+  results.assign(100, -1);
+  parallel_run(woker_fn, SequenceDispatcher(0, 100, 3), 3);
+  for (int i = 0; i < 100; ++i) {
+    if (i % 3 == 0) {
+      ASSERT_EQ(results[i], i);
+    } else {
+      ASSERT_EQ(results[i], -1);
+    }
+  }
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/profiler.cc b/paddle/cinn/utils/profiler.cc
new file mode 100644
index 0000000000000..1f9050e773612
--- /dev/null
+++ b/paddle/cinn/utils/profiler.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/profiler.h"
+
+#include <gflags/gflags.h>
+
+#ifdef CINN_WITH_NVTX
+#include <nvToolsExt.h>
+#endif
+#ifdef CINN_WITH_CUDA
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+
+#include "cinn/backends/cuda_util.h"
+#endif
+#include <chrono>
+
+DECLARE_int32(cinn_profiler_state);
+
+namespace cinn {
+namespace utils {
+
+ProfilerState ProfilerHelper::g_state = ProfilerState::kDisabled;
+
+void ProfilerHelper::UpdateState() {
+  if (FLAGS_cinn_profiler_state < 0) return;
+
+  switch (FLAGS_cinn_profiler_state) {
+    case 0:
+      g_state = ProfilerState::kDisabled;
+      break;
+    case 1:
+      g_state = ProfilerState::kCPU;
+      break;
+    case 2:
+      g_state = ProfilerState::kCUDA;
+      break;
+    case 3:
+      g_state = ProfilerState::kAll;
+      break;
+    default:
+      LOG(WARNING) << "Unsupport FLAGS_cinn_profiler_state = " << FLAGS_cinn_profiler_state << ", and will do nothing.";
+  }
+}
+
+RecordEvent::RecordEvent(const std::string& name, EventType type) {
+  if (!ProfilerHelper::IsEnable()) return;
+
+  if (ProfilerHelper::IsEnableCPU()) {
+    call_back_ = [this, tik = std::chrono::steady_clock::now(), annotation = std::move(name), type]() {
+      auto tok                               = std::chrono::steady_clock::now();
+      std::chrono::duration<double> duration = (tok - tik) * 1e3;  // ms
+      HostEventRecorder::GetInstance().RecordEvent(annotation, duration.count(), type);
+    };
+  }
+
+  if (ProfilerHelper::IsEnableCUDA()) {
+    ProfilerRangePush(name);
+  }
+}
+
+void RecordEvent::End() {
+  if (!ProfilerHelper::IsEnable()) return;
+
+  if (ProfilerHelper::IsEnableCPU() && call_back_ != nullptr) {
+    call_back_();
+  }
+
+  if (ProfilerHelper::IsEnableCUDA()) {
+    ProfilerRangePop();
+  }
+}
+
+void SynchronizeAllDevice() {
+#ifdef CINN_WITH_CUDA
+  int current_device_id;
+  CUDA_CALL(cudaGetDevice(&current_device_id));
+  int count;
+  CUDA_CALL(cudaGetDeviceCount(&count));
+  for (int i = 0; i < count; i++) {
+    CUDA_CALL(cudaSetDevice(i));
+    CUDA_CALL(cudaDeviceSynchronize());
+  }
+  CUDA_CALL(cudaSetDevice(current_device_id));
+#endif
+}
+
+void ProfilerStart() {
+#ifdef CINN_WITH_CUDA
+  CUDA_CALL(cudaProfilerStart());
+  SynchronizeAllDevice();
+#endif
+}
+
+void ProfilerStop() {
+#ifdef CINN_WITH_CUDA
+  CUDA_CALL(cudaProfilerStop());
+#endif
+}
+
+void ProfilerRangePush(const std::string& name) {
+#ifdef CINN_WITH_NVTX
+  nvtxRangePushA(name.c_str());
+#endif
+}
+
+void ProfilerRangePop() {
+#ifdef CINN_WITH_NVTX
+  nvtxRangePop();
+#endif
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/profiler.h b/paddle/cinn/utils/profiler.h
new file mode 100644
index 0000000000000..53a59868d0e49
--- /dev/null
+++ b/paddle/cinn/utils/profiler.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <string>
+
+#ifdef CINN_WITH_NVTX
+#include <nvToolsExt.h>
+#endif
+
+#include "cinn/utils/event.h"
+#include "glog/logging.h"
+
+namespace cinn {
+namespace utils {
+
+enum class ProfilerState {
+  kDisabled,  // disabled state
+  kCPU,       // CPU profiling state
+  kCUDA,      // GPU profiling state
+  kAll
+};
+
+class ProfilerHelper {
+ public:
+  static ProfilerState g_state;
+
+  static void EnableAll() { g_state = ProfilerState::kAll; }
+  static void EnableCPU() { g_state = ProfilerState::kCPU; }
+  static void EnableCUDA() { g_state = ProfilerState::kCUDA; }
+
+  static bool IsEnable() {
+    UpdateState();
+    return ProfilerHelper::g_state != ProfilerState::kDisabled;
+  }
+
+  static bool IsEnableCPU() {
+    UpdateState();
+    return ProfilerHelper::g_state == ProfilerState::kAll || ProfilerHelper::g_state == ProfilerState::kCPU;
+  }
+
+  static bool IsEnableCUDA() {
+    UpdateState();
+    return ProfilerHelper::g_state == ProfilerState::kAll || ProfilerHelper::g_state == ProfilerState::kCUDA;
+  }
+
+  static void UpdateState();
+};
+
+class RecordEvent {
+  using CallBack = std::function<void()>;
+
+ public:
+  RecordEvent(const std::string& name, EventType type = EventType::kOrdinary);
+
+  void End();
+
+  ~RecordEvent() { End(); }
+
+ private:
+  CallBack call_back_;
+};
+
+void SynchronizeAllDevice();
+
+void ProfilerStart();
+
+void ProfilerStop();
+
+void ProfilerRangePush(const std::string& name);
+
+void ProfilerRangePop();
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/profiler_test.cc b/paddle/cinn/utils/profiler_test.cc
new file mode 100644
index 0000000000000..1723cecea648f
--- /dev/null
+++ b/paddle/cinn/utils/profiler_test.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/profiler.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+TEST(RecordEvent, HOST) {
+  using cinn::utils::EventType;
+  using cinn::utils::HostEventRecorder;
+  using cinn::utils::ProfilerHelper;
+  using cinn::utils::RecordEvent;
+
+  ProfilerHelper::EnableCPU();
+
+  LOG(INFO) << "Usage 1: RecordEvent for HOST";
+  std::vector<EventType> types = {
+      EventType::kOrdinary, EventType::kCompile, EventType::kCompile, EventType::kInstruction};
+  for (int i = 0; i < 4; ++i) {
+    std::string name = "evs_op_" + std::to_string(i);
+    RecordEvent record_event(name, types[i]);
+    int counter = 1;
+    while (counter != i * 1000) counter++;
+  }
+
+  auto &events = HostEventRecorder::GetInstance().Events();
+  EXPECT_EQ(events.size(), 4U);
+  for (int i = 0; i < 4; ++i) {
+    auto &event      = events[i];
+    std::string name = "evs_op_" + std::to_string(i);
+    EXPECT_EQ(event.annotation_, name);
+    EXPECT_GT(event.duration_, 0.0);
+    EXPECT_EQ(event.type_, types[i]);
+    LOG(INFO) << name << " cost :" << event.duration_ << " ms.";
+  }
+
+  LOG(INFO) << HostEventRecorder::Table();
+  /*
+    40: ------------------------->     Profiling Report     <-------------------------
+    40:
+    40:  Category             Name                 CostTime(ms)         Ratio in Category(%)  Ratio in Total(%)
+    40:
+    40:  Ordinary             evs_op_0             9725.647664          100.000000           99.999827
+    40:  Instruction          evs_op_3             0.006967             100.000000           0.000072
+    40:  Compile              evs_op_1             0.005083             51.536044            0.000052
+    40:  Compile              evs_op_2             0.004780             48.463956            0.000049
+  */
+
+  LOG(INFO) << "Usage 2: Nested RecordEvent for HOST";
+  HostEventRecorder::GetInstance().Clear();
+  EXPECT_EQ(events.size(), 0U);
+
+  for (int i = 0; i < 4; ++i) {
+    std::string name = "ano_evs_op_" + std::to_string(i);
+    RecordEvent record_event(name);
+    int counter = 0;
+    while (counter != i * 10) counter++;
+    {
+      std::string nested_name = "nested_ano_evs_op_" + std::to_string(i);
+      RecordEvent nested_record_event(nested_name);
+      int nested_counter = 1;
+      while (nested_counter != i * 100) nested_counter++;
+    }
+  }
+  EXPECT_EQ(events.size(), 8U);
+}
\ No newline at end of file
diff --git a/paddle/cinn/utils/random_engine.cc b/paddle/cinn/utils/random_engine.cc
new file mode 100644
index 0000000000000..c1021e7c52e94
--- /dev/null
+++ b/paddle/cinn/utils/random_engine.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/random_engine.h"
+
+namespace cinn {
+namespace utils {
+
+// Sample Integers from uniform distribution [min, max)
+int SampleUniformInt(int min, int max, LinearRandomEngine::StateType* rand_seed) {
+  CHECK(min < max) << "Input value error: min(" << min << ") must be less than max(" << max << ")";
+  if (min + 1 == max) {
+    return min;
+  }
+
+  LinearRandomEngine engine(rand_seed);
+  std::uniform_int_distribution<> dist(min, max - 1);
+  return dist(engine);
+}
+
+// Sample Real Numbers from uniform distribution [min, max)
+double SampleUniformDouble(double min, double max, LinearRandomEngine::StateType* rand_seed) {
+  CHECK(min < max) << "Input value error: min(" << min << ") must be less than max(" << max << ")";
+  LinearRandomEngine engine(rand_seed);
+  std::uniform_real_distribution<> dist(min, max);
+  return dist(engine);
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/random_engine.h b/paddle/cinn/utils/random_engine.h
new file mode 100644
index 0000000000000..1c38c7f076113
--- /dev/null
+++ b/paddle/cinn/utils/random_engine.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+#include <stdint.h>
+
+#include <random>
+
+namespace cinn {
+namespace utils {
+
+/**
+ * LinearRandomEngine is a random number engine using linear congruence algorithm.
+ * The transition function of state is: x(i + 1) = (multiplier * x(i) + increment) mod modulus.
+ * Its interface and members are roughly the same as std::linear_congruential_engine,
+ * which can be used for std::xxx_distribution. The difference from std::linear_congruential_engine is that
+ * the LinearRandomEngine does not own the random seed,
+ * but holds the pointer of the random seed and transfers the state for other objects.
+ */
+class LinearRandomEngine {
+ public:
+  using StateType = int64_t;
+  // the type name "resule_type" is needed by std::xxx_distribution
+  using result_type = uint32_t;
+
+  // The minimum possible value of random state
+  static constexpr result_type min() { return 0; }
+  // The maximum possible value of random state
+  static constexpr result_type max() { return modulus - 1; }
+  // The multiplier
+  static constexpr StateType multiplier = 48271;
+  // The increment
+  static constexpr StateType increment = 0;
+  // The modulus
+  static constexpr StateType modulus = 2147483647;
+
+  // Construct a linear random engine with a random state pointer
+  LinearRandomEngine(StateType* state) : state_(state) {}
+
+  // operator() is needed by std::xxx_distribution
+  result_type operator()() { return Next(); }
+
+  // Get a device random state
+  static StateType GetDeviceRandomValue() { return (std::random_device()()) % modulus; }
+
+  // Normalize the random seed to the range of [1, modulus - 1]
+  static StateType NormalizeState(StateType state) {
+    if (state == -1) {
+      state = GetDeviceRandomValue();
+    } else {
+      state %= modulus;
+    }
+    if (state == 0) {
+      state = 1;
+    }
+    CHECK_GE(state, 0) << "Random seed must be greater than 0";
+
+    return state;
+  }
+
+  // Fork a new state for another Random Generator from current state
+  StateType ForkState() { return (Next() * 32767) % 1999999973; }
+
+ private:
+  // Move the state to the next and return the new state
+  result_type Next() {
+    *state_ = (increment + (*state_) * multiplier) % modulus;
+    return *state_;
+  }
+
+ private:
+  StateType* state_;
+};
+
+// Fork a new random state for another Random Generator, the original seed will be changed to next state.
+inline LinearRandomEngine::StateType ForkRandomState(LinearRandomEngine::StateType* rand_seed) {
+  return LinearRandomEngine(rand_seed).ForkState();
+}
+
+// Sample Integers from uniform distribution [min, max)
+int SampleUniformInt(int min, int max, LinearRandomEngine::StateType* rand_seed);
+
+// Sample Real Numbers from uniform distribution [min, max)
+double SampleUniformDouble(double min, double max, LinearRandomEngine::StateType* rand_seed);
+
+// Sample Integers from distribution of input weights
+template <typename T>
+int SampleDiscreteFromDistribution(const std::vector<T>& weights, LinearRandomEngine::StateType* rand_seed) {
+  CHECK(weights.size() > 0);
+  LinearRandomEngine engine(rand_seed);
+  std::discrete_distribution<int> dist(weights.begin(), weights.end());
+  return dist(engine);
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/registry.h b/paddle/cinn/utils/registry.h
new file mode 100644
index 0000000000000..52d28bbfa1c77
--- /dev/null
+++ b/paddle/cinn/utils/registry.h
@@ -0,0 +1,210 @@
+/**
+ * This file is copied from dmlc-core project, all the rights are resolved by original project. Following are the
+ * original header comment.
+ *  Copyright (c) 2015 by Contributors
+ * @file registry.h
+ * \brief Registry utility that helps to build registry singletons.
+ */
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+/**
+ * \brief Registry class.
+ *  Registry can be used to register global singletons.
+ *  The most commonly use case are factory functions.
+ *
+ * @tparam EntryType Type of Registry entries,
+ *     EntryType need to name a name field.
+ */
+template <typename EntryType>
+class Registry {
+ public:
+  /** @return list of entries in the registry(excluding alias) */
+  inline const std::vector<const EntryType *> &List() { return const_list_; }
+
+  /** @return list all names registered in the registry, including alias */
+  inline std::vector<std::string> ListAllNames() {
+    std::vector<std::string> names;
+    for (auto p = fmap_.begin(); p != fmap_.end(); ++p) {
+      names.push_back(p->first);
+    }
+    return names;
+  }
+  /**
+   * \brief Find the entry with corresponding name.
+   * @param name name of the function
+   * @return the corresponding function, can be NULL
+   */
+  inline const EntryType *Find(const std::string &name) {
+    typename std::map<std::string, EntryType *>::const_iterator p = fmap_.find(name);
+    if (p != fmap_.end()) {
+      return p->second;
+    } else {
+      return nullptr;
+    }
+  }
+  /**
+   * \brief Add alias to the key_name
+   * @param key_name The original entry key
+   * @param alias The alias key.
+   */
+  /*   inline void AddAlias(const std::string &key_name, const std::string &alias) {
+      EntryType *e = fmap_.at(key_name);
+      if (fmap_.count(alias)) {
+        CHECK_EQ(e, fmap_.at(alias)) << "Trying to register alias " << alias << " for key " << key_name << " but "
+                                     << alias << " is already taken";
+      } else {
+        fmap_[alias] = e;
+      }
+    } */
+  /**
+   * \brief Internal function to register a name function under name.
+   * @param name name of the function
+   * @return ref to the registered entry, used to set properties
+   */
+  inline EntryType &__REGISTER__(const std::string &name) {
+    std::lock_guard<std::mutex> guard(registering_mutex);
+    if (fmap_.count(name)) {
+      return *fmap_[name];
+    }
+
+    EntryType *e = new EntryType();
+    e->name      = name;
+    fmap_[name]  = e;
+    const_list_.push_back(e);
+    entry_list_.push_back(e);
+    return *e;
+  }
+
+  /**
+   * \brief Internal function to either register or get registered entry
+   * @param name name of the function
+   * @return ref to the registered entry, used to set properties
+   */
+  inline EntryType &__REGISTER_OR_GET__(const std::string &name) {
+    // Here if we use VLOG, we will get Seg Fault. Todo: Add VLOG and fix this. @Haoze
+    if (!fmap_.count(name)) {
+      return __REGISTER__(name);
+    } else {
+      return *fmap_.at(name);
+    }
+  }
+
+  /**
+   * \brief get a singleton of the Registry.
+   *  This function can be defined by CINN_REGISTRY_ENABLE.
+   * @return get a singleton
+   */
+  static Registry *Global() {
+    static Registry<EntryType> inst;
+    return &inst;
+  }
+
+  Registry() = default;
+  ~Registry() {
+    for (size_t i = 0; i < entry_list_.size(); ++i) {
+      delete entry_list_[i];
+    }
+  }
+
+ protected:
+  /** \brief list of entry types */
+  std::vector<EntryType *> entry_list_;
+  /** \brief list of entry types */
+  std::vector<const EntryType *> const_list_;
+  /** \brief map of name->function */
+  std::map<std::string, EntryType *> fmap_;
+  /** \brief lock guarding the registering*/
+  std::mutex registering_mutex;
+  /** \brief constructor */
+  /** \brief destructor */
+};
+
+/**
+ * \brief Common base class for function registry.
+ *
+ * \code
+ *  // This example demonstrates how to use Registry to create a factory of trees.
+ *  struct TreeFactory :
+ *      public FunctionRegEntryBase<TreeFactory, std::function<Tree*()> > {
+ *  };
+ *
+ *  // in a independent cc file
+ *  namespace cinn {
+ *  CINN_REGISTRY_ENABLE(TreeFactory);
+ *  }
+ *  // register binary tree constructor into the registry.
+ *  CINN_REGISTRY_REGISTER(TreeFactory, TreeFactory, BinaryTree)
+ *      .describe("Constructor of BinaryTree")
+ *      .set_body([]() { return new BinaryTree(); });
+ * \endcode
+ *
+ * @tparam EntryType The type of subclass that inheritate the base.
+ * @tparam FunctionType The function type this registry is registerd.
+ */
+template <typename EntryType, typename FunctionType>
+class FunctionRegEntryBase {
+ public:
+  /** \brief name of the entry */
+  std::string name;
+  /** \brief description of the entry */
+  std::string description;
+  /** \brief additional arguments to the factory function */
+  // std::vector<ParamFieldInfo> arguments;
+  /** \brief Function body to create ProductType */
+  FunctionType body;
+  /** \brief Return type of the function */
+  std::string return_type;
+
+  /**
+   * \brief Set the function body.
+   * @param body Function body to set.
+   * @return reference to self.
+   */
+  inline EntryType &set_body(FunctionType body) {
+    this->body = body;
+    return this->self();
+  }
+  /**
+   * \brief Describe the function.
+   * @param description The description of the factory function.
+   * @return reference to self.
+   */
+  inline EntryType &describe(const std::string &description) {
+    this->description = description;
+    return this->self();
+  }
+  /**
+   * \brief Set the return type.
+   * @param type Return type of the function, could be Symbol or Symbol[]
+   * @return reference to self.
+   */
+  inline EntryType &set_return_type(const std::string &type) {
+    return_type = type;
+    return this->self();
+  }
+
+ protected:
+  /**
+   * @return reference of self as derived type
+   */
+  inline EntryType &self() { return *(static_cast<EntryType *>(this)); }
+};
+
+/**
+ * \brief Generic macro to register an EntryType
+ *  There is a complete example in FactoryRegistryEntryBase.
+ *
+ * @param EntryType The type of registry entry.
+ * @param EntryTypeName The typename of EntryType, must do not contain namespace :: .
+ * @param Name The name to be registered.
+ * @sa FactoryRegistryEntryBase
+ */
+#define CINN_REGISTRY_REGISTER(EntryType, EntryTypeName, Name) \
+  static EntryType &__make_##EntryTypeName##_##Name##__ = ::Registry<EntryType>::Global()->__REGISTER__(#Name)
+
+#define CINN_STR_CONCAT_(__x, __y) __x##__y
+#define CINN_STR_CONCAT(__x, __y) CINN_STR_CONCAT_(__x, __y)
diff --git a/paddle/cinn/utils/sized_multi_set.cc b/paddle/cinn/utils/sized_multi_set.cc
new file mode 100644
index 0000000000000..8d6901b98b220
--- /dev/null
+++ b/paddle/cinn/utils/sized_multi_set.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/sized_multi_set.h"
+
+namespace cinn::utils {}  // namespace cinn::utils
diff --git a/paddle/cinn/utils/sized_multi_set.h b/paddle/cinn/utils/sized_multi_set.h
new file mode 100644
index 0000000000000..7865cfad9d0f6
--- /dev/null
+++ b/paddle/cinn/utils/sized_multi_set.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <functional>
+#include <memory>
+#include <set>
+
+namespace cinn {
+namespace utils {
+
+/**
+ * A data structure stores limited size ordered duplicatable elements.
+ *
+ * The default implementation would pop maximal element when size reaches
+ * capacity. Users could change pop_max_when_full parameter of constructor
+ * to false to pop minimal element.
+ *
+ * The underneath implementation uses std::multiset
+ */
+template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T>>
+class SizedMultiSet {
+ public:
+  SizedMultiSet(size_t capacity, bool pop_max_when_full = true)
+      : capacity_(capacity), pop_max_when_full_(pop_max_when_full) {}
+
+  void Push(const T& data) {
+    multi_set_.insert(data);
+    if (multi_set_.size() > capacity_) {
+      Pop();
+    }
+  }
+
+  void Push(T&& data) {
+    multi_set_.insert(data);
+    if (multi_set_.size() > capacity_) {
+      Pop();
+    }
+  }
+
+  void Pop() {
+    CHECK_GE(multi_set_.size(), 1UL) << "Call Pop on empty SizedMultiSet";
+    if (pop_max_when_full_) {
+      multi_set_.erase(--multi_set_.end());
+    } else {
+      multi_set_.erase(multi_set_.begin());
+    }
+  }
+
+  T MaxValue() const { return *(multi_set_.rbegin()); }
+
+  T MinValue() const { return *(multi_set_.begin()); }
+
+  size_t Size() const { return multi_set_.size(); }
+
+  template <class ContainerType>
+  ContainerType ReturnAsContainer() const {
+    return ContainerType(multi_set_.begin(), multi_set_.end());
+  }
+
+ private:
+  size_t capacity_;
+  bool pop_max_when_full_;
+  std::multiset<T, Compare, Alloc> multi_set_;
+};
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/sized_multi_set_test.cc b/paddle/cinn/utils/sized_multi_set_test.cc
new file mode 100644
index 0000000000000..613ed41140658
--- /dev/null
+++ b/paddle/cinn/utils/sized_multi_set_test.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/sized_multi_set.h"
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+namespace cinn {
+namespace utils {
+
+TEST(SizedMultiSet, PopMax) {
+  SizedMultiSet<int> sized_multi_set(5);
+
+  for (int i = 0; i < 10; ++i) {
+    sized_multi_set.Push(i);
+    if (i < 5) {
+      EXPECT_EQ(sized_multi_set.Size(), static_cast<size_t>(i + 1));
+      EXPECT_EQ(sized_multi_set.MaxValue(), i);
+      EXPECT_EQ(sized_multi_set.MinValue(), 0);
+    } else {
+      EXPECT_EQ(sized_multi_set.Size(), 5);
+      EXPECT_EQ(sized_multi_set.MaxValue(), 4);
+      EXPECT_EQ(sized_multi_set.MinValue(), 0);
+    }
+  }
+
+  std::vector<int> vec = sized_multi_set.ReturnAsContainer<std::vector<int>>();
+  for (int i = 0; i < 5; ++i) {
+    EXPECT_EQ(vec[i], i);
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    sized_multi_set.Pop();
+    EXPECT_EQ(sized_multi_set.Size(), static_cast<size_t>(4 - i));
+    EXPECT_EQ(sized_multi_set.MaxValue(), static_cast<size_t>(3 - i));
+    EXPECT_EQ(sized_multi_set.MinValue(), static_cast<size_t>(0));
+  }
+}
+
+TEST(SizedMultiSet, PopMin) {
+  SizedMultiSet<int> sized_multi_set(5, /* pop_max_when_full = */ false);
+  for (int i = 0; i < 10; ++i) {
+    sized_multi_set.Push(i);
+    if (i < 5) {
+      EXPECT_EQ(sized_multi_set.Size(), static_cast<size_t>(i + 1));
+      EXPECT_EQ(sized_multi_set.MaxValue(), i);
+      EXPECT_EQ(sized_multi_set.MinValue(), 0);
+    } else {
+      EXPECT_EQ(sized_multi_set.Size(), 5);
+      EXPECT_EQ(sized_multi_set.MaxValue(), i);
+      EXPECT_EQ(sized_multi_set.MinValue(), i - 4);
+    }
+  }
+
+  std::vector<int> vec = sized_multi_set.ReturnAsContainer<std::vector<int>>();
+  for (int i = 0; i < 5; ++i) {
+    EXPECT_EQ(vec[i], i + 5);
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    sized_multi_set.Pop();
+    EXPECT_EQ(sized_multi_set.Size(), static_cast<size_t>(4 - i));
+    EXPECT_EQ(sized_multi_set.MaxValue(), static_cast<size_t>(9));
+    EXPECT_EQ(sized_multi_set.MinValue(), static_cast<size_t>(6 + i));
+  }
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/small_vector.cc b/paddle/cinn/utils/small_vector.cc
new file mode 100644
index 0000000000000..47ce820fe4d00
--- /dev/null
+++ b/paddle/cinn/utils/small_vector.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/small_vector.h"
+
+namespace cinn::utils {}  // namespace cinn::utils
diff --git a/paddle/cinn/utils/small_vector.h b/paddle/cinn/utils/small_vector.h
new file mode 100644
index 0000000000000..b5216b1e68092
--- /dev/null
+++ b/paddle/cinn/utils/small_vector.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "llvm/ADT/SmallVector.h"
+
+namespace cinn::utils {
+
+template <typename Type, unsigned Num>
+using SmallVector = llvm::SmallVector<Type, Num>;
+
+}  // namespace cinn::utils
diff --git a/paddle/cinn/utils/string.cc b/paddle/cinn/utils/string.cc
new file mode 100644
index 0000000000000..be5566facb09a
--- /dev/null
+++ b/paddle/cinn/utils/string.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/string.h"
+
+#include <stdarg.h>
+
+#include <cstring>
+#include <iomanip>
+
+#include "glog/logging.h"
+
+namespace cinn {
+namespace utils {
+
+std::string StringFormat(const std::string &fmt_str, ...) {
+  /* Reserve two times as much as the length of the fmt_str */
+  int final_n, n = (static_cast<int>(fmt_str.size())) * 2;
+  std::unique_ptr<char[]> formatted;
+  va_list ap;
+  while (1) {
+    formatted.reset(new char[n]);                 /* Wrap the plain char array into the unique_ptr */
+    std::strcpy(&formatted[0], fmt_str.c_str());  // NOLINT
+    va_start(ap, fmt_str);
+    final_n = vsnprintf(&formatted[0], n, fmt_str.c_str(), ap);
+    va_end(ap);
+    if (final_n < 0 || final_n >= n)
+      n += abs(final_n - n + 1);
+    else
+      break;
+  }
+  return std::string(formatted.get());
+}
+
+std::string RemoveSuffix(const std::string &name) {
+  std::string res = name;
+  while (Endswith(res, "_outer") || Endswith(res, "_inner")) {
+    res = res.substr(0, res.size() - 6);
+  }
+  return res;
+}
+
+std::string Trim(const std::string &s, const char *empty) {
+  if (s.empty()) return s;
+  auto start = s.find_first_not_of(empty);
+  if (start == std::string::npos) return "";
+  auto end = s.find_last_not_of(empty);
+  return s.substr(start, end - start + 1);
+}
+
+std::string Uppercase(const std::string &x) {
+  auto res = x;
+  for (auto &c : res) {
+    c = toupper(c);
+  }
+  return res;
+}
+
+bool Startswith(const std::string &x, const std::string &str) { return x.find(str) == 0; }
+bool Endswith(const std::string &x, const std::string &str) {
+  if (x.length() >= str.length()) {
+    return std::equal(str.rbegin(), str.rend(), x.rbegin());
+  }
+  return false;
+}
+
+std::vector<std::string> Split(const std::string &str, const std::string &splitter) {
+  std::vector<std::string> results;
+  std::string::size_type pos1, pos2;
+  pos2 = str.find(splitter);
+  pos1 = 0;
+  while (std::string::npos != pos2) {
+    results.push_back(str.substr(pos1, pos2 - pos1));
+    pos1 = pos2 + splitter.size();
+    pos2 = str.find(splitter, pos1);
+  }
+  if (pos1 != str.length()) {
+    results.push_back(str.substr(pos1));
+  }
+  return results;
+}
+
+void Replace(std::string *s, const std::string &from, const std::string &to) {
+  size_t pos = 0;
+  while ((pos = s->find(from, pos)) != std::string::npos) {
+    s->replace(pos, from.size(), to);
+    pos += to.length();
+  }
+}
+
+size_t Count(std::string *s, const std::string &sub) {
+  size_t pos   = 0;
+  size_t times = 0;
+  while ((pos = s->find(sub, pos)) != std::string::npos) {
+    if ((pos == 0 || !IsPrefix(s->at(pos - 1))) &&
+        (pos + sub.length() == s->size() || !IsSuffix(s->at(pos + sub.length())))) {
+      pos += sub.length();
+      times++;
+    } else
+      pos++;
+  }
+  return times;
+}
+
+bool IsPrefix(const char &c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_'); }
+
+bool IsSuffix(const char &c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_') || (c >= '0' && c <= '9') || (c == '\'');
+}
+
+std::string TransValidVarName(std::string name) {
+  utils::Replace(&name, ".", "__");
+  utils::Replace(&name, "/", "___");
+  utils::Replace(&name, "@", "____");
+  name.erase(0, name.find_first_not_of("_"));
+  return name;
+}
+
+std::string Attribute2String(const utils::Attribute &attr) {
+  std::stringstream ss;
+  if (absl::holds_alternative<bool>(attr)) {
+    ss << (absl::get<bool>(attr) ? "True" : "False");
+  } else if (absl::holds_alternative<float>(attr)) {
+    ss << std::setprecision(std::numeric_limits<float>::max_digits10) << std::showpoint << absl::get<float>(attr);
+  } else if (absl::holds_alternative<double>(attr)) {
+    ss << std::setprecision(std::numeric_limits<double>::max_digits10) << std::showpoint << absl::get<double>(attr);
+  } else if (absl::holds_alternative<int>(attr)) {
+    ss << absl::get<int>(attr);
+  } else if (absl::holds_alternative<int64_t>(attr)) {
+    ss << absl::get<int64_t>(attr);
+  } else if (absl::holds_alternative<std::string>(attr)) {
+    ss << "\"" << absl::get<std::string>(attr) << "\"";
+  } else if (absl::holds_alternative<std::vector<bool>>(attr)) {
+    ss << "[" + cinn::utils::Join(absl::get<std::vector<bool>>(attr), ", ") + "]";
+  } else if (absl::holds_alternative<std::vector<int>>(attr)) {
+    ss << "[" + cinn::utils::Join(absl::get<std::vector<int>>(attr), ", ") + "]";
+  } else if (absl::holds_alternative<std::vector<int64_t>>(attr)) {
+    ss << "[" + cinn::utils::Join(absl::get<std::vector<int64_t>>(attr), ", ") + "]";
+  } else if (absl::holds_alternative<std::vector<float>>(attr)) {
+    ss << "[" + cinn::utils::Join(absl::get<std::vector<float>>(attr), ", ") + "]";
+  } else if (absl::holds_alternative<std::vector<double>>(attr)) {
+    ss << "[" + cinn::utils::Join(absl::get<std::vector<double>>(attr), ", ") + "]";
+  } else if (absl::holds_alternative<std::vector<std::string>>(attr)) {
+    auto attrs = absl::get<std::vector<std::string>>(attr);
+    for (auto &str : attrs) {
+      str = "\"" + str + "\"";
+    }
+    ss << "[" + cinn::utils::Join(attrs, ", ") + "]";
+  } else {
+    LOG(FATAL) << "Unkown attribute data type! Please check.";
+  }
+  return ss.str();
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/string.h b/paddle/cinn/utils/string.h
new file mode 100644
index 0000000000000..1f1151b7f9f52
--- /dev/null
+++ b/paddle/cinn/utils/string.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace utils {
+
+//! Get the content of a stream.
+template <typename T>
+std::string GetStreamCnt(const T& x);
+
+/**
+ * Construct a formatted string with arguments.
+ * @param fmt_str The format.
+ * @param ... The parameters of the format.
+ * @return The formated string.
+ */
+std::string StringFormat(const std::string& fmt_str, ...);
+
+/**
+ * Delete the '_outer' and '_inner' suffix of a var's name.
+ * @param name The input string.
+ * @return The edited string.
+ */
+std::string RemoveSuffix(const std::string& name);
+
+/**
+ * Join multiple fields to a single string. Similar to Python's str.join method.
+ */
+template <typename T = std::string>
+std::string Join(const std::vector<T>& fields, const std::string& splitter) {
+  if (fields.empty()) return "";
+  std::stringstream ss;
+  for (int i = 0; i < fields.size() - 1; i++) ss << fields[i] << splitter;
+  ss << fields.back();
+  return ss.str();
+}
+
+std::vector<std::string> Split(const std::string& str, const std::string& splitter);
+
+std::string Trim(const std::string& s, const char* empty = " \n\r\t");
+
+//! Convert a string to its uppercase.
+std::string Uppercase(const std::string& x);
+
+//! Replace a substr 'from' to 'to' in string s.
+void Replace(std::string* s, const std::string& from, const std::string& to);
+
+//! Count how many times substr 'sub' appears in string s.
+size_t Count(std::string* s, const std::string& sub);
+
+//! Tell if a char is prefix of a tensor's name.
+bool IsPrefix(const char& c);
+
+//! Tell if a char is suffix of a tensor's name.
+bool IsSuffix(const char& c);
+
+//! Tell if a string \p x start with \p str.
+bool Startswith(const std::string& x, const std::string& str);
+
+//! Tell if a string \p x ends with \p str.
+bool Endswith(const std::string& x, const std::string& str);
+
+template <typename T>
+std::string GetStreamCnt(const T& x) {
+  std::stringstream os;
+  os << x;
+  return os.str();
+}
+
+std::string TransValidVarName(std::string name);
+
+std::string Attribute2String(const utils::Attribute& attr);
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/string_test.cc b/paddle/cinn/utils/string_test.cc
new file mode 100644
index 0000000000000..526a904ee4d1e
--- /dev/null
+++ b/paddle/cinn/utils/string_test.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/string.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn {
+namespace utils {
+
+TEST(string, Endswith) {
+  std::string a = "a__p";
+  ASSERT_TRUE(Endswith(a, "__p"));
+  ASSERT_FALSE(Endswith(a, "_x"));
+  ASSERT_TRUE(Endswith(a, "a__p"));
+  ASSERT_FALSE(Endswith(a, "a___p"));
+}
+TEST(string, Startswith) {
+  std::string a = "a__p";
+  ASSERT_TRUE(Startswith(a, "a_"));
+  ASSERT_TRUE(Startswith(a, "a__"));
+  ASSERT_TRUE(Startswith(a, "a__p"));
+  ASSERT_FALSE(Startswith(a, "a___p"));
+}
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/timer.cc b/paddle/cinn/utils/timer.cc
new file mode 100644
index 0000000000000..c023af026c7d0
--- /dev/null
+++ b/paddle/cinn/utils/timer.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/utils/timer.h"
+
+namespace cinn {
+namespace utils {
+
+float Timer::Stop() {
+  end_     = std::chrono::high_resolution_clock::now();
+  auto ts  = std::chrono::duration_cast<std::chrono::nanoseconds>(end_ - start_);
+  float ms = 1000. * static_cast<double>(ts.count()) * std::chrono::nanoseconds::period::num /
+             std::chrono::nanoseconds::period::den;
+  return ms;
+}
+
+void Timer::Start() { start_ = std::chrono::high_resolution_clock::now(); }
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/timer.h b/paddle/cinn/utils/timer.h
new file mode 100644
index 0000000000000..a0d65f2b5107d
--- /dev/null
+++ b/paddle/cinn/utils/timer.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <chrono>  //NOLINT
+#include <vector>
+
+namespace cinn {
+namespace utils {
+
+class Timer {
+ public:
+  Timer() = default;
+
+  void Start();
+  float Stop();
+
+ private:
+  std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> start_, end_;
+};
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/paddle/cinn/utils/type_defs.h b/paddle/cinn/utils/type_defs.h
new file mode 100644
index 0000000000000..ce0e67cf9f03c
--- /dev/null
+++ b/paddle/cinn/utils/type_defs.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <absl/types/variant.h>
+
+#include <string>
+#include <vector>
+
+namespace cinn {
+namespace utils {
+
+// attribute type defs
+using Attribute    = absl::variant<bool,
+                                float,
+                                int,
+                                std::string,
+                                std::vector<bool>,
+                                std::vector<int>,
+                                std::vector<float>,
+                                std::vector<std::string>,
+                                int64_t,
+                                double,
+                                std::vector<int64_t>,
+                                std::vector<double>>;
+using AttributeMap = absl::flat_hash_map<std::string, Attribute>;
+
+// shape type defs
+using ShapeType = std::vector<int32_t>;
+using DimType   = ShapeType::value_type;
+
+}  // namespace utils
+}  // namespace cinn
diff --git a/python/cinn/__init__.py b/python/cinn/__init__.py
new file mode 100644
index 0000000000000..f71931a377b93
--- /dev/null
+++ b/python/cinn/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+cinndir = os.path.dirname(os.path.abspath(__file__))
+runtime_include_dir = os.path.join(cinndir, "libs")
+cuhfile = os.path.join(runtime_include_dir, "cinn_cuda_runtime_source.cuh")
+
+if os.path.exists(cuhfile):
+    os.environ.setdefault('runtime_include_dir', runtime_include_dir)
+
+from .core_api.common import *
+from .core_api.backends import *
+from .core_api.poly import *
+from .core_api.ir import *
+from .core_api.lang import *
+from .version import full_version as __version__
diff --git a/python/cinn/auto_schedule/__init__.py b/python/cinn/auto_schedule/__init__.py
new file mode 100644
index 0000000000000..e88df12c80eaa
--- /dev/null
+++ b/python/cinn/auto_schedule/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cinn/auto_schedule/cost_model/__init__.py b/python/cinn/auto_schedule/cost_model/__init__.py
new file mode 100644
index 0000000000000..73adef0a3589b
--- /dev/null
+++ b/python/cinn/auto_schedule/cost_model/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .cost_model import CostModel
+from .cost_model import CostModelType
+from .xgb_cost_model import XgbCostModel
+
+__all__ = [
+    "CostModel",
+    "CostModelType",
+    "XgbCostModel",
+]
diff --git a/python/cinn/auto_schedule/cost_model/cost_model.py b/python/cinn/auto_schedule/cost_model/cost_model.py
new file mode 100644
index 0000000000000..1a1307b723aeb
--- /dev/null
+++ b/python/cinn/auto_schedule/cost_model/cost_model.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+
+from .xgb_cost_model import XgbCostModel
+
+
+class CostModelType(enum.Enum):
+    XGB = 1
+
+
+class CostModel(object):
+    """
+    A base class to call different cost model algorithm.
+    """
+
+    def __init__(self, model_type=CostModelType.XGB):
+        """
+        Constructor
+        """
+        self.model = None
+        if model_type == CostModelType.XGB:
+            self.model = XgbCostModel()
+        else:
+            raise ValueError("Illegal CostModelType")
+
+    def train(self, samples, labels):
+        """
+        Train the model.
+
+        Args:
+            samples(list|numpy): an array of numpy array representing a batch
+                of input samples.
+            labels(list|numpy): an array of float representing a batch of labels
+        """
+        return self.model.train(samples, labels)
+
+    def predict(self, samples):
+        """
+        Predict
+
+        Args:
+            samples(list|numpy): an array of numpy array representing a batch
+                of input samples.
+        Returns:
+            np.array representing labels
+        """
+        return self.model.predict(samples)
+
+    def save(self, path):
+        """
+        Save the trained model.
+
+        Args:
+            path(str): path to save
+        """
+        return self.model.save(path)
+
+    def load(self, path):
+        """
+        Load the model
+
+        Args:
+            path(str): path to load
+        """
+        return self.model.load(path)
+
+    def update(self, samples, labels):
+        # TODO
+        pass
diff --git a/python/cinn/auto_schedule/cost_model/xgb_cost_model.py b/python/cinn/auto_schedule/cost_model/xgb_cost_model.py
new file mode 100644
index 0000000000000..03bfa5594b4d7
--- /dev/null
+++ b/python/cinn/auto_schedule/cost_model/xgb_cost_model.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import xgboost as xgb
+
+
+class XgbCostModel(object):
+    """
+    A cost model implemented by XgbCostModel
+    """
+
+    def __init__(self):
+        """
+        Constructor
+        """
+        # Store the xgb.Booster, which is the output of xgb.train
+        self.booster = None
+
+        self.xgb_param = {}
+        self.train_round = 10
+
+    def train(self, samples, labels):
+        """
+        Train the model.
+
+        Args:
+            samples(list|numpy): an array of numpy array representing a batch
+                of input samples.
+            labels(list|numpy): an array of float representing a batch of labels
+
+        Returns:
+            xgb.Booster
+        """
+        lengths = [x.shape[0] for x in samples]
+        if isinstance(samples, list):
+            samples = np.concatenate(samples, axis=0)
+        if isinstance(labels, list):
+            labels = np.concatenate(
+                [[y] * length for y, length in zip(labels, lengths)], axis=0)
+
+        dmatrix = xgb.DMatrix(data=samples, label=labels)
+        self.booster = xgb.train(self.xgb_param, dmatrix, self.train_round)
+        return self.booster
+
+    def predict(self, samples):
+        """
+        Predict
+
+        Args:
+            samples(list|numpy): an array of numpy array representing a batch
+                of input samples.
+        Returns:
+            np.array representing labels
+        """
+        if isinstance(samples, list):
+            samples = np.concatenate(samples, axis=0)
+        dmatrix = xgb.DMatrix(data=samples, label=None)
+        pred = self.booster.predict(dmatrix)
+        return pred
+
+    def save(self, path):
+        """
+        Save the trained XgbCostModel
+
+        Args:
+            path(str): path to save
+        """
+        assert self.booster is not None, "Calling save on a XgbCostModel not been trained"
+        self.booster.save_model(path)
+
+    def load(self, path):
+        """
+        Load the trained XgbCostModel
+
+        Args:
+            path(str): path to load
+        """
+        if self.booster is None:
+            self.booster = xgb.Booster()
+        self.booster.load_model(path)
+        # Should we save/load config parameters? Not now because it is pre-set.
+        # But we should do that here if that's changable in the future.
+
+    def update(self, samples, labels):
+        #xgb doesn't support incremental training, we leave this method as TODO
+        pass
diff --git a/python/cinn/backends.py b/python/cinn/backends.py
new file mode 100644
index 0000000000000..f4a6c21aa57c3
--- /dev/null
+++ b/python/cinn/backends.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core_api.backends import ExecutionOptions
+from .core_api.backends import ExecutionEngine
diff --git a/python/cinn/common.py b/python/cinn/common.py
new file mode 100644
index 0000000000000..97cc89e9b4ad6
--- /dev/null
+++ b/python/cinn/common.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core_api.common import make_const
+from .core_api.common import Target
+from .core_api.common import Type
+from .core_api.common import CINNValue
+from .core_api.common import Void, Int, UInt, Float, Bool, String
+from .core_api.common import *
diff --git a/python/cinn/framework.py b/python/cinn/framework.py
new file mode 100644
index 0000000000000..54ac8f70feb60
--- /dev/null
+++ b/python/cinn/framework.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core_api.framework import *
diff --git a/python/cinn/frontend.py b/python/cinn/frontend.py
new file mode 100644
index 0000000000000..bb340aa879e0b
--- /dev/null
+++ b/python/cinn/frontend.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core_api.frontend import *
diff --git a/python/cinn/ir/__init__.py b/python/cinn/ir/__init__.py
new file mode 100644
index 0000000000000..7d09bb66e8a8d
--- /dev/null
+++ b/python/cinn/ir/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core_api.ir import *
+
+
+def get_global_func(name):
+    return Registry.get(name)
+
+
+def register(name, override=False):
+    def _register_fn(fn):
+        Registry.register(name, override).set_body(PackedFunc(fn))
+        return Registry.get(name)
+
+    return _register_fn
+
+
+def register_packed_func(name, override=False):
+    def _register(fn):
+        def _packed(args, rv):
+            _args = []
+            for i in range(len(args)):
+                _args.append(args[i])
+            r = fn(*_args)
+            rv.set(r)
+
+        Registry.register(name, override).set_body(PackedFunc(_packed))
+        return Registry.get(name)
+
+    return _register
diff --git a/python/cinn/lang.py b/python/cinn/lang.py
new file mode 100644
index 0000000000000..18eaed8cb763d
--- /dev/null
+++ b/python/cinn/lang.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core_api.lang import Buffer
+from .core_api.lang import lower
+from .core_api.lang import compute
+from .core_api.lang import *
diff --git a/python/cinn/libs/__init__.py b/python/cinn/libs/__init__.py
new file mode 100644
index 0000000000000..b98e9f82cc94c
--- /dev/null
+++ b/python/cinn/libs/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# used for setup.py.in to store the thirdparty shared libraries
diff --git a/python/cinn/optim.py b/python/cinn/optim.py
new file mode 100644
index 0000000000000..11a16b872ed3a
--- /dev/null
+++ b/python/cinn/optim.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core_api.optim import simplify
+from .core_api.optim import ir_copy
diff --git a/python/cinn/pe.py b/python/cinn/pe.py
new file mode 100644
index 0000000000000..25dd5a77915b9
--- /dev/null
+++ b/python/cinn/pe.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core_api.pe import *
diff --git a/python/cinn/poly.py b/python/cinn/poly.py
new file mode 100644
index 0000000000000..91c8b4794025f
--- /dev/null
+++ b/python/cinn/poly.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core_api.poly import create_stages
diff --git a/python/cinn/runtime.py b/python/cinn/runtime.py
new file mode 100644
index 0000000000000..33eb9acb88b56
--- /dev/null
+++ b/python/cinn/runtime.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core_api.runtime import *
diff --git a/python/cinn/utils.py b/python/cinn/utils.py
new file mode 100644
index 0000000000000..8c44cc2a546d7
--- /dev/null
+++ b/python/cinn/utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .core_api.utils import *
diff --git a/python/cinn/version/__init__.py b/python/cinn/version/__init__.py
new file mode 100644
index 0000000000000..ad6a9882ea53f
--- /dev/null
+++ b/python/cinn/version/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from .info import *
+except:
+    full_version = 'Unknown'
diff --git a/test/cinn/auto_schedule/cost_model/test_cost_model.py b/test/cinn/auto_schedule/cost_model/test_cost_model.py
new file mode 100644
index 0000000000000..1a6f38bce6593
--- /dev/null
+++ b/test/cinn/auto_schedule/cost_model/test_cost_model.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cinn
+
+import os
+import shutil
+import numpy as np
+import unittest
+
+from cinn.auto_schedule.cost_model import CostModel
+
+
+class TestCostModel(unittest.TestCase):
+    def test_cost_model_init(self):
+        with self.assertRaises(ValueError):
+            cost_model = CostModel(2)
+
+        cost_model = CostModel()
+
+    def test_basic_functions(self):
+        samples = [np.random.randn(5, 6) for i in range(16)]
+        labels = [1.0] * 16
+
+        cost_model = CostModel()
+        cost_model.train(samples, labels)
+        pred = cost_model.predict(samples)
+
+        path = "./test_cost_model.save_model"
+        cost_model.save(path)
+
+        load_cost_model = CostModel()
+        load_cost_model.load(path)
+
+        load_pred = load_cost_model.predict(samples)
+
+        if os.path.exists(path):
+            if os.path.isfile(path):
+                os.remove(path)
+            else:
+                shutil.rmtree(path)
+
+        np.testing.assert_almost_equal(pred, load_pred)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/conv2d_utils.py b/test/cinn/conv2d_utils.py
new file mode 100644
index 0000000000000..eb3d38d4d86e6
--- /dev/null
+++ b/test/cinn/conv2d_utils.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.static as static
+import numpy as np
+
+
+def conv2d_native(inputs_data, input_shape, filter_size, attrs, is_depthwise):
+    main_program = static.Program()
+    paddle.enable_static()
+
+    with static.program_guard(main_program, static.Program()):
+        padding = [0, 0]
+        stride = [1, 1]
+        dilation = [1, 1]
+        data_format = "NCHW"
+        groups = 1
+        for key in attrs.attr_store:
+            if key == "stride":
+                stride = attrs.get_attr("stride")
+            elif key == "padding":
+                padding = attrs.get_attr("padding")
+            elif key == "dilation":
+                dilation = attrs.get_attr("dilation")
+            elif key == "groups":
+                groups = attrs.get_attr("groups")
+            elif key == "data_format":
+                data_format = attrs.get_attr("data_format")
+            else:
+                raise ValueError("attr_store {} is not supported".format(key))
+
+        img = static.data(name='img', shape=input_shape[1:], dtype='float32')
+        if is_depthwise:
+            if data_format == "NCHW":
+                cin_index = 1
+            else:
+                cin_index = 3
+            filter_size_new = [
+                filter_size[1] * input_shape[cin_index],
+                filter_size[0] // groups, filter_size[2], filter_size[3]
+            ]
+        else:
+            filter_size_new = filter_size
+        param = paddle.nn.initializer.NumpyArrayInitializer(
+            np.array(
+                inputs_data[1]).reshape(filter_size_new).astype("float32"))
+        # filter: (c_out, c_in // group, kernel_h, kernel_w)
+        filter_hw = list(filter_size_new[2:4])
+        if data_format == "NHWC":
+            filter_hw = list(filter_size_new[1:3])
+        if isinstance(stride, int):
+            stride = [stride.copy(), stride.copy()]
+        if isinstance(padding, int):
+            padding = [padding.copy(), padding.copy()]
+        if isinstance(dilation, int):
+            dilation = [dilation.copy(), dilation.copy()]
+
+        res = static.nn.conv2d(
+            input=img,
+            num_filters=filter_size_new[0],
+            filter_size=filter_hw,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            param_attr=param,
+            data_format=data_format)
+        exe = static.Executor(paddle.CPUPlace())
+        exe.run(static.default_startup_program())
+
+        x = np.array(inputs_data[0]).reshape(input_shape).astype("float32")
+        output = exe.run(feed={"img": x}, fetch_list=[res])
+        output = np.array(output)
+        print("output's shape is:", output.shape)
+
+    res_shape = output.shape[1:]
+
+    if is_depthwise:
+        return output, [res_shape]
+    else:
+        return output, [res_shape]
diff --git a/test/cinn/fake_model/naive_mul.py b/test/cinn/fake_model/naive_mul.py
new file mode 100644
index 0000000000000..b7048b310b19e
--- /dev/null
+++ b/test/cinn/fake_model/naive_mul.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import sys, os
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.static as static
+
+size = 30
+paddle.enable_static()
+
+a = static.data(name="A", shape=[-1, size], dtype='float32')
+label = static.data(name="label", shape=[size], dtype='float32')
+
+a1 = static.nn.fc(
+    x=a, size=size, activation="relu", bias_attr=None, num_flatten_dims=1)
+
+cost = paddle.nn.functional.square_error_cost(a1, label)
+avg_cost = paddle.mean(cost)
+
+optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+optimizer.minimize(avg_cost)
+
+cpu = paddle.CPUPlace()
+loss = exe = static.Executor(cpu)
+
+exe.run(static.default_startup_program())
+
+fluid.io.save_inference_model("./naive_mul_model", [a.name], [a1], exe)
+print('res is : ', a1.name)
diff --git a/test/cinn/fake_model/naive_multi_fc.py b/test/cinn/fake_model/naive_multi_fc.py
new file mode 100644
index 0000000000000..393df2dadb279
--- /dev/null
+++ b/test/cinn/fake_model/naive_multi_fc.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A fake model with multiple FC layers to test CINN on a more complex model.
+"""
+import numpy
+import sys, os
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.static as static
+
+size = 64
+num_layers = 6
+paddle.enable_static()
+
+a = static.data(name="A", shape=[-1, size], dtype='float32')
+label = static.data(name="label", shape=[size], dtype='float32')
+
+fc_out = static.nn.fc(
+    x=a,
+    size=size,
+    activation="relu",
+    bias_attr=paddle.ParamAttr(name="fc_bias"),
+    num_flatten_dims=1)
+
+for i in range(num_layers - 1):
+    fc_out = static.nn.fc(
+        x=fc_out,
+        size=size,
+        activation="relu",
+        bias_attr=paddle.ParamAttr(name="fc_bias"),
+        num_flatten_dims=1)
+
+cost = paddle.nn.functional.square_error_cost(fc_out, label)
+avg_cost = paddle.mean(cost)
+
+optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+optimizer.minimize(avg_cost)
+
+cpu = paddle.CPUPlace()
+loss = exe = static.Executor(cpu)
+
+exe.run(static.default_startup_program())
+
+fluid.io.save_inference_model("./multi_fc_model", [a.name], [fc_out], exe)
+print('res', fc_out.name)
diff --git a/test/cinn/fake_model/resnet_model.py b/test/cinn/fake_model/resnet_model.py
new file mode 100644
index 0000000000000..5ab2b45f13760
--- /dev/null
+++ b/test/cinn/fake_model/resnet_model.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import paddle
+import sys, os
+import numpy as np
+import paddle.fluid as fluid
+import paddle.static as static
+
+paddle.enable_static()
+
+resnet_input = static.data(
+    name="resnet_input", shape=[1, 160, 7, 7], dtype='float32')
+label = static.data(name="label", shape=[1, 960, 7, 7], dtype='float32')
+d = paddle.nn.functional.relu6(resnet_input)
+f = static.nn.conv2d(
+    input=d, num_filters=960, filter_size=1, stride=1, padding=0, dilation=1)
+g = static.nn.conv2d(
+    input=f, num_filters=160, filter_size=1, stride=1, padding=0, dilation=1)
+i = static.nn.conv2d(
+    input=g, num_filters=960, filter_size=1, stride=1, padding=0, dilation=1)
+j1 = paddle.scale(i, scale=2.0, bias=0.5)
+j = paddle.scale(j1, scale=2.0, bias=0.5)
+temp7 = paddle.nn.functional.relu(j)
+
+cost = paddle.nn.functional.square_error_cost(temp7, label)
+avg_cost = paddle.mean(cost)
+
+optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+optimizer.minimize(avg_cost)
+
+cpu = paddle.CPUPlace()
+exe = static.Executor(cpu)
+
+exe.run(static.default_startup_program())
+
+fluid.io.save_inference_model("./resnet_model", [resnet_input.name], [temp7],
+                              exe)
+print('res', temp7.name)
diff --git a/test/cinn/fusion/fusion_test.py b/test/cinn/fusion/fusion_test.py
new file mode 100644
index 0000000000000..899f94ac2e0f7
--- /dev/null
+++ b/test/cinn/fusion/fusion_test.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from tests.passes.pass_test import PassTest
+
+logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
+logger = logging.getLogger(name="pass_test")
+
+
+class FusionTest(PassTest):
+    def __init__(self, *args, **kwargs):
+        super(FusionTest, self).__init__(*args, **kwargs)
+
+    def init_input_data(self):
+        """Set feed data
+        """
+        self.feed_data = dict()
+        logger.warn("No Input Data")
+
+    def build_program(self, builder, target):
+        """
+        """
+        raise Exception("Not implemented.")
+
+    def check_fusion_outputs(self,
+                             group_size,
+                             max_relative_error=1e-5,
+                             all_equal=False,
+                             equal_nan=False):
+        base_passes = ["AutoCast", "Decomposer", "TransToCustomCallPass"]
+        fusion_passes = ["OpFusionPass", "FusionMergePass"]
+
+        real_group_size = self.get_pass_size(base_passes + fusion_passes)
+        logger.debug(
+            "The model has been fused into {} groups".format(real_group_size))
+        self.assertEqual(
+            real_group_size,
+            group_size,
+            msg=
+            "The model should be fused into {} groups, but actually fused {} groups"
+            .format(group_size, real_group_size))
+
+        cinn_no_fusion_outputs = self.get_pass_outputs(base_passes)
+        cinn_fusion_outputs = self.get_pass_outputs(base_passes +
+                                                    fusion_passes)
+
+        logger.debug("============ Check Outputs ============")
+        self.check_results(cinn_no_fusion_outputs, cinn_fusion_outputs,
+                           max_relative_error, all_equal, equal_nan)
diff --git a/test/cinn/fusion/test_cast_broadcast_reduce_max.py b/test/cinn/fusion/test_cast_broadcast_reduce_max.py
new file mode 100644
index 0000000000000..3bccc705f2b0f
--- /dev/null
+++ b/test/cinn/fusion/test_cast_broadcast_reduce_max.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from fusion_test import FusionTest
+
+
+class TestGroup1(FusionTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'eager_in_tmp_8': self.random([32, 1, 1, 128], "float32")
+        }
+
+    def build_program(self, builder, target):
+        eager_in_tmp_8 = builder.create_input(
+            self.nptype2cinntype(self.feed_data['eager_in_tmp_8'].dtype),
+            self.feed_data['eager_in_tmp_8'].shape, "eager_in_tmp_8")
+
+        var_15 = builder.cast(eager_in_tmp_8, dtype="float16")
+        # cast should not fused into reduce when the output need fetch
+        var_73 = builder.broadcast_to(
+            var_15, broadcast_axes=[0, 1, 2, 3], out_shape=[32, 12, 128, 128])
+        var_55 = builder.cast(var_73, dtype="float32")
+        var_76 = builder.reduce_max(var_55, dim=[3], keep_dim=False)
+
+        return [eager_in_tmp_8], [var_15, var_76]
+
+    def test_check_results(self):
+        self.check_fusion_outputs(group_size=2)
+
+
+class TestGroup2(FusionTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'eager_in_tmp_8': self.random([32, 1, 1, 128], "float32")
+        }
+
+    def build_program(self, builder, target):
+        eager_in_tmp_8 = builder.create_input(
+            self.nptype2cinntype(self.feed_data['eager_in_tmp_8'].dtype),
+            self.feed_data['eager_in_tmp_8'].shape, "eager_in_tmp_8")
+
+        var_15 = builder.cast(eager_in_tmp_8, dtype="float16")
+        # cast should  fused into reduce when the output not fetched
+        var_73 = builder.broadcast_to(
+            var_15, broadcast_axes=[0, 1, 2, 3], out_shape=[32, 12, 128, 128])
+        var_55 = builder.cast(var_73, dtype="float32")
+        var_76 = builder.reduce_max(var_55, dim=[3], keep_dim=False)
+
+        return [eager_in_tmp_8], [var_76]
+
+    def test_check_results(self):
+        self.check_fusion_outputs(group_size=1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/fusion/test_reduce_cast.py b/test/cinn/fusion/test_reduce_cast.py
new file mode 100644
index 0000000000000..c5c94abe44997
--- /dev/null
+++ b/test/cinn/fusion/test_reduce_cast.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from fusion_test import FusionTest
+
+
+class TestGroup1(FusionTest):
+    def init_input_data(self):
+        self.feed_data = {}
+
+    def build_program(self, builder, target):
+        x = builder.fill_constant(
+            dtype="float32", shape=[4, 5, 20, 20], value=1.00000000)
+        y = builder.cast(
+            builder.reduce_sum(x, dim=[2], keep_dim=False), "float16")
+
+        feed_list = []
+        fetch_list = [y]
+
+        return feed_list, fetch_list
+
+    def test_check_results(self):
+        self.check_fusion_outputs(group_size=1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/fusion/test_select_reduce.py b/test/cinn/fusion/test_select_reduce.py
new file mode 100644
index 0000000000000..f41cdac4a3bfe
--- /dev/null
+++ b/test/cinn/fusion/test_select_reduce.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from fusion_test import FusionTest
+
+
+class TestGroup1(FusionTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'cond': self.random([1, 1, 100, 100], "bool"),
+            'true_value': self.random([1, 1, 100, 100], "float64"),
+            'false_value': self.random([1, 1, 100, 100], "float64"),
+        }
+
+    def build_program(self, builder, target):
+        cond = builder.create_input(
+            self.nptype2cinntype(self.feed_data['cond'].dtype),
+            self.feed_data['cond'].shape, "cond")
+        true_value = builder.create_input(
+            self.nptype2cinntype(self.feed_data['true_value'].dtype),
+            self.feed_data['true_value'].shape, "true_value")
+        false_value = builder.create_input(
+            self.nptype2cinntype(self.feed_data['false_value'].dtype),
+            self.feed_data['false_value'].shape, "false_value")
+
+        var_1 = builder.select(cond, true_value, false_value)
+        var_2 = builder.reduce_sum(var_1, dim=[2], keep_dim=False)
+
+        feed_list = [cond, true_value, false_value]
+        fetch_list = [var_2]
+
+        return feed_list, fetch_list
+
+    def test_check_results(self):
+        self.check_fusion_outputs(group_size=1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/op_mapper_test.py b/test/cinn/op_mappers/op_mapper_test.py
new file mode 100644
index 0000000000000..38ed6a0d3ea0e
--- /dev/null
+++ b/test/cinn/op_mappers/op_mapper_test.py
@@ -0,0 +1,423 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ast import arg
+import os
+import logging
+
+import paddle
+from paddle.static import Variable as PaddleVariable
+from paddle.fluid.layer_helper import LayerHelper
+
+from cinn.frontend import NetBuilder, PaddleModelConvertor
+from cinn.common import is_compiled_with_cuda
+from cinn.framework import Scope
+
+import sys
+sys.path.append("/work/dev_CINN/build/python/tests")
+from tests.ops.op_test import OpTest, OpTestTool
+
+logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
+logger = logging.getLogger(name="op_test")
+
+paddle.enable_static()
+
+
+class OpMapperTest(OpTest):
+    def __init__(self, *args, **kwargs):
+        super(OpMapperTest, self).__init__(*args, **kwargs)
+        self._init_place()
+        self.init_input_data()
+
+    def _init_place(self):
+        self.place = paddle.CPUPlace()
+        if is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+    def init_input_data(self):
+        self.feed_data = {}
+        logger.warn("No Input Data")
+
+    def set_op_type(self) -> str:
+        """Set paddle C++ op type:\n
+        The op type should be got from the paddle static program.
+        Not the paddle python api name or phi api name.\n
+        For example, the C++ op type of `paddle.sum` is `reduce_sum`, the code from `Paddle/python/paddle/tensor/math.py`:
+        ```
+        def sum(x, axis=None, dtype=None, keepdim=False, name=None):
+            ...
+             helper.append_op(
+                type='reduce_sum',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs=attrs,
+            )
+        ```
+        """
+        raise Exception("Not implemented.")
+
+    def set_op_inputs(self) -> dict:
+        """Map from input parameter name to argument list, the argument should be get from paddle.static.data.\n
+        For example, `concat` should return
+        ```
+        x1 = paddle.static.data(name='x1', shape=[1, 2], dtype='float32')
+        x2 = paddle.static.data(name='x2', shape=[1, 2], dtype='float32')
+        return {'X' : [x1, x2]}
+        ```        """
+        return dict()
+
+    def set_op_attrs(self) -> dict:
+        """Map from attribute name to attribute value:\n
+        For example, `concat` should return
+        ```
+        return {'axis' : 0}
+        ```
+        """
+        return dict()
+
+    def set_op_outputs(self) -> dict:
+        """Map from output parameter name to argument type, the argument type should be represented by a string.\n
+        For example, if the `out_dtype` attribute of `cast` is `'float16'`, here should return
+        ```
+        return {'Out' : 'float16'}
+        ```
+        """
+        raise Exception("Not implemented.")
+
+    def skip_check_outputs(self) -> set:
+        """Skip check some output because some paddle's op outputs are useless, CINN will not support these.
+        ```
+        # skip check the result of output 'Out'
+        return {'Out'}
+        ```
+        """
+        return set()
+
+    def set_inplace_outputs(self) -> dict:
+        """Map from inplace output parameter name to input parameter name.\n
+        For example, if the op's output 'MeanOut' should share the memory with the input 'Mean', here should return
+        ```
+        return {'MeanOut' : 'Mean'}
+        ```
+        """
+        return dict()
+
+    def __set_paddle_op(self):
+        # paddle C++ op type
+        self.op_type = self.set_op_type()
+        # map from input param name to argument name list
+        self.inputs = self.set_op_inputs()
+        # map from attribute name to attribute value
+        self.attrs = self.set_op_attrs()
+        # map from output param name to output data type
+        self.output_dtypes = self.set_op_outputs()
+        # list of outputs which will be skip
+        self.skip_outputs = self.skip_check_outputs()
+        # dict of inplace var
+        self.inplace_outputs = self.set_inplace_outputs()
+        # collect some important infomation
+        self.input_arg_map = self.__get_arguments_map(self.inputs)
+        self.fetch_targets = list()
+        self.skip_check_list = list()
+        self.op_desc = None
+
+    def __check_valid(self):
+        self.assertIsInstance(
+            self.op_type, str, msg="The op type should be a string")
+        self.assertNotEqual(
+            self.op_type, "", msg="The op type should not empty")
+        self.assertIsInstance(
+            self.inputs,
+            dict,
+            msg=
+            "The set_op_inputs should be return dict(InputName, list(Variable)), where Variable are created by paddle.static.data"
+        )
+        self.assertIsInstance(
+            self.attrs,
+            dict,
+            msg="The set_op_attrs should be return dict(AttrName, AttrValue)")
+        self.assertIsInstance(
+            self.output_dtypes,
+            dict,
+            msg=
+            "The set_op_outputs should be return dict(OutName, list(OutDtype)), where OutName and OutDtype are string"
+        )
+        self.assertGreater(
+            len(self.output_dtypes),
+            0,
+            msg="The set_op_outputs cannot return a empty dict")
+
+        for name, var in self.input_arg_map.items():
+            self.assertIn(name, self.feed_data)
+            self.assertEqual(
+                var.shape,
+                self.feed_data[name].shape,
+                msg="The shape of input {} in feed_data is error".format(
+                    var.name))
+            self.assertEqual(
+                self.paddleddtype2nptype(var.dtype),
+                str(self.feed_data[name].dtype),
+                msg="The dtype of input {} in feed_data is error".format(
+                    var.name))
+
+        for out_name, in_name in self.inplace_outputs.items():
+            self.assertNotIn(
+                out_name,
+                self.output_dtypes,
+                msg=
+                "The {} should not declare twice because it's a inplace output, you should remove it from \"set_op_outputs\""
+                .format(out_name))
+            self.assertIn(
+                in_name,
+                self.inputs,
+                msg="The inplace var should existed in op' inputs dict")
+
+    def __get_arguments_map(self, param_maps):
+        arg_maps = dict()
+        for args in param_maps.values():
+            self.assertIsInstance(
+                args,
+                list,
+                msg=
+                "The type of arguments should be list(Variable), where Variable are created by paddle.static.data"
+            )
+            for var in args:
+                self.assertIsInstance(
+                    var,
+                    PaddleVariable,
+                    msg="The type of argument should be paddle.static.Variable"
+                )
+                self.assertTrue(
+                    (var.name not in arg_maps) or (arg_maps[var.name] == var),
+                    msg="Argument %s is duplicated" % var.name)
+                arg_maps[var.name] = var
+        return arg_maps
+
+    def __init_paddle_op(self):
+        self.__set_paddle_op()
+        self.__check_valid()
+
+    def __remove_skip_outputs(self, results):
+        check_outputs = list()
+        for i in range(len(self.fetch_targets)):
+            if self.fetch_targets[i].name not in self.skip_check_list:
+                check_outputs.append(results[i])
+                logger.debug(msg="{}, shape={}, dtype={}:\n{}".format(
+                    self.fetch_targets[i].name, results[i].shape,
+                    str(results[i].dtype), results[i]))
+
+        return check_outputs
+
+    def __debug_numpy_dict(self, info_dict: dict, title: str):
+        if logger.isEnabledFor(logging.DEBUG):
+            debug_info = ""
+            for k, v in info_dict.items():
+                debug_info += k + ", shape=" + str(v.shape) + ", dtype=" + str(
+                    v.dtype) + ":\n"
+                debug_info += str(v) + "\n"
+
+            logger.debug(title + ":\n" + debug_info)
+
+    def build_paddle_program(self, target):
+        self.__debug_numpy_dict(self.feed_data, "Feed Data")
+
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            self.__init_paddle_op()
+            helper = LayerHelper(self.op_type)
+
+            self.outputs = dict()
+            for var_name, dtypes in self.output_dtypes.items():
+                self.assertIsInstance(
+                    dtypes,
+                    list,
+                    msg=
+                    "The set_op_outputs should be return dict(OutName, list(OutDtype)), where OutName and OutDtype are string"
+                )
+                self.outputs[var_name] = list()
+                for dtype in dtypes:
+                    out_var = helper.create_variable_for_type_inference(dtype)
+                    self.fetch_targets.append(out_var)
+                    self.outputs[var_name].append(out_var)
+                    if var_name in self.skip_outputs:
+                        self.skip_check_list.append(out_var.name)
+
+            # inplace output
+            for out_name, in_name in self.inplace_outputs.items():
+                self.outputs[out_name] = self.inputs[in_name]
+                for var in self.inputs[in_name]:
+                    self.fetch_targets.append(var)
+                    if out_name in self.skip_outputs:
+                        self.skip_check_list.append(var.name)
+
+            self.op_desc = helper.append_op(
+                type=self.op_type,
+                inputs=self.inputs,
+                outputs=self.outputs,
+                attrs=self.attrs).desc
+
+        logger.debug("Paddle Program:\n" + str(main_program))
+
+        exe = paddle.static.Executor(self.place)
+        exe.run(startup_program)
+
+        results = exe.run(
+            main_program,
+            self.feed_data,
+            fetch_list=self.fetch_targets,
+            return_numpy=True)
+
+        # NOTE: The unittest of `test_reduce_op`, `test_argmax_op`, `test_argmin_op` will
+        # output 0D-Tensor, hence we need to reshape them into 1D-Tensor temporarily.
+        # After corresponding CINN op supports 0D-Tensor, this trick can be removed safely.
+        for i in range(len(results)):
+            if results[i] is not None and len(results[i].shape) == 0:
+                results[i] = results[i].reshape(1)
+
+        logger.debug(msg="Paddle result:")
+        self.paddle_outputs = self.__remove_skip_outputs(results)
+
+    def build_cinn_program(self, target):
+        scope = Scope()
+        convertor = PaddleModelConvertor(target=self.target, scope=scope)
+
+        for var_name, var in self.input_arg_map.items():
+            convertor.create_input(
+                dtype=self.paddleddtype2nptype(var.dtype),
+                shape=var.shape,
+                name=var_name)
+
+        convertor.append_op(
+            type=self.op_type,
+            inputs=self.op_desc.inputs(),
+            outputs=self.op_desc.outputs(),
+            attrs=self.attrs)
+
+        prog = convertor()
+
+        logger.debug("CINN Program:\n" + str(prog))
+
+        # get the CINN input list
+        cinn_inputs = []
+        cinn_feed_datas = []
+
+        vars = self.get_program_vars(prog)
+
+        # map the name the variable
+        if len(self.input_arg_map) > 0:
+            feed_names = set(self.input_arg_map.keys())
+            for name in feed_names:
+                cinn_name = convertor.get_cinn_name(name)
+
+                self.assertIn(
+                    cinn_name,
+                    vars,
+                    msg="Cannot find variable " + cinn_name +
+                    " in cinn program's var list")
+                cinn_inputs.append(vars[cinn_name])
+                cinn_feed_datas.append(self.feed_data[name])
+
+        # get the CINN output list
+        fetch_names = list()
+        inplace_start = 0
+        for dtypes in self.output_dtypes.values():
+            inplace_start += len(dtypes)
+        fetch_names += [var.name for var in self.fetch_targets[:inplace_start]]
+
+        inplace_end = inplace_start
+        for in_name in self.inplace_outputs.values():
+            inplace_end += len(self.inputs[in_name])
+        fetch_names += [
+            var.name + "@InplaceOut"
+            for var in self.fetch_targets[inplace_start:inplace_end]
+        ]
+
+        # map the name the variable
+        self.assertGreater(
+            len(fetch_names), 0, msg="The program's output cannot be empty!")
+        cinn_output_vars = list()
+        for name in fetch_names:
+            cinn_name = convertor.get_cinn_name(name)
+
+            self.assertIn(
+                cinn_name,
+                vars,
+                msg="Cannot find variable " + cinn_name +
+                " in cinn program's var list")
+            cinn_output_vars.append(vars[cinn_name])
+
+        # run and get result
+        results = self.get_cinn_output(
+            prog,
+            target,
+            cinn_inputs,
+            cinn_feed_datas,
+            cinn_output_vars,
+            passes=list(),
+            scope=scope)
+
+        logger.debug(msg="CINN result:")
+        self.cinn_outputs = self.__remove_skip_outputs(results)
+
+    @staticmethod
+    def get_program_vars(program) -> dict:
+        vars = dict()
+        for i in range(program.size()):
+            instr = program[i]
+            for var in instr.get_inputs():
+                if var.id() not in vars:
+                    vars[var.id()] = var
+            for var in instr.get_outputs():
+                if var.id() not in vars:
+                    vars[var.id()] = var
+
+        return vars
+
+    @staticmethod
+    def paddleddtype2nptype(dtype):
+        switch_map = {
+            paddle.float16: "float16",
+            paddle.float32: "float32",
+            paddle.float64: "float64",
+            paddle.int8: "int8",
+            paddle.int16: "int16",
+            paddle.int32: "int32",
+            paddle.int64: "int64",
+            paddle.uint8: "uint8",
+            paddle.bool: "bool",
+            paddle.fluid.core.VarDesc.VarType.RAW: "unk",
+        }
+        assert dtype in switch_map, str(dtype) + " not support in CINN"
+        return switch_map[dtype]
+
+    @staticmethod
+    def nptype2paddledtype(dtype):
+        switch_map = {
+            "float16": paddle.float16,
+            "float32": paddle.float32,
+            "float64": paddle.float64,
+            "int8": paddle.int8,
+            "int16": paddle.int16,
+            "int32": paddle.int32,
+            "int64": paddle.int64,
+            "uint8": paddle.uint8,
+            "bool": paddle.bool,
+            # The paddle's phi::DataType::UNDEFINED is mapped into ProtoDataType::RAW,
+            "unk": paddle.fluid.core.VarDesc.VarType.RAW,
+        }
+        assert dtype in switch_map, dtype + " not support in CINN"
+        return switch_map[dtype]
diff --git a/test/cinn/op_mappers/test_argmax_op.py b/test/cinn/op_mappers/test_argmax_op.py
new file mode 100644
index 0000000000000..846d2f3a79689
--- /dev/null
+++ b/test/cinn/op_mappers/test_argmax_op.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+from op_mapper_test import OpMapperTest
+from cinn.frontend import *
+from cinn.common import *
+
+
+class TestArgmaxOp(OpMapperTest):
+    def init_input_data(self):
+        self.axis = 1
+        self.shape = [2, 3, 4]
+        self.input_dtype = "float32"
+        self.output_dtype = "int64"
+        self.flatten = False
+        self.keepdims = False
+        self.feed_data = {
+            'x': self.random(self.shape, self.input_dtype),
+        }
+
+    def set_op_type(self):
+        return "arg_max"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {
+            "axis": self.axis,
+            "flatten": self.flatten,
+            "keepdims": self.keepdims,
+            "dtype": self.nptype2paddledtype(self.output_dtype)
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.output_dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestArgmaxCase1(TestArgmaxOp):
+    """
+    Test case with negative axis and True flatten and int64 output dtype
+    """
+
+    def init_input_data(self):
+        self.axis = -1
+        self.shape = [2, 3, 4]
+        self.input_dtype = "float32"
+        self.output_dtype = "int64"
+        self.keepdims = False
+        self.flatten = True
+        self.feed_data = {
+            'x': self.random(self.shape, self.input_dtype),
+        }
+
+
+class TestArgmaxCase2(TestArgmaxOp):
+    """
+    Test case with true keepdims 
+    """
+
+    def init_input_data(self):
+        self.axis = -1
+        self.shape = [2, 3, 4]
+        self.input_dtype = "float32"
+        self.output_dtype = "int32"
+        self.flatten = False
+        self.keepdims = True
+        self.feed_data = {
+            'x': self.random(self.shape, self.input_dtype),
+        }
+
+
+class TestArgmaxCase3(TestArgmaxOp):
+    """
+    Test case with input_dtype float64
+    """
+
+    def init_input_data(self):
+        self.axis = -1
+        self.shape = [2, 3, 4]
+        self.input_dtype = "float64"
+        self.output_dtype = "int32"
+        self.flatten = False
+        self.keepdims = True
+        self.feed_data = {
+            'x': self.random(self.shape, self.input_dtype),
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_argmin_op.py b/test/cinn/op_mappers/test_argmin_op.py
new file mode 100644
index 0000000000000..d28721c3023ef
--- /dev/null
+++ b/test/cinn/op_mappers/test_argmin_op.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+from op_mapper_test import OpMapperTest
+from cinn.frontend import *
+from cinn.common import *
+
+
+class TestArgminOp(OpMapperTest):
+    def init_input_data(self):
+        self.axis = 1
+        self.shape = [2, 3, 4]
+        self.input_dtype = "float32"
+        self.output_dtype = "int64"
+        self.flatten = False
+        self.keepdims = False
+        self.feed_data = {
+            'x': self.random(self.shape, self.input_dtype),
+        }
+
+    def set_op_type(self):
+        return "arg_min"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {
+            "axis": self.axis,
+            "flatten": self.flatten,
+            "keepdims": self.keepdims,
+            "dtype": self.nptype2paddledtype(self.output_dtype)
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.output_dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestArgminCase1(TestArgminOp):
+    """
+    Test case with negative axis and True flatten and int64 output dtype
+    """
+
+    def init_input_data(self):
+        self.axis = -1
+        self.shape = [2, 3, 4]
+        self.input_dtype = "float32"
+        self.output_dtype = "int64"
+        self.keepdims = False
+        self.flatten = True
+        self.feed_data = {
+            'x': self.random(self.shape, self.input_dtype),
+        }
+
+
+class TestArgminCase2(TestArgminOp):
+    """
+    Test case with true keepdims 
+    """
+
+    def init_input_data(self):
+        self.axis = -1
+        self.shape = [2, 3, 4]
+        self.input_dtype = "float32"
+        self.output_dtype = "int32"
+        self.flatten = False
+        self.keepdims = True
+        self.feed_data = {
+            'x': self.random(self.shape, self.input_dtype),
+        }
+
+
+class TestArgminCase3(TestArgminOp):
+    """
+    Test case with input_dtype float64
+    """
+
+    def init_input_data(self):
+        self.axis = -1
+        self.shape = [2, 3, 4]
+        self.input_dtype = "float64"
+        self.output_dtype = "int32"
+        self.flatten = False
+        self.keepdims = True
+        self.feed_data = {
+            'x': self.random(self.shape, self.input_dtype),
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_argsort_op.py b/test/cinn/op_mappers/test_argsort_op.py
new file mode 100644
index 0000000000000..913f31c4a59e6
--- /dev/null
+++ b/test/cinn/op_mappers/test_argsort_op.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestArgSortOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([3, 4], "float32"),
+        }
+        self.axis = 1
+
+    def set_op_type(self):
+        return "argsort"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {'axis': self.axis}
+
+    def set_op_outputs(self):
+        return {'Out': ['int64'], 'Indices': ['int64']}
+
+    def skip_check_outputs(self):
+        #'Out' is never used in Paddle API
+        return {"Out"}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_assign_value_op.py b/test/cinn/op_mappers/test_assign_value_op.py
new file mode 100644
index 0000000000000..2a642c0ecf5e2
--- /dev/null
+++ b/test/cinn/op_mappers/test_assign_value_op.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestAssignValueOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([10], 'float32')}
+
+    def set_op_type(self):
+        return "assign_value"
+
+    def set_op_inputs(self):
+        return {}
+
+    def convert_values(self, dtype, values):
+        if str(dtype) == "float64":
+            return 'fp32_values', [float(v) for v in values]
+        elif str(dtype) == "float32":
+            return 'fp32_values', [float(v) for v in values]
+        elif str(dtype) == "int64":
+            return 'int64_values', [int(v) for v in values]
+        elif str(dtype) == "int32":
+            return 'int32_values', [int(v) for v in values]
+        elif str(dtype) == "bool":
+            return 'bool_values', [int(v) for v in values]
+        else:
+            self.assertTrue(
+                False,
+                msg=
+                "The data type of 'input' must be bool, float32, int32 or int64"
+            )
+
+    def set_op_attrs(self):
+        dtype = self.feed_data['x'].dtype
+        shape = self.feed_data['x'].shape
+        value_name, values = self.convert_values(dtype, self.feed_data['x'])
+        return {
+            'dtype': self.nptype2paddledtype(
+                str(dtype)),  # should keep the same as input
+            'shape': shape,
+            value_name: values,
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestAssignValueCase1(TestAssignValueOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([10], 'int32', 0, 1000)}
+
+
+class TestAssignValueCase2(TestAssignValueOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([10], 'bool')}
+
+
+class TestAssignValueCase3(TestAssignValueOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([10], 'int64', 0, 1000)}
+
+
+class TestAssignValueCase4(TestAssignValueOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([1], 'float32')}
+
+
+class TestAssignValueCase5(TestAssignValueOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([np.random.randint(100, 1000)], 'float32')
+        }
+
+
+class TestAssignValueCase6(TestAssignValueOp):
+    def init_input_data(self):
+        self.feed_data = {'x': np.arange(128, dtype="int64")}
+
+
+class TestAssignValueCase7(TestAssignValueOp):
+    def init_input_data(self):
+        self.feed_data = {'x': np.arange(128, dtype="int32")}
+
+
+class TestAssignValueCase8(TestAssignValueOp):
+    def init_input_data(self):
+        self.feed_data = {'x': np.arange(0.0, 12.8, 0.1, dtype="float32")}
+
+
+class TestAssignValueCase9(TestAssignValueOp):
+    def init_input_data(self):
+        self.feed_data = {'x': np.arange(127, -1, -1, dtype="int32")}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_atan2_op.py b/test/cinn/op_mappers/test_atan2_op.py
new file mode 100644
index 0000000000000..00be8576c7e80
--- /dev/null
+++ b/test/cinn/op_mappers/test_atan2_op.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestAtan2Op(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float32"),
+            'y': self.random([32, 64], "float32"),
+        }
+
+    def set_op_type(self):
+        return "atan2"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        y = paddle.static.data(
+            name='y',
+            shape=self.feed_data['y'].shape,
+            dtype=self.feed_data['y'].dtype)
+        return {'X1': [x], 'X2': [y]}
+
+    def set_op_attrs(self):
+        return {}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestAtan2OpFP64(TestAtan2Op):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float64", low=-1.0, high=1.0),
+            'y': self.random([32, 64], "float64", low=-1.0, high=1.0),
+        }
+
+
+class TestAtan2OpINT32(TestAtan2Op):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int32", low=-10, high=10),
+            'y': self.random([32, 64], "int32", low=-10, high=10),
+        }
+
+
+class TestAtan2OpINT64(TestAtan2Op):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int64", low=-10, high=10),
+            'y': self.random([32, 64], "int64", low=-10, high=10),
+        }
+
+
+class TestAtan2OpZeroCase1(TestAtan2Op):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int64", low=0, high=1),
+            'y': self.random([32, 64], "int64", low=0, high=1),
+        }
+
+
+class TestAtan2OpZeroCase2(TestAtan2Op):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int64"),
+            'y': self.random([32, 64], "int64", low=0, high=1),
+        }
+
+
+class TestAtan2OpZeroCase3(TestAtan2Op):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int64", low=0, high=1),
+            'y': self.random([32, 64], "int64"),
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_batch_norm_op.py b/test/cinn/op_mappers/test_batch_norm_op.py
new file mode 100644
index 0000000000000..2d291121bd972
--- /dev/null
+++ b/test/cinn/op_mappers/test_batch_norm_op.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestBatchNormOp(OpMapperTest):
+    def init_input_data(self):
+        self.num_channels = 16
+        self.feed_data = {
+            "x": self.random([2, self.num_channels, 8, 8], "float32"),
+            'scale': self.random([self.num_channels], "float32"),
+            'bias': self.random([self.num_channels], "float32"),
+            'mean': self.random([self.num_channels], "float32"),
+            'variance': self.random([self.num_channels], "float32"),
+        }
+
+    def set_op_type(self):
+        return "batch_norm"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        scale = paddle.static.data(
+            name='scale',
+            shape=self.feed_data['scale'].shape,
+            dtype=self.feed_data['scale'].dtype)
+        bias = paddle.static.data(
+            name='bias',
+            shape=self.feed_data['bias'].shape,
+            dtype=self.feed_data['bias'].dtype)
+        mean = paddle.static.data(
+            name='mean',
+            shape=self.feed_data['mean'].shape,
+            dtype=self.feed_data['mean'].dtype)
+        variance = paddle.static.data(
+            name='variance',
+            shape=self.feed_data['variance'].shape,
+            dtype=self.feed_data['variance'].dtype)
+        return {
+            'X': [x],
+            'Scale': [scale],
+            'Bias': [bias],
+            'Mean': [mean],
+            'Variance': [variance]
+        }
+
+    def set_op_attrs(self):
+        return {
+            'epsilon': 1e-5,
+            'momentum': 0.9,
+            'data_layout': 'NCHW',
+            'is_test': False,
+            'trainable_statistics': False,
+            'use_global_stats': False
+        }
+
+    def set_op_outputs(self):
+        return {
+            'Y': [self.feed_data['x'].dtype],
+            'SavedMean': [self.feed_data['mean'].dtype],
+            'SavedVariance': [self.feed_data['variance'].dtype],
+        }
+
+    def set_inplace_outputs(self):
+        return {'MeanOut': 'Mean', 'VarianceOut': 'Variance'}
+
+    def skip_check_outputs(self):
+        # TODO(thisjiang): remove after Variance compute correct
+        return {'SavedVariance', 'VarianceOut'}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestBatchNormInferOp(TestBatchNormOp):
+    def set_op_attrs(self):
+        return {
+            'epsilon': 1e-5,
+            'momentum': 0.9,
+            'data_layout': 'NCHW',
+            'is_test': True,
+            'trainable_statistics': False,
+            'use_global_stats': False
+        }
+
+    def skip_check_outputs(self):
+        # 'SavedMean', 'SavedVariance' are None in Paddle
+        # TODO(thisjiang): remove after VarianceOut compute correct
+        return {'SavedMean', 'SavedVariance', 'VarianceOut'}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_bitwise_op.py b/test/cinn/op_mappers/test_bitwise_op.py
new file mode 100644
index 0000000000000..a03d326774ffe
--- /dev/null
+++ b/test/cinn/op_mappers/test_bitwise_op.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestBitwiseOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int32", 0, 100000),
+            'y': self.random([32, 64], "int32", 0, 100000),
+        }
+
+    def set_op_type(self):
+        return "bitwise_and"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        y = paddle.static.data(
+            name='y',
+            shape=self.feed_data['y'].shape,
+            dtype=self.feed_data['y'].dtype)
+        return {'X': [x], 'Y': [y]}
+
+    def set_op_attrs(self):
+        return {}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestBitwiseAndOp(TestBitwiseOp):
+    def set_op_type(self):
+        return "bitwise_and"
+
+
+class TestBitwiseOrOp(TestBitwiseOp):
+    def set_op_type(self):
+        return "bitwise_or"
+
+
+class TestBitwiseXOrOp(TestBitwiseOp):
+    def set_op_type(self):
+        return "bitwise_xor"
+
+
+class TestBitwiseNotOp(TestBitwiseOp):
+    def set_op_type(self):
+        return "bitwise_not"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_cholesky_op.py b/test/cinn/op_mappers/test_cholesky_op.py
new file mode 100644
index 0000000000000..33cf5a760767b
--- /dev/null
+++ b/test/cinn/op_mappers/test_cholesky_op.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from op_mapper_test import OpMapperTest
+import numpy as np
+import paddle
+
+
+class TestCholeskyOp(OpMapperTest):
+    def init_input_data(self):
+        matrix = self.random([3, 3], "float32")
+        matrix_t = np.transpose(matrix, [1, 0])
+        x = np.dot(matrix, matrix_t)
+        self.feed_data = {
+            'x': x,
+        }
+
+    def set_op_type(self):
+        return "cholesky"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"upper": False}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestCholeskyOpCase1(TestCholeskyOp):
+    def init_input_data(self):
+        matrix = self.random([5, 5], "float64")
+        matrix_t = np.transpose(matrix, [1, 0])
+        x = np.dot(matrix, matrix_t)
+        self.feed_data = {
+            'x': x,
+        }
+
+    def set_op_attrs(self):
+        return {"upper": True}
+
+
+class TestCholeskyOpCase2(TestCholeskyOp):
+    def init_input_data(self):
+        matrix = self.random([3, 3], "float32")
+        matrix_t = np.transpose(matrix, [1, 0])
+        x = np.dot(matrix, matrix_t)
+        x = x * np.ones(shape=(3, 3, 3))
+        self.feed_data = {
+            'x': x,
+        }
+
+    def set_op_attrs(self):
+        return {"upper": False}
+
+
+class TestCholeskyOpCase3(TestCholeskyOp):
+    def init_input_data(self):
+        matrix = self.random([3, 3], "float32")
+        matrix_t = np.transpose(matrix, [1, 0])
+        x = np.dot(matrix, matrix_t)
+        x = x * np.ones(shape=(2, 3, 3, 3))
+        self.feed_data = {
+            'x': x,
+        }
+
+    def set_op_attrs(self):
+        return {"upper": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_clip_op.py b/test/cinn/op_mappers/test_clip_op.py
new file mode 100644
index 0000000000000..241d0cbcc646a
--- /dev/null
+++ b/test/cinn/op_mappers/test_clip_op.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import random
+import unittest
+from op_mapper_test import OpMapperTest
+
+
+class TestClipOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2], "float32", -1.0, 1.0),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+    def set_op_type(self):
+        return "clip"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"min": self.min_val, "max": self.max_val}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestClipOp2D(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3], "float32", -1.0, 1.0),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOp3D(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "float32", -1.0, 1.0),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOp4D(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4, 5], "float32", -1.0, 1.0),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpSpecialCaseWithOne(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 1, 4, 5], "float32", -1.0, 1.0),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpSpecialCaseAllOne(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([1, 1, 1, 1], "float32", -1.0, 1.0),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpSpecialCaseLessThan1024(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 512, 5], "float32", -1.0, 1.0),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpSpecialCaseGreaterThan1024(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4, 2048], "float32", -1.0, 1.0),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpMaxTensor(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "float32", -1.0, 1.0),
+            'max_input': self.random([1], "float32")
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        max_input = paddle.static.data(
+            name='max_input',
+            shape=self.feed_data['max_input'].shape,
+            dtype=self.feed_data['max_input'].dtype)
+        return {'X': [x], 'Max': [max_input]}
+
+
+class TestClipOpMaxTensorInt32(TestClipOpMaxTensor):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "int32"),
+            'max_input': self.random([1], "int32")
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpMaxTensorFloat64(TestClipOpMaxTensor):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "float64"),
+            'max_input': self.random([1], "float64")
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpMaxTensorTypeCast(TestClipOpMaxTensor):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "float64"),
+            'max_input': self.random([1], "float32")
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpMinTensor(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "float32"),
+            'min_input': self.random([1], "float32")
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        min_input = paddle.static.data(
+            name='min_input',
+            shape=self.feed_data['min_input'].shape,
+            dtype=self.feed_data['min_input'].dtype)
+        return {'X': [x], 'Min': [min_input]}
+
+    def set_op_attrs(self):
+        return {"min": 0.0, "max": 1.0}
+
+
+class TestClipOpMinTensorInt32(TestClipOpMinTensor):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "int32"),
+            'min_input': self.random([1], "int32")
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpMinTensorFloat64(TestClipOpMinTensor):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "float64"),
+            'min_input': self.random([1], "float64")
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpMinTensorTypeCast(TestClipOpMinTensor):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "float64"),
+            'min_input': self.random([1], "float32")
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpFloat64(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "float64"),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+class TestClipOpInt32(TestClipOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "int32", low=0, high=10),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+    def set_op_attrs(self):
+        return {"min": 3.0, "max": 7.0}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestClipOpInt64(TestClipOpInt32):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "int64", low=0, high=10),
+        }
+        self.min_val = -random.random()
+        self.max_val = random.random()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_compare_op.py b/test/cinn/op_mappers/test_compare_op.py
new file mode 100644
index 0000000000000..be4f8d5511e46
--- /dev/null
+++ b/test/cinn/op_mappers/test_compare_op.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestCompareOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float32", -10, 10),
+            'y': self.random([32, 64], "float32", -10, 10),
+        }
+
+    def set_op_type(self):
+        return "equal"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        y = paddle.static.data(
+            name='y',
+            shape=self.feed_data['y'].shape,
+            dtype=self.feed_data['y'].dtype)
+        return {'X': [x], 'Y': [y]}
+
+    def set_op_attrs(self):
+        return {"axis": -1}
+
+    def set_op_outputs(self):
+        return {'Out': ['bool']}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestEqualOp(TestCompareOp):
+    def set_op_type(self):
+        return "equal"
+
+
+class TestNotEqualOp(TestCompareOp):
+    def set_op_type(self):
+        return "not_equal"
+
+
+class TestGreaterEqualOp(TestCompareOp):
+    def set_op_type(self):
+        return "greater_equal"
+
+
+class TestGreaterThanOp(TestCompareOp):
+    def set_op_type(self):
+        return "greater_than"
+
+
+class TestLessEqualOp(TestCompareOp):
+    def set_op_type(self):
+        return "less_equal"
+
+
+class TestLessThanOp(TestCompareOp):
+    def set_op_type(self):
+        return "less_than"
+
+
+class TestAxisCase(TestCompareOp):
+    def set_op_attrs(self):
+        return {"axis": 0}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_conv2d_op.py b/test/cinn/op_mappers/test_conv2d_op.py
new file mode 100644
index 0000000000000..5673a8ea61076
--- /dev/null
+++ b/test/cinn/op_mappers/test_conv2d_op.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestConv2dOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            "x": self.random([3, 16, 32, 32], "float32"),
+            "weight": self.random([16, 16, 3, 3], "float32")
+        }
+        self.data_format = 'NCHW'
+
+    def set_op_type(self):
+        return "conv2d"
+
+    def set_op_inputs(self):
+        x = paddle.static.data('x', self.feed_data["x"].shape,
+                               self.feed_data["x"].dtype)
+        weight = paddle.static.data('weight', self.feed_data["weight"].shape,
+                                    self.feed_data["weight"].dtype)
+        return {'Input': [x], 'Filter': [weight]}
+
+    def set_op_attrs(self):
+        return {
+            "strides": [1, 1],
+            "paddings": [0, 0],
+            "dilations": [1, 1],
+            "groups": 1,
+            "data_format": self.data_format,
+            "padding_algorithm": "EXPLICIT",
+            "use_cudnn": True
+        }
+
+    def set_op_outputs(self):
+        return {'Output': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestConv2dNCHWFP16(TestConv2dOp):
+    def init_input_data(self):
+        self.feed_data = {
+            "x": self.random([3, 16, 32, 32], "float16"),
+            "weight": self.random([16, 16, 3, 3], "float16")
+        }
+        self.data_format = 'NCHW'
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(max_relative_error=1e-3)
+
+
+class TestConv2dNHWC(TestConv2dOp):
+    def init_input_data(self):
+        self.feed_data = {
+            "x": self.random([3, 32, 32, 16], "float32"),
+            "weight": self.random([16, 16, 3, 3], "float32")
+        }
+        self.data_format = 'NHWC'
+
+
+class TestConv2dNHWCFP16(TestConv2dOp):
+    def init_input_data(self):
+        self.feed_data = {
+            "x": self.random([3, 32, 32, 16], "float16"),
+            "weight": self.random([16, 16, 3, 3], "float16")
+        }
+        self.data_format = 'NHWC'
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(max_relative_error=1e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_cumsum_op.py b/test/cinn/op_mappers/test_cumsum_op.py
new file mode 100644
index 0000000000000..3461a90ad35ac
--- /dev/null
+++ b/test/cinn/op_mappers/test_cumsum_op.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestCumsumOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "float32"),
+        }
+
+    def set_op_type(self):
+        return "cumsum"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"axis": -1, "flatten": False}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestCumsumCase1(TestCumsumOp):
+    """
+    Test case with negative axis
+    """
+
+    def set_op_attrs(self):
+        return {"axis": -3, "flatten": False}
+
+
+class TestCumsumCase2(TestCumsumOp):
+    """
+    Test case with unspecified axis and dtype
+    flatten = True if axis is None, and axis can ignore
+    """
+
+    def set_op_attrs(self):
+        return {"flatten": True}
+
+
+class TestCumsumCase3(TestCumsumOp):
+    """
+    Test case with dtype int32
+    """
+
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], "int32", -10, 10),
+        }
+
+    def set_op_attrs(self):
+        return {"axis": 1, "flatten": False}
+
+
+class TestCumsumCase4(TestCumsumOp):
+    """
+    Test case with dtype int64
+    """
+
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3], "int64", -10, 10),
+        }
+
+
+class TestCumsumCase5(TestCumsumOp):
+    """
+    Test case with exclusive = True
+    """
+
+    def set_op_attrs(self):
+        return {"exclusive": True}
+
+
+class TestCumsumCase6(TestCumsumOp):
+    """
+    Test case with reverse = True
+    """
+
+    def set_op_attrs(self):
+        return {"reverse": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_elementwise_op.py b/test/cinn/op_mappers/test_elementwise_op.py
new file mode 100644
index 0000000000000..563115ba4ae19
--- /dev/null
+++ b/test/cinn/op_mappers/test_elementwise_op.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestElementwiseOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float32"),
+            'y': self.random([32, 64], "float32")
+        }
+
+    def set_op_type(self):
+        return "elementwise_add"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        y = paddle.static.data(
+            name='y',
+            shape=self.feed_data['y'].shape,
+            dtype=self.feed_data['y'].dtype)
+        return {'X': [x], 'Y': [y]}
+
+    def set_op_attrs(self):
+        return {"axis": -1}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestAddOp(TestElementwiseOp):
+    def set_op_type(self):
+        return "elementwise_add"
+
+
+class TestSubOp(TestElementwiseOp):
+    def set_op_type(self):
+        return "elementwise_sub"
+
+
+class TestDivOp(TestElementwiseOp):
+    def set_op_type(self):
+        return "elementwise_div"
+
+
+class TestMulOp(TestElementwiseOp):
+    def set_op_type(self):
+        return "elementwise_mul"
+
+
+class TestPowOp(TestElementwiseOp):
+    def set_op_type(self):
+        return "elementwise_pow"
+
+
+class TestModOp(TestElementwiseOp):
+    def set_op_type(self):
+        return "elementwise_mod"
+
+
+class TestMaxOp(TestElementwiseOp):
+    def set_op_type(self):
+        return "elementwise_max"
+
+
+class TestMinOp(TestElementwiseOp):
+    def set_op_type(self):
+        return "elementwise_min"
+
+
+class TestFloorDivOpCase1(TestElementwiseOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], low=1, high=10, dtype='int32'),
+            'y': self.random([32, 64], low=1, high=10, dtype='int32')
+        }
+
+    def set_op_type(self):
+        return "elementwise_floordiv"
+
+
+class TestFloorDivOpCase2(TestElementwiseOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32], low=1, high=10, dtype='int64'),
+            'y': self.random([32], low=1, high=10, dtype='int64')
+        }
+
+    def set_op_type(self):
+        return "elementwise_floordiv"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_expand_op.py b/test/cinn/op_mappers/test_expand_op.py
new file mode 100644
index 0000000000000..1f8ad62b1c06d
--- /dev/null
+++ b/test/cinn/op_mappers/test_expand_op.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestExpandOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([1, 3], 'float32'),
+        }
+
+    def set_op_type(self):
+        return "expand"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"expand_times": [2, 1]}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_expand_v2_op.py b/test/cinn/op_mappers/test_expand_v2_op.py
new file mode 100644
index 0000000000000..fd591b442cde6
--- /dev/null
+++ b/test/cinn/op_mappers/test_expand_v2_op.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestExpandV2Op(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 1, 4, 5], 'float32'),
+        }
+
+    def set_op_type(self):
+        return "expand_v2"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"shape": [-1, 2, -1, 5]}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_fill_constant_op.py b/test/cinn/op_mappers/test_fill_constant_op.py
new file mode 100644
index 0000000000000..7dd83cbe59cf8
--- /dev/null
+++ b/test/cinn/op_mappers/test_fill_constant_op.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestFillConstantOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {"x": self.random([1], "float32")}
+        self.shape = [10, 10]
+        self.value = np.random.default_rng(12345).random()
+        self.str_value = ""
+        self.dtype = "float32"
+
+    def set_op_type(self):
+        return "fill_constant"
+
+    def set_op_inputs(self):
+        return {}
+
+    def set_op_attrs(self):
+        return {
+            "shape": self.shape,
+            "value": float(self.value),
+            "str_value": self.str_value,
+            "dtype": self.nptype2paddledtype(self.dtype)
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [self.dtype]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestFillConstantCase1(TestFillConstantOp):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [10, 10]
+        self.value = np.random.default_rng(12345).integers(low=0, high=10000)
+        self.str_value = ""
+        self.dtype = "int32"
+
+
+class TestFillConstantCase2(TestFillConstantOp):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [10, 10]
+        self.value = 0
+        self.str_value = "0.123456"
+        self.dtype = "float32"
+
+
+class TestFillConstantByValueTensor(TestFillConstantOp):
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {"ValueTensor": [x]}
+
+
+class TestFillConstantByValueTensorCase1(TestFillConstantByValueTensor):
+    def init_input_data(self):
+        self.feed_data = {"x": self.random([1], "int32", -10, 10)}
+        self.shape = [10, 10]
+        self.value = np.random.default_rng(12345).random()
+        self.str_value = ""
+        self.dtype = "float32"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_flip_op.py b/test/cinn/op_mappers/test_flip_op.py
new file mode 100644
index 0000000000000..edc2e876c5ebd
--- /dev/null
+++ b/test/cinn/op_mappers/test_flip_op.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from op_mapper_test import OpMapperTest
+import paddle
+
+
+class TestFlipOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([3, 2, 4], "float32")}
+
+    def set_op_type(self):
+        return "flip"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"axis": [0, 1]}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestFlipOpAxis(TestFlipOp):
+    def set_op_attrs(self):
+        return {"axis": [0, 2]}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_gather_nd_op.py b/test/cinn/op_mappers/test_gather_nd_op.py
new file mode 100644
index 0000000000000..0d064a34abd49
--- /dev/null
+++ b/test/cinn/op_mappers/test_gather_nd_op.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestGatherNdOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], 'float32'),
+            'index': np.array([[1]], dtype='int32')
+        }
+
+    def set_op_type(self):
+        return "gather_nd"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        index = paddle.static.data(
+            name='index',
+            shape=self.feed_data['index'].shape,
+            dtype=self.feed_data['index'].dtype)
+        return {'X': [x], 'Index': [index]}
+
+    def set_op_attrs(self):
+        return {}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestGatherNdCase1(TestGatherNdOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], 'float32'),
+            'index': np.array([[1, 2, 3]], dtype='int32')
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_gather_op.py b/test/cinn/op_mappers/test_gather_op.py
new file mode 100644
index 0000000000000..f98b34274709b
--- /dev/null
+++ b/test/cinn/op_mappers/test_gather_op.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestGatherOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([10, 12, 128, 128], 'float32'),
+            'index': self.random([5], 'int32', 0, 10)
+        }
+        self.axis = 0
+
+    def set_op_type(self):
+        return "gather"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        index = paddle.static.data(
+            name='index',
+            shape=self.feed_data['index'].shape,
+            dtype=self.feed_data['index'].dtype)
+        return {'X': [x], 'Index': [index]}
+
+    def set_op_attrs(self):
+        return {"axis": self.axis}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestGatherCase1(TestGatherOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([10, 12, 128, 128], 'float32'),
+            'index': self.random([64], 'int32', 0, 128),
+        }
+        self.axis = 2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_gaussian_random_op.py b/test/cinn/op_mappers/test_gaussian_random_op.py
new file mode 100644
index 0000000000000..5009b174e7c6a
--- /dev/null
+++ b/test/cinn/op_mappers/test_gaussian_random_op.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestGaussianRandomOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [2, 3]
+        self.mean = 0.0
+        self.std = 1.0
+        self.seed = 10
+        self.dtype = "float32"
+
+    def set_op_type(self):
+        return "gaussian_random"
+
+    def set_op_inputs(self):
+        return {}
+
+    def set_op_attrs(self):
+        return {
+            "mean": self.mean,
+            "std": self.std,
+            "seed": self.seed,
+            "shape": self.shape,
+            "dtype": self.nptype2paddledtype(self.dtype)
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [self.dtype]}
+
+    def test_check_results(self):
+        # Due to the different random number generation numbers implemented
+        # in the specific implementation, the random number results generated
+        # by CINN and Paddle are not the same, but they all conform to the
+        # Gaussian distribution.
+        self.check_outputs_and_grads(max_relative_error=10000)
+
+
+class TestGaussianRandomCase1(TestGaussianRandomOp):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [2, 3, 4]
+        self.mean = 1.0
+        self.std = 2.0
+        self.seed = 10
+        self.dtype = "float32"
+
+
+class TestGaussianRandomCase2(TestGaussianRandomOp):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [2, 3, 4]
+        self.mean = 2.0
+        self.std = 3.0
+        self.seed = 10
+        self.dtype = "float64"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_layer_norm_op.py b/test/cinn/op_mappers/test_layer_norm_op.py
new file mode 100644
index 0000000000000..795db586bea50
--- /dev/null
+++ b/test/cinn/op_mappers/test_layer_norm_op.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+import functools
+
+
+class TestLayerNormOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4, 5], 'float32'),
+            'scale': self.random([60], 'float32', 1.0, 2.0),
+            'bias': self.random([60], 'float32', -10.0, 10.0),
+        }
+        self.beigin_norm_axis = 1
+
+    def set_op_type(self):
+        return "layer_norm"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        scale = paddle.static.data(
+            name='scale',
+            shape=self.feed_data['scale'].shape,
+            dtype=self.feed_data['scale'].dtype)
+        bias = paddle.static.data(
+            name='bias',
+            shape=self.feed_data['bias'].shape,
+            dtype=self.feed_data['bias'].dtype)
+        return {'X': [x], 'Scale': [scale], "Bias": [bias]}
+
+    def set_op_attrs(self):
+        return {"epsilon": 1e-5, "begin_norm_axis": self.beigin_norm_axis}
+
+    def set_op_outputs(self):
+        return {
+            'Y': [str(self.feed_data['x'].dtype)],
+            'Mean': [str(self.feed_data['scale'].dtype)],
+            'Variance': [str(self.feed_data['scale'].dtype)]
+        }
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestLayerNormFp16(TestLayerNormOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4, 5], 'float16'),
+            'scale': self.random([60], 'float32', 1.0, 2.0),
+            'bias': self.random([60], 'float32', -10.0, 10.0),
+        }
+        self.beigin_norm_axis = 1
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(max_relative_error=1e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_log1p_op.py b/test/cinn/op_mappers/test_log1p_op.py
new file mode 100644
index 0000000000000..0d925a0c545a0
--- /dev/null
+++ b/test/cinn/op_mappers/test_log1p_op.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestLog1pOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([10, 12, 128, 128], 'float32'),
+        }
+
+    def set_op_type(self):
+        return "log1p"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_logical_op.py b/test/cinn/op_mappers/test_logical_op.py
new file mode 100644
index 0000000000000..6d1129589fd93
--- /dev/null
+++ b/test/cinn/op_mappers/test_logical_op.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestLogicalOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "bool"),
+            'y': self.random([32, 64], "bool"),
+        }
+
+    def set_op_type(self):
+        return "logical_and"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        y = paddle.static.data(
+            name='y',
+            shape=self.feed_data['y'].shape,
+            dtype=self.feed_data['y'].dtype)
+        return {'X': [x], 'Y': [y]}
+
+    def set_op_attrs(self):
+        return {}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestLogicalAndOp(TestLogicalOp):
+    def set_op_type(self):
+        return "logical_and"
+
+
+class TestLogicalOrOp(TestLogicalOp):
+    def set_op_type(self):
+        return "logical_or"
+
+
+class TestLogicalXOrOp(TestLogicalOp):
+    def set_op_type(self):
+        return "logical_xor"
+
+
+class TestLogicalNotOp(TestLogicalOp):
+    def set_op_type(self):
+        return "logical_not"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_lookup_table_op.py b/test/cinn/op_mappers/test_lookup_table_op.py
new file mode 100644
index 0000000000000..1e37621d25e92
--- /dev/null
+++ b/test/cinn/op_mappers/test_lookup_table_op.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestLookupTableOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            "w": self.random([10, 3], "float32"),
+            "ids": self.random([5, 1], "int64", 0, 9)
+        }
+
+    def set_op_type(self):
+        return "lookup_table"
+
+    def set_op_inputs(self):
+        w = paddle.static.data(
+            name="w",
+            shape=self.feed_data["w"].shape,
+            dtype=self.feed_data["w"].dtype)
+        ids = paddle.static.data(
+            name="ids",
+            shape=self.feed_data["ids"].shape,
+            dtype=self.feed_data["ids"].dtype)
+        return {"W": [w], "Ids": [ids]}
+
+    def set_op_attrs(self):
+        return {"padding_idx": -1}
+
+    def set_op_outputs(self):
+        return {"Out": [str(self.feed_data["w"].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestLookupTableOpCase1(TestLookupTableOp):
+    def init_input_data(self):
+        self.feed_data = {
+            "w": self.random([32, 64], "float64"),
+            "ids": self.random([10, 1], "int64", 0, 31)
+        }
+
+    def set_op_attrs(self):
+        return {"padding_idx": 1}
+
+
+class TestLookupTableV2Op(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            "w": self.random([10, 3], "float32"),
+            "ids": self.random([5, 2], "int32", 0, 9)
+        }
+
+    def set_op_type(self):
+        return "lookup_table_v2"
+
+    def set_op_inputs(self):
+        w = paddle.static.data(
+            name="w",
+            shape=self.feed_data["w"].shape,
+            dtype=self.feed_data["w"].dtype)
+        ids = paddle.static.data(
+            name="ids",
+            shape=self.feed_data["ids"].shape,
+            dtype=self.feed_data["ids"].dtype)
+        return {"W": [w], "Ids": [ids]}
+
+    def set_op_attrs(self):
+        return {"padding_idx": -1}
+
+    def set_op_outputs(self):
+        return {"Out": [str(self.feed_data["w"].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestLookupTableV2OpCase1(TestLookupTableV2Op):
+    def init_input_data(self):
+        self.feed_data = {
+            "w": self.random([32, 64], "float64"),
+            "ids": self.random([10, 3], "int64", 0, 31)
+        }
+
+    def set_op_attrs(self):
+        return {"padding_idx": 1}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_mul_op.py b/test/cinn/op_mappers/test_mul_op.py
new file mode 100644
index 0000000000000..12b22df81ac7a
--- /dev/null
+++ b/test/cinn/op_mappers/test_mul_op.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestMulOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float32"),
+            'y': self.random([64, 32], "float32"),
+        }
+        self.x_num_col_dims = 1
+        self.y_num_col_dims = 1
+
+    def set_op_type(self):
+        return "mul"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        y = paddle.static.data(
+            name='y',
+            shape=self.feed_data['y'].shape,
+            dtype=self.feed_data['y'].dtype)
+        return {'X': [x], 'Y': [y]}
+
+    def set_op_attrs(self):
+        return {
+            "x_num_col_dims": self.x_num_col_dims,
+            "y_num_col_dims": self.y_num_col_dims
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestMulCase1(TestMulOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([4, 8, 64], "float32"),
+            'y': self.random([64, 4, 8], "float32"),
+        }
+        self.x_num_col_dims = 2
+        self.y_num_col_dims = 1
+
+
+class TestMulCase2(TestMulOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([4, 8, 8, 8], "float32"),
+            'y': self.random([8, 8, 4, 8], "float32"),
+        }
+        self.x_num_col_dims = 2
+        self.y_num_col_dims = 2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_norm_op.py b/test/cinn/op_mappers/test_norm_op.py
new file mode 100644
index 0000000000000..30771e1d21ea8
--- /dev/null
+++ b/test/cinn/op_mappers/test_norm_op.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestNormOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32")}
+
+    def set_op_type(self):
+        return "norm"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"axis": -1, "epsilon": 1e-10, "is_test": False}
+
+    def set_op_outputs(self):
+        return {
+            'Out': [str(self.feed_data['x'].dtype)],
+            "Norm": [str(self.feed_data['x'].dtype)]
+        }
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestNormAxis1(TestNormOp):
+    def set_op_attrs(self):
+        return {"axis": 0, "epsilon": 1e-2, "is_test": False}
+
+
+class TestNormTestMode(TestNormOp):
+    def set_op_attrs(self):
+        return {"axis": -1, "epsilon": 1e-10, "is_test": True}
+
+    def skip_check_outputs(self):
+        # in test mode, 'Norm' is unnecesary
+        return {"Norm"}
+
+
+class TestNormFP16(TestNormOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float16")}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_one_hot_op.py b/test/cinn/op_mappers/test_one_hot_op.py
new file mode 100644
index 0000000000000..b1762a1817ff0
--- /dev/null
+++ b/test/cinn/op_mappers/test_one_hot_op.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestOneHotOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([1, 32], 'int32', low=0, high=9)}
+        self.depth = 10
+        self.dtype = "float32"
+        self.allow_out_of_range = False
+
+    def set_op_type(self):
+        return "one_hot"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {
+            "depth": self.depth,
+            "dtype": self.nptype2paddledtype(self.dtype),
+            "allow_out_of_range": self.allow_out_of_range
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestOneHotOpCase1(TestOneHotOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], 'int32')}
+        self.depth = 64
+        self.dtype = "int32"
+        self.allow_out_of_range = False
+
+
+class TestOneHotOpCase2(TestOneHotOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64, 1], 'int64')}
+        self.depth = 1
+        self.dtype = "int64"
+        self.allow_out_of_range = True
+
+
+class TestOneHotV2Op(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([1, 32], 'int32')}
+        self.depth = 10
+        self.dtype = "float32"
+        self.allow_out_of_range = False
+
+    def set_op_type(self):
+        return "one_hot_v2"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {
+            "depth": self.depth,
+            "dtype": self.nptype2paddledtype(self.dtype),
+            "allow_out_of_range": self.allow_out_of_range
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestOneHotV2OpCase1(TestOneHotV2Op):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], 'int32')}
+        self.depth = 64
+        self.dtype = "int32"
+        self.allow_out_of_range = False
+
+
+class TestOneHotV2OpCase2(TestOneHotV2Op):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64, 1], 'int64')}
+        self.depth = 1
+        self.dtype = "int64"
+        self.allow_out_of_range = True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_pool2d_op.py b/test/cinn/op_mappers/test_pool2d_op.py
new file mode 100644
index 0000000000000..fe246f55f733f
--- /dev/null
+++ b/test/cinn/op_mappers/test_pool2d_op.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from op_mapper_test import OpMapperTest
+from cinn.common import *
+
+
+@unittest.skipIf(not is_compiled_with_cudnn(),
+                 "x86 test will be skipped due to timeout.")
+class TestPool2dOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {"x": self.random([2, 3, 7, 7], "float64")}
+        self.data_format = "NCHW"
+        self.pooling_type = "avg"
+        self.kernel_size = [3, 3]
+        self.stride = [1, 1]
+        self.padding = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = False
+        self.adaptive = False
+        self.use_cudnn = True
+
+    def set_op_type(self):
+        return "pool2d"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {
+            "pooling_type": self.pooling_type,
+            "ksize": self.kernel_size,
+            "global_pooling": self.global_pooling,
+            "strides": self.stride,
+            "paddings": self.padding,
+            "exclusive": self.exclusive,
+            "adaptive": self.adaptive,
+            "ceil_mode": self.ceil_mode,
+            "data_format": self.data_format,
+            "padding_algorithm": self.padding_algorithm,
+            "use_cudnn": self.use_cudnn
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestPool2dOpCase1(TestPool2dOp):
+    def init_input_data(self):
+        super().init_input_data()
+        self.pooling_type = "max"
+
+
+class TestPool2dOpCase2(TestPool2dOp):
+    def init_input_data(self):
+        super().init_input_data()
+        self.kernel_size = [5, 5]
+        self.stride = [2, 2]
+        self.padding = [1, 1]
+
+
+class TestPool2dOpCase3(TestPool2dOp):
+    def init_input_data(self):
+        super().init_input_data()
+        self.global_pooling = True
+
+
+class TestPool2dOpCase4(TestPool2dOp):
+    def init_input_data(self):
+        super().init_input_data()
+        self.ceil_mode = True
+
+
+class TestPool2dOpCase5(TestPool2dOp):
+    def init_input_data(self):
+        super().init_input_data()
+        self.exclusive = True
+
+
+class TestPool2dOpCase6(TestPool2dOp):
+    def init_input_data(self):
+        super().init_input_data()
+        self.kernel_size = [2, 2]
+        self.stride = [1, 1]
+        self.padding = [0, 0]
+        self.adaptive = True
+
+
+class TestPool2dOpCase7(TestPool2dOp):
+    def init_input_data(self):
+        super().init_input_data()
+        self.padding_algorithm = "VALID"
+
+
+class TestPool2dOpCase8(TestPool2dOp):
+    def init_input_data(self):
+        super().init_input_data()
+        self.padding_algorithm = "SAME"
+
+
+class TestPool2dOpCase9(TestPool2dOp):
+    def init_input_data(self):
+        super().init_input_data()
+        self.feed_data = {"x": self.random([2, 7, 7, 3], "float64")}
+        self.data_format = "NHWC"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_pow_op.py b/test/cinn/op_mappers/test_pow_op.py
new file mode 100644
index 0000000000000..e912180e52f98
--- /dev/null
+++ b/test/cinn/op_mappers/test_pow_op.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestPowOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float32"),
+            'factor': self.random([1], "float32", 0.0, 4.0),
+        }
+
+    def set_op_type(self):
+        return "pow"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        factor = paddle.static.data(
+            name='factor',
+            shape=self.feed_data['factor'].shape,
+            dtype=self.feed_data['factor'].dtype)
+        return {'X': [x], 'FactorTensor': [factor]}
+
+    def set_op_attrs(self):
+        return {}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestPowCase1(TestPowOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int32", 2, 10),
+            'factor': self.random([1], "int32", 0, 5),
+        }
+
+
+class TestPowCase2(TestPowOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int32", 2, 10),
+            'factor': self.random([1], "int32", 0, 5),
+        }
+
+
+class TestPowOpInFactorAttr(TestPowOp):
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"factor": float(2)}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_randint_op.py b/test/cinn/op_mappers/test_randint_op.py
new file mode 100644
index 0000000000000..a88908d7ed4cb
--- /dev/null
+++ b/test/cinn/op_mappers/test_randint_op.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestRandIntOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [2, 3]
+        self.min = 1
+        self.max = 5
+        self.seed = 10
+        self.dtype = "int32"
+
+    def set_op_type(self):
+        return "randint"
+
+    def set_op_inputs(self):
+        return {}
+
+    def set_op_attrs(self):
+        return {
+            "low": self.min,
+            "high": self.max,
+            "seed": self.seed,
+            "shape": self.shape,
+            "dtype": self.nptype2paddledtype(self.dtype)
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [self.dtype]}
+
+    def test_check_results(self):
+        # Due to the different random number generation numbers implemented
+        # in the specific implementation, the random number results generated
+        # by CINN and Paddle are not the same, but they all conform to the
+        # uniform distribution.
+        # self.check_outputs_and_grads()
+        self.build_paddle_program(self.target)
+        self.build_cinn_program(self.target)
+
+
+class TestRandIntCase1(TestRandIntOp):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [2, 3, 4]
+        self.min = 1
+        self.max = 9
+        self.seed = 10
+        self.dtype = "int32"
+
+
+class TestRandIntCase2(TestRandIntOp):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [2, 3, 4]
+        self.min = 1
+        self.max = 9
+        self.seed = 10
+        self.dtype = "int64"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_reduce_op.py b/test/cinn/op_mappers/test_reduce_op.py
new file mode 100644
index 0000000000000..c2a8d8e6eae1f
--- /dev/null
+++ b/test/cinn/op_mappers/test_reduce_op.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestReduceOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32")}
+        self.dim = [0, 1]
+        self.keepdim = False
+
+    def set_op_type(self):
+        return "reduce_sum"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"dim": self.dim, "keep_dim": self.keepdim}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestReduceSum(TestReduceOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32")}
+        self.dim = [0]
+        self.keepdim = False
+
+
+class TestReduceSumCase1(TestReduceOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32")}
+        self.dim = [0]
+        self.keepdim = True
+
+
+class TestReduceMax(TestReduceOp):
+    def set_op_type(self):
+        return "reduce_max"
+
+
+class TestReduceMin(TestReduceOp):
+    def set_op_type(self):
+        return "reduce_min"
+
+
+class TestReduceProd(TestReduceOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([2, 3], "float32", 1.0, 2.0)}
+        self.dim = [0, 1]
+        self.keepdim = False
+
+    def set_op_type(self):
+        return "reduce_prod"
+
+
+class TestReduceMean(TestReduceOp):
+    def set_op_type(self):
+        return "reduce_mean"
+
+
+class TestReduceMeanCase1(TestReduceMean):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32", 1.0, 2.0)}
+        self.dim = [1]
+        self.keepdim = False
+
+
+class TestReduceMeanCase2(TestReduceMean):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([16, 32, 64], "float32", 1.0, 2.0)}
+        self.dim = [0, 1]
+        self.keepdim = True
+
+
+class TestReduceAll(TestReduceOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "bool")}
+        self.dim = [0, 1]
+        self.keepdim = False
+
+    def set_op_type(self):
+        return "reduce_all"
+
+
+class TestReduceAny(TestReduceOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "bool")}
+        self.dim = [0, 1]
+        self.keepdim = False
+
+    def set_op_type(self):
+        return "reduce_any"
+
+
+class TestReduceOutType(TestReduceOp):
+    def set_op_attrs(self):
+        return {
+            "dim": self.dim,
+            "keep_dim": self.keepdim,
+            "out_dtype": self.nptype2paddledtype("float64")
+        }
+
+
+class TestReduceUnkOutType(TestReduceOp):
+    def set_op_attrs(self):
+        return {
+            "dim": self.dim,
+            "keep_dim": self.keepdim,
+            "out_dtype": self.nptype2paddledtype("unk")
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_reverse_op.py b/test/cinn/op_mappers/test_reverse_op.py
new file mode 100644
index 0000000000000..c4be503c618ad
--- /dev/null
+++ b/test/cinn/op_mappers/test_reverse_op.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from op_mapper_test import OpMapperTest
+import paddle
+
+
+class TestReverseOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([3, 2, 4], "float32")}
+
+    def set_op_type(self):
+        return "reverse"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"axis": [0, 1]}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestReverseOpAxis(TestReverseOp):
+    def set_op_attrs(self):
+        return {"axis": [0, 2]}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_roll_op.py b/test/cinn/op_mappers/test_roll_op.py
new file mode 100644
index 0000000000000..55a731b5cf20c
--- /dev/null
+++ b/test/cinn/op_mappers/test_roll_op.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestRollOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': np.array([1, 2, 3], dtype='float32'),
+        }
+        self.axis = [0]
+        self.shifts = [1]
+
+    def set_op_type(self):
+        return "roll"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"shifts": self.shifts, "axis": self.axis}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestRollCase1(TestRollOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([1, 2, 3], 'float32'),
+        }
+        self.axis = [1]
+        self.shifts = [3]
+
+
+class TestRollCase2(TestRollOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([1], 'float32'),
+        }
+        self.axis = [0]
+        self.shifts = [2]
+
+
+class TestRollCase3(TestRollOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([1, 2, 3], 'float32'),
+        }
+        self.axis = [0, 1, 2, -1]
+        self.shifts = [3, 4, 10, 3]
+
+
+class TestRollCase4(TestRollOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([1, 2, 3], 'float32'),
+        }
+        self.axis = [0, 1]
+        self.shifts = [3, -8]
+
+
+class TestRollCase5(TestRollOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([1, 2, 3], 'float32'),
+        }
+        self.axis = [1]
+        self.shifts = [121]
+
+
+class TestRollCase6(TestRollOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([10, 2, 3], 'float32'),
+        }
+        self.axis = [1, 2]
+        self.shifts = [121, 122]
+
+
+class TestRollAxesEmpty(TestRollOp):
+    def set_op_attrs(self):
+        return {"shifts": self.shifts, "axis": []}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_scale_op.py b/test/cinn/op_mappers/test_scale_op.py
new file mode 100644
index 0000000000000..a5011ef5cb7fb
--- /dev/null
+++ b/test/cinn/op_mappers/test_scale_op.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestScaleOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32")}
+        self.scale = -1.0
+        self.bias = 0.0
+        self.bias_after_scale = True
+
+    def set_op_type(self):
+        return "scale"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {
+            "scale": self.scale,
+            "bias": self.bias,
+            "bias_after_scale": self.bias_after_scale
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestScaleCase1(TestScaleOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32")}
+        self.scale = 2.0
+        self.bias = 1.0
+        self.bias_after_scale = True
+
+
+class TestScaleCase2(TestScaleOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32")}
+        self.scale = 2.0
+        self.bias = 1.0
+        self.bias_after_scale = False
+
+
+class TestScaleWithScaleTensor(TestScaleOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float32"),
+            "scale": self.random([1], "float32", 2.0, 10.0)
+        }
+        self.bias = 2.0
+        self.bias_after_scale = True
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        scale = paddle.static.data(
+            name='scale',
+            shape=self.feed_data['scale'].shape,
+            dtype=self.feed_data['scale'].dtype)
+        return {'X': [x], "ScaleTensor": [scale]}
+
+    def set_op_attrs(self):
+        return {"bias": self.bias, "bias_after_scale": self.bias_after_scale}
+
+
+class TestScaleWithScaleTensorCase1(TestScaleWithScaleTensor):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float32"),
+            "scale": self.random([1], "float32", 2.0, 10.0)
+        }
+        self.bias = 0.0
+        self.bias_after_scale = True
+
+
+class TestScaleWithScaleTensorCase2(TestScaleWithScaleTensor):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int32"),
+            "scale": self.random([1], "float32", 2.0, 10.0)
+        }
+        self.bias = 0.0
+        self.bias_after_scale = True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_scatter_op.py b/test/cinn/op_mappers/test_scatter_op.py
new file mode 100644
index 0000000000000..78aff31b8da18
--- /dev/null
+++ b/test/cinn/op_mappers/test_scatter_op.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import random
+import unittest
+from op_mapper_test import OpMapperTest
+
+
+class TestScatterOp(OpMapperTest):
+    def init_input_data(self):
+        dim0 = 6
+        dim1 = 10
+        x_data = self.random([dim0, dim1], "float32")
+        ids_data = np.random.randint(
+            0, dim0, [random.randint(1, 5)], dtype=np.int32)
+        updates_data = self.random([len(ids_data), dim1], "float32")
+        self.feed_data = {
+            'x': x_data,
+            'ids': ids_data,
+            'updates': updates_data
+        }
+
+    def set_op_type(self):
+        return "scatter"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        ids = paddle.static.data(
+            name='ids',
+            shape=self.feed_data['ids'].shape,
+            dtype=self.feed_data['ids'].dtype)
+        updates = paddle.static.data(
+            name='updates',
+            shape=self.feed_data['updates'].shape,
+            dtype=self.feed_data['updates'].dtype)
+        return {'X': [x], 'Ids': [ids], 'Updates': [updates]}
+
+    def set_op_attrs(self):
+        return {'overwrite': False}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestScatterOpOverWrite(TestScatterOp):
+    def init_input_data(self):
+        dim0 = 6
+        dim1 = 10
+        x_data = self.random([dim0, dim1], "float32")
+        ids_data = np.random.randint(
+            0, dim0, [random.randint(1, 10)], dtype=np.int32)
+        # remove duplicate elements, because paddle has undetermined behavior for duplicate elements
+        ids_data = np.unique(ids_data)
+        updates_data = self.random([len(ids_data), dim1], "float32")
+        self.feed_data = {
+            'x': x_data,
+            'ids': ids_data,
+            'updates': updates_data
+        }
+
+    def set_op_attrs(self):
+        return {'overwrite': True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_sign_op.py b/test/cinn/op_mappers/test_sign_op.py
new file mode 100644
index 0000000000000..7d8f94edca20d
--- /dev/null
+++ b/test/cinn/op_mappers/test_sign_op.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestSignOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': np.array([-1, -0, np.nan, +0, 1]).astype("float32")
+        }
+
+    def set_op_type(self):
+        return "sign"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(equal_nan=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_split_op.py b/test/cinn/op_mappers/test_split_op.py
new file mode 100644
index 0000000000000..706065aef0707
--- /dev/null
+++ b/test/cinn/op_mappers/test_split_op.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestSplitOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float32"),
+        }
+        self.axis = 0
+        self.num = 4
+
+    def set_op_type(self):
+        return "split"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"axis": 0, "num": self.num}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)] * self.num}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSplitCase1(TestSplitOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float32"),
+        }
+        self.axis = -1
+        self.num = 8
+
+
+class TestSplitWithSection(TestSplitOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "float32"),
+        }
+        self.axis = 0
+        self.sections = [3, 5, 16, 2, 6]
+
+    def set_op_attrs(self):
+        return {"axis": 0, "sections": self.sections}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)] * len(self.sections)}
+
+
+class TestSplitWithSectionCase1(TestSplitWithSection):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int32", 0, 10000),
+        }
+        self.axis = 0
+        self.sections = [4, 4, 16, 8]
+
+
+class TestSplitWithSectionCase2(TestSplitWithSection):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 64], "int32", 0, 10000),
+        }
+        self.axis = 0
+        self.sections = [4, 4, -1, 8]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_squeeze_op.py b/test/cinn/op_mappers/test_squeeze_op.py
new file mode 100644
index 0000000000000..1c60526992ddf
--- /dev/null
+++ b/test/cinn/op_mappers/test_squeeze_op.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestSqueezeOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 1, 10], 'float32'),
+        }
+
+    def set_op_type(self):
+        return "squeeze2"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"axes": [1]}
+
+    def set_op_outputs(self):
+        return {
+            'Out': [str(self.feed_data['x'].dtype)],
+            "XShape": [str(self.feed_data['x'].dtype)]
+        }
+
+    def skip_check_outputs(self):
+        # in Paddle, XShape is None, its memory has been optimized
+        return {"XShape"}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSqueezeAxesEmpty(TestSqueezeOp):
+    def set_op_attrs(self):
+        return {"axes": []}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_stack_op.py b/test/cinn/op_mappers/test_stack_op.py
new file mode 100644
index 0000000000000..3aeccaacd4d83
--- /dev/null
+++ b/test/cinn/op_mappers/test_stack_op.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestStackOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x1': self.random([4, 3], 'float32'),
+            'x2': self.random([4, 3], 'float32'),
+        }
+
+    def set_op_type(self):
+        return "stack"
+
+    def set_op_inputs(self):
+        x = [
+            paddle.static.data(
+                name=var_name,
+                shape=self.feed_data[var_name].shape,
+                dtype=self.feed_data[var_name].dtype)
+            for var_name in self.feed_data.keys()
+        ]
+        return {'X': x}
+
+    def set_op_attrs(self):
+        return {"axis": 0}
+
+    def set_op_outputs(self):
+        return {'Y': [str(self.feed_data['x1'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_strided_slice_op.py b/test/cinn/op_mappers/test_strided_slice_op.py
new file mode 100644
index 0000000000000..7f7a9f56d8bf6
--- /dev/null
+++ b/test/cinn/op_mappers/test_strided_slice_op.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestStridedSliceOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'inputs': self.random([10], dtype='float32'),
+        }
+        self.axes = [0]
+        self.starts = [2]
+        self.ends = [5]
+        self.strides = [1]
+        self.infer_flags = [1]
+
+    def set_op_type(self):
+        return "strided_slice"
+
+    def set_op_inputs(self):
+        inputs = paddle.static.data(
+            name='inputs',
+            shape=self.feed_data['inputs'].shape,
+            dtype=self.feed_data['inputs'].dtype)
+        return {'Input': [inputs]}
+
+    def set_op_attrs(self):
+        return {
+            "axes": self.axes,
+            "starts": self.starts,
+            "ends": self.ends,
+            "strides": self.strides,
+            "infer_flags": self.infer_flags
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['inputs'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestStridedSliceCase1(TestStridedSliceOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'inputs': self.random([10, 12], 'float32'),
+        }
+        self.axes = [0, 1]
+        self.starts = [1, 2]
+        self.ends = [6, 10]
+        self.strides = [1, 2]
+        self.infer_flags = [1, 1]
+
+
+class TestStridedSliceCase2(TestStridedSliceOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'inputs': self.random([2, 10, 5], 'float32'),
+        }
+        self.axes = [0, 1, 2]
+        self.starts = [1, 2, 3]
+        self.ends = [6, 10, 5]
+        self.strides = [1, 2, 1]
+        self.infer_flags = [1, 1, 1]
+
+
+class TestStridedSliceCase3(TestStridedSliceOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'inputs': self.random([2, 15, 10], 'int32'),
+        }
+        self.axes = [0, 1, 2]
+        self.starts = [1, 10, 3]
+        self.ends = [6, 2, 5]
+        self.strides = [1, -2, 1]
+        self.infer_flags = [1, 1, 1]
+
+
+class TestStridedSliceCase4(TestStridedSliceOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'inputs': self.random([12, 14], 'float32'),
+        }
+        self.axes = [0, 1]
+        self.starts = [1, -2]
+        self.ends = [6, -10]
+        self.strides = [2, -2]
+        self.infer_flags = [1, -1]
+
+
+class TestStridedSliceCase5(TestStridedSliceOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'inputs': self.random([120], 'float32'),
+        }
+        self.axes = [0]
+        self.starts = [-1]
+        self.ends = [-120]
+        self.strides = [-4]
+        self.infer_flags = [-1]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_take_along_axis_op.py b/test/cinn/op_mappers/test_take_along_axis_op.py
new file mode 100644
index 0000000000000..9b213ae746e48
--- /dev/null
+++ b/test/cinn/op_mappers/test_take_along_axis_op.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest
+import paddle
+
+
+def infer_broadcast_shape(arr, indices, axis):
+    # This function is used in take/put_along_axis
+    broadcast_shape_list = list(arr.shape)
+    broadcast_shape_list[axis] = list(indices.shape)[axis]
+    broadcast_shape = tuple(broadcast_shape_list)
+    for i in range(len(arr.shape)):
+        if arr.shape[i] < indices.shape[i]:
+            # if indices matrix has larger size than arr matrix, do not broadcast.
+            return None
+    return broadcast_shape
+
+
+class TestTakeAlongAxisOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 5, 5, 5], 'float32'),
+            'index': self.random([1, 1, 1, 5], 'int32', 0, 5),
+        }
+        self.axis = 0
+
+    def set_op_type(self):
+        return "take_along_axis"
+
+    def set_op_inputs(self):
+        broadcast_shape = infer_broadcast_shape(
+            self.feed_data['x'], self.feed_data['index'], self.axis)
+        if not broadcast_shape:
+            broadcast_shape = self.feed_data['index'].shape
+        self.feed_data['index'] = np.broadcast_to(self.feed_data['index'],
+                                                  broadcast_shape).copy()
+        broadcast_shape_list = list(broadcast_shape)
+        broadcast_shape_list[self.axis] = list(
+            self.feed_data['x'].shape)[self.axis]
+        broadcast_shape = tuple(broadcast_shape_list)
+        self.feed_data['x'] = np.broadcast_to(self.feed_data['x'],
+                                              broadcast_shape).copy()
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        index = paddle.static.data(
+            name='index',
+            shape=self.feed_data['index'].shape,
+            dtype=self.feed_data['index'].dtype)
+        return {'Input': [x], 'Index': [index]}
+
+    def set_op_attrs(self):
+        return {'Axis': self.axis}
+
+    def set_op_outputs(self):
+        return {'Result': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestTakeAlongAxisCase1(TestTakeAlongAxisOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 5, 5, 5], 'float32'),
+            'index': self.random([1, 1, 1, 5], 'int32', 0, 5),
+        }
+        self.axis = 1
+
+
+class TestTakeAlongAxisCase2(TestTakeAlongAxisOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 5, 5, 5], 'float32'),
+            'index': self.random([1, 1, 1, 5], 'int32', 0, 5),
+        }
+        self.axis = 2
+
+
+class TestTakeAlongAxisCase3(TestTakeAlongAxisOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 5, 5, 5], 'float32'),
+            'index': self.random([1, 1, 1, 5], 'int32', 0, 5),
+        }
+        self.axis = 3
+
+
+class TestTakeAlongAxisCase4(TestTakeAlongAxisOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 5, 5, 5], 'float32'),
+            'index': self.random([5, 1, 1, 1], 'int32', 0, 5),
+        }
+        self.axis = 0
+
+
+class TestTakeAlongAxisCase5(TestTakeAlongAxisOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 5, 5, 5], 'float32'),
+            'index': self.random([1, 5, 1, 1], 'int32', 0, 5),
+        }
+        self.axis = 0
+
+
+class TestTakeAlongAxisCase6(TestTakeAlongAxisOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 5, 5, 5], 'float32'),
+            'index': self.random([1, 1, 5, 1], 'int32', 0, 5),
+        }
+        self.axis = 0
+
+
+class TestTakeAlongAxisCase7(TestTakeAlongAxisOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 5, 5, 5], 'float32'),
+            'index': self.random([1, 1, 1, 5], 'int64', 0, 5),
+        }
+        self.axis = 0
+
+
+class TestTakeAlongAxisCase8(TestTakeAlongAxisOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 5, 5, 5], 'float64'),
+            'index': self.random([1, 1, 1, 5], 'int64', 0, 5),
+        }
+        self.axis = 0
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_tile_op.py b/test/cinn/op_mappers/test_tile_op.py
new file mode 100644
index 0000000000000..d1f5b918659da
--- /dev/null
+++ b/test/cinn/op_mappers/test_tile_op.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestTileOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': np.array([1, 2, 3], dtype='float32'),
+        }
+        self.repeat_times = [2, 2]
+
+    def set_op_type(self):
+        return "tile"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"repeat_times": self.repeat_times}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestTileCase1(TestTileOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 3, 4], 'float32'),
+        }
+        self.repeat_times = [1, 2, 3]
+
+
+class TestTileCase2(TestTileOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 10, 5], 'float32'),
+        }
+        self.repeat_times = [2, 2]
+
+
+class TestTileCase3(TestTileOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([2, 4, 15], 'float32'),
+        }
+        self.repeat_times = [2, 1, 4]
+
+
+class TestTileCase4(TestTileOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([12, 14], 'float32'),
+        }
+        self.repeat_times = [2, 3]
+
+
+class TestTileCase5(TestTileOp):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([120], 'float32'),
+        }
+        self.repeat_times = [2, 2]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_transpose2_op.py b/test/cinn/op_mappers/test_transpose2_op.py
new file mode 100644
index 0000000000000..69d4d3a68e84b
--- /dev/null
+++ b/test/cinn/op_mappers/test_transpose2_op.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestTranspose2Op(OpMapperTest):
+    def init_input_dtype(self):
+        self.dtype = 'float32'
+
+    def init_input_data(self):
+        self.init_input_dtype()
+        self.feed_data = {
+            'x': self.random([2, 3, 4], self.dtype, 0.0, 100.0),
+        }
+
+    def set_op_type(self):
+        return "transpose2"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {"axis": [0, 2, 1]}
+
+    def set_op_outputs(self):
+        return {
+            'Out': [str(self.feed_data['x'].dtype)],
+            'XShape': [str(self.feed_data['x'].dtype)]
+        }
+
+    def skip_check_outputs(self):
+        # in Paddle, XShape is None, its memory has been optimized
+        return {"XShape"}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestTranspose2OpInt32(TestTranspose2Op):
+    def init_input_dtype(self):
+        self.dtype = 'int32'
+
+
+class TestTranspose2OpInt64(TestTranspose2Op):
+    def init_input_dtype(self):
+        self.dtype = 'int64'
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_triangular_solve_op.py b/test/cinn/op_mappers/test_triangular_solve_op.py
new file mode 100644
index 0000000000000..b9adfecc03a2a
--- /dev/null
+++ b/test/cinn/op_mappers/test_triangular_solve_op.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from op_mapper_test import OpMapperTest
+import paddle
+
+
+class TestTriangularSolveOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([32, 32], "float32"),
+            'y': self.random([32, 128], "float32")
+        }
+
+    def set_op_type(self):
+        return "triangular_solve"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        y = paddle.static.data(
+            name='y',
+            shape=self.feed_data['y'].shape,
+            dtype=self.feed_data['y'].dtype)
+        return {'X': [x], 'Y': [y]}
+
+    def set_op_attrs(self):
+        return {"upper": True, "transpose": False, "unitriangular": False}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestTriangularSolveOpUpper(TestTriangularSolveOp):
+    def set_op_attrs(self):
+        return {"upper": False, "transpose": False, "unitriangular": False}
+
+
+class TestTriangularSolveOpTranspose(TestTriangularSolveOp):
+    def set_op_attrs(self):
+        return {"upper": True, "transpose": True, "unitriangular": False}
+
+
+class TestTriangularSolveOpUnitriangular(TestTriangularSolveOp):
+    def set_op_attrs(self):
+        return {"upper": True, "transpose": False, "unitriangular": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_unary_op.py b/test/cinn/op_mappers/test_unary_op.py
new file mode 100644
index 0000000000000..d37ab74ab0ff9
--- /dev/null
+++ b/test/cinn/op_mappers/test_unary_op.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestUnaryOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32")}
+
+    def set_op_type(self):
+        return "sqrt"
+
+    def set_op_inputs(self):
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        return {'X': [x]}
+
+    def set_op_attrs(self):
+        return {}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSqrtOp(TestUnaryOp):
+    def set_op_type(self):
+        return "sqrt"
+
+
+class TestGeluOp(TestUnaryOp):
+    def set_op_type(self):
+        return "gelu"
+
+
+class TestSigmoidOp(TestUnaryOp):
+    def set_op_type(self):
+        return "sigmoid"
+
+
+class TestExpOp(TestUnaryOp):
+    def set_op_type(self):
+        return "exp"
+
+
+class TestErfOp(TestUnaryOp):
+    def set_op_type(self):
+        return "erf"
+
+
+class TestRsqrtOp(TestUnaryOp):
+    def set_op_type(self):
+        return "rsqrt"
+
+
+class TestSinOp(TestUnaryOp):
+    def set_op_type(self):
+        return "sin"
+
+
+class TestCosOp(TestUnaryOp):
+    def set_op_type(self):
+        return "cos"
+
+
+class TestTanOp(TestUnaryOp):
+    def set_op_type(self):
+        return "tan"
+
+
+class TestSinhOp(TestUnaryOp):
+    def set_op_type(self):
+        return "sinh"
+
+
+class TestCoshOp(TestUnaryOp):
+    def set_op_type(self):
+        return "cosh"
+
+
+class TestTanhOp(TestUnaryOp):
+    def set_op_type(self):
+        return "tanh"
+
+
+class TestAsinOp(TestUnaryOp):
+    def set_op_type(self):
+        return "asin"
+
+
+class TestAcosOp(TestUnaryOp):
+    def set_op_type(self):
+        return "acos"
+
+
+class TestAtanOp(TestUnaryOp):
+    def set_op_type(self):
+        return "atan"
+
+
+class TestAsinhOp(TestUnaryOp):
+    def set_op_type(self):
+        return "asinh"
+
+
+class TestAcoshOp(TestUnaryOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32", 1.0, 10.0)}
+
+    def set_op_type(self):
+        return "acosh"
+
+
+class TestAtanhOp(TestUnaryOp):
+    def set_op_type(self):
+        return "atanh"
+
+
+class TestSignOp(TestUnaryOp):
+    def set_op_type(self):
+        return "sign"
+
+
+class TestAbsOp(TestUnaryOp):
+    def set_op_type(self):
+        return "abs"
+
+
+class TestReciprocalOp(TestUnaryOp):
+    def set_op_type(self):
+        return "reciprocal"
+
+
+class TestFloorOp(TestUnaryOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32", -2.0, 2.0)}
+
+    def set_op_type(self):
+        return "floor"
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestCeilOp(TestUnaryOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32", -2.0, 2.0)}
+
+    def set_op_type(self):
+        return "ceil"
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestRoundOp(TestUnaryOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32", -2.0, 2.0)}
+
+    def set_op_type(self):
+        return "round"
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestTruncOp(TestUnaryOp):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([32, 64], "float32", -2.0, 2.0)}
+
+    def set_op_type(self):
+        return "trunc"
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestIsNanOp(TestUnaryOp):
+    def set_op_type(self):
+        return "isnan_v2"
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestIsNanCase1(TestIsNanOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64])}
+        self.inputs["x"][0] = [np.nan] * 64
+
+
+class TestIsFiniteOp(TestUnaryOp):
+    def set_op_type(self):
+        return "isfinite_v2"
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestIsFiniteCase1(TestIsFiniteOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64])}
+        self.inputs["x"][0] = [np.inf] * 64
+
+
+class TestIsInfOp(TestUnaryOp):
+    def set_op_type(self):
+        return "isinf_v2"
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestIsInfCase1(TestIsInfOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64])}
+        self.inputs["x"][0] = [np.inf] * 64
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_uniform_random_op.py b/test/cinn/op_mappers/test_uniform_random_op.py
new file mode 100644
index 0000000000000..04af4343de4ea
--- /dev/null
+++ b/test/cinn/op_mappers/test_uniform_random_op.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestUniformRandomOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [2, 3]
+        self.min = -1.0
+        self.max = 1.0
+        self.seed = 10
+        self.dtype = "float32"
+        self.diag_num = 0
+        self.diag_step = 1
+        self.diag_val = 1.0
+
+    def set_op_type(self):
+        return "uniform_random"
+
+    def set_op_inputs(self):
+        return {}
+
+    def set_op_attrs(self):
+        return {
+            "shape": self.shape,
+            "min": self.min,
+            "max": self.max,
+            "seed": self.seed,
+            "dtype": self.nptype2paddledtype(self.dtype),
+            "diag_num": self.diag_num,
+            "diag_step": self.diag_step,
+            "diag_val": self.diag_val
+        }
+
+    def set_op_outputs(self):
+        return {'Out': [self.dtype]}
+
+    def test_check_results(self):
+        # Due to the different random number generation numbers implemented
+        # in the specific implementation, the random number results generated
+        # by CINN and Paddle are not the same, but they all conform to the
+        # Uniform distribution.
+        self.check_outputs_and_grads(max_relative_error=100)
+
+
+class TestUniformRandomCase1(TestUniformRandomOp):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [2, 3, 4]
+        self.min = -5.5
+        self.max = 5.5
+        self.seed = 10
+        self.dtype = "float32"
+        self.diag_num = 3
+        self.diag_step = 3
+        self.diag_val = 1.0
+
+
+class TestUniformRandomCase2(TestUniformRandomOp):
+    def init_input_data(self):
+        self.feed_data = dict()
+        self.shape = [2, 3, 4]
+        self.min = -10.0
+        self.max = 10.0
+        self.seed = 10
+        self.dtype = "float64"
+        self.diag_num = 24
+        self.diag_step = 0
+        self.diag_val = 1.0
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/op_mappers/test_where_op.py b/test/cinn/op_mappers/test_where_op.py
new file mode 100644
index 0000000000000..a8a857f8a891c
--- /dev/null
+++ b/test/cinn/op_mappers/test_where_op.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# you may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANy KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_mapper_test import OpMapperTest, logger
+import paddle
+
+
+class TestWhereOp(OpMapperTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'condition': self.random([2, 3], 'bool'),
+            'x': self.random([2, 3], 'float32'),
+            'y': self.random([2, 3], 'float32'),
+        }
+
+    def set_op_type(self):
+        return "where"
+
+    def set_op_inputs(self):
+        condition = paddle.static.data(
+            name='condition',
+            shape=self.feed_data['condition'].shape,
+            dtype=self.feed_data['condition'].dtype)
+        x = paddle.static.data(
+            name='x',
+            shape=self.feed_data['x'].shape,
+            dtype=self.feed_data['x'].dtype)
+        y = paddle.static.data(
+            name='y',
+            shape=self.feed_data['y'].shape,
+            dtype=self.feed_data['y'].dtype)
+        return {'Condition': [condition], 'X': [x], "Y": [y]}
+
+    def set_op_attrs(self):
+        return {}
+
+    def set_op_outputs(self):
+        return {'Out': [str(self.feed_data['x'].dtype)]}
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/op_test.py b/test/cinn/ops/op_test.py
new file mode 100755
index 0000000000000..ca047a4dbb34f
--- /dev/null
+++ b/test/cinn/ops/op_test.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from cinn import Target
+from cinn.frontend import *
+from cinn.common import *
+from cinn.runtime import seed as cinn_seed
+import numpy as np
+import paddle
+import logging
+from contextlib import contextmanager
+import os
+
+import struct
+
+logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
+logger = logging.getLogger(name="op_test")
+
+
+def convert_float_to_uint16(data, data_format="NCHW"):
+    if data.size == 0:
+        return data.view(np.uint16)
+
+    if data_format == "NHWC":
+        data = np.transpose(data, [0, 3, 1, 2])
+
+    new_data = np.vectorize(
+        lambda x: struct.unpack('<I', struct.pack('<f', x))[0] >> 16,
+        otypes=[np.uint16],
+    )(data.flat)
+    new_data = np.reshape(new_data, data.shape)
+
+    if data_format == "NHWC":
+        new_data = np.transpose(new_data, [0, 2, 3, 1])
+    return new_data
+
+
+def convert_uint16_to_float(data):
+    new_data = np.vectorize(
+        lambda x: struct.unpack('<f', struct.pack('<I', x << 16))[0],
+        otypes=[np.float32],
+    )(data.flat)
+    return np.reshape(new_data, data.shape)
+
+
+class OpTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(OpTest, self).__init__(*args, **kwargs)
+        self._init_target()
+        self._init_results()
+        self._init_seed()
+
+    def _init_seed(self, seed_value=1234):
+        np.random.seed(seed_value)
+        paddle.seed(seed_value)
+        cinn_seed(seed_value)
+
+    def _init_results(self):
+        self.paddle_outputs = []
+        self.paddle_grads = []
+        self.cinn_outputs = []
+        self.cinn_grads = []
+
+    def _init_target(self):
+        self.target = DefaultHostTarget()
+        if is_compiled_with_cuda():
+            self.target = DefaultNVGPUTarget()
+
+    def _get_device(self):
+        return "NVGPU" if is_compiled_with_cuda() else "CPU"
+
+    def build_paddle_program(self, target):
+        raise Exception("Not implemented.")
+
+    def get_paddle_grads(self, outputs, inputs, grad_outputs):
+        grad_tensors = []
+        for grad in grad_outputs:
+            grad_tensors.append(paddle.to_tensor(grad))
+        grads = paddle.grad(outputs, inputs, grad_tensors)
+
+        return grads
+
+    def build_cinn_program(self, target):
+        raise Exception("Not implemented.")
+
+    def get_cinn_output(self,
+                        prog,
+                        target,
+                        inputs,
+                        feed_data,
+                        outputs,
+                        passes=[],
+                        scope=None):
+        fetch_ids = {str(out) for out in outputs}
+        result = prog.build_and_get_output(
+            target, inputs, feed_data, outputs, passes=passes, scope=scope)
+        outs_and_grads = []
+        for res in result:
+            outs_and_grads.append(res.numpy(target))
+
+        return outs_and_grads
+
+    def apply_pass(self, prog, target, passes=["Decomposer"], fetch_ids=set()):
+        def print_program(prog):
+            if logger.getEffectiveLevel() != logging.DEBUG:
+                return
+            for i in range(prog.size()):
+                print(prog[i])
+
+        logger.debug("============ Before Decomposer Pass ============")
+        print_program(prog)
+
+        prog.apply_pass(fetch_ids, target, passes)
+
+        logger.debug("============ After Decomposer Pass ============")
+        print_program(prog)
+
+    def check_outputs_and_grads(self,
+                                max_relative_error=1e-5,
+                                max_absolute_error=1e-6,
+                                all_equal=False,
+                                equal_nan=False):
+        self.build_paddle_program(self.target)
+        self.build_cinn_program(self.target)
+
+        logger.debug("============ Check Outputs ============")
+        self.check_results(self.paddle_outputs, self.cinn_outputs,
+                           max_relative_error, max_absolute_error, all_equal,
+                           equal_nan, "Outputs")
+
+        if len(self.cinn_grads) != 0:
+            logger.debug("============ Check Grads ============")
+            self.check_results(self.paddle_grads, self.cinn_grads,
+                               max_relative_error, max_absolute_error,
+                               all_equal, equal_nan, "Grads")
+
+    def check_results(self,
+                      expect_res,
+                      actual_res,
+                      max_relative_error,
+                      max_absolute_error,
+                      all_equal=False,
+                      equal_nan=False,
+                      name="Outputs"):
+        def _compute_error_message(output_id, expect, actual):
+            absolute_diff = np.abs(expect - actual).flatten()
+            relative_diff = absolute_diff / np.abs(expect).flatten()
+            max_relative_diff = 0
+            max_absolute_diff = 0
+            offset = -1
+            num_diffs = 0
+            for i in range(len(relative_diff)):
+                if relative_diff[i] > max_relative_diff:
+                    max_relative_diff = relative_diff[i]
+                if absolute_diff[i] > max_absolute_diff:
+                    max_absolute_diff = absolute_diff[i]
+                if relative_diff[i] > max_relative_error or absolute_diff[
+                        i] > max_absolute_error:
+                    num_diffs = num_diffs + 1
+                    offset = i if offset == -1 else offset
+                    # The following print can be used to debug.
+                    # print("i=%d, %e vs %e, relative_diff=%e, absolute_diff=%e" % (i, expect.flatten()[i], actual.flatten()[i], relative_diff[i], absolute_diff[i]))
+            error_message = "[%s] The %d-th output: total %d different results, offset=%d, shape=%s, %e vs %e. Maximum diff of the whole array: maximum_relative_diff=%e, maximum_absolute_diff=%e." % (
+                self._get_device(), output_id, num_diffs, offset,
+                str(expect.shape), expect.flatten()[offset],
+                actual.flatten()[offset], max_relative_diff, max_absolute_diff)
+            return error_message
+
+        def _check_error_message(output_id, expect, actual):
+            expect_flatten = expect.flatten()
+            actual_flatten = actual.flatten()
+            self.assertEqual(
+                len(expect_flatten), len(actual_flatten),
+                "[{}] The {}-th output size different, which expect shape is {} but actual is {}."
+                .format(self._get_device(), output_id, expect.shape,
+                        actual.shape))
+            num_diffs = 0
+            offset = -1
+            for i in range(len(expect_flatten)):
+                if expect_flatten[i] != actual_flatten[i]:
+                    num_diffs = num_diffs + 1
+                    offset = i if offset == -1 else offset
+
+            error_message = "[{}] The {}-th output: total {} different results, the first different result's offset={}, where expect value is {} but actual is {}.".format(
+                self._get_device(), output_id, num_diffs, offset,
+                expect_flatten[offset], actual_flatten[offset])
+            return error_message
+
+        self.assertEqual(len(expect_res), len(actual_res))
+        for i in range(len(expect_res)):
+            if expect_res[i] is None:
+                continue
+
+            if isinstance(expect_res[i], paddle.Tensor):
+                expect = expect_res[i].numpy()
+            else:
+                expect = expect_res[i]
+            actual = actual_res[i]
+
+            # data conversion for bfloat16 (uint16 in numpy)
+            if actual.dtype == np.uint16:
+                max_relative_error = 1e-2
+                if expect.dtype == np.float32 or expect.dtype == np.float64:
+                    actual = convert_uint16_to_float(actual)
+
+            self.assertEqual(
+                expect.dtype,
+                actual.dtype,
+                msg=
+                "[{}] The {}-th output dtype different, which expect shape is {} but actual is {}."
+                .format(self._get_device(), i, expect.dtype, actual.dtype))
+            # NOTE: Paddle's 0D Tensor will be changed to 1D when calling tensor.numpy(),
+            # only check non-0D Tensor's shape here. 0D-Tensor's shape will be verified by `test_zero_dim_tensor.py`
+            if len(expect.shape) != 0 and len(actual.shape) != 0:
+                self.assertEqual(
+                    expect.shape,
+                    actual.shape,
+                    msg=
+                    "[{}] The {}-th output shape different, which expect shape is {} but actual is {}."
+                    .format(self._get_device(), i, expect.shape, actual.shape))
+
+            should_all_equal = all_equal or (actual.dtype in [
+                np.dtype('bool'),
+                np.dtype('int32'),
+                np.dtype('int64')
+            ])
+
+            if expect.dtype == np.uint16:
+                expect_float = convert_uint16_to_float(expect)
+            if actual.dtype == np.uint16:
+                actual_float = convert_uint16_to_float(actual)
+
+            is_allclose = True
+            error_message = ""
+            if not should_all_equal:
+                is_allclose = np.allclose(
+                    expect,
+                    actual,
+                    atol=max_absolute_error,
+                    rtol=max_relative_error,
+                    equal_nan=equal_nan)
+                # _compute_error_message checks which values have absolute or relative error
+                error_message = "np.allclose(expect, actual, atol={}, rtol={}) checks succeed!".format(
+                    max_absolute_error, max_relative_error
+                ) if is_allclose else _compute_error_message(
+                    i, expect, actual)
+            else:
+                is_allclose = np.all(expect == actual)
+                # _check_error_message checks which values are not equal
+                error_message = "(expect == actual) checks succeed!" if is_allclose else _check_error_message(
+                    i, expect, actual)
+
+            error_message = "[Check " + name + "] " + error_message
+
+            logger.debug("{} {}".format(is_allclose, error_message))
+            self.assertTrue(is_allclose, msg=error_message)
+
+    @staticmethod
+    def nptype2cinntype(dtype):
+        switch_map = {
+            # numpy has no 'bfloat16', we use uint16 to hold bfloat16 data, same to Paddle
+            "uint16": BFloat16(),
+            "bfloat16": BFloat16(),
+            "float16": Float16(),
+            "float32": Float(32),
+            "float64": Float(64),
+            "int8": Int(8),
+            "int16": Int(16),
+            "int32": Int(32),
+            "int64": Int(64),
+            "uint8": UInt(8),
+            # numpy has no 'bfloat16', we use uint16 to hold bfloat16 data, same to Paddle
+            # "uint16": UInt(16),
+            "uint32": UInt(32),
+            "uint64": UInt(64),
+            "bool": Bool()
+        }
+        assert str(dtype) in switch_map, str(dtype) + " not support in CINN"
+        return switch_map[str(dtype)]
+
+    @staticmethod
+    def paddleddtype2cinntype(dtype):
+        return OpTest.nptype2cinntype(OpTest.paddleddtype2str(dtype))
+
+    @staticmethod
+    def random(shape, dtype="float32", low=0.0, high=1.0):
+        assert bool(shape), "Shape should not empty!"
+        assert -1 not in shape, "Shape should not -1!"
+        if dtype in ["float16", "float32", "float64"]:
+            return np.random.uniform(low, high, shape).astype(dtype)
+        elif dtype == "bfloat16":
+            return convert_float_to_uint16(
+                np.random.uniform(low, high, shape).astype("float32"))
+        elif dtype == "bool":
+            return np.random.choice(a=[False, True], size=shape).astype(dtype)
+        elif dtype in [
+                "uint8", "uint16", "uint32", "uint64", "int8", "int16",
+                "int32", "int64"
+        ]:
+            return np.random.randint(low, high, shape).astype(dtype)
+        else:
+            raise Exception("Not supported yet.")
+
+
+class OpTestTool:
+    @classmethod
+    def skip_if(cls, condition: object, reason: str):
+        return unittest.skipIf(condition, reason)
diff --git a/test/cinn/ops/op_test_helper.py b/test/cinn/ops/op_test_helper.py
new file mode 100644
index 0000000000000..d5fd1935c82f7
--- /dev/null
+++ b/test/cinn/ops/op_test_helper.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import argparse
+import itertools
+import unittest
+import re
+
+from unittest import suite
+from typing import Union, List
+
+parser = argparse.ArgumentParser(description="Argparse for op test helper")
+parser.add_argument(
+    "--case",
+    type=str,
+    help="Which case you want to test, default -1 for all cases.",
+    default=None)
+args = parser.parse_args()
+
+
+class TestCaseHelper():
+    """
+    Helper class for constructing test cases.
+    """
+
+    def __init__(self):
+        self.custom_attrs_list = []
+
+    def init_attrs(self):
+        """
+        Initialize attributes for op
+        """
+        raise Exception("Not implemented.")
+
+    def _flatten_tuple(self, cur_tuple):
+        """
+        Expand the nested dict in tuple
+        """
+        new_dict = []
+        for cur_dict in cur_tuple:
+            for k, v in cur_dict.items():
+                new_dict.append((k, v))
+        return dict(new_dict)
+
+    def _register_custom_attrs(self, custom_attrs):
+        """
+        register custom attribute
+        """
+        self.custom_attrs_list.append(custom_attrs)
+
+    def _init_cases(self):
+        """
+        Generate all test cases
+        """
+        assert isinstance(self.inputs, list)
+        assert isinstance(self.dtypes, list)
+        assert isinstance(self.attrs, list)
+        self.all_cases = []
+        all_lists = [
+            self.inputs, self.dtypes, self.attrs, *self.custom_attrs_list
+        ]
+        filtered_lists = filter(lambda x: len(x) > 0, all_lists)
+        for case in itertools.product(*filtered_lists):
+            self.all_cases.append(self._flatten_tuple(case))
+
+    def _make_all_classes(self):
+        """
+        Generate test classes
+        """
+        self.init_attrs()
+        self._init_cases()
+        self.all_classes = []
+        if args.case is not None:
+            for test_name in self.specify_test:
+                no = int(re.search(r'\d+$', test_name).group(0))
+                assert 0 <= no and no < len(self.all_cases)
+                self.all_classes.append(
+                    type(f'{self.__class__.__name__}.{self.class_name}{no}',
+                         (self.cls, ), {"case": self.all_cases[no]}))
+        else:
+            for i, case in enumerate(self.all_cases):
+                self.all_classes.append(
+                    type(f'{self.__class__.__name__}.{self.class_name}{i}',
+                         (self.cls, ), {"case": case}))
+
+    def run(self):
+        """
+        Run all test classes
+        """
+        if args.case is not None:
+            self.specify_test = []
+            all_tests = args.case.split(',')
+            for test in all_tests:
+                test_info = test.split('.')
+                assert len(test_info) == 2
+                if self.__class__.__name__ == test_info[0]:
+                    self.specify_test.append(test_info[1])
+            if len(self.specify_test) == 0:
+                return
+        self._make_all_classes()
+        test_suite = unittest.TestSuite()
+        test_loader = unittest.TestLoader()
+        for x in self.all_classes:
+            test_suite.addTests(test_loader.loadTestsFromTestCase(x))
+        runner = unittest.TextTestRunner()
+        res = runner.run(test_suite)
+        if not res.wasSuccessful():
+            sys.exit(not res.wasSuccessful())
+
+
+def run_test(test_class: Union[suite.TestSuite, List[suite.TestSuite]]):
+    test_suite = unittest.TestSuite()
+    test_loader = unittest.TestLoader()
+    if isinstance(test_class, type):
+        test_suite.addTests(test_loader.loadTestsFromTestCase(test_class))
+    else:
+        for cls in test_class:
+            test_suite.addTests(test_loader.loadTestsFromTestCase(cls))
+    runner = unittest.TextTestRunner()
+    res = runner.run(test_suite)
+    if not res.wasSuccessful():
+        sys.exit(not res.wasSuccessful())
diff --git a/test/cinn/ops/test_abs_op.py b/test/cinn/ops/test_abs_op.py
new file mode 100644
index 0000000000000..95d381155a8ec
--- /dev/null
+++ b/test/cinn/ops/test_abs_op.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestAbsOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-100,
+            high=100)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.abs(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("identity")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.abs(x)
+
+        prog = builder.build()
+
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestAbsOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAbsOpShape"
+        self.cls = TestAbsOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestAbsOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAbsOpDtype"
+        self.cls = TestAbsOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [{
+            "x_dtype": "int32",
+        }, {
+            "x_dtype": "int64",
+        }, {
+            "x_dtype": "float16",
+            "max_relative_error": 1e-3
+        }, {
+            "x_dtype": "float32",
+        }, {
+            "x_dtype": "float64",
+        }]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestAbsOpShape().run()
+    TestAbsOpDtype().run()
diff --git a/test/cinn/ops/test_acos_op.py b/test/cinn/ops/test_acos_op.py
new file mode 100644
index 0000000000000..6e557d541e6cf
--- /dev/null
+++ b/test/cinn/ops/test_acos_op.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestAcosOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-1,
+            high=1)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        out = paddle.acos(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("acos")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+
+        out = builder.acos(x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestAcosCase1(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAcosCase1"
+        self.cls = TestAcosOp
+        self.inputs = [{"x_shape": [512, 256]}]
+        self.dtypes = [{
+            "x_dtype": "float32"
+        }, {
+            "x_dtype": "float64",
+        }]
+        self.attrs = []
+
+
+class TestAcosCase2(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAcosCase2"
+        self.cls = TestAcosOp
+        self.inputs = [{
+            "x_shape": [1]
+        }, {
+            "x_shape": [1024]
+        }, {
+            "x_shape": [512, 256]
+        }, {
+            "x_shape": [128, 64, 32]
+        }, {
+            "x_shape": [128, 2048, 32]
+        }, {
+            "x_shape": [16, 8, 4, 2]
+        }, {
+            "x_shape": [1, 1, 1, 1]
+        }, {
+            "x_shape": [16, 8, 4, 2, 1]
+        }]
+        self.dtypes = [{"x_dtype": "float32"}]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestAcosCase1().run()
+    TestAcosCase2().run()
diff --git a/test/cinn/ops/test_add_op.py b/test/cinn/ops/test_add_op.py
new file mode 100644
index 0000000000000..80ea1f08639fb
--- /dev/null
+++ b/test/cinn/ops/test_add_op.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from cinn.common import *
+from cinn.frontend import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestElementwiseAddOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-10,
+            high=10)
+        self.y_np = self.random(
+            shape=self.case["y_shape"],
+            dtype=self.case["y_dtype"],
+            low=-10,
+            high=10)
+        self.dout_np = self.random(
+            self.case["dout_shape"], dtype=self.case["dout_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        y = paddle.to_tensor(self.y_np, stop_gradient=False)
+
+        def get_unsqueeze_axis(x_rank, y_rank, axis):
+            self.assertTrue(
+                x_rank >= y_rank,
+                "The rank of x should be greater or equal to that of y.")
+            axis = axis if axis >= 0 else x_rank - y_rank
+            unsqueeze_axis = np.arange(0, axis).tolist() + np.arange(
+                axis + y_rank, x_rank).tolist()
+            return unsqueeze_axis
+
+        unsqueeze_axis = get_unsqueeze_axis(
+            len(x.shape), len(y.shape), self.case["axis"])
+        y_t = paddle.unsqueeze(
+            y, axis=unsqueeze_axis) if len(unsqueeze_axis) > 0 else y
+        out = paddle.add(x, y_t)
+
+        self.paddle_outputs = [out]
+        self.paddle_grads = self.get_paddle_grads([out], [x, y],
+                                                  [self.dout_np])
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("add")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.case["y_dtype"]), self.case["y_shape"],
+            "y")
+        out = builder.add(x, y, axis=self.case["axis"])
+
+        dout = builder.create_input(
+            self.nptype2cinntype(self.case["dout_dtype"]),
+            self.case["dout_shape"], "dout")
+        x_grad, y_grad = builder.elementwise_add_grad(
+            dout, x, y, axis=self.case["axis"])
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y, dout],
+                                   [self.x_np, self.y_np, self.dout_np],
+                                   [out, x_grad, y_grad])
+
+        self.cinn_outputs = [res[0]]
+        self.cinn_grads = [res[1], res[2]]
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestAddAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestElementwiseAddOpCase"
+        self.cls = TestElementwiseAddOp
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "y_shape": [1],
+                "dout_shape": [1],
+                "axis": 0,
+            },
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+                "dout_shape": [1024],
+                "axis": -1,
+            },
+            {
+                "x_shape": [512, 256],
+                "y_shape": [512, 256],
+                "dout_shape": [512, 256],
+                "axis": 0,
+            },
+            {
+                "x_shape": [128, 64, 32],
+                "y_shape": [128, 64, 32],
+                "dout_shape": [128, 64, 32],
+                "axis": -1,
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+                "y_shape": [16, 8, 4, 2],
+                "dout_shape": [16, 8, 4, 2],
+                "axis": 0,
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "y_shape": [16, 8, 4, 2, 1],
+                "dout_shape": [16, 8, 4, 2, 1],
+                "axis": -1,
+            },
+        ]
+        self.dtypes = [
+            # TODO: paddle 2.3.1 unsupport int16 now, remove after ci paddle updated
+            # {
+            #     "x_dtype": "int16",
+            #     "y_dtype": "int16",
+            #     "dout_dtype": "int16",
+            # },
+            {
+                "x_dtype": "int32",
+                "y_dtype": "int32",
+                "dout_dtype": "int32",
+            },
+            {
+                "x_dtype": "int64",
+                "y_dtype": "int64",
+                "dout_dtype": "int64",
+            },
+            {
+                "x_dtype": "float16",
+                "y_dtype": "float16",
+                "dout_dtype": "float16",
+                "max_relative_error": 1e-3,
+            },
+            {
+                "x_dtype": "float32",
+                "y_dtype": "float32",
+                "dout_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+                "y_dtype": "float64",
+                "dout_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+class TestAddAllWithBroadcast(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestElementwiseAddOpCase"
+        self.cls = TestElementwiseAddOp
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "y_shape": [1],
+                "dout_shape": [1],
+                "axis": 0,
+            },
+            {
+                "x_shape": [1024],
+                "y_shape": [1],
+                "dout_shape": [1024],
+                "axis": -1,
+            },
+            {
+                "x_shape": [512, 256],
+                "y_shape": [1, 1],
+                "dout_shape": [512, 256],
+                "axis": 0,
+            },
+            {
+                "x_shape": [128, 64, 32],
+                "y_shape": [1, 1, 1],
+                "dout_shape": [128, 64, 32],
+                "axis": -1,
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+                "y_shape": [1, 1, 1, 1],
+                "dout_shape": [16, 8, 4, 2],
+                "axis": 0,
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "y_shape": [1, 1, 1, 1, 1],
+                "dout_shape": [16, 8, 4, 2, 1],
+                "axis": -1,
+            },
+        ]
+        self.dtypes = [
+            # Todo: Reduce does in support int16
+            # {
+            #     "x_dtype": "int16",
+            #     "y_dtype": "int16",
+            #     "dout_dtype": "int16",
+            # },
+            {
+                "x_dtype": "int32",
+                "y_dtype": "int32",
+                "dout_dtype": "int32",
+            },
+            {
+                "x_dtype": "int64",
+                "y_dtype": "int64",
+                "dout_dtype": "int64",
+            },
+            {
+                "x_dtype": "float16",
+                "y_dtype": "float16",
+                "dout_dtype": "float16",
+                "max_relative_error": 1e-3,
+            },
+            {
+                "x_dtype": "float32",
+                "y_dtype": "float32",
+                "dout_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+                "y_dtype": "float64",
+                "dout_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestAddAll().run()
+    TestAddAllWithBroadcast().run()
diff --git a/test/cinn/ops/test_arange_op.py b/test/cinn/ops/test_arange_op.py
new file mode 100644
index 0000000000000..2402400bfc5f2
--- /dev/null
+++ b/test/cinn/ops/test_arange_op.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestArangeOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "start": self.case["start"],
+            "end": self.case["end"],
+            "step": self.case["step"],
+            "dtype": self.case["dtype"]
+        }
+
+    def build_paddle_program(self, target):
+        out = paddle.arange(self.inputs["start"], self.inputs["end"],
+                            self.inputs["step"], self.inputs["dtype"])
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("arange")
+        out = builder.arange(self.inputs["start"], self.inputs["end"],
+                             self.inputs["step"], self.inputs["dtype"])
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [], [], [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestArangeOpShapeAndAttr(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestArangeOpShapeAndAttr"
+        self.cls = TestArangeOp
+        self.inputs = [
+            # basic shape test
+            {
+                "start": 0,
+                "end": 10,
+                "step": 1,
+            },
+            {
+                "start": 0,
+                "end": 1024,
+                "step": 16,
+            },
+            {
+                "start": 512,
+                "end": 2600,
+                "step": 512,
+            },
+            {
+                "start": 0,
+                "end": 65536,
+                "step": 1024,
+            },
+            {
+                "start": 0,
+                "end": 131072,
+                "step": 2048,
+            },
+            {
+                "start": 0,
+                "end": 1,
+                "step": 2,
+            },
+            {
+                "start": 0,
+                "end": 1,
+                "step": 2,
+            },
+            # step test
+            {
+                "start": 1024,
+                "end": 512,
+                "step": -2,
+            },
+            {
+                "start": 2048,
+                "end": 0,
+                "step": -64,
+            },
+            # range test
+            {
+                "start": -2048,
+                "end": 2048,
+                "step": 32,
+            },
+            {
+                "start": -2048,
+                "end": -512,
+                "step": 64,
+            },
+            {
+                "start": 1024,
+                "end": 4096,
+                "step": 512,
+            },
+            {
+                "start": 1024,
+                "end": -1024,
+                "step": -128,
+            },
+            {
+                "start": -1024,
+                "end": -2048,
+                "step": -64,
+            },
+            {
+                "start": 2048,
+                "end": 512,
+                "step": -32,
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestArangeOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestArangeOpDtype"
+        self.cls = TestArangeOp
+        self.inputs = [
+            {
+                "start": 5,
+                "end": 10,
+                "step": 1,
+            },
+            {
+                "start": -10,
+                "end": -100,
+                "step": -10,
+            },
+            {
+                "start": -10,
+                "end": 10,
+                "step": 1,
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestArangeOpShapeAndAttr().run()
+    TestArangeOpDtype().run()
diff --git a/test/cinn/ops/test_argsort_op.py b/test/cinn/ops/test_argsort_op.py
new file mode 100644
index 0000000000000..9d7935f58cb6f
--- /dev/null
+++ b/test/cinn/ops/test_argsort_op.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import paddle.nn.functional as F
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestArgSortOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {
+            "x1": np.random.random([
+                2,
+                4,
+            ]).astype("float32")
+        }
+        self.axis = 1
+        self.descending = False
+
+    def build_paddle_program(self, target):
+        x1 = paddle.to_tensor(self.inputs["x1"], stop_gradient=True)
+        out = paddle.argsort(x1, self.axis, self.descending)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("argsort")
+        x1 = builder.create_input(Float(32), self.inputs["x1"].shape, "x1")
+        out = builder.argsort(x1, self.axis, not self.descending)
+        prog = builder.build()
+        forward_res = self.get_cinn_output(prog, target, [x1],
+                                           [self.inputs["x1"]], out)
+
+        self.cinn_outputs = np.array([forward_res[0]]).astype("int64")
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestArgSortCase1(TestArgSortOp):
+    def init_case(self):
+        self.inputs = {
+            "x1": np.random.random([
+                2,
+                4,
+            ]).astype("float32")
+        }
+        self.axis = 0
+        self.descending = False
+
+
+class TestArgSortCase2(TestArgSortOp):
+    def init_case(self):
+        self.inputs = {
+            "x1": np.random.random([
+                2,
+                4,
+            ]).astype("float32")
+        }
+        self.axis = 0
+        self.descending = True
+
+
+class TestArgSortCase3(TestArgSortOp):
+    def init_case(self):
+        self.inputs = {
+            "x1": np.random.random([
+                2,
+                4,
+            ]).astype("float32")
+        }
+        self.axis = 1
+        self.descending = True
+
+
+class TestArgSortCase4(TestArgSortOp):
+    def init_case(self):
+        self.inputs = {
+            "x1": np.random.random([
+                2,
+                4,
+            ]).astype("float32")
+        }
+        self.axis = -1
+        self.descending = True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_asin_op.py b/test/cinn/ops/test_asin_op.py
new file mode 100644
index 0000000000000..4bc15faaef685
--- /dev/null
+++ b/test/cinn/ops/test_asin_op.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestAsinOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-1.0,
+            high=1.0)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.asin(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.asin(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestAsinOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAsinOpShape"
+        self.cls = TestAsinOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestAsinOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAsinOpDtype"
+        self.cls = TestAsinOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestAsinOpShape().run()
+    TestAsinOpDtype().run()
diff --git a/test/cinn/ops/test_asinh_op.py b/test/cinn/ops/test_asinh_op.py
new file mode 100644
index 0000000000000..a45aee2b0b091
--- /dev/null
+++ b/test/cinn/ops/test_asinh_op.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestAsinhOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        out = paddle.asinh(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("asinh")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+
+        out = builder.asinh(x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestAsinhCase1(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAsinhCase1"
+        self.cls = TestAsinhOp
+        self.inputs = [{"x_shape": [512, 256]}]
+        self.dtypes = [{
+            "x_dtype": "float32"
+        }, {
+            "x_dtype": "float64",
+        }]
+        self.attrs = []
+
+
+class TestAsinhCase2(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAsinhCase2"
+        self.cls = TestAsinhOp
+        self.inputs = [{
+            "x_shape": [1]
+        }, {
+            "x_shape": [1024]
+        }, {
+            "x_shape": [512, 256]
+        }, {
+            "x_shape": [128, 64, 32]
+        }, {
+            "x_shape": [128, 2048, 32]
+        }, {
+            "x_shape": [16, 8, 4, 2]
+        }, {
+            "x_shape": [1, 1, 1, 1]
+        }, {
+            "x_shape": [16, 8, 4, 2, 1]
+        }]
+        self.dtypes = [{"x_dtype": "float32"}]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestAsinhCase1().run()
+    TestAsinhCase2().run()
diff --git a/test/cinn/ops/test_atan2_op.py b/test/cinn/ops/test_atan2_op.py
new file mode 100644
index 0000000000000..b3e4bf32f07d3
--- /dev/null
+++ b/test/cinn/ops/test_atan2_op.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestAtan2Op(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=self.case["x_low"],
+            high=self.case["x_high"])
+        self.y_np = self.random(
+            shape=self.case["y_shape"],
+            dtype=self.case["y_dtype"],
+            low=self.case["y_low"],
+            high=self.case["y_high"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        y = paddle.to_tensor(self.y_np, stop_gradient=False)
+        out = paddle.atan2(x, y)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("atan2")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.case["y_dtype"]), self.case["y_shape"],
+            "y")
+        out = builder.atan2(x, y)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.x_np, self.y_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestAtan2OpShapes(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAtan2OpCase"
+        self.cls = TestAtan2Op
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+            },
+            {
+                "x_shape": [512, 256],
+                "y_shape": [512, 256],
+            },
+            {
+                "x_shape": [128, 64, 32],
+                "y_shape": [128, 64, 32],
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+                "y_shape": [16, 8, 4, 2],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "y_shape": [16, 8, 4, 2, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "float32",
+                "y_dtype": "float32",
+            },
+        ]
+        self.attrs = [{
+            "x_low": -10,
+            "x_high": 10,
+            "y_low": -10,
+            "y_high": 10,
+        }]
+
+
+class TestAtan2OpDtypes(TestAtan2OpShapes):
+    def init_attrs(self):
+        super().init_attrs()
+        self.inputs = [
+            {
+                "x_shape": [128],
+                "y_shape": [128],
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "y_dtype": "float16",
+                "max_relative_error": 1e-2,
+            },
+            {
+                "x_dtype": "float32",
+                "y_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+                "y_dtype": "float64",
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestAtan2OpShapes().run()
+    TestAtan2OpDtypes().run()
diff --git a/test/cinn/ops/test_atan_op.py b/test/cinn/ops/test_atan_op.py
new file mode 100644
index 0000000000000..7d10fd7e437c1
--- /dev/null
+++ b/test/cinn/ops/test_atan_op.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestAtanOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        out = paddle.atan(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("atan")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+
+        out = builder.atan(x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestAtanCase1(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAtanCase1"
+        self.cls = TestAtanOp
+        self.inputs = [{"x_shape": [512, 256]}]
+        self.dtypes = [{
+            "x_dtype": "float32"
+        }, {
+            "x_dtype": "float64",
+        }]
+        self.attrs = []
+
+
+class TestAtanCase2(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAtanCase2"
+        self.cls = TestAtanOp
+        self.inputs = [{
+            "x_shape": [1]
+        }, {
+            "x_shape": [1024]
+        }, {
+            "x_shape": [512, 256]
+        }, {
+            "x_shape": [128, 64, 32]
+        }, {
+            "x_shape": [128, 2048, 32]
+        }, {
+            "x_shape": [16, 8, 4, 2]
+        }, {
+            "x_shape": [1, 1, 1, 1]
+        }, {
+            "x_shape": [16, 8, 4, 2, 1]
+        }]
+        self.dtypes = [{"x_dtype": "float32"}]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestAtanCase1().run()
+    TestAtanCase2().run()
diff --git a/test/cinn/ops/test_atanh_op.py b/test/cinn/ops/test_atanh_op.py
new file mode 100644
index 0000000000000..bd78153f1b674
--- /dev/null
+++ b/test/cinn/ops/test_atanh_op.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestAtanhOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        out = paddle.atanh(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("atanh")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+
+        out = builder.atanh(x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestAtanhCase1(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAtanhCase1"
+        self.cls = TestAtanhOp
+        self.inputs = [{"x_shape": [512, 256]}]
+        self.dtypes = [{
+            "x_dtype": "float32"
+        }, {
+            "x_dtype": "float64",
+        }]
+        self.attrs = []
+
+
+class TestAtanhCase2(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestAtanhCase2"
+        self.cls = TestAtanhOp
+        self.inputs = [{
+            "x_shape": [1]
+        }, {
+            "x_shape": [1024]
+        }, {
+            "x_shape": [512, 256]
+        }, {
+            "x_shape": [128, 64, 32]
+        }, {
+            "x_shape": [128, 2048, 32]
+        }, {
+            "x_shape": [16, 8, 4, 2]
+        }, {
+            "x_shape": [1, 1, 1, 1]
+        }, {
+            "x_shape": [16, 8, 4, 2, 1]
+        }]
+        self.dtypes = [{"x_dtype": "float32"}]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestAtanhCase1().run()
+    TestAtanhCase2().run()
diff --git a/test/cinn/ops/test_batch_norm_op.py b/test/cinn/ops/test_batch_norm_op.py
new file mode 100644
index 0000000000000..47a96e30110f1
--- /dev/null
+++ b/test/cinn/ops/test_batch_norm_op.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest, sys
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestBatchNormTrainOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.num_channels = 16
+        self.inputs = {
+            "x":
+            self.random([2, self.num_channels, 8, 8], "float32", 0.0, 1.0),
+            "dout":
+            self.random([2, self.num_channels, 8, 8], "float32", 1e-7, 1e-6),
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"])
+        batch_norm = paddle.nn.BatchNorm(
+            self.num_channels, act=None, is_test=False)
+        out = batch_norm(x)
+
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("batch_norm")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        scale = builder.fill_constant([self.num_channels], 1.0, 'scale',
+                                      'float32')
+        bias = builder.fill_constant([self.num_channels], 0.0, 'bias',
+                                     'float32')
+        mean = builder.fill_constant([self.num_channels], 0.0, 'mean',
+                                     'float32')
+        variance = builder.fill_constant([self.num_channels], 1.0, 'variance',
+                                         'float32')
+
+        out = builder.batchnorm(x, scale, bias, mean, variance, is_test=False)
+
+        prog = builder.build()
+        forward_res = self.get_cinn_output(
+            prog, target, [x], [self.inputs["x"]], out, passes=[])
+        self.cinn_outputs = [forward_res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+# Reopen after decomposer infer dtype fixed
+class TestBatchNormTrainFP16(TestBatchNormTrainOp):
+    def init_case(self):
+        self.num_channels = 16
+        self.inputs = {
+            "x": self.random([2, self.num_channels, 8, 8], "float16"),
+            "dout": self.random([2, self.num_channels, 8, 8], "float16"),
+        }
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(max_relative_error=1e-3)
+
+
+class TestBatchNormTrainBF16(TestBatchNormTrainOp):
+    def init_case(self):
+        self.num_channels = 16
+        x = self.random([2, self.num_channels, 8, 8], "bfloat16")
+        dout = self.random([2, self.num_channels, 8, 8], "bfloat16")
+        self.inputs = {
+            "x": x,
+            "dout": dout,
+        }
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(max_relative_error=1e-2)
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestBatchNormBackwardOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.num_channels = 16
+        self.inputs = {
+            "x":
+            self.random([2, self.num_channels, 8, 8], "float32", 0.0, 10.0),
+            "dout":
+            self.random([2, self.num_channels, 8, 8], "float32", 1e-7, 1e-6),
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        batch_norm = paddle.nn.BatchNorm(
+            self.num_channels, act=None, is_test=False)
+        out = batch_norm(x)
+
+        self.paddle_outputs = [out]
+        self.paddle_grads = self.get_paddle_grads([out], [x],
+                                                  [self.inputs["dout"]])
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("batch_norm")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        scale = builder.fill_constant([self.num_channels], 1.0, 'scale',
+                                      'float32')
+        bias = builder.fill_constant([self.num_channels], 0.0, 'bias',
+                                     'float32')
+        mean = builder.fill_constant([self.num_channels], 0.0, 'mean',
+                                     'float32')
+        variance = builder.fill_constant([self.num_channels], 1.0, 'variance',
+                                         'float32')
+
+        out = builder.batchnorm(x, scale, bias, mean, variance, is_test=False)
+
+        prog = builder.build()
+        forward_res = self.get_cinn_output(
+            prog, target, [x], [self.inputs["x"]], out, passes=[])
+        self.cinn_outputs = [forward_res[0]]
+
+        builder_grad = NetBuilder("batch_norm_grad")
+        dout = builder_grad.create_input(
+            self.nptype2cinntype(self.inputs["dout"].dtype),
+            self.inputs["dout"].shape, "dout")
+        x_g = builder_grad.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x_g")
+        scale_g = builder_grad.fill_constant(scale.shape(), 1.0, 'scale_g',
+                                             'float32')
+        save_mean = builder_grad.create_input(
+            self.nptype2cinntype('float32'), out[1].shape(), "save_mean")
+        save_variance = builder_grad.create_input(
+            self.nptype2cinntype('float32'), out[2].shape(), "save_variance")
+
+        out_grad = builder_grad.batch_norm_grad(dout, x_g, scale_g, save_mean,
+                                                save_variance)
+        prog = builder_grad.build()
+        backward_res = self.get_cinn_output(
+            prog,
+            target, [dout, x_g, save_mean, save_variance], [
+                self.inputs["dout"], self.inputs["x"], forward_res[1],
+                forward_res[2]
+            ],
+            out_grad,
+            passes=[])
+        self.cinn_grads = [backward_res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestBatchNormBackwardFP16(TestBatchNormBackwardOp):
+    def init_case(self):
+        self.num_channels = 16
+        self.inputs = {
+            "x":
+            self.random([2, self.num_channels, 8, 8], "float16", 0.0, 10.0),
+            "dout":
+            self.random([2, self.num_channels, 8, 8], "float16", 1e-7, 1e-6),
+        }
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(max_relative_error=1e-3)
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestBatchNormInferOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.num_channels = 16
+        self.inputs = {
+            "x": self.random([2, self.num_channels, 8, 8], "float32", 0.0,
+                             1.0),
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"])
+        batch_norm = paddle.nn.BatchNorm(
+            self.num_channels, act=None, is_test=True)
+        out = batch_norm(x)
+
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("batch_norm")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        scale = builder.fill_constant([self.num_channels], 1.0, 'scale',
+                                      'float32')
+        bias = builder.fill_constant([self.num_channels], 0.0, 'bias',
+                                     'float32')
+        mean = builder.fill_constant([self.num_channels], 0.0, 'mean',
+                                     'float32')
+        variance = builder.fill_constant([self.num_channels], 1.0, 'variance',
+                                         'float32')
+
+        out = builder.batchnorm(x, scale, bias, mean, variance, is_test=False)
+
+        prog = builder.build()
+        forward_res = self.get_cinn_output(
+            prog, target, [x], [self.inputs["x"]], out, passes=[])
+        self.cinn_outputs = [forward_res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_binary_elementwise_op.py b/test/cinn/ops/test_binary_elementwise_op.py
new file mode 100644
index 0000000000000..5ee1aadb941df
--- /dev/null
+++ b/test/cinn/ops/test_binary_elementwise_op.py
@@ -0,0 +1,370 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestBinaryOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def get_x_data(self):
+        return self.random([32, 64], 'float32', -10.0, 10.0)
+
+    def get_y_data(self):
+        return self.random([32, 64], 'float32', -10.0, 10.0)
+
+    def get_axis_value(self):
+        return -1
+
+    def init_case(self):
+        self.inputs = {"x": self.get_x_data(), "y": self.get_y_data()}
+        self.axis = self.get_axis_value()
+
+    def paddle_func(self, x, y):
+        return paddle.add(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.add(x, y, axis)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=False)
+
+        def get_unsqueeze_axis(x_rank, y_rank, axis):
+            self.assertTrue(
+                x_rank >= y_rank,
+                "The rank of x should be greater or equal to that of y.")
+            axis = axis if axis >= 0 else x_rank - y_rank
+            unsqueeze_axis = np.arange(0, axis).tolist() + np.arange(
+                axis + y_rank, x_rank).tolist()
+
+            return unsqueeze_axis
+
+        unsqueeze_axis = get_unsqueeze_axis(
+            len(self.inputs["x"].shape), len(self.inputs["y"].shape),
+            self.axis)
+        y_t = paddle.unsqueeze(
+            y, axis=unsqueeze_axis) if len(unsqueeze_axis) > 0 else y
+        out = self.paddle_func(x, y_t)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("binary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.inputs["y"].dtype),
+            self.inputs["y"].shape, "y")
+        out = self.cinn_func(builder, x, y, axis=self.axis)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.inputs["x"], self.inputs["y"]], [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestAddOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.add(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.add(x, y, axis)
+
+
+class TestAddOpFP64(TestAddOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'float64', -10.0, 10.0)
+
+    def get_y_data(self):
+        return self.random([32, 64], 'float64', -10.0, 10.0)
+
+
+class TestAddOpFP16(TestAddOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'float16', -10.0, 10.0)
+
+    def get_y_data(self):
+        return self.random([32, 64], 'float16', -10.0, 10.0)
+
+
+class TestAddOpInt32(TestAddOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'int32', -10.0, 10.0)
+
+    def get_y_data(self):
+        return self.random([32, 64], 'int32', -10.0, 10.0)
+
+
+class TestAddOpInt64(TestAddOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'int64', -10.0, 10.0)
+
+    def get_y_data(self):
+        return self.random([32, 64], 'int64', -10.0, 10.0)
+
+
+class TestSubtractOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.subtract(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.subtract(x, y, axis)
+
+
+class TestDivideOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.divide(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.divide(x, y, axis)
+
+
+class TestMultiplyOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.multiply(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.multiply(x, y, axis)
+
+
+class TestFloorDivideOp(TestBinaryOp):
+    def get_x_data(self):
+        # avoid random generate 0
+        return self.random([32, 64], 'int32', 1, 100) * np.random.choice(
+            [-1, 1], [1])[0]
+
+    def get_y_data(self):
+        # avoid random generate 0
+        return self.random([32, 64], 'int32', 1, 100) * np.random.choice(
+            [-1, 1], [1])[0]
+
+    def paddle_func(self, x, y):
+        return paddle.floor_divide(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.floor_divide(x, y, axis)
+
+
+class TestModOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.mod(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.mod(x, y, axis)
+
+
+class TestModCase1(TestModOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'int32', 1, 100) * np.random.choice(
+            [-1, 1], [1])[0]
+
+    def get_y_data(self):
+        return self.random([32, 64], 'int32', 1, 100) * np.random.choice(
+            [-1, 1], [1])[0]
+
+
+class TestRemainderOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.remainder(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        # paddle.remainder actual invoke mod function
+        return builder.mod(x, y, axis)
+
+
+class TestRemainderCase1(TestRemainderOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'int32', 1, 100) * np.random.choice(
+            [-1, 1], [1])[0]
+
+    def get_y_data(self):
+        return self.random([32, 64], 'int32', 1, 100) * np.random.choice(
+            [-1, 1], [1])[0]
+
+
+class TestMaxOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.maximum(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.max(x, y, axis)
+
+
+class TestMinOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.minimum(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.min(x, y, axis)
+
+
+class TestLogicalAndOp(TestBinaryOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'bool')
+
+    def get_y_data(self):
+        return self.random([32, 64], 'bool')
+
+    def paddle_func(self, x, y):
+        return paddle.logical_and(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.logical_and(x, y, axis)
+
+
+class TestLogicalOrOp(TestBinaryOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'bool')
+
+    def get_y_data(self):
+        return self.random([32, 64], 'bool')
+
+    def paddle_func(self, x, y):
+        return paddle.logical_or(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.logical_or(x, y, axis)
+
+
+class TestLogicalXorOp(TestBinaryOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'bool')
+
+    def get_y_data(self):
+        return self.random([32, 64], 'bool')
+
+    def paddle_func(self, x, y):
+        return paddle.logical_xor(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.logical_xor(x, y, axis)
+
+
+class TestBitwiseAndOp(TestBinaryOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'int32', 1, 10000)
+
+    def get_y_data(self):
+        return self.random([32, 64], 'int32', 1, 10000)
+
+    def paddle_func(self, x, y):
+        return paddle.bitwise_and(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.bitwise_and(x, y, axis)
+
+
+class TestBitwiseOrOp(TestBinaryOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'int32', 1, 10000)
+
+    def get_y_data(self):
+        return self.random([32, 64], 'int32', 1, 10000)
+
+    def paddle_func(self, x, y):
+        return paddle.bitwise_or(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.bitwise_or(x, y, axis)
+
+
+class TestBitwiseXorOp(TestBinaryOp):
+    def get_x_data(self):
+        return self.random([32, 64], 'int32', 1, 10000)
+
+    def get_y_data(self):
+        return self.random([32, 64], 'int32', 1, 10000)
+
+    def paddle_func(self, x, y):
+        return paddle.bitwise_xor(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.bitwise_xor(x, y, axis)
+
+
+class TestEqualOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.equal(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.equal(x, y, axis)
+
+
+class TestNotEqualOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.not_equal(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.not_equal(x, y, axis)
+
+
+class TestGreaterThanOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.greater_than(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.greater_than(x, y, axis)
+
+
+class TestLessThanOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.less_than(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.less_than(x, y, axis)
+
+
+class TestGreaterEqualOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.greater_equal(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.greater_equal(x, y, axis)
+
+
+class TestLessEqualOp(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.less_equal(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.less_equal(x, y, axis)
+
+
+class TestAtan2Op(TestBinaryOp):
+    def paddle_func(self, x, y):
+        return paddle.atan2(x, y)
+
+    def cinn_func(self, builder, x, y, axis):
+        return builder.atan2(x, y, axis)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_bitcast_convert_op.py b/test/cinn/ops/test_bitcast_convert_op.py
new file mode 100644
index 0000000000000..f206d07c7c30b
--- /dev/null
+++ b/test/cinn/ops/test_bitcast_convert_op.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+from struct import pack, unpack
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestBitcastConvertOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    # input[(3, 1), int32] --> output[(3, 1, 4), uint8]
+    def init_case(self):
+        data = np.random.random([3, 1]).astype(np.int32)
+        packed = pack(data.size * 'i', *data.flatten())
+        self.inputs = {"x": data}
+        self.outputs = {
+            "y": np.array(unpack('12B', packed), dtype='uint8').reshape((3, 1,
+                                                                         4)),
+            "output_type": "uint8"
+        }
+
+    def build_paddle_program(self, target):
+        y = paddle.to_tensor(self.outputs["y"], stop_gradient=False)
+        self.paddle_outputs = [y]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("bitcast_convert")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.bitcast_convert(x, self.outputs["output_type"])
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestBitcastConvertCase1(TestBitcastConvertOp):
+    # input[(4, 2), int16] --> output[(4), int32]
+    def init_case(self):
+        data = np.random.random([4, 2]).astype(np.int16)
+        packed = pack(data.size * 'h', *data.flatten())
+        self.inputs = {"x": data}
+        self.outputs = {
+            "y": np.array(unpack('4i', packed), dtype='int32').reshape((4)),
+            "output_type": "int32"
+        }
+
+
+class TestBitcastConvertCase2(TestBitcastConvertOp):
+    # input[(4, 3, 2), float32] --> output[(4, 3), float64]
+    def init_case(self):
+        data = np.random.random([4, 3, 2]).astype(np.float32)
+        packed = pack(data.size * 'f', *data.flatten())
+        self.inputs = {"x": data}
+        self.outputs = {
+            "y": np.array(unpack('12d', packed), dtype='float64').reshape((4,
+                                                                           3)),
+            "output_type": "float64"
+        }
+
+
+class TestBitcastConvertCase3(TestBitcastConvertOp):
+    # input[(4, 3, 2), float32] --> output[(4, 3, 2, 2), uint16]
+    def init_case(self):
+        data = np.random.random([4, 3, 2]).astype(np.float32)
+        packed = pack(data.size * 'f', *data.flatten())
+        self.inputs = {"x": data}
+        self.outputs = {
+            "y":
+            np.array(unpack('48H', packed), dtype='uint16').reshape((4, 3, 2,
+                                                                     2)),
+            "output_type":
+            "uint16"
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_bitwise_op.py b/test/cinn/ops/test_bitwise_op.py
new file mode 100644
index 0000000000000..6c3e8090a85a1
--- /dev/null
+++ b/test/cinn/ops/test_bitwise_op.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestBitwiseOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        # Test with infinite values
+        if "with_inf" in self.case:
+            self.x_np = np.full(
+                shape=self.case["x_shape"],
+                fill_value=np.inf,
+                dtype=self.case["dtype"])
+        # Test with nan values
+        elif "with_nan" in self.case:
+            self.x_np = np.full(
+                shape=self.case["x_shape"],
+                fill_value=np.nan,
+                dtype=self.case["dtype"])
+        else:
+            self.x_np = self.random(
+                shape=self.case["x_shape"], dtype=self.case["dtype"])
+        if self.case["op_type"] != "not":
+            self.y_np = self.random(
+                shape=self.case["y_shape"], dtype=self.case["dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        if self.case["op_type"] != "not":
+            y = paddle.to_tensor(self.y_np, stop_gradient=False)
+        if self.case["op_type"] == "and":
+            out = paddle.bitwise_and(x, y)
+        elif self.case["op_type"] == "or":
+            out = paddle.bitwise_or(x, y)
+        elif self.case["op_type"] == "xor":
+            out = paddle.bitwise_xor(x, y)
+        elif self.case["op_type"] == "not":
+            out = paddle.bitwise_not(x)
+        else:
+            out = paddle.assign(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("bitwise")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["x_shape"],
+            "x")
+        if self.case["op_type"] != "not":
+            y = builder.create_input(
+                self.nptype2cinntype(self.case["dtype"]), self.case["y_shape"],
+                "y")
+        if self.case["op_type"] == "and":
+            out = builder.bitwise_and(x, y)
+        elif self.case["op_type"] == "or":
+            out = builder.bitwise_or(x, y)
+        elif self.case["op_type"] == "xor":
+            out = builder.bitwise_xor(x, y)
+        elif self.case["op_type"] == "not":
+            out = builder.bitwise_not(x)
+        else:
+            out = builder.identity(x)
+        prog = builder.build()
+        if self.case["op_type"] != "not":
+            res = self.get_cinn_output(prog, target, [x, y],
+                                       [self.x_np, self.y_np], [out])
+        else:
+            res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestBitwiseOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestBitwiseOpCase"
+        self.cls = TestBitwiseOp
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+            },
+            {
+                "x_shape": [512, 256],
+                "y_shape": [512, 256],
+            },
+            {
+                "x_shape": [128, 64, 32],
+                "y_shape": [128, 64, 32],
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+                "y_shape": [16, 8, 4, 2],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "y_shape": [16, 8, 4, 2, 1],
+            },
+            {
+                "x_shape": [1, 1, 1, 1, 1],
+                "y_shape": [1, 1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "int32"
+            },
+        ]
+        self.attrs = [
+            {
+                "op_type": "and"
+            },
+            {
+                "op_type": "or"
+            },
+            {
+                "op_type": "xor"
+            },
+            {
+                "op_type": "not"
+            },
+        ]
+
+
+class TestBitwiseOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestBitwiseOpCase"
+        self.cls = TestBitwiseOp
+        self.inputs = [
+            {
+                "x_shape": [32, 64],
+                "y_shape": [32, 64],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "uint8"
+            },
+            {
+                "dtype": "int8"
+            },
+            {
+                "dtype": "int16"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [
+            {
+                "op_type": "and"
+            },
+            {
+                "op_type": "or"
+            },
+            {
+                "op_type": "xor"
+            },
+            {
+                "op_type": "not"
+            },
+        ]
+
+
+class TestBitwiseOpBroadcast(TestBitwiseOpShape):
+    def init_attrs(self):
+        super().init_attrs()
+        self.inputs = [
+            {
+                "x_shape": [1024],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [512, 256],
+                "y_shape": [1, 1],
+            },
+            {
+                "x_shape": [128, 64, 32],
+                "y_shape": [1, 1, 1],
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+                "y_shape": [1, 1, 1, 1],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "y_shape": [1, 1, 1, 1, 1],
+            },
+        ]
+
+
+class TestBitwiseWithINF(TestBitwiseOpDtype):
+    def init_attrs(self):
+        super().init_attrs()
+        self.inputs = [
+            {
+                "x_shape": [16],
+                "y_shape": [16],
+                "with_inf": True,
+            },
+        ]
+
+
+class TestBitwiseWithNAN(TestBitwiseOpDtype):
+    def init_attrs(self):
+        super().init_attrs()
+        self.inputs = [
+            {
+                "x_shape": [16],
+                "y_shape": [16],
+                "with_nan": True,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestBitwiseOpShape().run()
+    TestBitwiseOpDtype().run()
+    TestBitwiseOpBroadcast().run()
+    TestBitwiseWithINF().run()
+    TestBitwiseWithNAN().run()
diff --git a/test/cinn/ops/test_broadcast_to_op.py b/test/cinn/ops/test_broadcast_to_op.py
new file mode 100644
index 0000000000000..42518b975a037
--- /dev/null
+++ b/test/cinn/ops/test_broadcast_to_op.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+import sys
+
+
+class TestBroadcastToOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {"x": np.random.random([6]).astype("float32")}
+        self.out_shape = [4, 5, 6]
+        self.broadcast_axes = [2]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = paddle.broadcast_to(x, shape=self.out_shape)
+
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("BroadcastTo")
+        x = builder.create_input(Float(32), self.inputs["x"].shape, "x")
+        out = builder.broadcast_to(
+            x, out_shape=self.out_shape, broadcast_axes=self.broadcast_axes)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestBroadcastToCase1(TestBroadcastToOp):
+    def init_case(self):
+        self.inputs = {"x": np.random.random([1, 1, 3]).astype("float32")}
+        self.out_shape = [4, 5, 3]
+        self.broadcast_axes = [0, 1, 2]
+
+
+class TestBroadcastToCase2(TestBroadcastToOp):
+    def init_case(self):
+        self.inputs = {"x": np.random.random([5, 3]).astype("float32")}
+        self.out_shape = [4, 5, 3]
+        self.broadcast_axes = [1, 2]
+
+
+class TestBroadcastToCase3(TestBroadcastToOp):
+    def init_case(self):
+        self.inputs = {"x": np.random.random([4, 3]).astype("float32")}
+        self.out_shape = [4, 5, 3]
+        self.broadcast_axes = [0, 2]
+
+    def test_check_results(self):
+        self.build_cinn_program(self.target)
+        # because paddle and numpy do not support discontinuous broadcast,
+        # so here we just pass the check until we know how to compose
+        pass
+
+
+class TestBroadcastToCase4(TestBroadcastToOp):
+    def init_case(self):
+        self.inputs = {"x": np.random.random([5]).astype("float32")}
+        self.out_shape = [4, 5, 3]
+        self.broadcast_axes = [1]
+
+    def test_check_results(self):
+        self.build_cinn_program(self.target)
+        # because paddle and numpy do not support discontinuous broadcast,
+        # so here we just pass the check until we know how to compose
+        pass
+
+
+class TestBroadcastToOpNoAxes(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {"x": np.random.random([6]).astype("float32")}
+        self.out_shape = [4, 5, 6]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = paddle.broadcast_to(x, shape=self.out_shape)
+
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("BroadcastTo")
+        x = builder.create_input(Float(32), self.inputs["x"].shape, "x")
+        out = builder.broadcast_to(x, out_shape=self.out_shape)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestBroadcastToNoAxesCase1(TestBroadcastToOpNoAxes):
+    def init_case(self):
+        self.inputs = {"x": np.random.random([1, 1, 3]).astype("float32")}
+        self.out_shape = [4, 5, 3]
+
+
+class TestBroadcastToNoAxesCase2(TestBroadcastToOpNoAxes):
+    def init_case(self):
+        self.inputs = {"x": np.random.random([5, 3]).astype("float32")}
+        self.out_shape = [4, 5, 3]
+
+
+class TestBroadcastToNoAxesCase3(TestBroadcastToOpNoAxes):
+    def init_case(self):
+        self.inputs = {"x": np.random.random([4, 1, 3]).astype("float32")}
+        self.out_shape = [4, 5, 3]
+
+
+class TestBroadcastToNoAxesCase4(TestBroadcastToOpNoAxes):
+    def init_case(self):
+        self.inputs = {"x": np.random.random([1, 1, 1]).astype("float32")}
+        self.out_shape = [4, 5, 3]
+
+
+class TestBroadcastToNoAxesCase5(TestBroadcastToOpNoAxes):
+    def init_case(self):
+        self.inputs = {"x": np.random.random([5]).astype("float32")}
+        self.out_shape = [4, 5, 3]
+
+    def test_check_results(self):
+        self.build_cinn_program(self.target)
+        # because paddle and numpy do not support discontinuous broadcast,
+        # so here we just pass the check until we know how to compose
+        pass
+
+
+class TestBroadcastToNoAxesCase6(TestBroadcastToOpNoAxes):
+    def init_case(self):
+        self.inputs = {"x": np.random.random([1]).astype("float32")}
+        self.out_shape = [5]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_broadcast_to_op_new.py b/test/cinn/ops/test_broadcast_to_op_new.py
new file mode 100644
index 0000000000000..818fcd6097e54
--- /dev/null
+++ b/test/cinn/ops/test_broadcast_to_op_new.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestBroadcastToOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.broadcast_to(x, shape=self.case["d_shape"])
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("BroadcastTo")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.broadcast_to(
+            x,
+            out_shape=self.case["d_shape"],
+            broadcast_axes=self.case["broadcast_axes"])
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestBroadcastToAllOne(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestBroadcastToOpCase"
+        self.cls = TestBroadcastToOp
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "d_shape": [1, 2],
+                "broadcast_axes": [1],
+            },
+            {
+                "x_shape": [5, 3],
+                "d_shape": [4, 5, 3],
+                "broadcast_axes": [1, 2],
+            },
+            {
+                "x_shape": [4, 5, 3],
+                "d_shape": [6, 4, 5, 3],
+                "broadcast_axes": [1, 2, 3],
+            },
+            {
+                "x_shape": [5, 4, 3, 2],
+                "d_shape": [6, 5, 4, 3, 2],
+                "broadcast_axes": [1, 2, 3, 4],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "d_shape": [32, 16, 8, 4, 2, 1],
+                "broadcast_axes": [1, 2, 3, 4, 5],
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "float32",
+            },
+        ]
+        self.attrs = []
+
+
+class TestBroadcastToAllTwo(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestBroadcastToOpCase"
+        self.cls = TestBroadcastToOp
+        self.inputs = [
+            {
+                "x_shape": [5, 3],
+                "d_shape": [4, 5, 3],
+                "broadcast_axes": [1, 2],
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "bool",
+            },
+            #{
+            #    "x_dtype": "int8",
+            #},
+            {
+                "x_dtype": "int32",
+            },
+            {
+                "x_dtype": "int64",
+            },
+            {
+                "x_dtype": "float16",
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+class TestBroadcastToOpNoAxes(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.broadcast_to(x, shape=self.case["d_shape"])
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("BroadcastTo")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.broadcast_to(x, out_shape=self.case["d_shape"])
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestBroadcastToOpNoAxesAllOne(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestBroadcastToOpNoAxesCase"
+        self.cls = TestBroadcastToOpNoAxes
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "d_shape": [1, 2],
+            },
+            {
+                "x_shape": [6],
+                "d_shape": [4, 5, 6],
+            },
+            {
+                "x_shape": [1, 1, 1],
+                "d_shape": [4, 5, 3],
+            },
+            {
+                "x_shape": [1, 1, 3],
+                "d_shape": [4, 5, 3],
+            },
+            {
+                "x_shape": [4, 1, 3],
+                "d_shape": [4, 5, 3],
+            },
+            {
+                "x_shape": [64, 2],
+                "d_shape": [64, 2],
+            },
+            {
+                "x_shape": [64, 32, 16],
+                "d_shape": [128, 64, 32, 16],
+            },
+            {
+                "x_shape": [64, 32, 16, 8],
+                "d_shape": [128, 64, 32, 16, 8],
+            },
+            #{
+            #    "x_shape": [128, 64, 32, 16, 8],
+            #    "d_shape": [256, 128, 64, 32, 16, 8],
+            #},
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "float32",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestBroadcastToAllOne().run()
+    TestBroadcastToAllTwo().run()
+    TestBroadcastToOpNoAxesAllOne().run()
diff --git a/test/cinn/ops/test_cast_op.py b/test/cinn/ops/test_cast_op.py
new file mode 100644
index 0000000000000..7813a2efc755b
--- /dev/null
+++ b/test/cinn/ops/test_cast_op.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestCastOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.cast(x, self.case["d_dtype"])
+
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("cast")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.cast(x, self.case["d_dtype"])
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestCastShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCastOpCase"
+        self.cls = TestCastOp
+        self.inputs = [
+            {
+                "x_shape": [1],
+            },
+            {
+                "x_shape": [1024],
+            },
+            {
+                "x_shape": [32, 64],
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+            },
+        ]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = [{
+            "d_dtype": "float64",
+        }]
+
+
+class TestCastDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCastOpCase"
+        self.cls = TestCastOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "bool",
+            },
+            {
+                "x_dtype": "int8",
+            },
+            {
+                "x_dtype": "int16"
+            },
+            {
+                "x_dtype": "int32",
+            },
+            {
+                "x_dtype": "int64"
+            },
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "d_dtype": "bool",
+            },
+            {
+                "d_dtype": "int8",
+            },
+            {
+                "d_dtype": "int16"
+            },
+            {
+                "d_dtype": "int32",
+            },
+            {
+                "d_dtype": "int64"
+            },
+            {
+                "d_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "d_dtype": "float32",
+            },
+            {
+                "d_dtype": "float64",
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestCastShape().run()
+    TestCastDtype().run()
diff --git a/test/cinn/ops/test_cbrt_op.py b/test/cinn/ops/test_cbrt_op.py
new file mode 100644
index 0000000000000..1ca112cdf9f78
--- /dev/null
+++ b/test/cinn/ops/test_cbrt_op.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+from cinn.common import *
+from cinn.frontend import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestCbrtOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x":
+            self.random(self.case["shape"], self.case["dtype"], -100.0, 100.0),
+        }
+
+    def build_paddle_program(self, target):
+        numpy_out = np.cbrt(self.inputs["x"])
+        out = paddle.to_tensor(numpy_out, stop_gradient=False)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("cbrt")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.cbrt(x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(
+            max_relative_error=1e-3, max_absolute_error=1e-3)
+
+
+class TestCbrtOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCbrtOpShape"
+        self.cls = TestCbrtOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 1024, 7],
+            },
+            {
+                "shape": [10, 5, 1024, 2048],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestCbrtOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCbrtOpDtype"
+        self.cls = TestCbrtOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestCbrtOpShape().run()
+    TestCbrtOpDtype().run()
diff --git a/test/cinn/ops/test_ceil_op.py b/test/cinn/ops/test_ceil_op.py
new file mode 100644
index 0000000000000..1377849daf021
--- /dev/null
+++ b/test/cinn/ops/test_ceil_op.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.common import *
+from cinn.frontend import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestCeilOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x":
+            self.random(self.case["shape"], self.case["dtype"], -100.0, 100.0),
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = paddle.ceil(x)
+
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("ceil")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.ceil(x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestCeilOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCeilOpShape"
+        self.cls = TestCeilOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 1024, 7],
+            },
+            {
+                "shape": [10, 5, 1024, 2048],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestCeilOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCeilOpDtype"
+        self.cls = TestCeilOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestCeilOpShape().run()
+    TestCeilOpDtype().run()
diff --git a/test/cinn/ops/test_cholesky_op.py b/test/cinn/ops/test_cholesky_op.py
new file mode 100644
index 0000000000000..d0396e0abb0d4
--- /dev/null
+++ b/test/cinn/ops/test_cholesky_op.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from cinn.common import *
+from cinn.frontend import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestCholeskyOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        if "batch_dim" in self.case and self.case["batch_dim"] > 0:
+            x = []
+            for _ in range(self.case["batch_dim"]):
+                matrix = self.random(self.case["shape"], self.case["dtype"],
+                                     -1.0, 1.0)
+                matrix_t = np.transpose(matrix, [1, 0])
+                x.append(np.dot(matrix, matrix_t))
+            x = np.stack(x)
+        else:
+            matrix = self.random(self.case["shape"], self.case["dtype"], -1.0,
+                                 1.0)
+            matrix_t = np.transpose(matrix, [1, 0])
+            x = np.dot(matrix, matrix_t)
+        self.inputs = {"x": x}
+        self.upper = self.case["upper"]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        y = paddle.linalg.cholesky(x, upper=self.upper)
+        self.paddle_outputs = [y]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("cholesky")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.cholesky(x, self.upper)
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [x], [self.inputs["x"]], [out], passes=[])
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestCholeskyOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCholeskyOpShape"
+        self.cls = TestCholeskyOp
+        self.inputs = [
+            {
+                "shape": [1, 1],
+            },
+            {
+                "shape": [8, 8],
+            },
+            {
+                "shape": [10, 10],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "upper": False
+            },
+        ]
+
+
+class TestCholeskyOpLargeShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCholeskyOpLargeShape"
+        self.cls = TestCholeskyOp
+        self.inputs = [
+            {
+                "shape": [1024, 1024],
+            },
+            {
+                "shape": [2048, 2048],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = [
+            {
+                "upper": False,
+                "batch_dim": 2
+            },
+            {
+                "upper": False,
+                "batch_dim": 4
+            },
+            {
+                "upper": True,
+                "batch_dim": 8
+            },
+        ]
+
+
+class TestCholeskyOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCholeskyOpDtype"
+        self.cls = TestCholeskyOp
+        self.inputs = [
+            {
+                "shape": [1, 1],
+            },
+            {
+                "shape": [8, 8],
+            },
+            {
+                "shape": [10, 10],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = [
+            {
+                "upper": False
+            },
+        ]
+
+
+class TestCholeskyOpBatch(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCholeskyOpBatch"
+        self.cls = TestCholeskyOp
+        self.inputs = [
+            {
+                "shape": [1, 1],
+            },
+            {
+                "shape": [8, 8],
+            },
+            {
+                "shape": [10, 10],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "upper": False,
+                "batch_dim": 1
+            },
+            {
+                "upper": False,
+                "batch_dim": 4
+            },
+            {
+                "upper": False,
+                "batch_dim": 8
+            },
+        ]
+
+
+class TestCholeskyOpAttrs(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCholeskyOpAttrs"
+        self.cls = TestCholeskyOp
+        self.inputs = [
+            {
+                "shape": [1, 1],
+            },
+            {
+                "shape": [8, 8],
+            },
+            {
+                "shape": [10, 10],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = [
+            {
+                "upper": True,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestCholeskyOpShape().run()
+    TestCholeskyOpLargeShape().run()
+    TestCholeskyOpDtype().run()
+    TestCholeskyOpBatch().run()
+    TestCholeskyOpAttrs().run()
diff --git a/test/cinn/ops/test_clz_op.py b/test/cinn/ops/test_clz_op.py
new file mode 100644
index 0000000000000..2488e1f6a7dd3
--- /dev/null
+++ b/test/cinn/ops/test_clz_op.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from cinn.common import *
+from cinn.frontend import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+INT32_MAX = (1 << 31) - 1
+INT32_MIN = -(1 << 31)
+INT64_MAX = (1 << 63) - 1
+INT64_MIN = -(1 << 63)
+
+
+def count_leading_zeros(integer, dtype):
+    if dtype == "int32":
+        bits = 32
+    elif dtype == "int64":
+        bits = 64
+    else:
+        raise NotImplementedError
+    if integer < 0:
+        return 0
+    mask = 1 << (bits - 1)
+    integer &= (mask - 1)
+    clz = 0
+    while mask > 0 and integer & mask == 0:
+        clz += 1
+        mask >>= 1
+    return clz
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestClzOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        dtype = self.case["dtype"]
+        low = INT32_MIN if dtype == "int32" else INT64_MIN
+        high = INT32_MAX if dtype == "int32" else INT64_MAX
+        x = self.random(self.case["shape"], dtype, low=low, high=high)
+        y = list(
+            map(lambda num: count_leading_zeros(num, dtype),
+                x.reshape(-1).tolist()))
+        self.inputs = {"x": x}
+        self.outputs = {"y": np.array(y).reshape(x.shape).astype(dtype)}
+
+    def build_paddle_program(self, target):
+        y = paddle.to_tensor(self.outputs["y"], stop_gradient=False)
+        self.paddle_outputs = [y]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("clz")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.clz(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestClzOpShapeDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestClzOpShapeDtype"
+        self.cls = TestClzOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 1024, 7],
+            },
+            {
+                "shape": [10, 5, 2048, 2],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestClzOpShapeDtype().run()
diff --git a/test/cinn/ops/test_comparison_op.py b/test/cinn/ops/test_comparison_op.py
new file mode 100644
index 0000000000000..fb1578dd1030f
--- /dev/null
+++ b/test/cinn/ops/test_comparison_op.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestComparisonOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        if self.case["broadcast"]:
+            self.inputs = {
+                "x": self.random(self.case["x_shape"], self.case["dtype"]),
+                "y": self.random(self.case["y_shape"], self.case["dtype"])
+            }
+        else:
+            self.inputs = {
+                "x": self.random(self.case["shape"], self.case["dtype"]),
+                "y": self.random(self.case["shape"], self.case["dtype"])
+            }
+        self.operation = self.case["operation"]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=True)
+        if self.operation == "equal":
+            out = paddle.equal(x, y)
+        elif self.operation == "not_equal":
+            out = paddle.not_equal(x, y)
+        elif self.operation == "greater_than":
+            out = paddle.greater_than(x, y)
+        elif self.operation == "less_than":
+            out = paddle.less_than(x, y)
+        elif self.operation == "greater_equal":
+            out = paddle.greater_equal(x, y)
+        elif self.operation == "less_equal":
+            out = paddle.less_equal(x, y)
+        else:
+            raise NotImplementedError
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("select")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.inputs["y"].dtype),
+            self.inputs["y"].shape, "y")
+
+        if self.operation == "equal":
+            out = builder.equal(x, y)
+        elif self.operation == "not_equal":
+            out = builder.not_equal(x, y)
+        elif self.operation == "greater_than":
+            out = builder.greater_than(x, y)
+        elif self.operation == "less_than":
+            out = builder.less_than(x, y)
+        elif self.operation == "greater_equal":
+            out = builder.greater_equal(x, y)
+        elif self.operation == "less_equal":
+            out = builder.less_equal(x, y)
+        else:
+            raise NotImplementedError
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.inputs["x"], self.inputs["y"]], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestComparisonOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestComparisonOpShape"
+        self.cls = TestComparisonOp
+        self.inputs = [
+            {
+                "shape": [64],
+            },
+            {
+                "shape": [64, 32],
+            },
+            {
+                "shape": [64, 1],
+            },
+            {
+                "shape": [64, 32, 128],
+            },
+            {
+                "shape": [1, 32, 128],
+            },
+            {
+                "shape": [64, 32, 16, 32],
+            },
+            {
+                "shape": [64, 32, 1, 32],
+            },
+            {
+                "shape": [64, 32, 16, 1, 128],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [1, 1],
+            },
+            {
+                "shape": [1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1024, 1, 1],
+            },
+            {
+                "shape": [65536],
+            },
+            {
+                "shape": [131072],
+            },
+            {
+                "shape": [1048576]
+            },
+            {
+                "shape": [64, 32, 16, 8, 4],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "operation": "equal",
+                "broadcast": False
+            },
+            {
+                "operation": "not_equal",
+                "broadcast": False
+            },
+            {
+                "operation": "greater_than",
+                "broadcast": False
+            },
+            {
+                "operation": "less_than",
+                "broadcast": False
+            },
+            {
+                "operation": "greater_equal",
+                "broadcast": False
+            },
+            {
+                "operation": "less_equal",
+                "broadcast": False
+            },
+        ]
+
+
+class TestComparisonOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestComparisonOpDtype"
+        self.cls = TestComparisonOp
+        self.inputs = [
+            {
+                "shape": [64, 1, 128],
+            },
+            {
+                "shape": [64, 32, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [
+            {
+                "operation": "equal",
+                "broadcast": False
+            },
+            {
+                "operation": "not_equal",
+                "broadcast": False
+            },
+            {
+                "operation": "greater_than",
+                "broadcast": False
+            },
+            {
+                "operation": "less_than",
+                "broadcast": False
+            },
+            {
+                "operation": "greater_equal",
+                "broadcast": False
+            },
+            {
+                "operation": "less_equal",
+                "broadcast": False
+            },
+        ]
+
+
+class TestComparisonOpBroadcastTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestComparisonOpShapeTest"
+        self.cls = TestComparisonOp
+        self.inputs = [
+            {
+                "x_shape": [64],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [1],
+                "y_shape": [64],
+            },
+            {
+                "x_shape": [64, 32],
+                "y_shape": [64, 1],
+            },
+            {
+                "x_shape": [1, 1],
+                "y_shape": [64, 32],
+            },
+            {
+                "x_shape": [64, 1],
+                "y_shape": [1, 32],
+            },
+            {
+                "x_shape": [64, 1, 128],
+                "y_shape": [64, 32, 128],
+            },
+            {
+                "x_shape": [64, 32, 128],
+                "y_shape": [64, 32, 1],
+            },
+            {
+                "x_shape": [64, 1, 128],
+                "y_shape": [1, 32, 128],
+            },
+            {
+                "x_shape": [1, 1, 1],
+                "y_shape": [64, 32, 128],
+            },
+            {
+                "x_shape": [64, 1, 16, 32],
+                "y_shape": [64, 32, 16, 32],
+            },
+            {
+                "x_shape": [64, 32, 16, 32],
+                "y_shape": [64, 32, 1, 32],
+            },
+            {
+                "x_shape": [64, 1, 1, 32],
+                "y_shape": [64, 32, 16, 32],
+            },
+            {
+                "x_shape": [64, 32, 16, 1],
+                "y_shape": [64, 1, 16, 32],
+            },
+            {
+                "x_shape": [1, 1, 1, 1],
+                "y_shape": [64, 32, 16, 32],
+            },
+            {
+                "x_shape": [1, 32, 16, 32],
+                "y_shape": [64, 32, 16, 32],
+            },
+            {
+                "x_shape": [64, 32, 16, 32],
+                "y_shape": [64, 32, 16, 32],
+            },
+            {
+                "x_shape": [65536],
+                "y_shape": [1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "operation": "equal",
+                "broadcast": True
+            },
+            {
+                "operation": "not_equal",
+                "broadcast": True
+            },
+            {
+                "operation": "greater_than",
+                "broadcast": True
+            },
+            {
+                "operation": "less_than",
+                "broadcast": True
+            },
+            {
+                "operation": "greater_equal",
+                "broadcast": True
+            },
+            {
+                "operation": "less_equal",
+                "broadcast": True
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestComparisonOpShape().run()
+    TestComparisonOpDtype().run()
+    TestComparisonOpBroadcastTest().run()
diff --git a/test/cinn/ops/test_concat_op.py b/test/cinn/ops/test_concat_op.py
new file mode 100755
index 0000000000000..23816e0774ee6
--- /dev/null
+++ b/test/cinn/ops/test_concat_op.py
@@ -0,0 +1,362 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.common import *
+from cinn.frontend import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestConcatOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {}
+        self.axis = self.case["axis"]
+        dtype = self.case["dtype"]
+        shapes = self.case["shapes"]
+        for i, shape in enumerate(shapes):
+            name = "x" + str(i)
+            self.inputs[name] = self.random(shape, dtype)
+
+    def paddle_inputs(self, inputs):
+        return [
+            paddle.to_tensor(data, stop_gradient=True)
+            for _, data in inputs.items()
+        ]
+
+    def cinn_inputs(self, builder, inputs):
+        return [
+            builder.create_input(
+                self.nptype2cinntype(data.dtype), data.shape, name)
+            for name, data in inputs.items()
+        ]
+
+    def build_paddle_program(self, target):
+        out = paddle.concat(x=self.paddle_inputs(self.inputs), axis=self.axis)
+
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("concat")
+        input_list = self.cinn_inputs(builder, self.inputs)
+        out = builder.concat(input_list, axis=self.axis)
+
+        prog = builder.build()
+
+        input_datas = [data for _, data in self.inputs.items()]
+
+        res = self.get_cinn_output(prog, target, input_list, input_datas,
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestConcatOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestConcatOpShape"
+        self.cls = TestConcatOp
+        self.inputs = [
+            {
+                "shapes": [[10], [6]],
+            },
+            {
+                "shapes": [[8, 5], [8, 5]],
+            },
+            {
+                "shapes": [[10, 3, 5], [4, 3, 5]],
+            },
+            {
+                "shapes": [[80, 40, 5, 7], [20, 40, 5, 7]],
+            },
+            {
+                "shapes": [[80, 1, 5, 7], [8, 1, 5, 7]],
+            },
+            {
+                "shapes": [[80, 3, 1024, 7], [100, 3, 1024, 7]],
+            },
+            {
+                "shapes": [[1, 5, 1024, 2048], [2, 5, 1024, 2048]],
+            },
+            {
+                "shapes": [[1], [1]],
+            },
+            {
+                "shapes": [[512], [512]],
+            },
+            {
+                "shapes": [[1024], [512]],
+            },
+            {
+                "shapes": [[2048], [4096]],
+            },
+            {
+                "shapes": [[1, 1, 1, 1], [1, 1, 1, 1]],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "axis": 0
+            },
+        ]
+
+
+class TestConcatOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestConcatOpDtype"
+        self.cls = TestConcatOp
+        self.inputs = [
+            {
+                "shapes": [[10], [6]],
+            },
+            {
+                "shapes": [[8, 5], [8, 5]],
+            },
+            {
+                "shapes": [[10, 3, 5], [4, 3, 5]],
+            },
+            {
+                "shapes": [[80, 40, 5, 7], [20, 40, 5, 7]],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "uint8"
+            },
+            {
+                "dtype": "int8"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [
+            {
+                "axis": 0
+            },
+        ]
+
+
+class TestConcatOpMultipleInputs(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestConcatOpMultipleInputs"
+        self.cls = TestConcatOp
+        self.inputs = [
+            # 1D tensor with 1~4 inputs
+            {
+                "shapes": [[10]],
+                "axis": 0
+            },
+            {
+                "shapes": [[10], [6]],
+                "axis": 0
+            },
+            {
+                "shapes": [[10], [6], [8]],
+                "axis": 0
+            },
+            {
+                "shapes": [[10], [6], [10], [6]],
+                "axis": 0
+            },
+            # 2D tensor with 1~4 inputs
+            {
+                "shapes": [[8, 5]],
+                "axis": 1
+            },
+            {
+                "shapes": [[8, 5], [8, 8]],
+                "axis": 1
+            },
+            {
+                "shapes": [[8, 5], [8, 5], [16, 5]],
+                "axis": 0
+            },
+            {
+                "shapes": [[8, 5], [8, 5], [8, 5], [8, 5]],
+                "axis": 0
+            },
+            # 3D tensor with 1~4 inputs
+            {
+                "shapes": [[10, 3, 5]],
+                "axis": 0
+            },
+            {
+                "shapes": [[10, 3, 5], [10, 7, 5]],
+                "axis": 1
+            },
+            {
+                "shapes": [[10, 3, 5], [10, 3, 6], [10, 3, 7]],
+                "axis": 2
+            },
+            {
+                "shapes": [[10, 3, 5], [4, 3, 5], [2, 3, 5]],
+                "axis": 0
+            },
+            # 4D tensor with 1~4 inputs
+            {
+                "shapes": [[80, 1, 5, 7]],
+                "axis": 0
+            },
+            {
+                "shapes": [[80, 1, 5, 7], [80, 79, 5, 7]],
+                "axis": 1
+            },
+            {
+                "shapes": [[80, 1, 50, 7], [80, 1, 5, 7], [80, 1, 10, 7]],
+                "axis": 2
+            },
+            {
+                "shapes": [[80, 1, 5, 17], [80, 1, 5, 27], [80, 1, 5, 37],
+                           [80, 1, 5, 47]],
+                "axis":
+                3
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestConcatOpAttrs(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestConcatOpAttrs"
+        self.cls = TestConcatOp
+        self.inputs = [
+            # 1D tensor
+            {
+                "shapes": [[10], [8]],
+                "axis": 0
+            },
+            {
+                "shapes": [[10], [6]],
+                "axis": -1
+            },
+            # 2D tensor
+            {
+                "shapes": [[8, 5], [10, 5]],
+                "axis": 0
+            },
+            {
+                "shapes": [[8, 5], [8, 8]],
+                "axis": 1
+            },
+            # 3D tensor
+            {
+                "shapes": [[10, 3, 5], [10, 3, 5]],
+                "axis": 0
+            },
+            {
+                "shapes": [[10, 3, 5], [10, 7, 5]],
+                "axis": 1
+            },
+            {
+                "shapes": [[10, 3, 15], [10, 3, 5]],
+                "axis": 2
+            },
+            {
+                "shapes": [[10, 3, 7], [10, 3, 5]],
+                "axis": -1
+            },
+            {
+                "shapes": [[10, 3, 5], [10, 7, 5]],
+                "axis": -2
+            },
+            {
+                "shapes": [[10, 7, 5], [20, 7, 5]],
+                "axis": -3
+            },
+            # 4D tensor
+            {
+                "shapes": [[80, 1, 5, 7], [80, 1, 5, 7]],
+                "axis": 0
+            },
+            {
+                "shapes": [[80, 1, 5, 7], [80, 79, 5, 7]],
+                "axis": 1
+            },
+            {
+                "shapes": [[80, 1, 5, 7], [80, 1, 10, 7]],
+                "axis": 2
+            },
+            {
+                "shapes": [[80, 1, 5, 7], [80, 1, 5, 7]],
+                "axis": 3
+            },
+            {
+                "shapes": [[80, 1, 5, 7], [80, 1, 5, 13]],
+                "axis": -1
+            },
+            {
+                "shapes": [[80, 1, 5, 7], [80, 1, 5, 7]],
+                "axis": -2
+            },
+            {
+                "shapes": [[80, 15, 5, 7], [80, 5, 5, 7]],
+                "axis": -3
+            },
+            {
+                "shapes": [[80, 1, 5, 7], [20, 1, 5, 7]],
+                "axis": -4
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestConcatOpShape().run()
+    TestConcatOpDtype().run()
+    TestConcatOpMultipleInputs().run()
+    TestConcatOpAttrs().run()
diff --git a/test/cinn/ops/test_constant_op.py b/test/cinn/ops/test_constant_op.py
new file mode 100644
index 0000000000000..424aa7c56aaea
--- /dev/null
+++ b/test/cinn/ops/test_constant_op.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.common import *
+from cinn.frontend import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestConstantOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.name = "x"
+        dtype = self.case["dtype"]
+        if "constant_value" in self.case:
+            if "bool" in dtype:
+                self.value = bool(self.case["constant_value"])
+            elif "int" in dtype:
+                self.value = int(self.case["constant_value"])
+            elif "float" in dtype:
+                self.value = float(self.case["constant_value"])
+        else:
+            self.value = self.random(self.case["shape"], dtype).tolist()
+        self.dtype = dtype
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.value, dtype=self.dtype)
+        self.paddle_outputs = [x]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("constant")
+        x = builder.constant(self.value, self.name, self.dtype)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [], [], [x])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestConstantOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestConstantOpShape"
+        self.cls = TestConstantOp
+        self.inputs = [
+            {
+                "constant_value": 10,
+            },
+            {
+                "constant_value": -5,
+            },
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [1, 2, 4, 8],
+            },
+            # known issue: https://github.com/PaddlePaddle/CINN/pull/1453
+            # The compilation time is particularly long for AssignValue op.
+            # {
+            #     "shape": [16, 4, 8, 32],
+            # },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            # very slow for the shape 2048
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestConstantOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestConstantOpDtype"
+        self.cls = TestConstantOp
+        self.inputs = [
+            {
+                "constant_value": 1,
+            },
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "uint8"
+            },
+            {
+                "dtype": "int8"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestConstantOpShape().run()
+    TestConstantOpDtype().run()
diff --git a/test/cinn/ops/test_conv2d_op.py b/test/cinn/ops/test_conv2d_op.py
new file mode 100755
index 0000000000000..4d292f9d0876d
--- /dev/null
+++ b/test/cinn/ops/test_conv2d_op.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cinn
+import numpy as np
+import paddle
+import unittest
+
+from cinn.frontend import *
+from cinn.common import *
+from cinn.runtime import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+set_cinn_cudnn_deterministic(True)
+paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': 1})
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestConv2dOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["dtype"])
+        self.w_np = self.random(
+            shape=self.case["w_shape"], dtype=self.case["dtype"])
+        self.dy_np = self.random(
+            shape=self.case["dy_shape"], dtype=self.case["dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        weight = paddle.to_tensor(self.w_np, stop_gradient=False)
+        y = paddle.nn.functional.conv2d(
+            x,
+            weight,
+            stride=self.case["stride"],
+            padding=self.case["padding"],
+            dilation=self.case["dilation"],
+            groups=self.case["groups"],
+            data_format=self.case["data_format"])
+        self.paddle_outputs = [y]
+        self.paddle_grads = self.get_paddle_grads([y], [x, weight],
+                                                  [self.dy_np])
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("conv2d")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["x_shape"],
+            "x")
+        weight = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["w_shape"],
+            "weight")
+        dy = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["dy_shape"],
+            "dy")
+
+        if self.case["data_format"] == "NCHW":
+            y = builder.conv2d(
+                x,
+                weight,
+                strides=self.case["stride"],
+                paddings=self.case["padding"],
+                dilations=self.case["dilation"],
+                groups=self.case["groups"],
+                data_format=self.case["data_format"])
+            x_grad = builder.conv(
+                weight,
+                dy,
+                data_format=self.case["data_format"],
+                conv_type="backward_data",
+                output_shape=x.shape())
+            weight_grad = builder.conv(
+                x,
+                dy,
+                data_format=self.case["data_format"],
+                conv_type="backward_filter",
+                output_shape=weight.shape())
+        elif self.case["data_format"] == "NHWC":
+            weight_t = builder.transpose(weight, [0, 2, 3, 1])
+            y = builder.conv2d(
+                x,
+                weight_t,
+                strides=self.case["stride"],
+                paddings=self.case["padding"],
+                dilations=self.case["dilation"],
+                groups=self.case["groups"],
+                data_format=self.case["data_format"])
+            x_grad = builder.conv(
+                weight_t,
+                dy,
+                data_format=self.case["data_format"],
+                conv_type="backward_data",
+                output_shape=x.shape())
+            w_grad = builder.conv(
+                x,
+                dy,
+                data_format=self.case["data_format"],
+                conv_type="backward_filter",
+                output_shape=weight_t.shape())
+            weight_grad = builder.transpose(w_grad, [0, 3, 1, 2])
+
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog,
+            target, [x, weight, dy], [self.x_np, self.w_np, self.dy_np],
+            [y, x_grad, weight_grad],
+            passes=[])
+
+        self.cinn_outputs = [res[0]]
+        self.cinn_grads = [res[1], res[2]]
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestConv2dOpAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestConv2dCase"
+        self.cls = TestConv2dOp
+        self.inputs = [
+            {
+                "x_shape": [3, 16, 32, 32],
+                "w_shape": [16, 16, 3, 3],
+                "dy_shape": [3, 16, 30, 30],
+                "data_format": "NCHW",
+            },
+            {
+                "x_shape": [3, 16, 64, 64],
+                "w_shape": [16, 16, 3, 3],
+                "dy_shape": [3, 16, 62, 62],
+                "data_format": "NCHW",
+            },
+            {
+                "x_shape": [3, 32, 32, 16],
+                "w_shape": [16, 16, 3, 3],
+                "dy_shape": [3, 30, 30, 16],
+                "data_format": "NHWC",
+            },
+            {
+                "x_shape": [3, 64, 64, 16],
+                "w_shape": [16, 16, 3, 3],
+                "dy_shape": [3, 62, 62, 16],
+                "data_format": "NHWC",
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16",
+                "max_relative_error": 1e-3,
+            },
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "stride": [1, 1],
+                "padding": [0, 0],
+                "dilation": [1, 1],
+                "groups": 1,
+            },
+        ]
+
+
+# Cause Conv2d backward_fliter mode do not support NHWC
+class TestConv2dOpFP64(TestConv2dOpAll):
+    def init_attrs(self):
+        super().init_attrs()
+        self.inputs = [
+            {
+                "x_shape": [3, 16, 32, 32],
+                "w_shape": [16, 16, 3, 3],
+                "dy_shape": [3, 16, 30, 30],
+                "data_format": "NCHW",
+            },
+            {
+                "x_shape": [3, 16, 64, 64],
+                "w_shape": [16, 16, 3, 3],
+                "dy_shape": [3, 16, 62, 62],
+                "data_format": "NCHW",
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float64",
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestConv2dOpAll().run()
+    TestConv2dOpFP64().run()
diff --git a/test/cinn/ops/test_cos_op.py b/test/cinn/ops/test_cos_op.py
new file mode 100644
index 0000000000000..42e85d24b7dd3
--- /dev/null
+++ b/test/cinn/ops/test_cos_op.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestCosOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.cos(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.cos(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestCosOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCosOpShape"
+        self.cls = TestCosOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestCosOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCosOpDtype"
+        self.cls = TestCosOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestCosOpShape().run()
+    TestCosOpDtype().run()
diff --git a/test/cinn/ops/test_cosh_op.py b/test/cinn/ops/test_cosh_op.py
new file mode 100644
index 0000000000000..e6b06d902fe2f
--- /dev/null
+++ b/test/cinn/ops/test_cosh_op.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestCoshOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.cosh(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.cosh(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestCoshOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCoshOpShape"
+        self.cls = TestCoshOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestCoshOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestCoshOpDtype"
+        self.cls = TestCoshOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestCoshOpShape().run()
+    TestCoshOpDtype().run()
diff --git a/test/cinn/ops/test_depthwise_conv2d_op.py b/test/cinn/ops/test_depthwise_conv2d_op.py
new file mode 100644
index 0000000000000..60a5956dbcd3f
--- /dev/null
+++ b/test/cinn/ops/test_depthwise_conv2d_op.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import paddle.nn as nn
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cudnn(),
+                    "x86 test will be skipped due to timeout.")
+class TestDepthwiseConv2dOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["dtype"])
+        self.w_np = self.random(
+            shape=self.case["w_shape"], dtype=self.case["dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        weight = nn.initializer.Assign(self.w_np)
+        if self.case["data_format"] == "NCHW":
+            c_axis = 1
+        elif self.case["data_format"] == "NHWC":
+            c_axis = 3
+        else:
+            raise ValueError("Unknown data_format")
+        conv = nn.Conv2D(
+            in_channels=self.case["x_shape"][c_axis],
+            out_channels=self.case["x_shape"][c_axis],
+            kernel_size=self.case["kernel_size"],
+            stride=self.case["stride"],
+            padding=self.case["padding"],
+            dilation=self.case["dilation"],
+            groups=self.case["groups"],
+            weight_attr=weight,
+            bias_attr=False,
+            data_format=self.case["data_format"])
+        y = conv(x)
+        self.paddle_outputs = [y]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("depthwise_conv2d")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["x_shape"],
+            "x")
+        weight = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["w_shape"],
+            "weight")
+
+        if self.case["data_format"] == "NCHW":
+            y = builder.depthwise_conv2d(
+                x,
+                weight,
+                strides=self.case["stride"],
+                paddings=self.case["padding"],
+                dilations=self.case["dilation"],
+                groups=self.case["groups"],
+                data_format=self.case["data_format"])
+        elif self.case["data_format"] == "NHWC":
+            weight_t = builder.transpose(weight, [0, 2, 3, 1])
+            y = builder.depthwise_conv2d(
+                x,
+                weight_t,
+                strides=self.case["stride"],
+                paddings=self.case["padding"],
+                dilations=self.case["dilation"],
+                groups=self.case["groups"],
+                data_format=self.case["data_format"])
+
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [x, weight], [self.x_np, self.w_np], [y], passes=[])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestDepthwiseConv2dOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestDepthwiseConv2dCase"
+        self.cls = TestDepthwiseConv2dOp
+        self.inputs = [
+            {
+                "x_shape": [3, 16, 32, 32],
+                "w_shape": [16, 1, 3, 3],
+                "data_format": "NCHW",
+                "groups": 16,
+            },
+            {
+                "x_shape": [3, 16, 64, 64],
+                "w_shape": [16, 1, 3, 3],
+                "data_format": "NCHW",
+                "groups": 16,
+            },
+            {
+                "x_shape": [3, 32, 32, 16],
+                "w_shape": [16, 1, 3, 3],
+                "data_format": "NHWC",
+                "groups": 16,
+            },
+            {
+                "x_shape": [3, 64, 64, 16],
+                "w_shape": [16, 1, 3, 3],
+                "data_format": "NHWC",
+                "groups": 16,
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "kernel_size": [3, 3],
+                "stride": [1, 1],
+                "padding": [0, 0],
+                "dilation": [1, 1],
+            },
+        ]
+
+
+class TestDepthwiseConv2dOpAttr(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestDepthwiseConv2dCase"
+        self.cls = TestDepthwiseConv2dOp
+        self.inputs = [
+            {
+                "x_shape": [3, 16, 32, 32],
+                "w_shape": [16, 1, 3, 3],
+                "data_format": "NCHW",
+                "groups": 16,
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "kernel_size": [5, 5],
+                "stride": [1, 1],
+                "padding": [0, 0],
+                "dilation": [1, 1],
+            },
+            {
+                "kernel_size": [3, 3],
+                "stride": [2, 2],
+                "padding": [0, 0],
+                "dilation": [1, 1],
+            },
+            {
+                "kernel_size": [3, 3],
+                "stride": [1, 1],
+                "padding": [1, 1],
+                "dilation": [1, 1],
+            },
+            {
+                "kernel_size": [3, 3],
+                "stride": [1, 1],
+                "padding": [0, 0],
+                "dilation": [2, 2],
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestDepthwiseConv2dOpShape().run()
+    TestDepthwiseConv2dOpAttr().run()
diff --git a/test/cinn/ops/test_divide_op.py b/test/cinn/ops/test_divide_op.py
new file mode 100644
index 0000000000000..06b686ae0e218
--- /dev/null
+++ b/test/cinn/ops/test_divide_op.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestDivOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.init_case()
+
+    def init_case(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=self.case["x_low"],
+            high=self.case["x_high"])
+        self.y_np = self.random(
+            shape=self.case["y_shape"],
+            dtype=self.case["y_dtype"],
+            low=self.case["y_low"],
+            high=self.case["y_high"])
+
+        # paddle.divide does not support zero division
+        if self.case["y_dtype"] is "int32" or self.case["y_dtype"] is "int64":
+            self.y_np[self.y_np == 0] = 1
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        y = paddle.to_tensor(self.y_np, stop_gradient=True)
+        out = paddle.divide(x, y)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("div")
+        x = builder.create_input(
+            self.nptype2cinntype(self.x_np.dtype), self.x_np.shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.y_np.dtype), self.y_np.shape, "y")
+        out = builder.divide(x, y)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.x_np, self.y_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestDivOpBase(TestCaseHelper):
+
+    inputs = [
+        {
+            "x_shape": [32],
+            "y_shape": [32],
+        },
+        {
+            "x_shape": [32, 64],
+            "y_shape": [32, 64],
+        },
+        {
+            "x_shape": [2, 3, 4],
+            "y_shape": [2, 3, 4],
+        },
+        {
+            "x_shape": [16, 8, 4, 2],
+            "y_shape": [16, 8, 4, 2],
+        },
+        {
+            "x_shape": [16, 8, 4, 2, 1],
+            "y_shape": [16, 8, 4, 2, 1],
+        },
+    ]
+
+    dtypes = [
+        {
+            "x_dtype": "float32",
+            "y_dtype": "float32",
+        },
+    ]
+
+    attrs = [
+        {
+            "x_low": -10,
+            "x_high": 10,
+            "y_low": -10,
+            "y_high": 10
+        },
+    ]
+
+    def init_attrs(self):
+        self.class_name = "TestDivOpCase"
+        self.cls = TestDivOp
+
+
+class TestDivOpShapeTest(TestDivOpBase):
+    def init_attrs(self):
+        self.class_name = "TestDivOpShapeTest"
+        self.cls = TestDivOp
+        self.inputs = [{
+            "x_shape": [32],
+            "y_shape": [32],
+        }, {
+            "x_shape": [32, 64],
+            "y_shape": [32, 64],
+        }, {
+            "x_shape": [2, 3, 4],
+            "y_shape": [2, 3, 4],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+            "y_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 1024],
+            "y_shape": [16, 8, 4, 1024],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+            "y_shape": [16, 8, 4, 2, 1],
+        }, {
+            "x_shape": [1, 1, 1, 1, 1],
+            "y_shape": [1, 1, 1, 1, 1],
+        }, {
+            "x_shape": [1],
+            "y_shape": [1],
+        }, {
+            "x_shape": [1024],
+            "y_shape": [1024],
+        }, {
+            "x_shape": [2048],
+            "y_shape": [2048],
+        }, {
+            "x_shape": [32768],
+            "y_shape": [32768],
+        }, {
+            "x_shape": [65536],
+            "y_shape": [65536],
+        }, {
+            "x_shape": [131072],
+            "y_shape": [131072],
+        }]
+
+
+class TestDivOpDtypeTest(TestDivOpBase):
+    def init_attrs(self):
+        self.class_name = "TestDivOpDtypeTest"
+        self.cls = TestDivOp
+        self.dtypes = [{
+            "x_dtype": "int32",
+            "y_dtype": "int32",
+        }, {
+            "x_dtype": "int64",
+            "y_dtype": "int64",
+        }, {
+            "x_dtype": "float32",
+            "y_dtype": "float32",
+        }, {
+            "x_dtype": "float64",
+            "y_dtype": "float64",
+        }]
+
+
+class TestDivOpPolarityTest(TestDivOpBase):
+    def init_attrs(self):
+        self.class_name = "TestDivOpPolarityTest"
+        self.cls = TestDivOp
+        self.attrs = [
+            {
+                "x_low": -10,
+                "x_high": 10,
+                "y_low": -10,
+                "y_high": -1
+            },
+            {
+                "x_low": -10,
+                "x_high": 10,
+                "y_low": 1,
+                "y_high": 10
+            },
+        ]
+
+
+class TestDivOpBroadcastTest(TestDivOpBase):
+    def init_attrs(self):
+        self.class_name = "TestDivOpBroadcastTest"
+        self.cls = TestDivOp
+        self.inputs = [{
+            "x_shape": [32],
+            "y_shape": [1],
+        }, {
+            "x_shape": [1],
+            "y_shape": [32],
+        }, {
+            "x_shape": [1, 64],
+            "y_shape": [32, 1],
+        }, {
+            "x_shape": [1, 64],
+            "y_shape": [32, 64],
+        }, {
+            "x_shape": [32, 1],
+            "y_shape": [32, 64],
+        }, {
+            "x_shape": [1, 1],
+            "y_shape": [32, 64],
+        }, {
+            "x_shape": [1, 3, 4],
+            "y_shape": [2, 3, 4],
+        }, {
+            "x_shape": [1, 3, 1],
+            "y_shape": [2, 3, 4],
+        }, {
+            "x_shape": [1, 1, 1],
+            "y_shape": [2, 3, 4],
+        }, {
+            "x_shape": [2, 1, 1],
+            "y_shape": [1, 3, 4],
+        }, {
+            "x_shape": [1, 8, 4, 2],
+            "y_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 1, 1],
+            "y_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [1, 8, 1, 1],
+            "y_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [1, 1, 1, 1],
+            "y_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [1, 8, 1, 2],
+            "y_shape": [16, 1, 4, 1],
+        }, {
+            "x_shape": [1, 8, 4, 2, 32],
+            "y_shape": [16, 8, 4, 2, 32],
+        }, {
+            "x_shape": [16, 1, 1, 2, 32],
+            "y_shape": [16, 8, 4, 2, 32],
+        }, {
+            "x_shape": [16, 1, 4, 1, 1],
+            "y_shape": [16, 8, 4, 2, 32],
+        }, {
+            "x_shape": [1, 1, 1, 1, 32],
+            "y_shape": [16, 8, 4, 2, 32],
+        }, {
+            "x_shape": [1, 1, 1, 1, 1],
+            "y_shape": [16, 8, 4, 2, 32],
+        }, {
+            "x_shape": [16, 1, 4, 1, 32],
+            "y_shape": [1, 8, 1, 2, 1],
+        }]
+
+
+if __name__ == "__main__":
+    TestDivOpShapeTest().run()
+    TestDivOpDtypeTest().run()
+    TestDivOpPolarityTest().run()
+    TestDivOpBroadcastTest().run()
diff --git a/test/cinn/ops/test_dropout_infer_op.py b/test/cinn/ops/test_dropout_infer_op.py
new file mode 100644
index 0000000000000..54afdc2fecc46
--- /dev/null
+++ b/test/cinn/ops/test_dropout_infer_op.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestDropoutInferOp(OpTest):
+    def setUp(self):
+        """Preparation before unittest"""
+        # Print current case name and attributes
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        """Construct inputs and attributes for unittest"""
+        # We initialize the input data using numpy
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+        if self.case["mode"] is 'upscale_in_train':
+            self.case["cinn_mode"] = 'upscale_in_train'
+        elif self.case["mode"] is 'downscale_in_infer':
+            self.case["cinn_mode"] = 'downgrade_in_infer'
+        else:
+            raise f"Unknown mode for dropout_infer: {self.case['mode']}"
+
+    def build_paddle_program(self, target):
+        """Test in paddle and get result from paddle"""
+        # Convert data from numpy to paddle tensor
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        # Test dropout op
+        out = paddle.nn.functional.dropout(
+            x, p=self.case["p"], mode=self.case["mode"], training=False)
+        # Set paddle output
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        """Test in CINN and get result from CINN"""
+        builder = NetBuilder("dropout_infer")
+        # Create input tensor for CINN
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        # Test dropout op
+        out = builder.dropout_infer(x, self.case["p"], self.case["cinn_mode"])
+        # Build CINN program and get result
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        """Check if the result of Paddle is consistent with the result of CINN"""
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestDropoutInferAll(TestCaseHelper):
+    def init_attrs(self):
+        """Initialize attributes for all test cases"""
+        # Set class name for test cases, will be named by following rules: {class_name}{No}
+        self.class_name = "TestDropoutInferOpCase"
+        # Set base class for test cases
+        self.cls = TestDropoutInferOp
+        # Initialize shape for test cases
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [512, 256],
+        }, {
+            "x_shape": [128, 64, 32],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        # Initialize dtype for test cases
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }, {
+            "x_dtype": "float64",
+        }]
+        # Initialize attributes for test cases
+        self.attrs = [{
+            "p": 0.1,
+            "mode": "upscale_in_train"
+        }, {
+            "p": 0.5,
+            "mode": "downscale_in_infer"
+        }, {
+            "p": 0.7,
+            "mode": "upscale_in_train"
+        }, {
+            "p": 0.9,
+            "mode": "downscale_in_infer"
+        }]
+
+
+if __name__ == "__main__":
+    TestDropoutInferAll().run()
diff --git a/test/cinn/ops/test_erf_op.py b/test/cinn/ops/test_erf_op.py
new file mode 100644
index 0000000000000..fc1dacb766493
--- /dev/null
+++ b/test/cinn/ops/test_erf_op.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestErfOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.erf(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.erf(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestErfOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestErfOpShape"
+        self.cls = TestErfOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestErfOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestErfOpDtype"
+        self.cls = TestErfOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestErfOpShape().run()
+    TestErfOpDtype().run()
diff --git a/test/cinn/ops/test_exp_op.py b/test/cinn/ops/test_exp_op.py
new file mode 100644
index 0000000000000..bd310009320db
--- /dev/null
+++ b/test/cinn/ops/test_exp_op.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestExpOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.exp(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.exp(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestExpOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestUnaryOpCase"
+        self.cls = TestExpOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestExpOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestUnaryOpCase"
+        self.cls = TestExpOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestExpOpShape().run()
+    TestExpOpDtype().run()
diff --git a/test/cinn/ops/test_expand_dims.py b/test/cinn/ops/test_expand_dims.py
new file mode 100644
index 0000000000000..a6c31da8d485a
--- /dev/null
+++ b/test/cinn/ops/test_expand_dims.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestExpandDimsOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.init_case()
+
+    def init_case(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.unsqueeze(x, self.case["axes_shape"])
+
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("expand_dims")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.expand_dims(x, self.case["axes_shape"])
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestExpandDimsAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestExpandDimsOpCase"
+        self.cls = TestExpandDimsOp
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "axes_shape": [0],
+            },
+            {
+                "x_shape": [1024],
+                "axes_shape": [0, 1],
+            },
+            {
+                "x_shape": [32, 64],
+                "axes_shape": [0, 2],
+            },
+            {
+                "x_shape": [32, 64],
+                "axes_shape": [0, 1, 2],
+            },
+            {
+                "x_shape": [32, 64, 128],
+                "axes_shape": [0, 1, 2],
+            },
+            {
+                "x_shape": [32, 64, 128],
+                "axes_shape": [1, 2, 3],
+            },
+            {
+                "x_shape": [128, 64, 32, 16],
+                "axes_shape": [0, 1],
+            },
+            {
+                "x_shape": [128, 64, 32, 16],
+                "axes_shape": [3, 4],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "axes_shape": [2],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "axes_shape": [5],
+            },
+        ]
+        self.dtypes = [
+            #{
+            #    "x_dtype": "bool",
+            #    "axes_dtype": "int32",
+            #},
+            #{
+            #    "x_dtype": "int8",
+            #    "axes_dtype": "int32",
+            #},
+            #{
+            #    "x_dtype": "int16",
+            #    "axes_dtype": "int32",
+            #},
+            #{
+            #    "x_dtype": "int32",
+            #    "axes_dtype": "int32",
+            #},
+            #{
+            #    "x_dtype": "int64",
+            #    "axes_dtype": "int32",
+            #},
+            #{
+            #    "x_dtype": "float16",
+            #    "max_relative_error": 1e-3,
+            #    "axes_dtype": "int32",
+            #},
+            {
+                "x_dtype": "float32",
+            },
+            #{
+            #    "x_dtype": "float64",
+            #    "axes_dtype": "int32",
+            #},
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestExpandDimsAll().run()
diff --git a/test/cinn/ops/test_fill_constant_op.py b/test/cinn/ops/test_fill_constant_op.py
new file mode 100644
index 0000000000000..8f8a1bcb4b270
--- /dev/null
+++ b/test/cinn/ops/test_fill_constant_op.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+from cinn.common import *
+from cinn.frontend import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestFillConstantOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.shape = self.case["shape"]
+        self.value = self.case["value"]
+        self.dtype = self.case["dtype"]
+        if isinstance(self.value, str):
+            dtypes = ["bool", "int", "float"]
+            for dtype in dtypes:
+                if dtype in self.dtype:
+                    try:
+                        self.value = eval(f"{dtype}(self.value)")
+                    except:
+                        self.value = eval(f"{dtype}(0)")
+
+    def build_paddle_program(self, target):
+        if self.dtype == None:
+            x = np.full(self.shape, self.value)
+            x = paddle.to_tensor(x)
+        else:
+            x = paddle.full(self.shape, self.value, dtype=self.dtype)
+
+        self.paddle_outputs = [x]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("fill_constant")
+        if self.dtype == None:
+            x = builder.fill_constant(self.shape, self.value, "out")
+        else:
+            x = builder.fill_constant(self.shape, self.value, "out",
+                                      self.dtype)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [], [], [x])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestFillConstantOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestFillConstantOpShape"
+        self.cls = TestFillConstantOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [1, 2, 4, 8],
+            },
+            {
+                "shape": [16, 4, 8, 32],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "value": 123.456
+            },
+        ]
+
+
+class TestFillConstantOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestFillConstantOpDtype"
+        self.cls = TestFillConstantOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [1, 2, 4, 8],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "uint8"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [
+            {
+                "value": 123.456
+            },
+        ]
+
+
+class TestFillConstantOpValue(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestFillConstantOpValue"
+        self.cls = TestFillConstantOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [1, 2, 4, 8],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": None
+            },
+        ]
+        self.attrs = [
+            {
+                "value": bool(True)
+            },
+            {
+                "value": int(123)
+            },
+            {
+                "value": float(123.456)
+            },
+        ]
+
+
+class TestFillConstantOpStrValue(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestFillConstantOpStrValue"
+        self.cls = TestFillConstantOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [1, 2, 4, 8],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "uint8"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [
+            {
+                "value": "1024"
+            },
+            {
+                "value": "0.12345678987654321"
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestFillConstantOpShape().run()
+    TestFillConstantOpDtype().run()
+    TestFillConstantOpValue().run()
+    TestFillConstantOpStrValue().run()
diff --git a/test/cinn/ops/test_floor_divide_op.py b/test/cinn/ops/test_floor_divide_op.py
new file mode 100644
index 0000000000000..996262fab4635
--- /dev/null
+++ b/test/cinn/ops/test_floor_divide_op.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import paddle.nn.functional as F
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestFloorDivideOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.init_case()
+
+    def init_case(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=self.case["x_low"],
+            high=self.case["x_high"])
+        self.y_np = self.random(
+            shape=self.case["y_shape"],
+            dtype=self.case["y_dtype"],
+            low=self.case["y_low"],
+            high=self.case["y_high"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        y = paddle.to_tensor(self.y_np, stop_gradient=True)
+
+        out = paddle.floor_divide(x, y)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("pow")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.case["y_dtype"]), self.case["y_shape"],
+            "y")
+        out = builder.floor_divide(x, y)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.x_np, self.y_np], [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestFloorDivideShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestFloorDivideOpCase"
+        self.cls = TestFloorDivideOp
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+            },
+            {
+                "x_shape": [512, 256],
+                "y_shape": [512, 256],
+            },
+            {
+                "x_shape": [128, 64, 32],
+                "y_shape": [128, 64, 32],
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+                "y_shape": [16, 8, 4, 2],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "y_shape": [16, 8, 4, 2, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "int32",
+                "y_dtype": "int32",
+            },
+        ]
+        self.attrs = [
+            {
+                "x_low": -10,
+                "x_high": 10,
+                "y_low": -10,
+                "y_high": -1,
+            },
+            {
+                "x_low": -10,
+                "x_high": 10,
+                "y_low": 1,
+                "y_high": 10,
+            },
+        ]
+
+
+class TestFloorDivideBroadcast(TestFloorDivideShape):
+    def init_attrs(self):
+        super().init_attrs()
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [1024],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [512, 256],
+                "y_shape": [1, 1],
+            },
+            {
+                "x_shape": [128, 64, 32],
+                "y_shape": [1, 1, 1],
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+                "y_shape": [1, 1, 1, 1],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "y_shape": [1, 1, 1, 1, 1],
+            },
+        ]
+
+
+class TestFloorDivideDtype(TestFloorDivideShape):
+    def init_attrs(self):
+        super().init_attrs()
+        self.inputs = [
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "int8",
+                "y_dtype": "int8",
+            },
+            {
+                "x_dtype": "int16",
+                "y_dtype": "int16",
+            },
+            {
+                "x_dtype": "int32",
+                "y_dtype": "int32",
+            },
+            {
+                "x_dtype": "int64",
+                "y_dtype": "int64",
+            },
+            {
+                "x_dtype": "float16",
+                "y_dtype": "float16",
+                "max_relative_error": 1,
+            },
+            {
+                "x_dtype": "float32",
+                "y_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+                "y_dtype": "float64",
+            },
+        ]
+
+
+class TestFloorDivideUINT(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestFloorDivideOpCase"
+        self.cls = TestFloorDivideOp
+        self.inputs = [
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "uint8",
+                "y_dtype": "uint8",
+            },
+        ]
+        self.attrs = [
+            {
+                "x_low": 1,
+                "x_high": 10,
+                "y_low": 1,
+                "y_high": 10,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestFloorDivideShape().run()
+    TestFloorDivideBroadcast().run()
+    TestFloorDivideDtype().run()
+    TestFloorDivideUINT().run()
diff --git a/test/cinn/ops/test_floor_op.py b/test/cinn/ops/test_floor_op.py
new file mode 100644
index 0000000000000..f454eec840233
--- /dev/null
+++ b/test/cinn/ops/test_floor_op.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestFloorOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-1000.0,
+            high=1000.0)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.floor(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.floor(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestFloorOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestFloorOpShape"
+        self.cls = TestFloorOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestFloorOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestFloorOpDtype"
+        self.cls = TestFloorOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestFloorOpShape().run()
+    TestFloorOpDtype().run()
diff --git a/test/cinn/ops/test_gather_nd_op.py b/test/cinn/ops/test_gather_nd_op.py
new file mode 100644
index 0000000000000..bc2f7f19475eb
--- /dev/null
+++ b/test/cinn/ops/test_gather_nd_op.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+import logging
+import os
+from itertools import product
+
+logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
+logger = logging.getLogger(name="gather_nd")
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestGatherNdOp(OpTest):
+    def setUp(self):
+        self.data = []
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = [{"x": [3, 4, 3], "index": [4, 1]}]
+        self.dtypes = ["float32"]
+
+    def build_paddle_program(self, target):
+        for inputs, dtype in product(self.inputs, self.dtypes):
+            x_shape = inputs["x"]
+            index_shape = inputs["index"]
+            x = np.random.randn(*x_shape).astype(dtype)
+            index = np.random.randint(0, min(x_shape),
+                                      index_shape).astype("int32")
+            self.data.append([x, index])
+            x = paddle.to_tensor(x, stop_gradient=False)
+            index = paddle.to_tensor(index, stop_gradient=False)
+            out = paddle.gather_nd(x, index)
+            logger.debug(" -- The output of Paddle:\n{}".format(out))
+            self.paddle_outputs.append(out)
+
+    def build_cinn_program(self, target):
+        for i, (inputs, dtype) in enumerate(product(self.inputs, self.dtypes)):
+            builder = NetBuilder("gather")
+            x = builder.create_input(
+                self.nptype2cinntype(dtype), inputs["x"], "x")
+            index = builder.create_input(Int(32), inputs["index"], "index")
+            out = builder.gather_nd(x, index)
+            prog = builder.build()
+            res = self.get_cinn_output(prog, target, [x, index], self.data[i],
+                                       [out])
+            logger.debug(" -- The output of CINN:\n{}".format(res))
+            self.cinn_outputs.extend(res)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestGatherOpAll(TestGatherNdOp):
+    def init_case(self):
+        self.inputs = []
+        for x_shape in [
+            [16],
+            [8, 16],
+            [4, 8, 16],
+            [2, 4, 8, 16],
+            [2, 4, 8, 1],
+            [2, 4, 8, 1024],
+        ]:
+            for j in range(1, len(x_shape)):
+                self.inputs.append({"x": x_shape, "index": [8, j]})
+
+        self.dtypes = [
+            "float32",
+            "float64",
+            "int16",
+            "int32",
+            "int64",
+            # "uint8"  # note: some types is not supported in paddle now.
+        ]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_gather_op.py b/test/cinn/ops/test_gather_op.py
new file mode 100644
index 0000000000000..519f5bd5f8620
--- /dev/null
+++ b/test/cinn/ops/test_gather_op.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+import logging
+import os
+from itertools import product
+
+logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
+logger = logging.getLogger(name="gather")
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestGatherOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.data = None
+
+    def build_paddle_program(self, target):
+        inputs = self.case
+        dtype = self.case["x_dtype"]
+        axis = inputs["axis"]
+        x_shape = inputs["x"]
+        index_shape = inputs["index"]
+        # Paddle does not support negative axis values.
+        axis = axis if axis >= 0 else len(x_shape) + axis
+        x = np.random.randn(*x_shape).astype(dtype)
+        index = np.random.randint(0, x_shape[axis],
+                                  index_shape).astype("int32")
+        self.data = [x, index]
+        x = paddle.to_tensor(x, stop_gradient=False)
+        index = paddle.to_tensor(index, stop_gradient=False)
+        out = paddle.gather(x, index, axis)
+        logger.debug(" -- The output of Paddle:\n{}".format(out))
+        self.paddle_outputs.append(out)
+
+    def build_cinn_program(self, target):
+        inputs = self.case
+        dtype = self.case["x_dtype"]
+        axis = inputs["axis"]
+        builder = NetBuilder("gather")
+        x = builder.create_input(self.nptype2cinntype(dtype), inputs["x"], "x")
+        index = builder.create_input(Int(32), inputs["index"], "index")
+        out = builder.gather(x, index, axis=axis)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, index], self.data, [out])
+        logger.debug(" -- The output of CINN:\n{}".format(res))
+        self.cinn_outputs.extend(res)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestGatherOpAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestGatherOpAll"
+        self.cls = TestGatherOp
+        # note: The possible values of axis are related to x, so axis is added in self.inputs
+        self.inputs = [
+            {
+                "x": [128],
+                "index": [64],
+                "axis": 0
+            },
+            {
+                "x": [16, 32],
+                "index": [32],
+                "axis": 0
+            },
+            {
+                "x": [16, 32],
+                "index": [32],
+                "axis": 1
+            },
+            {
+                "x": [8, 16, 32],
+                "index": [16],
+                "axis": -3
+            },
+            {
+                "x": [8, 16, 32],
+                "index": [8],
+                "axis": -2
+            },
+            {
+                "x": [8, 16, 32],
+                "index": [8],
+                "axis": -1
+            },
+            {
+                "x": [8, 16, 32],
+                "index": [4],
+                "axis": 2
+            },
+            {
+                "x": [16, 8, 4, 64],
+                "index": [4],
+                "axis": 2
+            },
+            {
+                "x": [16, 8, 4, 1024],
+                "index": [4],
+                "axis": 2
+            },
+            {
+                "x": [16, 8, 4, 1],
+                "index": [4],
+                "axis": 2
+            },
+            {
+                "x": [1, 1, 1, 1],
+                "index": [4],
+                "axis": 2
+            },
+        ]
+        self.dtypes = [{
+            "x_dtype": "int16",
+            "y_dtype": "int64"
+        }, {
+            "x_dtype": "int32",
+            "y_dtype": "int64"
+        }, {
+            "x_dtype": "int64",
+            "y_dtype": "int64"
+        }, {
+            "x_dtype": "float32",
+            "y_dtype": "int64"
+        }, {
+            "x_dtype": "float64",
+            "y_dtype": "int64"
+        }]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestGatherOpAll().run()
diff --git a/test/cinn/ops/test_gaussian_random_op.py b/test/cinn/ops/test_gaussian_random_op.py
new file mode 100644
index 0000000000000..c5643c6cd3353
--- /dev/null
+++ b/test/cinn/ops/test_gaussian_random_op.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestGaussianRandomOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        pass
+
+    def build_paddle_program(self, target):
+        out = paddle.tensor.random.gaussian(
+            shape=self.case["shape"],
+            mean=self.case["mean"],
+            std=self.case["std"],
+            dtype=self.case["dtype"])
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("gaussian_random")
+        out = builder.gaussian_random(self.case["shape"], self.case["mean"],
+                                      self.case["std"], self.case["seed"],
+                                      self.case["dtype"])
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [], [], [out], passes=[])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        # Due to the different random number generation numbers implemented
+        # in the specific implementation, the random number results generated
+        # by CINN and Paddle are not the same, but they all conform to the
+        # Uniform distribution.
+        self.check_outputs_and_grads(
+            max_relative_error=10000, max_absolute_error=10000)
+
+
+class TestGaussianRandomOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestGaussianRandomOpCase"
+        self.cls = TestGaussianRandomOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [512, 256],
+            },
+            {
+                "shape": [128, 64, 32],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+            {
+                "shape": [16, 8, 4, 2, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "mean": 0.0,
+                "std": 0.0,
+                "seed": 1234,
+            },
+        ]
+
+
+class TestGaussianRandomOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestGaussianRandomOpCase"
+        self.cls = TestGaussianRandomOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "mean": 0.0,
+                "std": 0.0,
+                "seed": 1234,
+            },
+        ]
+
+
+class TestGaussianRandomOpAttr(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestGaussianRandomOpCase"
+        self.cls = TestGaussianRandomOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "mean": 1.0,
+                "std": 0.0,
+                "seed": 1,
+            },
+            {
+                "mean": 0.0,
+                "std": 1.0,
+                "seed": 2,
+            },
+            {
+                "mean": 1.0,
+                "std": 1.0,
+                "seed": 3,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestGaussianRandomOpShape().run()
+    TestGaussianRandomOpDtype().run()
+    TestGaussianRandomOpAttr().run()
diff --git a/test/cinn/ops/test_gelu_op.py b/test/cinn/ops/test_gelu_op.py
new file mode 100644
index 0000000000000..cf6080ef9ed19
--- /dev/null
+++ b/test/cinn/ops/test_gelu_op.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import paddle.nn.functional as F
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestGeluOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x":
+            self.random(
+                shape=self.case["x_shape"], dtype=self.case["x_dtype"]),
+            "dout":
+            self.random(shape=(32, 64), dtype=self.case["x_dtype"])
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        out = F.gelu(x)
+
+        self.paddle_outputs = [out]
+        self.paddle_grads = self.get_paddle_grads([out], [x],
+                                                  [self.inputs["dout"]])
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("gelu")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.gelu(x)
+        prog = builder.build()
+        forward_res = self.get_cinn_output(prog, target, [x],
+                                           [self.inputs["x"]], [out])
+
+        self.cinn_outputs = forward_res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestGeluShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestGeluOp"
+        self.cls = TestGeluOp
+        self.inputs = [{
+            "x_shape": [1024],
+        }, {
+            "x_shape": [512, 256],
+        }, {
+            "x_shape": [128, 64, 32],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }, {
+            "x_shape": [1],
+        }, {
+            "x_shape": [1, 1, 1, 1, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestGeluDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestGeluOp"
+        self.cls = TestGeluOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float64",
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float16",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestGeluShape().run()
+    TestGeluDtype().run()
diff --git a/test/cinn/ops/test_identity_op.py b/test/cinn/ops/test_identity_op.py
new file mode 100644
index 0000000000000..b68f66979f597
--- /dev/null
+++ b/test/cinn/ops/test_identity_op.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestIdentityOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.assign(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("identity")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.identity(x)
+
+        prog = builder.build()
+
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestIdentityOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIdentityOpShape"
+        self.cls = TestIdentityOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestIdentityOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIdentityOpDtype"
+        self.cls = TestIdentityOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [{
+            "x_dtype": "bool",
+        }, {
+            "x_dtype": "int8",
+        }, {
+            "x_dtype": "int16",
+        }, {
+            "x_dtype": "int32",
+        }, {
+            "x_dtype": "int64",
+        }, {
+            "x_dtype": "float16",
+            "max_relative_error": 1e-3
+        }, {
+            "x_dtype": "float32",
+        }, {
+            "x_dtype": "float64",
+        }]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestIdentityOpShape().run()
+    TestIdentityOpDtype().run()
diff --git a/test/cinn/ops/test_is_finite_op.py b/test/cinn/ops/test_is_finite_op.py
new file mode 100644
index 0000000000000..cfe9dcbc258b5
--- /dev/null
+++ b/test/cinn/ops/test_is_finite_op.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestIsFiniteOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-100,
+            high=100)
+
+        index = np.random.randint(0, len(self.x_np))
+        inf_data = np.where(self.x_np[index] > 0, np.inf, np.nan)
+        self.x_np[index] = inf_data.astype(self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.isfinite(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("is_finite")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.is_finite(x)
+
+        prog = builder.build()
+
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestIsFiniteOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIsFiniteOpShape"
+        self.cls = TestIsFiniteOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestIsFiniteOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIsFiniteOpDtype"
+        self.cls = TestIsFiniteOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [{
+            "x_dtype": "int32",
+        }, {
+            "x_dtype": "int64",
+        }, {
+            "x_dtype": "float16",
+            "max_relative_error": 1e-3
+        }, {
+            "x_dtype": "float32",
+        }, {
+            "x_dtype": "float64",
+        }]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestIsFiniteOpShape().run()
+    TestIsFiniteOpDtype().run()
diff --git a/test/cinn/ops/test_is_inf_op.py b/test/cinn/ops/test_is_inf_op.py
new file mode 100644
index 0000000000000..291066ac7ad1f
--- /dev/null
+++ b/test/cinn/ops/test_is_inf_op.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestIsInfOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-100,
+            high=100)
+
+        index = np.random.randint(0, len(self.x_np))
+        inf_data = np.zeros(self.x_np[index].shape, dtype="float") + np.inf
+        self.x_np[index] = inf_data.astype(self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.isinf(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("is_inf")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.is_inf(x)
+
+        prog = builder.build()
+
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestIsInfOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIsInfOpShape"
+        self.cls = TestIsInfOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestIsInfOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIsInfOpDtype"
+        self.cls = TestIsInfOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [{
+            "x_dtype": "int32",
+        }, {
+            "x_dtype": "int64",
+        }, {
+            "x_dtype": "float16",
+            "max_relative_error": 1e-3
+        }, {
+            "x_dtype": "float32",
+        }, {
+            "x_dtype": "float64",
+        }]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestIsInfOpShape().run()
+    TestIsInfOpDtype().run()
diff --git a/test/cinn/ops/test_is_nan_op.py b/test/cinn/ops/test_is_nan_op.py
new file mode 100644
index 0000000000000..4a050b8846729
--- /dev/null
+++ b/test/cinn/ops/test_is_nan_op.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestIsNanOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-100,
+            high=100)
+
+        index = np.random.randint(0, len(self.x_np))
+        nan_data = np.zeros(self.x_np[index].shape, dtype="float") + np.nan
+        self.x_np[index] = nan_data.astype(self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.isnan(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("is_nan")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.is_nan(x)
+
+        prog = builder.build()
+
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestIsNanOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIsNanOpShape"
+        self.cls = TestIsNanOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestIsNanOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIsNanOpDtype"
+        self.cls = TestIsNanOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [{
+            "x_dtype": "int32",
+        }, {
+            "x_dtype": "int64",
+        }, {
+            "x_dtype": "float16",
+            "max_relative_error": 1e-3
+        }, {
+            "x_dtype": "float32",
+        }, {
+            "x_dtype": "float64",
+        }]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestIsNanOpShape().run()
+    TestIsNanOpDtype().run()
diff --git a/test/cinn/ops/test_isclose_op.py b/test/cinn/ops/test_isclose_op.py
new file mode 100644
index 0000000000000..4c2621f03b641
--- /dev/null
+++ b/test/cinn/ops/test_isclose_op.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestIsCloseOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        if self.case["nan_as_input"]:
+            self.x_np = np.full(shape=self.case["shape"], fill_value=np.nan)
+        else:
+            self.x_np = self.random(
+                shape=self.case["shape"], dtype=self.case["dtype"])
+        self.y_np = self.x_np + self.random(
+            shape=self.case["shape"], dtype=self.case["dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        y = paddle.to_tensor(self.y_np, stop_gradient=False)
+        shape = paddle.broadcast_shape(x.shape, y.shape)
+        x = paddle.broadcast_to(x, shape)
+        y = paddle.broadcast_to(y, shape)
+        out = paddle.isclose(x, y, self.case["rtol"], self.case["atol"],
+                             self.case["equal_nan"])
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("isclose")
+        x = builder.create_input(
+            self.nptype2cinntype(self.x_np.dtype), self.x_np.shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.y_np.dtype), self.y_np.shape, "y")
+        out = builder.isclose(x, y, self.case["rtol"], self.case["atol"],
+                              self.case["equal_nan"])
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.x_np, self.y_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestIsCloseShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIsCloseOpCase"
+        self.cls = TestIsCloseOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [512, 256],
+            },
+            {
+                "shape": [128, 64, 32],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+            {
+                "shape": [16, 8, 4, 2, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "rtol": 1e-5,
+                "atol": 1e-8,
+                "equal_nan": False,
+                "nan_as_input": False,
+            },
+        ]
+
+
+class TestIsCloseDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIsCloseOpCase"
+        self.cls = TestIsCloseOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "rtol": 1e-5,
+                "atol": 1e-8,
+                "equal_nan": False,
+                "nan_as_input": False,
+            },
+        ]
+
+
+class TestIsCloseAttr(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIsCloseOpCase"
+        self.cls = TestIsCloseOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "rtol": 1e-3,
+                "atol": 1e-3,
+                "equal_nan": False,
+                "nan_as_input": False,
+            },
+            {
+                "rtol": 1e-5,
+                "atol": 1e-5,
+                "equal_nan": False,
+                "nan_as_input": False,
+            },
+            {
+                "rtol": 1e-8,
+                "atol": 1e-8,
+                "equal_nan": False,
+                "nan_as_input": False,
+            },
+            {
+                "rtol": 1e-5,
+                "atol": 1e-8,
+                "equal_nan": True,
+                "nan_as_input": False,
+            },
+        ]
+
+
+class TestIsCloseNAN(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestIsCloseOpCase"
+        self.cls = TestIsCloseOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "rtol": 1e-5,
+                "atol": 1e-8,
+                "equal_nan": True,
+                "nan_as_input": True,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestIsCloseShape().run()
+    TestIsCloseDtype().run()
+    TestIsCloseAttr().run()
+    TestIsCloseNAN().run()
diff --git a/test/cinn/ops/test_left_shift_op.py b/test/cinn/ops/test_left_shift_op.py
new file mode 100644
index 0000000000000..975fa9840f827
--- /dev/null
+++ b/test/cinn/ops/test_left_shift_op.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestLeftShiftOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-100,
+            high=100)
+        self.y_np = self.random(
+            shape=self.case["y_shape"],
+            dtype=self.case["y_dtype"],
+            low=0,
+            high=16)
+
+    def build_paddle_program(self, target):
+        np_out = np.left_shift(self.x_np, self.y_np)
+        out = paddle.to_tensor(np_out, stop_gradient=True)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("left_shift")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.case["y_dtype"]), self.case["y_shape"],
+            "y")
+        out = builder.left_shift(x, y)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.x_np, self.y_np], [out])
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestLeftShiftAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestLeftShiftOpCase"
+        self.cls = TestLeftShiftOp
+        self.inputs = [{
+            "x_shape": [1],
+            "y_shape": [1],
+        }, {
+            "x_shape": [1024],
+            "y_shape": [1024],
+        }, {
+            "x_shape": [512, 256],
+            "y_shape": [512, 256],
+        }, {
+            "x_shape": [128, 64, 32],
+            "y_shape": [128, 64, 32],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+            "y_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+            "y_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [
+            # {
+            #     "x_dtype": "uint8",
+            #     "y_dtype": "uint8",
+            # },
+            {
+                "x_dtype": "int32",
+                "y_dtype": "int32",
+            },
+            # {
+            #     "x_dtype": "int64",
+            #     "y_dtype": "int64",
+            # },
+        ]
+        self.attrs = []
+
+
+class TestLeftShiftAllWithBroadcast(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestLeftShiftOpCase"
+        self.cls = TestLeftShiftOp
+        self.inputs = [{
+            "x_shape": [1],
+            "y_shape": [1],
+        }, {
+            "x_shape": [1024],
+            "y_shape": [1],
+        }, {
+            "x_shape": [512, 256],
+            "y_shape": [1, 1],
+        }, {
+            "x_shape": [128, 64, 32],
+            "y_shape": [1, 1, 1],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+            "y_shape": [1, 1, 1, 1],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+            "y_shape": [1, 1, 1, 1, 1],
+        }]
+        self.dtypes = [
+            # {
+            #     "x_dtype": "uint8",
+            #     "y_dtype": "uint8",
+            # },
+            {
+                "x_dtype": "int32",
+                "y_dtype": "int32",
+            },
+            # {
+            #     "x_dtype": "int64",
+            #     "y_dtype": "int64",
+            # }
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestLeftShiftAll().run()
+    TestLeftShiftAllWithBroadcast().run()
diff --git a/test/cinn/ops/test_log_op.py b/test/cinn/ops/test_log_op.py
new file mode 100644
index 0000000000000..20d7900543d39
--- /dev/null
+++ b/test/cinn/ops/test_log_op.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+class TestLogOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["shape"], dtype=self.case["dtype"])
+        self.base = self.case["base"]
+
+    def paddle_op(self, x):
+        if self.base == "e":
+            return paddle.log(x)
+        elif self.base == "2":
+            return paddle.log2(x)
+        elif self.base == "10":
+            return paddle.log10(x)
+        else:
+            raise ValueError("Unknown log base")
+
+    def cinn_op(self, builder, x):
+        if self.base == "e":
+            return builder.log(x)
+        elif self.base == "2":
+            return builder.log2(x)
+        elif self.base == "10":
+            return builder.log10(x)
+        else:
+            raise ValueError("Unknown log base")
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        out = self.paddle_op(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("add")
+        x = builder.create_input(
+            self.nptype2cinntype(self.x_np.dtype), self.x_np.shape, "x")
+        out = self.cinn_op(builder, x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestLogOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestLogeOpCase"
+        self.cls = TestLogOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [512, 256],
+            },
+            {
+                "shape": [128, 64, 32],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+            {
+                "shape": [16, 8, 4, 2, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "base": "e",
+            },
+            {
+                "base": "2",
+            },
+            {
+                "base": "10",
+            },
+        ]
+
+
+class TestLogOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestLogeOpCase"
+        self.cls = TestLogOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "base": "e",
+            },
+            {
+                "base": "2",
+            },
+            {
+                "base": "10",
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestLogOpShape().run()
+    TestLogOpDtype().run()
diff --git a/test/cinn/ops/test_logical_right_shift_op.py b/test/cinn/ops/test_logical_right_shift_op.py
new file mode 100644
index 0000000000000..ecec017cf157c
--- /dev/null
+++ b/test/cinn/ops/test_logical_right_shift_op.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestLogicalRightShift(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        iinfo = np.iinfo(self.case["dtype"])
+        self.x_np = self.random(
+            shape=self.case["shape"],
+            dtype=self.case["dtype"],
+            low=0,
+            high=iinfo.max)
+        self.y_np = self.random(
+            shape=self.case["shape"],
+            dtype=self.case["dtype"],
+            low=0,
+            high=iinfo.bits)
+
+    def build_paddle_program(self, target):
+        out_np = np.right_shift(self.x_np, self.y_np)
+        out = paddle.to_tensor(out_np, stop_gradient=True)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("logical_right_shift")
+        x = builder.create_input(
+            self.nptype2cinntype(self.x_np.dtype), self.x_np.shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.y_np.dtype), self.y_np.shape, "y")
+        out = builder.logical_right_shift(x, y)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.x_np, self.y_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestLogicalRightShiftShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestLogicalRightShiftCase"
+        self.cls = TestLogicalRightShift
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [512, 256],
+            },
+            {
+                "shape": [128, 64, 32],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+            {
+                "shape": [16, 8, 4, 2, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "int32",
+            },
+        ]
+        self.attrs = []
+
+
+class TestLogicalRightShiftDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestLogicalRightShiftCase"
+        self.cls = TestLogicalRightShift
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "uint8",
+            },
+            {
+                "dtype": "int8",
+            },
+            {
+                "dtype": "int16",
+            },
+            {
+                "dtype": "int32",
+            },
+            {
+                "dtype": "int64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestLogicalRightShiftShape().run()
+    TestLogicalRightShiftDtype().run()
diff --git a/test/cinn/ops/test_lookup_table_op.py b/test/cinn/ops/test_lookup_table_op.py
new file mode 100644
index 0000000000000..1ad2200935269
--- /dev/null
+++ b/test/cinn/ops/test_lookup_table_op.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import paddle.nn.functional as F
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestLookupTableOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.table_np = self.random(
+            shape=self.case["table_shape"], dtype=self.case["table_dtype"])
+        self.ids_np = self.random(
+            shape=self.case["ids_shape"],
+            dtype=self.case["ids_dtype"],
+            low=0,
+            high=self.case["table_shape"][0])
+
+    def build_paddle_program(self, target):
+        table = paddle.to_tensor(self.table_np, stop_gradient=False)
+        ids = paddle.to_tensor(self.ids_np, stop_gradient=False)
+        out = F.embedding(ids, table, self.case["padding_idx"])
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("lookup_table")
+        table = builder.create_input(
+            self.nptype2cinntype(self.table_np.dtype), self.table_np.shape,
+            "table")
+        ids = builder.create_input(
+            self.nptype2cinntype(self.ids_np.dtype), self.ids_np.shape + (1, ),
+            "ids")
+        out = builder.lookup_table(table, ids, self.case["padding_idx"])
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [table, ids],
+                                   [self.table_np, self.ids_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestLookupTableOpAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestLookupTableOpCase"
+        self.cls = TestLookupTableOp
+        self.inputs = [
+            {
+                "table_shape": [128, 8],
+                "ids_shape": [8],
+            },
+            {
+                "table_shape": [256, 4],
+                "ids_shape": [8, 4],
+            },
+            {
+                "table_shape": [1024, 2],
+                "ids_shape": [8, 4, 2],
+            },
+        ]
+        self.dtypes = [
+            {
+                "table_dtype": "float16",
+                "ids_dtype": "int16",
+            },
+            {
+                "table_dtype": "float32",
+                "ids_dtype": "int32",
+            },
+            {
+                "table_dtype": "float64",
+                "ids_dtype": "int64",
+            },
+        ]
+        self.attrs = [
+            {
+                "padding_idx": -1,
+            },
+            {
+                "padding_idx": 0,
+            },
+            {
+                "padding_idx": 1,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestLookupTableOpAll().run()
diff --git a/test/cinn/ops/test_matmul_op.py b/test/cinn/ops/test_matmul_op.py
new file mode 100755
index 0000000000000..8a2e4731e42ad
--- /dev/null
+++ b/test/cinn/ops/test_matmul_op.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import paddle.nn.functional as F
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestMatmulOp(OpTest):
+    def setUp(self):
+        # print(f'{self.__class__.__name__}: {self.case}')
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(self.case["x_shape"], self.case["dtype"])
+        self.y_np = self.random(self.case["y_shape"], self.case["dtype"])
+
+    def paddle_func(self, x, y):
+        return paddle.matmul(
+            x,
+            y,
+            transpose_x=self.case["transx"],
+            transpose_y=self.case["transy"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        y = paddle.to_tensor(self.y_np, stop_gradient=True)
+        out = self.paddle_func(x, y)
+        self.paddle_outputs = [out]
+
+    def cinn_func(self, builder, x, y):
+        return builder.matmul(
+            x,
+            y,
+            transpose_x=self.case["transx"],
+            transpose_y=self.case["transy"])
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("matmul")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["x_shape"],
+            "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["y_shape"],
+            "y")
+        out = self.cinn_func(builder, x, y)
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [x, y], [self.x_np, self.y_np], [out], passes=list())
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        # 1e-6 is same as the atol parameter of np.allclose
+        max_absolute_error = self.case[
+            "max_absolute_error"] if "max_absolute_error" in self.case else 1e-6
+        self.check_outputs_and_grads(
+            max_relative_error=max_relative_error,
+            max_absolute_error=max_absolute_error)
+
+
+class TestMatmulOpShapeDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestMatmulOpCase"
+        self.cls = TestMatmulOp
+        self.inputs = [
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+            },
+            {
+                "x_shape": [4, 4],
+                "y_shape": [4, 4],
+            },
+            {
+                "x_shape": [4, 16],
+                "y_shape": [16, 32],
+            },
+            {
+                "x_shape": [5, 4, 16],
+                "y_shape": [5, 16, 32],
+            },
+            {
+                # Matrix mul row vector
+                "x_shape": [4, 16],
+                "y_shape": [16],
+            },
+            {
+                # Matrix mul col vector
+                "x_shape": [4, 16],
+                "y_shape": [16, 1],
+            },
+            {
+                "x_shape": [8, 16, 4],
+                "y_shape": [1, 4, 16],
+            },
+            {
+                "x_shape": [1, 1, 1, 1],
+                "y_shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            # {
+            #     "dtype": "bfloat16",
+            # },
+            {
+                "dtype": "float16",
+                "max_relative_error": 1e-2,
+                "max_absolute_error": 1e-2
+            },
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "transx": False,
+                "transy": False,
+            },
+        ]
+
+
+class TestMatmulOpTrans(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestMatmulOpCase"
+        self.cls = TestMatmulOp
+        self.inputs = [
+            {
+                "x_shape": [16, 4],
+                "y_shape": [16, 32],
+                "transx": True,
+                "transy": False,
+            },
+            {
+                "x_shape": [5, 16, 4],
+                "y_shape": [5, 16, 32],
+                "transx": True,
+                "transy": False,
+            },
+            {
+                "x_shape": [8, 4, 16],
+                "y_shape": [4, 16],
+                "transx": True,
+                "transy": False,
+            },
+            {
+                "x_shape": [4, 16],
+                "y_shape": [32, 16],
+                "transx": False,
+                "transy": True,
+            },
+            {
+                "x_shape": [10, 12, 128, 64],
+                "y_shape": [10, 12, 128, 64],
+                "transx": False,
+                "transy": True,
+            },
+            {
+                "x_shape": [16, 4],
+                "y_shape": [32, 16],
+                "transx": True,
+                "transy": True,
+            },
+            {
+                "x_shape": [10, 12, 64, 128],
+                "y_shape": [10, 12, 128, 64],
+                "transx": True,
+                "transy": True,
+            },
+            {
+                "x_shape": [128],
+                "y_shape": [10, 12, 128, 64],
+                "transx": True,
+                "transy": False,
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = []
+
+
+class TestMatmulTransposePass(TestMatmulOp):
+    def paddle_func(self, x, y):
+        out = paddle.matmul(
+            x,
+            y,
+            transpose_x=self.case["transx"],
+            transpose_y=self.case["transy"])
+        return paddle.transpose(out, self.case["perm"])
+
+    def cinn_func(self, builder, x, y):
+        out = builder.matmul(
+            x,
+            y,
+            transpose_x=self.case["transx"],
+            transpose_y=self.case["transy"])
+        return builder.transpose(out, self.case["perm"])
+
+
+class TestMatmulTransposePassAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestMatmulTransposePassCase"
+        self.cls = TestMatmulTransposePass
+        self.inputs = [
+            {
+                "x_shape": [32, 64],
+                "y_shape": [64, 128],
+                "perm": [1, 0],
+                "transx": False,
+                "transy": False,
+            },
+            {
+                "x_shape": [10, 1, 128, 64],
+                "y_shape": [10, 12, 64, 128],
+                "perm": [0, 1, 3, 2],
+                "transx": False,
+                "transy": False,
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestMatmulOpShapeDtype().run()
+    TestMatmulOpTrans().run()
+    TestMatmulTransposePassAll().run()
diff --git a/test/cinn/ops/test_max_op.py b/test/cinn/ops/test_max_op.py
new file mode 100644
index 0000000000000..abaa0acbefc53
--- /dev/null
+++ b/test/cinn/ops/test_max_op.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import paddle.nn.functional as F
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestMaxOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {
+            "x": np.random.random((16, 64)).astype("float32"),
+            "y": np.random.random((16, 64)).astype("float32")
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=False)
+
+        out = paddle.maximum(x, y)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("pow")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.inputs["y"].dtype),
+            self.inputs["y"].shape, "y")
+        out = builder.max(x, y)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.inputs["x"], self.inputs["y"]], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestMinOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {
+            "x": np.random.random((16, 64)).astype("float32"),
+            "y": np.random.random((16, 64)).astype("float32")
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=False)
+
+        out = paddle.minimum(x, y)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("pow")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.inputs["y"].dtype),
+            self.inputs["y"].shape, "y")
+        out = builder.min(x, y)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.inputs["x"], self.inputs["y"]], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_mod_op.py b/test/cinn/ops/test_mod_op.py
new file mode 100644
index 0000000000000..cf32b442d4321
--- /dev/null
+++ b/test/cinn/ops/test_mod_op.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import paddle.nn.functional as F
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestModOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {
+            "x": np.array([7]).astype('float32'),
+            "y": np.array([-3]).astype('float32')
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=False)
+
+        out = paddle.mod(x, y)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("pow")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.inputs["y"].dtype),
+            self.inputs["y"].shape, "y")
+        out = builder.mod(x, y)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.inputs["x"], self.inputs["y"]], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestModCase1(TestModOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([32, 64], "float32", 20, 100),
+            "y": self.random([32, 64], "float32", 1, 20),
+        }
+
+
+class TestModCase2(TestModOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([32, 64], "int32", 20, 100),
+            "y": self.random([32, 64], "int32", 1, 20),
+        }
+
+
+class TestModCase3(TestModOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([32, 64], "float32", 20, 100),
+            "y": self.random([32, 64], "float32", -20, -1),
+        }
+
+
+class TestModCase4(TestModOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([32, 64], "int32", 20, 100),
+            "y": self.random([32, 64], "int32", -20, -1),
+        }
+
+
+class TestModCase5(TestModOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([32, 64], "float32", -100, -20),
+            "y": self.random([32, 64], "float32", 1, 20),
+        }
+
+
+class TestModCase6(TestModOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([32, 64], "float32", -100, -20),
+            "y": self.random([32, 64], "float32", -20, -1),
+        }
+
+
+class TestModCase7(TestModOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([32, 64], "int32", -100, -20),
+            "y": self.random([32, 64], "int32", 1, 20),
+        }
+
+
+class TestModCase8(TestModOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([32, 64], "int32", -100, -20),
+            "y": self.random([32, 64], "int32", -20, -1),
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_mul_op.py b/test/cinn/ops/test_mul_op.py
new file mode 100755
index 0000000000000..22d7362f0c555
--- /dev/null
+++ b/test/cinn/ops/test_mul_op.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import paddle.nn.functional as F
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+import sys
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestMulOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {
+            "x": np.random.random((16, 64)).astype("float32"),
+            "y": np.random.random((64, 16)).astype("float32")
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=False)
+        out = paddle.matmul(x, y)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("matmul")
+
+        x = builder.create_input(Float(32), self.inputs["x"].shape, "x")
+        y = builder.create_input(Float(32), self.inputs["y"].shape, "y")
+
+        out = builder.matmul(x, y)
+        prog = builder.build()
+        forward_res = self.get_cinn_output(
+            prog, target, [x, y], [self.inputs["x"], self.inputs["y"]], [out])
+
+        self.cinn_outputs = forward_res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_multiply_op.py b/test/cinn/ops/test_multiply_op.py
new file mode 100644
index 0000000000000..450d2449f37a9
--- /dev/null
+++ b/test/cinn/ops/test_multiply_op.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import paddle.nn.functional as F
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestElementwiseMulOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {
+            "x": np.random.random([32, 64]).astype("float32"),
+            "y": np.random.random([32, 64]).astype("float32")
+        }
+        self.axis = 0
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=False)
+
+        def get_unsqueeze_axis(x_rank, y_rank, axis):
+            self.assertTrue(
+                x_rank >= y_rank,
+                "The rank of x should be greater or equal to that of y.")
+            axis = axis if axis >= 0 else x_rank - y_rank
+            unsqueeze_axis = np.arange(0, axis).tolist() + np.arange(
+                axis + y_rank, x_rank).tolist()
+
+            return unsqueeze_axis
+
+        unsqueeze_axis = get_unsqueeze_axis(
+            len(self.inputs["x"].shape), len(self.inputs["y"].shape),
+            self.axis)
+        y_t = paddle.unsqueeze(
+            y, axis=unsqueeze_axis) if len(unsqueeze_axis) > 0 else y
+        out = paddle.multiply(x, y_t)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("multiply")
+        x = builder.create_input(Float(32), self.inputs["x"].shape, "x")
+        y = builder.create_input(Float(32), self.inputs["y"].shape, "y")
+        out = builder.multiply(x, y, axis=self.axis)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.inputs["x"], self.inputs["y"]], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestMulCase1(TestElementwiseMulOp):
+    def init_case(self):
+        self.inputs = {
+            "x": np.random.random([8, 16, 32, 32]).astype("float32"),
+            "y": np.random.random([32, 32]).astype("float32")
+        }
+        self.axis = 2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_negative_op.py b/test/cinn/ops/test_negative_op.py
new file mode 100644
index 0000000000000..cb26fd777a066
--- /dev/null
+++ b/test/cinn/ops/test_negative_op.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestNegativeOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-100,
+            high=100)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.neg(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.negative(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestNegativeOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestNegativeOpShape"
+        self.cls = TestNegativeOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestNegativeOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestNegativeOpDtype"
+        self.cls = TestNegativeOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "int8",
+            },
+            {
+                "x_dtype": "int16",
+            },
+            {
+                "x_dtype": "int32",
+            },
+            {
+                "x_dtype": "int64",
+            },
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestNegativeOpShape().run()
+    TestNegativeOpDtype().run()
diff --git a/test/cinn/ops/test_one_hot_op.py b/test/cinn/ops/test_one_hot_op.py
new file mode 100755
index 0000000000000..4dd01e07d935a
--- /dev/null
+++ b/test/cinn/ops/test_one_hot_op.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import paddle.nn.functional as F
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestOneHotOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {
+            "X": np.random.random_integers(0, 9, (10)).astype("int64")
+        }
+        self.depth = 10
+        self.axis = -1
+        self.dtype = "float32"
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["X"])
+        out = F.one_hot(x, self.depth)
+
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("one_hot")
+        x = builder.create_input(Int(64), self.inputs["X"].shape, "X")
+        on_value = builder.fill_constant([1], 1, 'on_value', 'int64')
+        off_value = builder.fill_constant([1], 0, 'off_value', 'int64')
+
+        out = builder.one_hot(x, on_value, off_value, self.depth, self.axis,
+                              self.dtype)
+        prog = builder.build()
+        forward_res = self.get_cinn_output(prog, target, [x],
+                                           [self.inputs["X"]], [out])
+
+        self.cinn_outputs = forward_res
+
+    def test_check_results(self):
+        self.build_paddle_program(self.target)
+        self.build_cinn_program(self.target)
+        self.check_results(self.paddle_outputs, self.cinn_outputs, 1e-5, False,
+                           False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_pool2d_op.py b/test/cinn/ops/test_pool2d_op.py
new file mode 100644
index 0000000000000..61bb2a3b1811f
--- /dev/null
+++ b/test/cinn/ops/test_pool2d_op.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from paddle import _C_ops
+
+
+@OpTestTool.skip_if(not is_compiled_with_cudnn(),
+                    "x86 test will be skipped due to timeout.")
+class TestPool2dOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["shape"], dtype=self.case["dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        out = _C_ops.pool2d(
+            x,
+            self.case["kernel_size"],
+            self.case["stride"],
+            self.case["padding"],
+            self.case["ceil_mode"],
+            self.case["exclusive"],
+            self.case["data_format"],
+            self.case["pooling_type"],
+            self.case["global_pooling"],
+            self.case["adaptive"],
+            self.case["padding_algorithm"],
+        )
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("pool2d")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["shape"], "x")
+        out = builder.pool2d(
+            x,
+            pooling_type=self.case["pooling_type"],
+            kernel_size=self.case["kernel_size"],
+            stride=self.case["stride"],
+            padding=self.case["padding"],
+            ceil_mode=self.case["ceil_mode"],
+            exclusive=self.case["exclusive"],
+            data_format=self.case["data_format"],
+            global_pooling=self.case["global_pooling"],
+            adaptive=self.case["adaptive"],
+            padding_algorithm=self.case["padding_algorithm"],
+        )
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [x], [self.x_np], [out], passes=[])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestPool2dOpAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestPool2dOpCase"
+        self.cls = TestPool2dOp
+        self.inputs = [
+            {
+                "shape": [1, 3, 32, 32],
+                "data_format": "NCHW",
+            },
+            {
+                "shape": [32, 3, 64, 64],
+                "data_format": "NCHW",
+            },
+            {
+                "shape": [1, 32, 32, 3],
+                "data_format": "NHWC",
+            },
+            {
+                "shape": [32, 64, 64, 3],
+                "data_format": "NHWC",
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16",
+            },
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "pooling_type": "max",
+                "kernel_size": [2, 2],
+                "stride": [1, 1],
+                "padding": [0, 0],
+                "padding_algorithm": "VALID",
+                "global_pooling": False,
+                "ceil_mode": False,
+                "exclusive": True,
+                "adaptive": False,
+            },
+            {
+                "pooling_type": "max",
+                "kernel_size": [3, 3],
+                "stride": [2, 2],
+                "padding": [1, 1],
+                "padding_algorithm": "SAME",
+                "global_pooling": False,
+                "ceil_mode": True,
+                "exclusive": True,
+                "adaptive": False,
+            },
+            {
+                "pooling_type": "max",
+                "kernel_size": [5, 5],
+                "stride": [3, 3],
+                "padding": [2, 2],
+                "padding_algorithm": "EXPLICIT",
+                "global_pooling": False,
+                "ceil_mode": False,
+                "exclusive": False,
+                "adaptive": False,
+            },
+            {
+                "pooling_type": "avg",
+                "kernel_size": [2, 2],
+                "stride": [1, 1],
+                "padding": [0, 0],
+                "padding_algorithm": "VALID",
+                "global_pooling": False,
+                "ceil_mode": False,
+                "exclusive": True,
+                "adaptive": False,
+            },
+            {
+                "pooling_type": "avg",
+                "kernel_size": [3, 3],
+                "stride": [2, 2],
+                "padding": [1, 1],
+                "padding_algorithm": "SAME",
+                "global_pooling": True,
+                "ceil_mode": False,
+                "exclusive": True,
+                "adaptive": False,
+            },
+            {
+                "pooling_type": "avg",
+                "kernel_size": [5, 5],
+                "stride": [3, 3],
+                "padding": [1, 1],
+                "padding_algorithm": "EXPLICIT",
+                "global_pooling": False,
+                "ceil_mode": False,
+                "exclusive": True,
+                "adaptive": True,
+            },
+        ]
+
+
+@OpTestTool.skip_if(not is_compiled_with_cudnn(),
+                    "x86 test will be skipped due to timeout.")
+class TestPool2dBackwardOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["shape"], dtype=self.case["dtype"])
+        self.dy_np = self.random(
+            shape=self.case["shape"], dtype=self.case["dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        forward_out = _C_ops.pool2d(
+            x,
+            self.case["kernel_size"],
+            self.case["stride"],
+            self.case["padding"],
+            self.case["ceil_mode"],
+            self.case["exclusive"],
+            self.case["data_format"],
+            self.case["pooling_type"],
+            self.case["global_pooling"],
+            self.case["adaptive"],
+            self.case["padding_algorithm"],
+            True,  # Need in paddlepaddle-2.4.2, will be removed in paddlepaddle-2.5
+        )
+        self.paddle_outputs = [forward_out]
+        self.paddle_grads = self.get_paddle_grads([forward_out], [x],
+                                                  [self.dy_np])
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("pool2d")
+        # forward
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["shape"], "x")
+        y = builder.pool2d(
+            x,
+            kernel_size=self.case["kernel_size"],
+            stride=self.case["stride"],
+            padding=self.case["padding"],
+            ceil_mode=self.case["ceil_mode"],
+            exclusive=self.case["exclusive"],
+            data_format=self.case["data_format"],
+            pooling_type=self.case["pooling_type"],
+            global_pooling=self.case["global_pooling"],
+            adaptive=self.case["adaptive"],
+            padding_algorithm=self.case["padding_algorithm"],
+        )
+        # backward
+        dy = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["shape"], "dy")
+        dx = builder.pool2d_grad(
+            x,
+            y,
+            dy,
+            kernel_size=self.case["kernel_size"],
+            stride=self.case["stride"],
+            padding=self.case["padding"],
+            ceil_mode=self.case["ceil_mode"],
+            exclusive=self.case["exclusive"],
+            data_format=self.case["data_format"],
+            pooling_type=self.case["pooling_type"],
+            global_pooling=self.case["global_pooling"],
+            adaptive=self.case["adaptive"],
+            padding_algorithm=self.case["padding_algorithm"],
+        )
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [x, dy], [self.x_np, self.dy_np], [y, dx], passes=[])
+        self.cinn_outputs = [res[0]]
+        self.cinn_grads = [res[1]]
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestPool2dBackwardAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestPool2dBackwardCase"
+        self.cls = TestPool2dBackwardOp
+        self.inputs = [
+            {
+                "shape": [1, 3, 32, 32],
+                "data_format": "NCHW",
+            },
+            {
+                "shape": [1, 32, 32, 3],
+                "data_format": "NHWC",
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16",
+                "max_relative_error": 1e-2,
+            },
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "pooling_type": "max",
+                "kernel_size": [5, 5],
+                "stride": [2, 2],
+                "padding": [0, 0],
+                "padding_algorithm": "SAME",
+                "global_pooling": False,
+                "ceil_mode": False,
+                "exclusive": True,
+                "adaptive": False,
+            },
+            {
+                "pooling_type": "max",
+                "kernel_size": [3, 3],
+                "stride": [2, 2],
+                "padding": [1, 1],
+                "padding_algorithm": "VALID",
+                "global_pooling": False,
+                "ceil_mode": True,
+                "exclusive": False,
+                "adaptive": False,
+            },
+            {
+                "pooling_type": "avg",
+                "kernel_size": [2, 2],
+                "stride": [2, 2],
+                "padding": [0, 0],
+                "padding_algorithm": "SAME",
+                "global_pooling": True,
+                "ceil_mode": False,
+                "exclusive": True,
+                "adaptive": False,
+            },
+            {
+                "pooling_type": "avg",
+                "kernel_size": [3, 3],
+                "stride": [2, 2],
+                "padding": [1, 1],
+                "padding_algorithm": "EXPLICIT",
+                "global_pooling": False,
+                "ceil_mode": False,
+                "exclusive": True,
+                "adaptive": True,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestPool2dOpAll().run()
+    TestPool2dBackwardAll().run()
diff --git a/test/cinn/ops/test_popc_op.py b/test/cinn/ops/test_popc_op.py
new file mode 100644
index 0000000000000..560335b93c4b7
--- /dev/null
+++ b/test/cinn/ops/test_popc_op.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from cinn.common import *
+from cinn.frontend import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+INT32_MAX = (1 << 31) - 1
+INT32_MIN = -(1 << 31)
+INT64_MAX = (1 << 63) - 1
+INT64_MIN = -(1 << 63)
+
+
+def popcount(integer, dtype):
+    if dtype == "int32":
+        bits = 32
+    elif dtype == "int64":
+        bits = 64
+    else:
+        raise NotImplementedError
+    ones = bin(integer).count("1")
+    if integer > 0:
+        return ones
+    else:
+        trailing_zeros = 0
+        mask = 1
+        while trailing_zeros < bits and ((integer & mask) == 0):
+            trailing_zeros += 1
+            mask <<= 1
+        return bits - ones - trailing_zeros + 1
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestPopcOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        dtype = self.case["dtype"]
+        low = INT32_MIN if dtype == "int32" else INT64_MIN
+        high = INT32_MAX if dtype == "int32" else INT64_MAX
+        x = self.random(self.case["shape"], dtype, low=low, high=high)
+        y = list(map(lambda num: popcount(num, dtype), x.reshape(-1).tolist()))
+        self.inputs = {"x": x}
+        self.outputs = {"y": np.array(y).reshape(x.shape).astype(dtype)}
+
+    def build_paddle_program(self, target):
+        y = paddle.to_tensor(self.outputs["y"], stop_gradient=False)
+        self.paddle_outputs = [y]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("popc")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.popc(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestPopcOpShapeDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestPopcOpShapeDtype"
+        self.cls = TestPopcOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 1024, 7],
+            },
+            {
+                "shape": [10, 5, 2048, 2],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestPopcOpShapeDtype().run()
diff --git a/test/cinn/ops/test_pow_op.py b/test/cinn/ops/test_pow_op.py
new file mode 100644
index 0000000000000..90be49863d8f7
--- /dev/null
+++ b/test/cinn/ops/test_pow_op.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestPowOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["dtype"],
+            low=self.case["base_low"],
+            high=self.case["base_high"])
+        self.y_np = self.random(
+            shape=self.case["y_shape"],
+            dtype=self.case["dtype"],
+            low=self.case["exp_low"],
+            high=self.case["exp_high"])
+        self.axis = np.random.choice([-1, 0])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        y = paddle.to_tensor(self.y_np, stop_gradient=False)
+        out = paddle.pow(x, y)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("pow")
+        x = builder.create_input(
+            self.nptype2cinntype(self.x_np.dtype), self.x_np.shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.y_np.dtype), self.y_np.shape, "y")
+        out = builder.pow(x, y, axis=self.axis)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.x_np, self.y_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(equal_nan=True)
+
+
+class TestPowOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestLogicalRightShiftCase"
+        self.cls = TestPowOp
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+            },
+            {
+                "x_shape": [512, 256],
+                "y_shape": [512, 256],
+            },
+            {
+                "x_shape": [128, 64, 32],
+                "y_shape": [128, 64, 32],
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+                "y_shape": [16, 8, 4, 2],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "y_shape": [16, 8, 4, 2, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "base_low": -10,
+                "base_high": 10,
+                "exp_low": -3,
+                "exp_high": 3,
+            },
+        ]
+
+
+class TestPowOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestLogicalRightShiftCase"
+        self.cls = TestPowOp
+        self.inputs = [
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "int32",
+            },
+            {
+                "dtype": "int64",
+            },
+            {
+                "dtype": "float16",
+            },
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "base_low": -10,
+                "base_high": 10,
+                "exp_low": -3,
+                "exp_high": 3,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestPowOpShape().run()
+    TestPowOpDtype().run()
diff --git a/test/cinn/ops/test_randint_op.py b/test/cinn/ops/test_randint_op.py
new file mode 100644
index 0000000000000..d699614c4416b
--- /dev/null
+++ b/test/cinn/ops/test_randint_op.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from random import seed
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestRandIntOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.shape = [2, 3]
+        self.min = 0
+        self.max = 5
+        self.seed = 10
+        self.dtype = "int32"
+
+    def build_paddle_program(self, target):
+        out = paddle.randint(
+            shape=self.shape, low=self.min, high=self.max, dtype=self.dtype)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("randint")
+        out = builder.randint(self.shape, self.min, self.max, self.seed,
+                              self.dtype)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [], [], [out], passes=[])
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        # Due to the different random number generation numbers implemented
+        # in the specific implementation, the random number results generated
+        # by CINN and Paddle are not the same, but they all conform to the
+        # uniform distribution.
+        # self.check_outputs_and_grads()
+        self.build_paddle_program(self.target)
+        self.build_cinn_program(self.target)
+
+
+class TestRandIntCase1(TestRandIntOp):
+    def init_case(self):
+        self.shape = [2, 3, 4]
+        self.min = 0
+        self.max = 8
+        self.seed = 10
+        self.dtype = "int32"
+
+
+class TestRandIntCase2(TestRandIntOp):
+    def init_case(self):
+        self.shape = [2, 3, 4]
+        self.min = -2
+        self.max = 3
+        self.seed = 8
+        self.dtype = "int64"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_reciprocal_op.py b/test/cinn/ops/test_reciprocal_op.py
new file mode 100644
index 0000000000000..4f47a26716e35
--- /dev/null
+++ b/test/cinn/ops/test_reciprocal_op.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+class TestReciprocalOp(OpTest):
+    def setUp(self):
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x":
+            np.random.random(self.case["x_shape"]).astype(self.case["x_dtype"])
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = paddle.reciprocal(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("reciprocal_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.reciprocal(x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestReciprocalShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReciprocalOp"
+        self.cls = TestReciprocalOp
+        self.inputs = [
+            {
+                "x_shape": [1024]
+            },
+            {
+                "x_shape": [512, 256]
+            },
+            {
+                "x_shape": [128, 64, 32]
+            },
+            {
+                "x_shape": [16, 8, 4, 2]
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1]
+            },
+            {
+                "x_shape": [1]
+            },
+            {
+                "x_shape": [1, 1, 1, 1, 1]
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "float64"
+            },
+            {
+                "x_dtype": "float32"
+            },
+            {
+                "x_dtype": "float16"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestReciprocalShape().run()
diff --git a/test/cinn/ops/test_reduce_op.py b/test/cinn/ops/test_reduce_op.py
new file mode 100644
index 0000000000000..2e47fca48f871
--- /dev/null
+++ b/test/cinn/ops/test_reduce_op.py
@@ -0,0 +1,680 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+paddle.seed(2)
+np.random.seed(2)
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestReduceBaseOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = []
+        self.keep_dim = False
+
+    def paddle_func(self, x):
+        return paddle.sum(x, axis=self.dim, keepdim=self.keep_dim)
+
+    def cinn_func(self, builder, x):
+        return builder.reduce_sum(x, self.dim, self.keep_dim)
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Float(32), shape, name)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = self.paddle_func(x)
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("reduce")
+        x = self.cinn_create_input(builder, self.inputs["x"].shape, "x")
+        out = self.cinn_func(builder, x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestReduceSumOp(TestReduceBaseOp):
+    def paddle_func(self, x):
+        return paddle.sum(x, axis=self.dim, keepdim=self.keep_dim)
+
+    def cinn_func(self, builder, x):
+        return builder.reduce_sum(x, self.dim, self.keep_dim)
+
+
+class TestReduceSumCase1(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [1]
+        self.keep_dim = False
+
+
+class TestReduceSumCase2(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0, 1]
+        self.keep_dim = False
+
+
+class TestReduceSumCase3(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0, 2]
+        self.keep_dim = False
+
+
+class TestReduceSumCase4(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0]
+        self.keep_dim = True
+
+
+class TestReduceSumCase5(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([1, 16, 256], "float32", -1.0, 1.0)}
+        self.dim = [0]
+        self.keep_dim = False
+
+
+class TestReduceSumCase6(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([1, 12, 9, 9], "float32", -1.0, 1.0)}
+        self.dim = [-1]
+        self.keep_dim = False
+
+
+class TestReduceSumCase7(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([1, 1, 10, 10, 10], "float32", -1.0, 1.0)
+        }
+        self.dim = [0]
+        self.keep_dim = False
+
+
+class TestReduceSumCase8(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([1, 10, 1, 10, 10], "float32", -1.0, 1.0)
+        }
+        self.dim = [0, 2]
+        self.keep_dim = False
+
+
+class TestReduceSumCase9(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([1, 1, 10], "float32", -1.0, 1.0)}
+        self.dim = [0, 2]
+        self.keep_dim = False
+
+
+class TestReduceSumCase10(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([1, 1, 10], "float32", -1.0, 1.0)}
+        self.dim = [0, 2]
+        self.keep_dim = True
+
+
+class TestReduceSumCase11(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {
+            "x": self.random([32, 32, 32, 32], "float32", -0.1, 0.1)
+        }
+        self.dim = [0, 2, 3]
+        self.keep_dim = False
+
+    def test_check_results(self):
+        # the shape of tensor is large, lead to the different of result increase
+        self.check_outputs_and_grads(max_relative_error=1e-4)
+
+
+class TestReduceSumCase12(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 1024], "float32", -1.0, 1.0)}
+        self.dim = []
+        self.keep_dim = False
+
+
+class TestReduceSumCase13(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 1024], "float32", -1.0, 1.0)}
+        self.dim = [0]
+        self.keep_dim = False
+
+
+class TestReduceSumCase14(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 1024], "float32", -1.0, 1.0)}
+        self.dim = [1]
+        self.keep_dim = False
+
+
+class TestReduceSumCase15(TestReduceSumOp):
+    def init_case(self):
+        # data shape from resnet50 bs=32
+        self.inputs = {
+            "x": self.random([32, 64, 56, 56], "float32", -0.1, 0.1)
+        }
+        self.dim = [0, 2, 3]
+        self.keep_dim = False
+
+    def test_check_results(self):
+        # the shape of tensor is large, lead to the different of result increase
+        self.check_outputs_and_grads(max_relative_error=1e-4)
+
+
+class TestReduceSumCase16(TestReduceSumOp):
+    def init_case(self):
+        # data shape from resnet50 NHWC bs=32
+        self.inputs = {
+            "x": self.random([32, 56, 56, 64], "float32", -0.1, 0.1)
+        }
+        self.dim = [0, 1, 2]
+        self.keep_dim = False
+
+    def test_check_results(self):
+        # the shape of tensor is large, lead to the different of result increase
+        # NHWC's difference are more larger than NCHW
+        self.check_outputs_and_grads(max_relative_error=1e-3)
+
+
+class TestReduceSumCase17(TestReduceSumOp):
+    def init_case(self):
+        # data shape from resnet50 bs=1
+        self.inputs = {"x": self.random([1, 64, 56, 56], "float32", -0.1, 0.1)}
+        self.dim = [0, 2, 3]
+        self.keep_dim = False
+
+    def test_check_results(self):
+        # the shape of tensor is large, lead to the different of result increase
+        self.check_outputs_and_grads(max_relative_error=1e-4)
+
+
+class TestReduceSumCase18(TestReduceSumOp):
+    def init_case(self):
+        # data shape from resnet50 NHWC bs=1
+        self.inputs = {"x": self.random([1, 56, 56, 64], "float32", -0.1, 0.1)}
+        self.dim = [0, 1, 2]
+        self.keep_dim = False
+
+    def test_check_results(self):
+        # the shape of tensor is large, lead to the different of result increase
+        self.check_outputs_and_grads(max_relative_error=1e-4)
+
+
+class TestReduceSumFP64(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float64", -1.0, 1.0)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Float(64), shape, name)
+
+
+class TestReduceSumINT32(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10], "int32", -100, 100)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Int(32), shape, name)
+
+    def paddle_func(self, x):
+        return paddle.sum(
+            x, axis=self.dim,
+            keepdim=self.keep_dim).cast(self.inputs["x"].dtype)
+
+
+class TestReduceSumINT32Case1(TestReduceSumINT32):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 1024], "int32", -100, 100)}
+        self.dim = [1]
+        self.keep_dim = False
+
+
+class TestReduceSumINT32Case2(TestReduceSumINT32):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "int32", -100, 100)}
+        self.dim = []
+        self.keep_dim = False
+
+
+class TestReduceSumINT32Case3(TestReduceSumINT32):
+    def init_case(self):
+        self.inputs = {"x": self.random([1, 56, 56, 64], "int32", -100, 100)}
+        self.dim = [0, 1, 2]
+        self.keep_dim = False
+
+
+class TestReduceSumINT64(TestReduceSumOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10], "int64", -100, 100)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Int(64), shape, name)
+
+
+class TestReduceSumINT64Case1(TestReduceSumINT64):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 1024], "int64", -100, 100)}
+        self.dim = [1]
+        self.keep_dim = False
+
+
+class TestReduceSumINT64Case2(TestReduceSumINT64):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "int64", -100, 100)}
+        self.dim = []
+        self.keep_dim = False
+
+
+class TestReduceSumINT64Case3(TestReduceSumINT64):
+    def init_case(self):
+        self.inputs = {"x": self.random([1, 56, 56, 64], "int64", -100, 100)}
+        self.dim = [0, 1, 2]
+        self.keep_dim = False
+
+
+class TestReduceProdOp(TestReduceBaseOp):
+    def paddle_func(self, x):
+        return paddle.prod(x, axis=self.dim, keepdim=self.keep_dim)
+
+    def cinn_func(self, builder, x):
+        return builder.reduce_prod(x, self.dim, self.keep_dim)
+
+
+class TestReduceProdCase1(TestReduceProdOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0, 1]
+        self.keep_dim = False
+
+
+class TestReduceProdCase2(TestReduceProdOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0, 1]
+        self.keep_dim = True
+
+
+class TestReduceProdCase3(TestReduceProdOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0]
+        self.keep_dim = False
+
+
+class TestReduceProdCase4(TestReduceProdOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = []
+        self.keep_dim = False
+
+
+class TestReduceProdFP64(TestReduceProdOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float64", -1.0, 1.0)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Float(64), shape, name)
+
+
+class TestReduceProdINT32(TestReduceProdOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10], "int32", -10, 10)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Int(32), shape, name)
+
+    def paddle_func(self, x):
+        return paddle.prod(
+            x, axis=self.dim,
+            keepdim=self.keep_dim).cast(self.inputs["x"].dtype)
+
+
+class TestReduceProdINT64(TestReduceProdOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 1024], "int64", -10, 10)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Int(64), shape, name)
+
+
+class TestReduceProdINT64Case1(TestReduceProdINT64):
+    def init_case(self):
+        self.inputs = {"x": self.random([1, 56, 56, 64], "int64", -10, 10)}
+        self.dim = [0, 1, 2]
+        self.keep_dim = False
+
+
+class TestReduceMaxOp(TestReduceBaseOp):
+    def paddle_func(self, x):
+        return paddle.max(x, axis=self.dim, keepdim=self.keep_dim)
+
+    def cinn_func(self, builder, x):
+        return builder.reduce_max(x, self.dim, self.keep_dim)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestReduceMaxCase1(TestReduceMaxOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0, 1]
+        self.keep_dim = False
+
+
+class TestReduceMaxCase2(TestReduceMaxOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0, 1]
+        self.keep_dim = True
+
+
+class TestReduceMaxCase3(TestReduceMaxOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0]
+        self.keep_dim = False
+
+
+class TestReduceMaxCase4(TestReduceMaxOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -10.0, -1.0)}
+        self.dim = []
+        self.keep_dim = False
+
+
+class TestReduceMaxFP64(TestReduceMaxOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float64", -1.0, 1.0)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Float(64), shape, name)
+
+
+class TestReduceMaxFP64Case1(TestReduceMaxFP64):
+    def init_case(self):
+        self.inputs = {"x": self.random([2, 3, 4, 5], "float64", -1.0, 1.0)}
+        self.dim = [0, 1]
+        self.keep_dim = False
+
+
+class TestReduceMaxINT32(TestReduceMaxOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10], "int32", -100, 100)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Int(32), shape, name)
+
+    def paddle_func(self, x):
+        return paddle.max(
+            x, axis=self.dim,
+            keepdim=self.keep_dim).cast(self.inputs["x"].dtype)
+
+
+class TestReduceMaxINT64(TestReduceMaxOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10], "int64", -100, 100)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Int(64), shape, name)
+
+
+class TestReduceMaxINT64Case1(TestReduceMaxINT64):
+    def init_case(self):
+        self.inputs = {"x": self.random([1, 56, 56, 64], "int64", -100, 100)}
+        self.dim = [0, 1, 2]
+        self.keep_dim = False
+
+
+class TestReduceMinOp(TestReduceBaseOp):
+    def paddle_func(self, x):
+        return paddle.min(x, axis=self.dim, keepdim=self.keep_dim)
+
+    def cinn_func(self, builder, x):
+        return builder.reduce_min(x, self.dim, self.keep_dim)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestReduceMinCase1(TestReduceMinOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0, 1]
+        self.keep_dim = False
+
+
+class TestReduceMinCase2(TestReduceMinOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0, 1]
+        self.keep_dim = True
+
+
+class TestReduceMinCase3(TestReduceMinOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", -1.0, 1.0)}
+        self.dim = [0]
+        self.keep_dim = False
+
+
+class TestReduceMinCase4(TestReduceMinOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float32", 1.0, 10.0)}
+        self.dim = []
+        self.keep_dim = False
+
+
+class TestReduceMinFP64(TestReduceMinOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "float64", -1.0, 1.0)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Float(64), shape, name)
+
+
+class TestReduceMinFP64Case1(TestReduceMinFP64):
+    def init_case(self):
+        self.inputs = {"x": self.random([2, 3, 4, 5], "float64", -1.0, 1.0)}
+        self.dim = [0, 1]
+        self.keep_dim = False
+
+
+class TestReduceMinINT32(TestReduceMinOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10], "int32", -100, 100)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Int(32), shape, name)
+
+    def paddle_func(self, x):
+        return paddle.min(
+            x, axis=self.dim,
+            keepdim=self.keep_dim).cast(self.inputs["x"].dtype)
+
+
+class TestReduceMinINT64(TestReduceMinOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10], "int64", -100, 100)}
+        self.dim = []
+        self.keep_dim = False
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Int(64), shape, name)
+
+
+class TestReduceMinINT64Case1(TestReduceMinINT64):
+    def init_case(self):
+        self.inputs = {"x": self.random([1, 56, 56, 64], "int64", -100, 100)}
+        self.dim = [0, 1, 2]
+        self.keep_dim = False
+
+
+class TestAllOp(TestReduceBaseOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "bool")}
+        self.dim = []
+        self.keep_dim = False
+
+    def paddle_func(self, x):
+        return paddle.all(x, axis=self.dim, keepdim=self.keep_dim)
+
+    def cinn_func(self, builder, x):
+        return builder.reduce_all(x, self.dim, self.keep_dim)
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Bool(), shape, name)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestAllCase1(TestAllOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "bool")}
+        self.dim = []
+        self.keep_dim = True
+
+
+class TestAllCase2(TestAllOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "bool")}
+        self.dim = [0, 1]
+        self.keep_dim = False
+
+
+class TestAllCase3(TestAllOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "bool")}
+        self.dim = [0, 1]
+        self.keep_dim = True
+
+
+class TestAllCase4(TestAllOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "bool")}
+        self.dim = [0]
+        self.keep_dim = False
+
+
+class TestAllCase5(TestAllOp):
+    def init_case(self):
+        self.inputs = {"x": np.full([10, 10, 10], True, 'bool')}
+        self.dim = []
+        self.keep_dim = False
+
+
+class TestAnyOp(TestReduceBaseOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "bool")}
+        self.dim = []
+        self.keep_dim = False
+
+    def paddle_func(self, x):
+        return paddle.any(x, axis=self.dim, keepdim=self.keep_dim)
+
+    def cinn_func(self, builder, x):
+        return builder.reduce_any(x, self.dim, self.keep_dim)
+
+    def cinn_create_input(self, builder, shape, name):
+        return builder.create_input(Bool(), shape, name)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestAnyCase1(TestAnyOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "bool")}
+        self.dim = []
+        self.keep_dim = True
+
+
+class TestAnyCase2(TestAnyOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "bool")}
+        self.dim = [0, 1]
+        self.keep_dim = False
+
+
+class TestAnyCase3(TestAnyOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "bool")}
+        self.dim = [0, 1]
+        self.keep_dim = True
+
+
+class TestAnyCase4(TestAnyOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([10, 10, 10], "bool")}
+        self.dim = [0]
+        self.keep_dim = False
+
+
+class TestAnyCase5(TestAllOp):
+    def init_case(self):
+        self.inputs = {"x": np.full([10, 10, 10], False, 'bool')}
+        self.dim = []
+        self.keep_dim = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_reduce_op_new.py b/test/cinn/ops/test_reduce_op_new.py
new file mode 100644
index 0000000000000..ada16d57a882b
--- /dev/null
+++ b/test/cinn/ops/test_reduce_op_new.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestReduceOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["shape"], dtype=self.case["dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        if self.case["op_type"] == "sum":
+            out = paddle.sum(
+                x, axis=self.case["axis"], keepdim=self.case["keepdim"])
+            if self.case["dtype"] == "int32":
+                out = out.cast(self.case["dtype"])
+        elif self.case["op_type"] == "prod":
+            out = paddle.prod(
+                x, axis=self.case["axis"], keepdim=self.case["keepdim"])
+        elif self.case["op_type"] == "max":
+            out = paddle.max(
+                x, axis=self.case["axis"], keepdim=self.case["keepdim"])
+        elif self.case["op_type"] == "min":
+            out = paddle.min(
+                x, axis=self.case["axis"], keepdim=self.case["keepdim"])
+        elif self.case["op_type"] == "all":
+            out = paddle.all(
+                x, axis=self.case["axis"], keepdim=self.case["keepdim"])
+        elif self.case["op_type"] == "any":
+            out = paddle.any(
+                x, axis=self.case["axis"], keepdim=self.case["keepdim"])
+        else:
+            out = paddle.assign(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("reduce")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["shape"], "x")
+        if self.case["op_type"] == "sum":
+            out = builder.reduce_sum(x, self.case["axis"],
+                                     self.case["keepdim"])
+        elif self.case["op_type"] == "prod":
+            out = builder.reduce_prod(x, self.case["axis"],
+                                      self.case["keepdim"])
+        elif self.case["op_type"] == "max":
+            out = builder.reduce_max(x, self.case["axis"],
+                                     self.case["keepdim"])
+        elif self.case["op_type"] == "min":
+            out = builder.reduce_min(x, self.case["axis"],
+                                     self.case["keepdim"])
+        elif self.case["op_type"] == "all":
+            out = builder.reduce_all(x, self.case["axis"],
+                                     self.case["keepdim"])
+        elif self.case["op_type"] == "any":
+            out = builder.reduce_any(x, self.case["axis"],
+                                     self.case["keepdim"])
+        else:
+            out = builder.identity(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestReduceAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReduceOpCase"
+        self.cls = TestReduceOp
+        self.inputs = [
+            {
+                "shape": [1],
+                "axis": [-1],
+            },
+            {
+                "shape": [1024],
+                "axis": [0],
+            },
+            {
+                "shape": [512, 256],
+                "axis": [1],
+            },
+            {
+                "shape": [128, 64, 32],
+                "axis": [2],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+                "axis": [3],
+            },
+            {
+                "shape": [16, 8, 4, 2, 1],
+                "axis": [3],
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+                "axis": [3],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "op_type": "sum",
+                "keepdim": True
+            },
+            {
+                "op_type": "sum",
+                "keepdim": False
+            },
+            {
+                "op_type": "prod",
+                "keepdim": True
+            },
+            {
+                "op_type": "prod",
+                "keepdim": False
+            },
+            {
+                "op_type": "max",
+                "keepdim": True
+            },
+            {
+                "op_type": "max",
+                "keepdim": False
+            },
+            {
+                "op_type": "min",
+                "keepdim": True
+            },
+            {
+                "op_type": "min",
+                "keepdim": False
+            },
+        ]
+
+
+class TestReduceDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReduceOpCase"
+        self.cls = TestReduceOp
+        self.inputs = [
+            {
+                "shape": [16, 8, 4, 2, 1],
+                "axis": [3],
+            },
+        ]
+        self.dtypes = [
+            # Paddle reduce not support
+            # {
+            #     "dtype": "int16",
+            # },
+            {
+                "dtype": "int32",
+            },
+            {
+                "dtype": "int64",
+            },
+            # Paddle reduce not support
+            # {
+            #     "dtype": "float16",
+            # },
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "op_type": "sum",
+                "keepdim": True
+            },
+            {
+                "op_type": "sum",
+                "keepdim": False
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestReduceAll().run()
diff --git a/test/cinn/ops/test_reduce_op_other.py b/test/cinn/ops/test_reduce_op_other.py
new file mode 100644
index 0000000000000..1f54dd3b20f43
--- /dev/null
+++ b/test/cinn/ops/test_reduce_op_other.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_reduce_op_new import TestReduceAll
+
+
+class TestReduceForBool(TestReduceAll):
+    def init_attrs(self):
+        super().init_attrs()
+        self.dtypes = [{"dtype": "bool"}]
+        self.attrs = [
+            {
+                "op_type": "all",
+                "keepdim": True
+            },
+            {
+                "op_type": "all",
+                "keepdim": False
+            },
+            {
+                "op_type": "any",
+                "keepdim": True
+            },
+            {
+                "op_type": "any",
+                "keepdim": False
+            },
+        ]
+
+
+class TestReduceAxis(TestReduceAll):
+    def init_attrs(self):
+        super().init_attrs()
+        self.inputs = [
+            {
+                "shape": [1, 512, 1],
+                "axis": [1],
+            },
+            {
+                "shape": [1, 1024, 1],
+                "axis": [1],
+            },
+            {
+                "shape": [1, 2048, 1],
+                "axis": [1],
+            },
+            {
+                "shape": [64, 32, 16, 8, 4],
+                "axis": [0, 2],
+            },
+            {
+                "shape": [64, 32, 16, 8, 4],
+                "axis": [1, 2, 3],
+            },
+            {
+                # No axis, all reduce
+                "shape": [64, 32, 16, 8, 4],
+                "axis": [],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [
+            {
+                "op_type": "sum",
+                "keepdim": True,
+            },
+            {
+                "op_type": "sum",
+                "keepdim": False,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestReduceForBool().run()
+    TestReduceAxis().run()
diff --git a/test/cinn/ops/test_relu6_op.py b/test/cinn/ops/test_relu6_op.py
new file mode 100644
index 0000000000000..af26f5e54ee3c
--- /dev/null
+++ b/test/cinn/ops/test_relu6_op.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestRelu6Op(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-10,
+            high=10)
+
+    def build_paddle_program(self, target):
+        f = paddle.nn.ReLU6()
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        self.paddle_outputs = [f(x)]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("relu6")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.relu6(x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestRelu6All(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRelu6OpCase"
+        self.cls = TestRelu6Op
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [512, 256],
+        }, {
+            "x_shape": [128, 64, 32],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [
+            # {
+            #     "x_dtype": "bfloat16",
+            # },
+            {
+                "x_dtype": "float16",
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestRelu6All().run()
diff --git a/test/cinn/ops/test_relu_op.py b/test/cinn/ops/test_relu_op.py
new file mode 100755
index 0000000000000..28e7eb1956b9b
--- /dev/null
+++ b/test/cinn/ops/test_relu_op.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestReluOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x": self.random(self.case["shape"], self.case["dtype"], -1.0,
+                             1.0),
+            "dout": self.random(self.case["shape"], self.case["dtype"], -1.0,
+                                1.0)
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        out = F.relu(x)
+        self.paddle_outputs = [out]
+        self.paddle_grads = self.get_paddle_grads([out], [x],
+                                                  [self.inputs["dout"]])
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("relu")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.relu(x)
+
+        dout = builder.create_input(
+            self.nptype2cinntype(self.inputs["dout"].dtype),
+            self.inputs["dout"].shape, "dout")
+        x_grad = builder.relu_grad(dout, out)
+
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog,
+            target, [x, dout], [self.inputs["x"], self.inputs["dout"]],
+            [out, x_grad],
+            passes=[])
+
+        self.cinn_outputs = [res[0]]
+        self.cinn_grads = [res[1]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestReluOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReluOpShape"
+        self.cls = TestReluOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 1024, 7],
+            },
+            {
+                "shape": [10, 5, 1024, 2048],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestReluOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReluOpDtype"
+        self.cls = TestReluOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestReluOpShape().run()
+    TestReluOpDtype().run()
diff --git a/test/cinn/ops/test_remainder_op.py b/test/cinn/ops/test_remainder_op.py
new file mode 100644
index 0000000000000..58532d588f487
--- /dev/null
+++ b/test/cinn/ops/test_remainder_op.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestRemainderOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["dtype"],
+            low=self.case["x_low"],
+            high=self.case["x_high"])
+        self.y_np = self.random(
+            shape=self.case["y_shape"],
+            dtype=self.case["dtype"],
+            low=self.case["y_low"],
+            high=self.case["y_high"])
+        self.y_np = np.where(self.y_np == 0, 1, self.y_np)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        y = paddle.to_tensor(self.y_np, stop_gradient=False)
+        out = paddle.remainder(x, y)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("remainder")
+        x = builder.create_input(
+            self.nptype2cinntype(self.x_np.dtype), self.x_np.shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.y_np.dtype), self.y_np.shape, "y")
+        out = builder.remainder(x, y)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.x_np, self.y_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestRemainderOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRemainderOpCase"
+        self.cls = TestRemainderOp
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+            },
+            {
+                "x_shape": [512, 256],
+                "y_shape": [512, 256],
+            },
+            {
+                "x_shape": [128, 64, 32],
+                "y_shape": [128, 64, 32],
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+                "y_shape": [16, 8, 4, 2],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "y_shape": [16, 8, 4, 2, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "int32",
+            },
+        ]
+        self.attrs = [
+            {
+                "x_low": -10,
+                "x_high": 10,
+                "y_low": -10,
+                "y_high": 10,
+            },
+        ]
+
+
+class TestRemainderOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRemainderOpCase"
+        self.cls = TestRemainderOp
+        self.inputs = [
+            {
+                "x_shape": [1024],
+                "y_shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "int32",
+            },
+            {
+                "dtype": "int64",
+            },
+            {
+                "dtype": "float16",
+            },
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "x_low": -10,
+                "x_high": 10,
+                "y_low": -10,
+                "y_high": 10,
+            },
+        ]
+
+
+class TestRemainderOpBroadcast(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRemainderOpCase"
+        self.cls = TestRemainderOp
+        self.inputs = [
+            {
+                "x_shape": [1],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [1024],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [512, 256],
+                "y_shape": [1, 256],
+            },
+            {
+                "x_shape": [128, 64, 32],
+                "y_shape": [128, 1, 32],
+            },
+            {
+                "x_shape": [16, 8, 4, 2],
+                "y_shape": [16, 1, 1, 2],
+            },
+            {
+                "x_shape": [16, 8, 4, 2, 1],
+                "y_shape": [16, 1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "int32",
+            },
+        ]
+        self.attrs = [
+            {
+                "x_low": -10,
+                "x_high": 10,
+                "y_low": -10,
+                "y_high": 10,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestRemainderOpShape().run()
+    TestRemainderOpDtype().run()
+    TestRemainderOpBroadcast().run()
diff --git a/test/cinn/ops/test_repeat_op.py b/test/cinn/ops/test_repeat_op.py
new file mode 100644
index 0000000000000..3745054e5c42a
--- /dev/null
+++ b/test/cinn/ops/test_repeat_op.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestRepeatOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        shape = self.case["shape"]
+        dtype = self.case["dtype"]
+        repeats = self.case["repeats"]
+        axis = self.case["axis"]
+        dims = len(shape)
+        axis = min(axis, dims - 1)
+        axis = max(axis, -dims)
+        self.inputs = {
+            "x": self.random(shape, dtype, -1.0, 1.0),
+            "repeats": repeats,
+            "axis": axis
+        }
+
+    def build_paddle_program(self, target):
+        x = np.repeat(self.inputs["x"], self.inputs["repeats"],
+                      self.inputs["axis"])
+        out = paddle.to_tensor(x, stop_gradient=True)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("repeat")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.repeat(x, self.inputs["repeats"], self.inputs["axis"])
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestRepeatOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRepeatOpShape"
+        self.cls = TestRepeatOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 1024, 7],
+            },
+            {
+                "shape": [10, 5, 1024, 2048],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "repeats": 2,
+                "axis": 0
+            },
+        ]
+
+
+class TestRepeatOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRepeatOpDtype"
+        self.cls = TestRepeatOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "int8"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = [
+            {
+                "repeats": 4,
+                "axis": 0
+            },
+        ]
+
+
+class TestRepeatOpAttributeRepeats(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRepeatOpAttributeRepeats"
+        self.cls = TestRepeatOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "repeats": 256,
+                "axis": 0
+            },
+            {
+                "repeats": 1024,
+                "axis": 0
+            },
+            {
+                "repeats": 2048,
+                "axis": 0
+            },
+        ]
+
+
+class TestRepeatOpAttributeAxis(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRepeatOpAttributeAxis"
+        self.cls = TestRepeatOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "repeats": 128,
+                "axis": 0
+            },
+            {
+                "repeats": 128,
+                "axis": 1
+            },
+            {
+                "repeats": 128,
+                "axis": 2
+            },
+            {
+                "repeats": 128,
+                "axis": 3
+            },
+            {
+                "repeats": 128,
+                "axis": -1
+            },
+            {
+                "repeats": 128,
+                "axis": -2
+            },
+            {
+                "repeats": 128,
+                "axis": -3
+            },
+            {
+                "repeats": 128,
+                "axis": -4
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestRepeatOpShape().run()
+    TestRepeatOpDtype().run()
+    TestRepeatOpAttributeRepeats().run()
+    TestRepeatOpAttributeAxis().run()
diff --git a/test/cinn/ops/test_reshape_op.py b/test/cinn/ops/test_reshape_op.py
new file mode 100644
index 0000000000000..accf7273246ce
--- /dev/null
+++ b/test/cinn/ops/test_reshape_op.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestReshapeOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x": self.random(self.case["shape"], self.case["dtype"]),
+        }
+        self.target_shape = self.case["target_shape"]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = paddle.reshape(x, self.target_shape)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("reshape_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.reshape(x, self.target_shape)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestReshapeOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReshapeOpShape"
+        self.cls = TestReshapeOp
+        self.inputs = [
+            # 1D -> [1-5]D
+            {
+                "shape": [100],
+                "target_shape": [100]
+            },
+            {
+                "shape": [100],
+                "target_shape": [10, 10]
+            },
+            {
+                "shape": [125],
+                "target_shape": [5, 5, 5]
+            },
+            {
+                "shape": [256],
+                "target_shape": [4, 4, 4, 4]
+            },
+            {
+                "shape": [1024],
+                "target_shape": [8, 8, 4, 4]
+            },
+            # 2D -> [1-5]D
+            {
+                "shape": [5, 5],
+                "target_shape": [25]
+            },
+            {
+                "shape": [6, 8],
+                "target_shape": [4, 12]
+            },
+            {
+                "shape": [10, 20],
+                "target_shape": [5, 10, 4]
+            },
+            {
+                "shape": [4, 8],
+                "target_shape": [2, 2, 2, 4]
+            },
+            {
+                "shape": [16, 16],
+                "target_shape": [4, 2, 2, 1, 16]
+            },
+            # 3D -> [1-5]D
+            {
+                "shape": [1, 1, 1],
+                "target_shape": [1]
+            },
+            {
+                "shape": [1, 2, 3],
+                "target_shape": [6, 1]
+            },
+            {
+                "shape": [4, 8, 16],
+                "target_shape": [16, 8, 4]
+            },
+            {
+                "shape": [6, 6, 6],
+                "target_shape": [4, 9, 2, 3]
+            },
+            {
+                "shape": [8, 1, 8],
+                "target_shape": [2, 2, 2, 2, 4]
+            },
+            # 4D -> [1-5]D
+            {
+                "shape": [4, 1, 2, 1],
+                "target_shape": [8]
+            },
+            {
+                "shape": [2, 2, 4, 8],
+                "target_shape": [4, 32]
+            },
+            {
+                "shape": [6, 7, 8, 9],
+                "target_shape": [42, 36, 2]
+            },
+            {
+                "shape": [1024, 1, 1, 1],
+                "target_shape": [4, 4, 8, 8]
+            },
+            {
+                "shape": [10, 20, 30, 40],
+                "target_shape": [8, 6, 4, 2, 625]
+            },
+            # special
+            {
+                "shape": [1, 1024, 4],
+                "target_shape": [1, 2048, 2]
+            },
+            {
+                "shape": [2048, 2, 2],
+                "target_shape": [256, 8, 4]
+            },
+            {
+                "shape": [1, 1, 256],
+                "target_shape": [16, 1, 16]
+            },
+            {
+                "shape": [1, 1, 1, 1],
+                "target_shape": [1, 1]
+            },
+            {
+                "shape": [1, 1, 1],
+                "target_shape": [1]
+            },
+            {
+                "shape": [1],
+                "target_shape": [1, 1, 1, 1]
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestReshapeOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReshapeOpDtype"
+        self.cls = TestReshapeOp
+        self.inputs = [
+            {
+                "shape": [2, 3, 4],
+                "target_shape": [4, 6]
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "uint8"
+            },
+            {
+                "dtype": "int8"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestReshapeOpShape().run()
+    TestReshapeOpDtype().run()
diff --git a/test/cinn/ops/test_resize_op.py b/test/cinn/ops/test_resize_op.py
new file mode 100644
index 0000000000000..977b742200669
--- /dev/null
+++ b/test/cinn/ops/test_resize_op.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+from paddle.vision.transforms import functional as F
+
+# paddle resize is based on cv2 module
+# This test requires cv2 module (pip3.6 install opencv_python==3.2.0.7)
+# @OpTestTool.skip_if(not is_compiled_with_cuda(),
+#                    "x86 test will be skipped due to timeout.")
+# class TestResizeOp(OpTest):
+#     def setUp(self):
+#         self.init_case()
+
+#     def init_case(self):
+#         self.in_shape = [1,2,220,300]
+#         self.inputs = {
+#             "x":
+#             (np.random.random(self.in_shape) * 255).astype('int32')
+#         }
+#         self.out_shape = [240, 240]
+#         self.mode = "nearest"
+
+#     def build_paddle_program(self, target):
+#         #paddle resize only support [HWC] format.
+#         input = self.inputs["x"].reshape(self.in_shape[1:4]).transpose([1,2,0]).astype('uint8')
+#         out = F.resize(input, self.out_shape, self.mode)
+#         out = paddle.to_tensor(out.transpose([2,0,1]).reshape(self.in_shape[0:2]+self.out_shape), dtype="int32", stop_gradient=False)
+#         self.paddle_outputs = [out]
+
+#     def build_cinn_program(self, target):
+#         builder = NetBuilder("resize")
+#         x = builder.create_input(
+#             self.nptype2cinntype(self.inputs["x"].dtype),
+#             self.inputs["x"].shape, "x")
+#         out = builder.resize(x, self.out_shape, self.mode)
+#         prog = builder.build()
+#         res = self.get_cinn_output(
+#             prog, target, [x], [self.inputs["x"]], [out], passes=[])
+#         self.cinn_outputs = [res[0]]
+
+#     def check_outputs_and_grads(self):
+#         self.build_paddle_program(self.target)
+#         self.build_cinn_program(self.target)
+#         expect = self.paddle_outputs[0].numpy()
+#         actual = self.cinn_outputs[0]
+
+#         self.assertEqual(
+#             expect.dtype,
+#             actual.dtype,
+#             msg=
+#             "[{}] The output dtype different, which expect shape is {} but actual is {}."
+#             .format(self._get_device(), expect.dtype, actual.dtype))
+#         self.assertEqual(
+#             expect.shape,
+#             actual.shape,
+#             msg=
+#             "[{}] The output shape different, which expect shape is {} but actual is {}."
+#             .format(self._get_device(), expect.shape, actual.shape))
+
+#         is_allclose = np.allclose(
+#                 expect,
+#                 actual,
+#                 atol=1)
+#         error_message = "np.allclose(expect, actual, atol=1) checks error!"
+#         self.assertTrue(is_allclose, msg=error_message)
+
+#     def test_check_results(self):
+#         self.check_outputs_and_grads()
+
+# @OpTestTool.skip_if(not is_compiled_with_cuda(),
+#                   "x86 test will be skipped due to timeout.")
+# class TestResizeOp1(TestResizeOp):
+#     def init_case(self):
+#         self.in_shape = [1,2,220,300]
+#         self.inputs = {
+#             "x":
+#             (np.random.random(self.in_shape) * 255).astype('int32')
+#         }
+#         self.out_shape = [4, 4]
+#         self.mode = "bilinear"
+
+# @OpTestTool.skip_if(not is_compiled_with_cuda(),
+#                    "x86 test will be skipped due to timeout.")
+# class TestResizeOp2(TestResizeOp):
+#     def init_case(self):
+#         self.in_shape = [1,2,220,300]
+#         self.inputs = {
+#             "x":
+#             (np.random.random(self.in_shape) * 255).astype('int32')
+#         }
+#         self.out_shape = [4, 4]
+#         self.mode = "bicubic"
+
+# if __name__ == "__main__":
+#     unittest.main()
diff --git a/test/cinn/ops/test_reverse_op.py b/test/cinn/ops/test_reverse_op.py
new file mode 100755
index 0000000000000..3bde72d323374
--- /dev/null
+++ b/test/cinn/ops/test_reverse_op.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.common import *
+from cinn.frontend import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestReverseOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        dims = len(self.case["shape"])
+        axes = self.case["axes"].copy()
+        for i in range(len(axes)):
+            axes[i] = min(axes[i], dims - 1)
+            axes[i] = max(axes[i], -dims)
+        self.inputs = {
+            "x": self.random(self.case["shape"], self.case["dtype"]),
+            "axes": axes
+        }
+        self.net_builder_api = self.case["net_builder_api"]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        if self.net_builder_api == "reverse":
+            out = paddle.reverse(x, self.inputs["axes"])
+        elif self.net_builder_api == "flip":
+            out = paddle.flip(x, self.inputs["axes"])
+        else:
+            raise NotImplementedError
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("reverse")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        if self.net_builder_api == "reverse":
+            out = builder.reverse(x, self.inputs["axes"])
+        elif self.net_builder_api == "flip":
+            out = builder.flip(x, self.inputs["axes"])
+        else:
+            raise NotImplementedError
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestReverseOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReverseOpShape"
+        self.cls = TestReverseOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 1024, 7],
+            },
+            {
+                "shape": [10, 5, 1024, 2048],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [65536],
+            },
+            {
+                "shape": [131072],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "axes": [0]
+            },
+        ]
+        net_builder_api_attrs = [
+            {
+                "net_builder_api": "reverse",
+            },
+            {
+                "net_builder_api": "flip",
+            },
+        ]
+        self._register_custom_attrs(net_builder_api_attrs)
+
+
+class TestReverseOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReverseOpDtype"
+        self.cls = TestReverseOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [5, 10],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = [
+            {
+                "axes": [0]
+            },
+        ]
+        net_builder_api_attrs = [
+            {
+                "net_builder_api": "reverse",
+            },
+            {
+                "net_builder_api": "flip",
+            },
+        ]
+        self._register_custom_attrs(net_builder_api_attrs)
+
+
+class TestReverseOpAxis(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReverseOpAxis"
+        self.cls = TestReverseOp
+        self.inputs = [
+            {
+                "shape": [8, 4, 2, 16],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "axes": [0]
+            },
+            {
+                "axes": [1]
+            },
+            {
+                "axes": [2]
+            },
+            {
+                "axes": [3]
+            },
+            {
+                "axes": [-1]
+            },
+            {
+                "axes": [-2]
+            },
+            {
+                "axes": [-3]
+            },
+            {
+                "axes": [-4]
+            },
+        ]
+        net_builder_api_attrs = [
+            {
+                "net_builder_api": "reverse",
+            },
+            {
+                "net_builder_api": "flip",
+            },
+        ]
+        self._register_custom_attrs(net_builder_api_attrs)
+
+
+class TestReverseOpMultiAxis(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestReverseOpMultiAxis"
+        self.cls = TestReverseOp
+        self.inputs = [
+            {
+                "shape": [8, 4, 2, 16],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "axes": []
+            },
+            {
+                "axes": [0]
+            },
+            {
+                "axes": [1, 2]
+            },
+            {
+                "axes": [2, -1, 3]
+            },
+            {
+                "axes": [0, -3, 3, 1]
+            },
+            {
+                "axes": [-1]
+            },
+            {
+                "axes": [-2, -1]
+            },
+            {
+                "axes": [-3, -2, 3]
+            },
+            {
+                "axes": [0, 3, -3, -2]
+            },
+        ]
+        net_builder_api_attrs = [
+            {
+                "net_builder_api": "reverse",
+            },
+            {
+                "net_builder_api": "flip",
+            },
+        ]
+        self._register_custom_attrs(net_builder_api_attrs)
+
+
+if __name__ == "__main__":
+    TestReverseOpShape().run()
+    TestReverseOpDtype().run()
+    TestReverseOpAxis().run()
+    TestReverseOpMultiAxis().run()
diff --git a/test/cinn/ops/test_right_shift_op.py b/test/cinn/ops/test_right_shift_op.py
new file mode 100644
index 0000000000000..26fc3c87404c4
--- /dev/null
+++ b/test/cinn/ops/test_right_shift_op.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestRightShiftOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-100,
+            high=100)
+        self.y_np = self.random(
+            shape=self.case["y_shape"],
+            dtype=self.case["y_dtype"],
+            low=0,
+            high=16)
+
+    def build_paddle_program(self, target):
+        np_out = np.right_shift(self.x_np, self.y_np)
+        out = paddle.to_tensor(np_out, stop_gradient=True)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("right_shift")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.case["y_dtype"]), self.case["y_shape"],
+            "y")
+        out = builder.right_shift(x, y)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.x_np, self.y_np], [out])
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestRightShiftAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRightShiftOpCase"
+        self.cls = TestRightShiftOp
+        self.inputs = [{
+            "x_shape": [1],
+            "y_shape": [1],
+        }, {
+            "x_shape": [1024],
+            "y_shape": [1024],
+        }, {
+            "x_shape": [512, 256],
+            "y_shape": [512, 256],
+        }, {
+            "x_shape": [128, 64, 32],
+            "y_shape": [128, 64, 32],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+            "y_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+            "y_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [
+            # {
+            #     "x_dtype": "uint8",
+            #     "y_dtype": "uint8",
+            # },
+            {
+                "x_dtype": "int32",
+                "y_dtype": "int32",
+            },
+            # {
+            #     "x_dtype": "int64",
+            #     "y_dtype": "int64",
+            # },
+        ]
+        self.attrs = []
+
+
+class TestRightShiftAllWithBroadcast(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRightShiftOpCase"
+        self.cls = TestRightShiftOp
+        self.inputs = [{
+            "x_shape": [1],
+            "y_shape": [1],
+        }, {
+            "x_shape": [1024],
+            "y_shape": [1],
+        }, {
+            "x_shape": [512, 256],
+            "y_shape": [1, 1],
+        }, {
+            "x_shape": [128, 64, 32],
+            "y_shape": [1, 1, 1],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+            "y_shape": [1, 1, 1, 1],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+            "y_shape": [1, 1, 1, 1, 1],
+        }]
+        self.dtypes = [
+            # {
+            #     "x_dtype": "uint8",
+            #     "y_dtype": "uint8",
+            # },
+            {
+                "x_dtype": "int32",
+                "y_dtype": "int32",
+            },
+            # {
+            #     "x_dtype": "int64",
+            #     "y_dtype": "int64",
+            # }
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestRightShiftAll().run()
+    TestRightShiftAllWithBroadcast().run()
diff --git a/test/cinn/ops/test_round_op.py b/test/cinn/ops/test_round_op.py
new file mode 100644
index 0000000000000..7180ad8920b44
--- /dev/null
+++ b/test/cinn/ops/test_round_op.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestRoundOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["shape"], dtype=self.case["dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=False)
+        out = paddle.round(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("add")
+        x = builder.create_input(
+            self.nptype2cinntype(self.x_np.dtype), self.x_np.shape, "x")
+        out = builder.round(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestRoundOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRoundOpCase"
+        self.cls = TestRoundOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [512, 256],
+            },
+            {
+                "shape": [128, 64, 32],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+            {
+                "shape": [16, 8, 4, 2, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = []
+
+
+class TestRoundOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRoundOpCase"
+        self.cls = TestRoundOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16",
+            },
+            {
+                "dtype": "bfloat16",
+            },
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestRoundOpShape().run()
+    TestRoundOpDtype().run()
diff --git a/test/cinn/ops/test_rsqrt_op.py b/test/cinn/ops/test_rsqrt_op.py
new file mode 100644
index 0000000000000..c90dff5d3f103
--- /dev/null
+++ b/test/cinn/ops/test_rsqrt_op.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestRsqrtOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=0.01,
+            high=10000)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.rsqrt(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.rsqrt(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestRsqrtOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRsqrtOpShape"
+        self.cls = TestRsqrtOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestRsqrtOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestRsqrtOpDtype"
+        self.cls = TestRsqrtOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestRsqrtOpShape().run()
+    TestRsqrtOpDtype().run()
diff --git a/test/cinn/ops/test_scale_op.py b/test/cinn/ops/test_scale_op.py
new file mode 100644
index 0000000000000..f15e5bd0f6ca2
--- /dev/null
+++ b/test/cinn/ops/test_scale_op.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestScaleOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.scale(x, self.case["scale"], self.case["bias"],
+                           self.case["bias_after_scale"])
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("scale")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.scale(x, self.case["scale"], self.case["bias"],
+                            self.case["bias_after_scale"])
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        max_relative_error = self.case[
+            "max_relative_error"] if "max_relative_error" in self.case else 1e-5
+        self.check_outputs_and_grads(max_relative_error=max_relative_error)
+
+
+class TestScaleShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScaleOpCase"
+        self.cls = TestScaleOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [512, 256],
+        }, {
+            "x_shape": [128, 64, 32],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = [{"scale": -0.1, "bias": 10, "bias_after_scale": False}]
+
+
+class TestScaleDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScaleOpCase"
+        self.cls = TestScaleOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            # {
+            #     # Todo: Support uint8 in op scale
+            #     "x_dtype": "uint8",
+            # },
+            {
+                "x_dtype": "int8",
+            },
+            {
+                "x_dtype": "int16"
+            },
+            {
+                "x_dtype": "int32",
+            },
+            {
+                "x_dtype": "int64"
+            },
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = [{"scale": -0.1, "bias": 10, "bias_after_scale": False}]
+
+
+class TestScaleAttr(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScaleOpCase"
+        self.cls = TestScaleOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = [{
+            "scale": 0,
+            "bias": 0,
+            "bias_after_scale": True
+        }, {
+            "scale": 0,
+            "bias": 0,
+            "bias_after_scale": False
+        }, {
+            "scale": 0.1,
+            "bias": 10,
+            "bias_after_scale": True
+        }, {
+            "scale": -0.1,
+            "bias": 10,
+            "bias_after_scale": False
+        }, {
+            "scale": 1,
+            "bias": 0,
+            "bias_after_scale": True
+        }, {
+            "scale": -1,
+            "bias": 0,
+            "bias_after_scale": False
+        }, {
+            "scale": 0,
+            "bias": 10,
+            "bias_after_scale": True
+        }, {
+            "scale": 0,
+            "bias": 10,
+            "bias_after_scale": False
+        }]
+
+
+if __name__ == "__main__":
+    TestScaleShape().run()
+    TestScaleDtype().run()
+    TestScaleAttr().run()
diff --git a/test/cinn/ops/test_scatter_add.py b/test/cinn/ops/test_scatter_add.py
new file mode 100644
index 0000000000000..a2879c521e763
--- /dev/null
+++ b/test/cinn/ops/test_scatter_add.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper, run_test
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestScatterAddOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        x_shape = self.case["x_shape"]
+        y_shape = self.case["y_shape"]
+        dtype = self.case["dtype"]
+        axis = self.case["axis"]
+        self.inputs = {
+            "x": self.random(x_shape, dtype),
+            "y": self.random(y_shape, dtype),
+            "index": self.random([y_shape[axis]], "int32", 0, x_shape[axis])
+        }
+        self.axis = axis
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=True)
+
+        pos_axis = self.axis
+        if pos_axis < 0:
+            pos_axis += len(x.shape)
+
+        index_nd = []
+        if pos_axis == 0:
+            for i in range(len(self.inputs["index"])):
+                index_nd.append([self.inputs["index"][i]])
+        elif pos_axis == 1:
+            for i in range(self.inputs['x'].shape[0]):
+                index_nd.append([])
+                for j in range(len(self.inputs["index"])):
+                    index_nd[i].append([i, self.inputs["index"][j]])
+        elif pos_axis == 2:
+            for i in range(self.inputs['x'].shape[0]):
+                index_nd.append([])
+                for j in range(self.inputs['x'].shape[1]):
+                    index_nd[i].append([])
+                    for k in range(len(self.inputs["index"])):
+                        index_nd[i][j].append([i, j, self.inputs["index"][k]])
+        elif pos_axis == 3:
+            for i in range(self.inputs['x'].shape[0]):
+                index_nd.append([])
+                for j in range(self.inputs['x'].shape[1]):
+                    index_nd[i].append([])
+                    for k in range(self.inputs['x'].shape[2]):
+                        index_nd[i][j].append([])
+                        for l in range(len(self.inputs["index"])):
+                            index_nd[i][j][k].append(
+                                [i, j, k, self.inputs["index"][l]])
+        else:
+            self.assertTrue(False, "Axis {} No Implement".format(pos_axis))
+
+        index = paddle.to_tensor(index_nd, stop_gradient=True)
+        res = paddle.scatter_nd_add(x, index, y)
+        self.paddle_outputs = [res]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("scatter_add")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.inputs["y"].dtype),
+            self.inputs["y"].shape, "y")
+        index = builder.create_input(
+            self.nptype2cinntype(self.inputs["index"].dtype),
+            self.inputs["index"].shape, "index")
+        out = builder.scatter_add(x, y, index, self.axis)
+
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [x, y, index],
+            [self.inputs["x"], self.inputs["y"], self.inputs["index"]], [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        if self.case["dtype"] == "float16":
+            self.check_outputs_and_grads(
+                max_relative_error=0.01, max_absolute_error=0.01)
+        else:
+            self.check_outputs_and_grads()
+
+
+class TestScatterAddOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScatterAddOpShapeTest"
+        self.cls = TestScatterAddOp
+        self.inputs = [
+            {
+                "x_shape": [10],
+                "y_shape": [5],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8],
+                "y_shape": [8, 8],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8, 16],
+                "y_shape": [10, 4, 16],
+                "axis": 1
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 20, 32],
+                "axis": -2
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 1, 32],
+                "axis": -2
+            },
+            {
+                "x_shape": [10, 1, 16, 32],
+                "y_shape": [10, 1, 8, 32],
+                "axis": -2
+            },
+            {
+                "x_shape": [1024, 8, 16, 4],
+                "y_shape": [512, 8, 16, 4],
+                "axis": 0
+            },
+            {
+                "x_shape": [2048, 8, 16, 4],
+                "y_shape": [1024, 8, 16, 4],
+                "axis": 0
+            },
+            {
+                "x_shape": [1024, 8, 16, 4],
+                "y_shape": [2048, 8, 16, 4],
+                "axis": 0
+            },
+            {
+                "x_shape": [1, 1, 1, 1],
+                "y_shape": [1, 1, 1, 1],
+                "axis": 0
+            },
+            {
+                "x_shape": [1],
+                "y_shape": [8],
+                "axis": 0
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestScatterAddOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScatterAddOpDtypeTest"
+        self.cls = TestScatterAddOp
+        self.inputs = [
+            {
+                "x_shape": [10],
+                "y_shape": [5],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8],
+                "y_shape": [8, 8],
+                "axis": 0
+            },
+            {
+                "x_shape": [1024, 8, 16, 4],
+                "y_shape": [512, 8, 16, 4],
+                "axis": 0
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = []
+
+
+class TestScatterAddOpAttributeAxis(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScatterAddOpAttributeAxis"
+        self.cls = TestScatterAddOp
+        self.inputs = [
+            {
+                "x_shape": [10],
+                "y_shape": [5],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8],
+                "y_shape": [8, 8],
+                "axis": -2
+            },
+            {
+                "x_shape": [10, 8, 16],
+                "y_shape": [5, 8, 16],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8, 16],
+                "y_shape": [10, 4, 16],
+                "axis": 1
+            },
+            {
+                "x_shape": [10, 8, 16],
+                "y_shape": [10, 8, 8],
+                "axis": 2
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [2, 8, 16, 32],
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 8, 32],
+                "axis": 2
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 16, 16],
+                "axis": 3
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 16, 8],
+                "axis": -1
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 8, 4, 32],
+                "axis": -2
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [1, 8, 16, 32],
+                "axis": -4
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 4, 16, 32],
+                "axis": 1
+            },
+            {
+                "x_shape": [10, 8, 16, 32],
+                "y_shape": [10, 2, 16, 32],
+                "axis": -3
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+# test inline compute: https://github.com/PaddlePaddle/CINN/pull/1329
+class TestScatterAddCaseInline1(TestScatterAddOp):
+    def setUp(self):
+        self.case = {
+            "x_shape": [10, 5],
+            "y_shape": [5, 5],
+            "index_shape": [5],
+            "dtype": "float32",
+            "index_dtype": "int32",
+            "axis": 0
+        }
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("scatter_add")
+        x = builder.create_input(Float(64), self.inputs["x"].shape, "x")
+        y = builder.create_input(Float(32), self.inputs["y"].shape, "y")
+        x1 = builder.cast(x, dtype="float32")  # newly added
+        index = builder.create_input(
+            Int(32), self.inputs["index"].shape, "index")
+        out = builder.scatter_add(x1, y, index, self.axis)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y, index], [
+            self.inputs["x"].astype("float64"), self.inputs["y"],
+            self.inputs["index"]
+        ], [out])
+
+        self.cinn_outputs = [res[0]]
+
+
+class TestScatterAddCaseInline2(TestScatterAddCaseInline1):
+    def build_cinn_program(self, target):
+        builder = NetBuilder("scatter_add")
+        x = builder.create_input(Float(32), self.inputs["x"].shape, "x")
+        y = builder.create_input(Float(64), self.inputs["y"].shape, "y")
+        y1 = builder.cast(y, dtype="float32")  # newly added
+        index = builder.create_input(
+            Int(32), self.inputs["index"].shape, "index")
+        out = builder.scatter_add(x, y1, index, self.axis)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y, index], [
+            self.inputs["x"], self.inputs["y"].astype("float64"),
+            self.inputs["index"]
+        ], [out])
+
+        self.cinn_outputs = res
+
+
+if __name__ == "__main__":
+    TestScatterAddOpShapeTest().run()
+    TestScatterAddOpDtypeTest().run()
+    TestScatterAddOpAttributeAxis().run()
+    run_test(TestScatterAddCaseInline1)
+    run_test(TestScatterAddCaseInline2)
diff --git a/test/cinn/ops/test_scatter_assign_op.py b/test/cinn/ops/test_scatter_assign_op.py
new file mode 100644
index 0000000000000..32c3b373d16dc
--- /dev/null
+++ b/test/cinn/ops/test_scatter_assign_op.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestScatterAssignOpBase(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs["x"] = self.random(self.case["x_shape"]).astype(
+            self.case["x_dtype"])
+        self.inputs["y"] = self.random(self.case["y_shape"]).astype(
+            self.case["y_dtype"])
+        self.inputs["index"] = np.random.randint(
+            0, self.case["index_upper"],
+            size=self.case["index_size"]).astype("int32")
+        self.axis = self.case["axis"]
+
+    def build_paddle_program(self, target):
+        x = self.inputs["x"].copy()
+        y = self.inputs["y"].copy()
+
+        out = x
+        axis = self.axis
+        while (axis < 0):
+            axis += len(self.inputs["x"].shape)
+
+        if axis == 0:
+            for i in range(self.inputs["index"].shape[0]):
+                out[self.inputs["index"][i]] = y[i]
+        elif axis == 1:
+            for i in range(self.inputs["x"].shape[0]):
+                for j in range(self.inputs["index"].shape[0]):
+                    out[i][self.inputs["index"][j]] = y[i][j]
+        elif axis == 2:
+            for i in range(self.inputs["x"].shape[0]):
+                for j in range(self.inputs["x"].shape[1]):
+                    for k in range(self.inputs["index"].shape[0]):
+                        out[i][j][self.inputs["index"][k]] = y[i][j][k]
+        elif axis == 3:
+            for i in range(self.inputs["x"].shape[0]):
+                for j in range(self.inputs["x"].shape[1]):
+                    for k in range(self.inputs["x"].shape[2]):
+                        for l in range(self.inputs["index"].shape[0]):
+                            out[i][j][k][self.inputs["index"]
+                                         [l]] = y[i][j][k][l]
+        else:
+            self.assertTrue(False, "Axis {} No Implement".format(self.axis))
+
+        pd_out = paddle.to_tensor(out, stop_gradient=True)
+        self.paddle_outputs = [pd_out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("scatter_assign")
+        x = builder.create_input(
+            OpTest.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            OpTest.nptype2cinntype(self.inputs["y"].dtype),
+            self.inputs["y"].shape, "y")
+        index = builder.create_input(
+            OpTest.nptype2cinntype(self.inputs["index"].dtype),
+            self.inputs["index"].shape, "index")
+        out = builder.scatter_assign(x, y, index, self.axis)
+
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [x, y, index],
+            [self.inputs["x"], self.inputs["y"], self.inputs["index"]], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestScatterAssignOp(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScatterAssignOp"
+        self.cls = TestScatterAssignOpBase
+        self.inputs = [
+            {
+                "x_shape": [10],
+                "y_shape": [1],
+                "index_upper": 10,
+                "index_size": 1,
+                "axis": -1
+            },
+            {
+                "x_shape": [10, 5],
+                "y_shape": [3, 5],
+                "index_upper": 10,
+                "index_size": 3,
+                "axis": 0
+            },
+            {
+                "x_shape": [10, 5, 5],
+                "y_shape": [10, 5, 4],
+                "index_upper": 5,
+                "index_size": 4,
+                "axis": -1
+            },
+            {
+                "x_shape": [10, 5, 5, 7],
+                "y_shape": [10, 5, 2, 7],
+                "index_upper": 5,
+                "index_size": 2,
+                "axis": -2
+            },
+            {
+                "x_shape": [10, 5, 1024, 2048],
+                "y_shape": [10, 5, 2, 2048],
+                "index_upper": 5,
+                "index_size": 2,
+                "axis": -2
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "float32",
+                "y_dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestScatterAssignOpAttribute(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScatterAssignOpAttribute"
+        self.cls = TestScatterAssignOpBase
+        self.inputs = [
+            {
+                "x_shape": [1, 1, 1, 1],
+                "y_shape": [1, 1, 1, 1],
+                "index_upper": 1,
+                "index_size": 1,
+                "axis": 0,
+            },
+            {
+                "x_shape": [1, 10, 10, 3],
+                "y_shape": [1, 4, 10, 3],
+                "index_upper": 10,
+                "index_size": 4,
+                "axis": 1,
+            },
+            {
+                "x_shape": [10, 4, 8, 3],
+                "y_shape": [10, 4, 5, 3],
+                "index_upper": 8,
+                "index_size": 5,
+                "axis": 2,
+            },
+            {
+                "x_shape": [10, 4, 5, 6],
+                "y_shape": [10, 4, 5, 3],
+                "index_upper": 6,
+                "index_size": 3,
+                "axis": 3,
+            },
+            {
+                "x_shape": [10, 4, 5, 1024],
+                "y_shape": [10, 4, 5, 768],
+                "index_upper": 1024,
+                "index_size": 768,
+                "axis": -1,
+            },
+            {
+                "x_shape": [1024, 4, 12, 10],
+                "y_shape": [1024, 4, 5, 10],
+                "index_upper": 12,
+                "index_size": 5,
+                "axis": -2,
+            },
+            {
+                "x_shape": [10, 8192, 12, 10],
+                "y_shape": [10, 4096, 12, 10],
+                "index_upper": 8192,
+                "index_size": 4096,
+                "axis": -3,
+            },
+            {
+                "x_shape": [2048, 10, 12, 10],
+                "y_shape": [1024, 10, 12, 10],
+                "index_upper": 2048,
+                "index_size": 1024,
+                "axis": -4,
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "float32",
+                "y_dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestScatterAssignOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestScatterAssignOpDtype"
+        self.cls = TestScatterAssignOpBase
+        self.inputs = [
+            {
+                "x_shape": [10, 5, 20, 7],
+                "y_shape": [10, 5, 15, 7],
+                "index_upper": 20,
+                "index_size": 15,
+                "axis": -2
+            },
+        ]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "y_dtype": "float16"
+            },
+            {
+                "x_dtype": "float32",
+                "y_dtype": "float32"
+            },
+            {
+                "x_dtype": "float64",
+                "y_dtype": "float64"
+            },
+            {
+                "x_dtype": "int32",
+                "y_dtype": "int32"
+            },
+            {
+                "x_dtype": "int64",
+                "y_dtype": "int64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestScatterAssignOp().run()
+    TestScatterAssignOpAttribute().run()
+    TestScatterAssignOpDtype().run()
diff --git a/test/cinn/ops/test_select_op.py b/test/cinn/ops/test_select_op.py
new file mode 100644
index 0000000000000..978d210030e51
--- /dev/null
+++ b/test/cinn/ops/test_select_op.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSelectOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "Condition": self.random(self.case["shape"], "bool"),
+            "X": self.random(self.case["shape"], self.case["dtype"]),
+            "Y": self.random(self.case["shape"], self.case["dtype"])
+        }
+
+    def build_paddle_program(self, target):
+        c = paddle.to_tensor(self.inputs["Condition"], stop_gradient=True)
+        x = paddle.to_tensor(self.inputs["X"], stop_gradient=True)
+        y = paddle.to_tensor(self.inputs["Y"], stop_gradient=True)
+
+        out = paddle.where(c, x, y)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("select")
+        c = builder.create_input(
+            self.nptype2cinntype(self.inputs["Condition"].dtype),
+            self.inputs["Condition"].shape, "Condition")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["X"].dtype),
+            self.inputs["X"].shape, "X")
+        y = builder.create_input(
+            self.nptype2cinntype(self.inputs["Y"].dtype),
+            self.inputs["Y"].shape, "Y")
+
+        out = builder.select(c, x, y)
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [c, x, y],
+            [self.inputs["Condition"], self.inputs["X"], self.inputs["Y"]],
+            [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestSelectOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSelectOpShape"
+        self.cls = TestSelectOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 1024, 7],
+            },
+            {
+                "shape": [10, 5, 1024, 2048],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestSelectOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSelectOpDtype"
+        self.cls = TestSelectOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSelectOpShape().run()
+    TestSelectOpDtype().run()
diff --git a/test/cinn/ops/test_sigmoid_op.py b/test/cinn/ops/test_sigmoid_op.py
new file mode 100644
index 0000000000000..4b9ccd13781ec
--- /dev/null
+++ b/test/cinn/ops/test_sigmoid_op.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSigmoidOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x": self.random(self.case["shape"], self.case["dtype"], -1.0, 1.0)
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = F.sigmoid(x)
+
+        self.paddle_outputs = [out]
+
+    # Note: If the forward and backward operators are run in the same program,
+    # the forward result will be incorrect.
+    def build_cinn_program(self, target):
+        builder = NetBuilder("sigmoid")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.sigmoid(x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSigmoidOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSigmoidOpShape"
+        self.cls = TestSigmoidOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 1024, 7],
+            },
+            {
+                "shape": [10, 5, 1024, 2048],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestSigmoidOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSigmoidOpDtype"
+        self.cls = TestSigmoidOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSigmoidOpShape().run()
+    TestSigmoidOpDtype().run()
diff --git a/test/cinn/ops/test_sign_op.py b/test/cinn/ops/test_sign_op.py
new file mode 100644
index 0000000000000..b70faaff2cc40
--- /dev/null
+++ b/test/cinn/ops/test_sign_op.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSignOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x": self.random(self.case["shape"], self.case["dtype"], -10, 10)
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = paddle.sign(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("sign")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.sign(x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(equal_nan=True)
+
+
+class TestSignOp1(TestSignOp):
+    def init_case(self):
+        self.inputs = {
+            "x": np.array([1, 0, -1, np.nan, np.inf,
+                           -np.inf]).astype("float32")
+        }
+
+
+class TestSignOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSignOpShape"
+        self.cls = TestSignOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 32, 7],
+            },
+            {
+                "shape": [10, 5, 32, 32],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestSignOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSignOpDtype"
+        self.cls = TestSignOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [{
+            "dtype": "float16"
+        }, {
+            "dtype": "float32"
+        }, {
+            "dtype": "float64"
+        }]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSignOpShape().run()
+    TestSignOpDtype().run()
diff --git a/test/cinn/ops/test_sin_op.py b/test/cinn/ops/test_sin_op.py
new file mode 100644
index 0000000000000..9514e1ab7b788
--- /dev/null
+++ b/test/cinn/ops/test_sin_op.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSinOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.sin(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.sin(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSinOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSinOpShape"
+        self.cls = TestSinOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestSinOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSinOpDtype"
+        self.cls = TestSinOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSinOpShape().run()
+    TestSinOpDtype().run()
diff --git a/test/cinn/ops/test_sinh_op.py b/test/cinn/ops/test_sinh_op.py
new file mode 100644
index 0000000000000..674184ae7151a
--- /dev/null
+++ b/test/cinn/ops/test_sinh_op.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSinhOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.sinh(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.sinh(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSinhOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSinhOpShape"
+        self.cls = TestSinhOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestSinhOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSinhOpDtype"
+        self.cls = TestSinhOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSinhOpShape().run()
+    TestSinhOpDtype().run()
diff --git a/test/cinn/ops/test_slice_assign_op.py b/test/cinn/ops/test_slice_assign_op.py
new file mode 100644
index 0000000000000..d5fcd68ca953f
--- /dev/null
+++ b/test/cinn/ops/test_slice_assign_op.py
@@ -0,0 +1,421 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+def paddle_slice_assign(data, update, axes, starts, ends, strides):
+    assert len(axes) == len(starts) == len(ends) == len(strides)
+
+    # prepare
+    for i in range(len(ends)):
+        input_len = data.shape[axes[i]]
+        if ends[i] < 0:
+            ends[i] += input_len
+        elif ends[i] > input_len:
+            ends[i] = input_len
+        if starts[i] < 0:
+            starts[i] += input_len
+        elif starts[i] > input_len:
+            starts[i] = input_len - 1
+
+    # slice & assign
+    dims = len(data.shape)
+    slices = ['::'] * dims
+    for i, axis in enumerate(axes):
+        slices[axis] = str(starts[i]) + ':' + str(ends[i]) + ':' + str(
+            strides[i])
+    res = data.clone()
+    exec(f"res[{','.join(slices)}] = update")
+    return res
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSliceAssignOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "inputs": self.random(self.case["inputs_shape"],
+                                  self.case["dtype"]),
+            "assign": self.random(self.case["assign_shape"],
+                                  self.case["dtype"]),
+        }
+        if self.case["assign_zeros"]:
+            self.inputs["assign"] = np.zeros(self.case["assign_shape"]).astype(
+                self.case["dtype"])
+        self.axes = self.case["axes"]
+        self.starts = self.case["starts"]
+        self.ends = self.case["ends"]
+        self.strides = self.case["strides"]
+
+    def build_paddle_program(self, target):
+        inputs = paddle.to_tensor(self.inputs["inputs"], stop_gradient=True)
+        assign = paddle.to_tensor(self.inputs["assign"], stop_gradient=True)
+        res = paddle_slice_assign(inputs, assign, self.axes, self.starts,
+                                  self.ends, self.strides)
+        self.paddle_outputs = [res]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("slice_assign")
+        inputs = builder.create_input(
+            self.nptype2cinntype(self.inputs["inputs"].dtype),
+            self.inputs["inputs"].shape, "inputs")
+        assign = builder.create_input(
+            self.nptype2cinntype(self.inputs["assign"].dtype),
+            self.inputs["assign"].shape, "assign")
+        out = builder.slice_assign(
+            inputs,
+            assign,
+            starts=self.starts,
+            ends=self.ends,
+            axes=self.axes,
+            strides=self.strides)
+
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [inputs, assign],
+            [self.inputs["inputs"], self.inputs["assign"]], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestSliceAssignOpLegacyTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceAssignOpLegacyTest"
+        self.cls = TestSliceAssignOp
+        self.inputs = [
+            {
+                "inputs_shape": [10, 12],
+                "assign_shape": [3, 3],
+                "axes": [0, 1],
+                "starts": [2, 2],
+                "ends": [5, 5],
+                "strides": [1, 1]
+            },
+            {
+                "inputs_shape": [10, 12],
+                "assign_shape": [5, 5],
+                "axes": [0, 1],
+                "starts": [1, 2],
+                "ends": [6, 1000],
+                "strides": [1, 2]
+            },
+            {
+                "inputs_shape": [10, 12],
+                "assign_shape": [3, 3],
+                "axes": [0, 1],
+                "starts": [2, 1],
+                "ends": [-1, 7],
+                "strides": [3, 2]
+            },
+            {
+                "inputs_shape": [10, 12],
+                "assign_shape": [6, 5],
+                "axes": [0, 1],
+                "starts": [2, 1000],
+                "ends": [8, 1],
+                "strides": [1, -2]
+            },
+            {
+                "inputs_shape": [10, 12],
+                "assign_shape": [4, 3],
+                "axes": [0, 1],
+                "starts": [-1, -2],
+                "ends": [-5, -8],
+                "strides": [-1, -2]
+            },
+            {
+                "inputs_shape": [121, 2],
+                "assign_shape": [121, 1],
+                "axes": [1],
+                "starts": [0],
+                "ends": [1],
+                "strides": [1]
+            },
+            {
+                "inputs_shape": [121, 2],
+                "assign_shape": [121, 1],
+                "axes": [1],
+                "starts": [1],
+                "ends": [2],
+                "strides": [1]
+            },
+            {
+                "inputs_shape": [121, 2],
+                "assign_shape": [121, 1],
+                "axes": [1],
+                "starts": [1],
+                "ends": [0],
+                "strides": [-1]
+            },
+            {
+                "inputs_shape": [121, 2, 2],
+                "assign_shape": [121, 2, 1],
+                "axes": [2],
+                "starts": [0],
+                "ends": [1],
+                "strides": [1]
+            },
+            {
+                "inputs_shape": [121, 2, 2],
+                "assign_shape": [121, 2, 1],
+                "axes": [2],
+                "starts": [1],
+                "ends": [2],
+                "strides": [1]
+            },
+            {
+                "inputs_shape": [121, 2, 2],
+                "assign_shape": [121, 2, 1],
+                "axes": [2],
+                "starts": [1],
+                "ends": [0],
+                "strides": [-1]
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [
+            {
+                "assign_zeros": True,
+            },
+            {
+                "assign_zeros": False,
+            },
+        ]
+
+
+class TestSliceAssignOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceAssignOpShapeTest"
+        self.cls = TestSliceAssignOp
+        self.inputs = [
+            {
+                "inputs_shape": [64],
+                "assign_shape": [3],
+                "axes": [0],
+                "starts": [2],
+                "ends": [5],
+                "strides": [1]
+            },
+            {
+                "inputs_shape": [128, 32],
+                "assign_shape": [32, 16],
+                "axes": [0, 1],
+                "starts": [24, 10],
+                "ends": [56, 26],
+                "strides": [1, 1]
+            },
+            {
+                "inputs_shape": [32, 10, 64],
+                "assign_shape": [8, 4, 16],
+                "axes": [0, 1, 2],
+                "starts": [24, 4, 0],
+                "ends": [32, 8, 64],
+                "strides": [1, 1, 4]
+            },
+            {
+                "inputs_shape": [10, 12, 9, 5],
+                "assign_shape": [3, 5, 4, 5],
+                "axes": [0, 1, 2],
+                "starts": [2, 4, 0],
+                "ends": [5, 9, 7],
+                "strides": [1, 1, 2]
+            },
+            {
+                "inputs_shape": [1],
+                "assign_shape": [1],
+                "axes": [0],
+                "starts": [0],
+                "ends": [1],
+                "strides": [1]
+            },
+            {
+                "inputs_shape": [1, 1, 1, 1, 1],
+                "assign_shape": [1, 1, 1, 1, 1],
+                "axes": [0],
+                "starts": [0],
+                "ends": [1],
+                "strides": [1]
+            },
+            {
+                "inputs_shape": [1024, 1, 2],
+                "assign_shape": [512, 1, 2],
+                "axes": [0],
+                "starts": [128],
+                "ends": [640],
+                "strides": [1]
+            },
+            {
+                "inputs_shape": [2, 4096, 8],
+                "assign_shape": [2, 2048, 8],
+                "axes": [1],
+                "starts": [1024],
+                "ends": [3072],
+                "strides": [1]
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "assign_zeros": False,
+            },
+        ]
+
+
+class TestSliceAssignOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceAssignOpDtypeTest"
+        self.cls = TestSliceAssignOp
+        self.inputs = [
+            {
+                "inputs_shape": [10, 12, 9, 5],
+                "assign_shape": [3, 5, 4, 5],
+                "axes": [0, 1, 2],
+                "starts": [2, 4, 0],
+                "ends": [5, 9, 7],
+                "strides": [1, 1, 2]
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+            {
+                "dtype": "bool"
+            },
+        ]
+        self.attrs = [
+            {
+                "assign_zeros": False,
+            },
+        ]
+
+
+class TestSliceAssignOpAxesTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceAssignOpAxesTest"
+        self.cls = TestSliceAssignOp
+        self.inputs = [
+            {
+                "inputs_shape": [128, 32],
+                "assign_shape": [128, 16],
+                "axes": [1],
+                "starts": [10],
+                "ends": [26],
+                "strides": [1]
+            },
+            {
+                "inputs_shape": [32, 10, 64],
+                "assign_shape": [8, 10, 16],
+                "axes": [0, 2],
+                "starts": [24, 0],
+                "ends": [32, 64],
+                "strides": [1, 4]
+            },
+            {
+                "inputs_shape": [10, 12, 9, 5],
+                "assign_shape": [3, 12, 9, 3],
+                "axes": [0, 3],
+                "starts": [2, 0],
+                "ends": [5, 3],
+                "strides": [1, 1]
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "assign_zeros": False,
+            },
+        ]
+
+
+class TestSliceAssignOpStridesTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceAssignOpStridesTest"
+        self.cls = TestSliceAssignOp
+        self.inputs = [
+            {
+                "inputs_shape": [128, 32],
+                "assign_shape": [8, 16],
+                "axes": [0, 1],
+                "starts": [0, 0],
+                "ends": [128, 32],
+                "strides": [16, 2]
+            },
+            {
+                "inputs_shape": [32, 10, 64],
+                "assign_shape": [8, 10, 16],
+                "axes": [0, 2],
+                "starts": [16, 0],
+                "ends": [32, 64],
+                "strides": [2, 4]
+            },
+            {
+                "inputs_shape": [8, 16, 32, 64, 128],
+                "assign_shape": [8, 8, 8, 8, 8],
+                "axes": [0, 1, 2, 3, 4],
+                "starts": [0, 0, 0, 0, 0],
+                "ends": [8, 16, 32, 64, 128],
+                "strides": [1, 2, 4, 8, 16]
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "assign_zeros": False,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestSliceAssignOpLegacyTest().run()
+    TestSliceAssignOpShapeTest().run()
+    TestSliceAssignOpDtypeTest().run()
+    TestSliceAssignOpAxesTest().run()
+    TestSliceAssignOpStridesTest().run()
diff --git a/test/cinn/ops/test_slice_op.py b/test/cinn/ops/test_slice_op.py
new file mode 100644
index 0000000000000..6a15be0a84936
--- /dev/null
+++ b/test/cinn/ops/test_slice_op.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSliceOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "inputs": self.random(self.case["shape"], self.case["dtype"])
+        }
+        self.axes = self.case["axes"]
+        self.starts = self.case["starts"]
+        self.ends = self.case["ends"]
+        self.strides = self.case["strides"]
+        self.decrease_axis = self.case["decrease_axis"]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["inputs"], stop_gradient=True)
+        res = paddle.strided_slice(x, self.axes, self.starts, self.ends,
+                                   self.strides)
+        out_shape = []
+        for i in range(len(res.shape)):
+            if i in self.decrease_axis:
+                self.assertEqual(res.shape[i], 1)
+            else:
+                out_shape.append(res.shape[i])
+
+        if len(out_shape) == 0:
+            out_shape = [1]
+        res = paddle.reshape(res, out_shape)
+        self.paddle_outputs = [res]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("slice")
+        inputs = builder.create_input(
+            self.nptype2cinntype(self.inputs["inputs"].dtype),
+            self.inputs["inputs"].shape, "inputs")
+        out = builder.slice(
+            inputs,
+            axes=self.axes,
+            starts=self.starts,
+            ends=self.ends,
+            strides=self.strides,
+            decrease_axis=self.decrease_axis)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [inputs],
+                                   [self.inputs["inputs"]], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestSliceOpLegacyTestCase(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceOpLegacyTestCase"
+        self.cls = TestSliceOp
+        self.inputs = [
+            {
+                "shape": [10, 12],
+                "axes": [0, 1],
+                "starts": [2, 2],
+                "ends": [5, 5],
+                "strides": [1, 1],
+                "decrease_axis": []
+            },
+            {
+                "shape": [10, 12],
+                "axes": [0, 1],
+                "starts": [1, 2],
+                "ends": [6, 1000],
+                "strides": [1, 2],
+                "decrease_axis": []
+            },
+            {
+                "shape": [10, 12],
+                "axes": [0, 1],
+                "starts": [2, 1],
+                "ends": [-1, 7],
+                "strides": [3, 2],
+                "decrease_axis": []
+            },
+            {
+                "shape": [10, 12],
+                "axes": [0, 1],
+                "starts": [2, 1000],
+                "ends": [8, 1],
+                "strides": [1, -2],
+                "decrease_axis": []
+            },
+            {
+                "shape": [10, 12],
+                "axes": [0, 1],
+                "starts": [-1, -2],
+                "ends": [-5, -8],
+                "strides": [-1, -2],
+                "decrease_axis": []
+            },
+            {
+                "shape": [10, 12],
+                "axes": [0, 1],
+                "starts": [2, 2],
+                "ends": [5, 3],
+                "strides": [1, 1],
+                "decrease_axis": [1],
+            },
+            {
+                "shape": [10, 12],
+                "axes": [0, 1],
+                "starts": [2, 2],
+                "ends": [3, 3],
+                "strides": [1, 1],
+                "decrease_axis": [0, 1],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestSliceOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceOpShapeTest"
+        self.cls = TestSliceOp
+        self.inputs = [
+            {
+                "shape": [64],
+                "axes": [0],
+                "starts": [2],
+                "ends": [5],
+                "strides": [1],
+                "decrease_axis": []
+            },
+            {
+                "shape": [128, 32],
+                "axes": [0, 1],
+                "starts": [24, 10],
+                "ends": [56, 26],
+                "strides": [1, 1],
+                "decrease_axis": []
+            },
+            {
+                "shape": [32, 10, 64],
+                "axes": [0, 1, 2],
+                "starts": [24, 4, 0],
+                "ends": [32, 8, 64],
+                "strides": [1, 1, 4],
+                "decrease_axis": []
+            },
+            {
+                "shape": [10, 12, 9, 5],
+                "axes": [0, 1, 2],
+                "starts": [2, 4, 0],
+                "ends": [5, 9, 7],
+                "strides": [1, 1, 2],
+                "decrease_axis": []
+            },
+            {
+                "shape": [1],
+                "axes": [0],
+                "starts": [0],
+                "ends": [1],
+                "strides": [1],
+                "decrease_axis": []
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+                "axes": [0],
+                "starts": [0],
+                "ends": [1],
+                "strides": [1],
+                "decrease_axis": []
+            },
+            {
+                "shape": [1024, 1, 2],
+                "axes": [0],
+                "starts": [128],
+                "ends": [640],
+                "strides": [1],
+                "decrease_axis": []
+            },
+            {
+                "shape": [2, 4096, 8],
+                "axes": [1],
+                "starts": [1024],
+                "ends": [3072],
+                "strides": [1],
+                "decrease_axis": []
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestSliceOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceOpDtypeTest"
+        self.cls = TestSliceOp
+        self.inputs = [
+            {
+                "shape": [9, 5, 4, 7],
+                "axes": [0, 1, 3],
+                "starts": [2, 2, 0],
+                "ends": [5, 5, 6],
+                "strides": [1, 2, 4],
+                "decrease_axis": []
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+            {
+                "dtype": "bool"
+            },
+        ]
+        self.attrs = []
+
+
+class TestSliceOpAxesTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceOpAxesTest"
+        self.cls = TestSliceOp
+        self.inputs = [
+            {
+                "shape": [128, 32],
+                "axes": [1],
+                "starts": [10],
+                "ends": [26],
+                "strides": [1],
+                "decrease_axis": []
+            },
+            {
+                "shape": [32, 10, 64],
+                "axes": [0, 2],
+                "starts": [24, 0],
+                "ends": [32, 64],
+                "strides": [1, 4],
+                "decrease_axis": []
+            },
+            {
+                "shape": [10, 12, 9, 5],
+                "axes": [0, 3],
+                "starts": [2, 0],
+                "ends": [5, 3],
+                "strides": [1, 1],
+                "decrease_axis": []
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestSliceOpStridesTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceOpStridesTest"
+        self.cls = TestSliceOp
+        self.inputs = [
+            {
+                "shape": [128, 32],
+                "axes": [0, 1],
+                "starts": [0, 0],
+                "ends": [128, 32],
+                "strides": [16, 2],
+                "decrease_axis": []
+            },
+            {
+                "shape": [32, 10, 64],
+                "axes": [0, 2],
+                "starts": [16, 0],
+                "ends": [32, 64],
+                "strides": [2, 4],
+                "decrease_axis": []
+            },
+            {
+                "shape": [8, 16, 32, 64, 128],
+                "axes": [0, 1, 2, 3, 4],
+                "starts": [0, 0, 0, 0, 0],
+                "ends": [8, 16, 32, 64, 128],
+                "strides": [1, 2, 4, 8, 16],
+                "decrease_axis": []
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestSliceOpDecreaseAxisTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSliceOpDecreaseAxisTest"
+        self.cls = TestSliceOp
+        self.inputs = [
+            {
+                "shape": [1],
+                "axes": [0],
+                "starts": [0],
+                "ends": [1],
+                "strides": [1],
+                "decrease_axis": [0]
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+                "axes": [0],
+                "starts": [0],
+                "ends": [1],
+                "strides": [1],
+                "decrease_axis": [1, 2, 3]
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+                "axes": [0],
+                "starts": [0],
+                "ends": [1],
+                "strides": [1],
+                "decrease_axis": [0, 1, 2, 3, 4]
+            },
+            {
+                "shape": [128, 32],
+                "axes": [0, 1],
+                "starts": [127, 0],
+                "ends": [128, 32],
+                "strides": [16, 2],
+                "decrease_axis": [0]
+            },
+            {
+                "shape": [32, 10, 64],
+                "axes": [0, 2],
+                "starts": [31, 32],
+                "ends": [32, 33],
+                "strides": [2, 4],
+                "decrease_axis": [0, 2]
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSliceOpLegacyTestCase().run()
+    TestSliceOpShapeTest().run()
+    TestSliceOpDtypeTest().run()
+    TestSliceOpAxesTest().run()
+    TestSliceOpStridesTest().run()
+    TestSliceOpDecreaseAxisTest().run()
diff --git a/test/cinn/ops/test_softmax_op.py b/test/cinn/ops/test_softmax_op.py
new file mode 100644
index 0000000000000..95ba82e109000
--- /dev/null
+++ b/test/cinn/ops/test_softmax_op.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSoftmaxOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(self.case["shape"], self.case["dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = F.softmax(x, axis=self.case["axis"])
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("softmax")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["dtype"]), self.case["shape"], "x")
+        out = builder.softmax(x, axes=[self.case["axis"]])
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSoftmaxAll(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSoftmaxCase"
+        self.cls = TestSoftmaxOp
+        self.inputs = [
+            {
+                "shape": [1],
+                "axis": 0,
+            },
+            {
+                "shape": [1024],
+                "axis": -1,
+            },
+            {
+                "shape": [512, 256],
+                "axis": 0,
+            },
+            {
+                "shape": [128, 64, 32],
+                "axis": 1,
+            },
+            {
+                "shape": [16, 8, 4, 2],
+                "axis": 2,
+            },
+            {
+                "shape": [16, 8, 4, 2, 1],
+                "axis": 2,
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+                "axis": 2,
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSoftmaxAll().run()
diff --git a/test/cinn/ops/test_sort_op.py b/test/cinn/ops/test_sort_op.py
new file mode 100644
index 0000000000000..7708d58e869bc
--- /dev/null
+++ b/test/cinn/ops/test_sort_op.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest
+from op_test_helper import TestCaseHelper, run_test
+
+
+class TestSortOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x": self.random(self.case["shape"], self.case["dtype"])
+        }
+        self.axis = self.case["axis"]
+        self.descending = self.case["descending"]
+
+    def build_paddle_program(self, target):
+        x1 = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = paddle.sort(x1, self.axis, self.descending)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("sort")
+        x1 = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.sort(x1, self.axis, not self.descending)
+        prog = builder.build()
+        forward_res = self.get_cinn_output(prog, target, [x1],
+                                           [self.inputs["x"]], [out])
+
+        self.cinn_outputs = forward_res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSortOpDumpicateElement(TestSortOp):
+    def setUp(self):
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {"x": self.random([128], "int64", -10, 10)}
+        self.axis = 0
+        self.descending = False
+
+
+# This test case will cause CINN to allocate a large amount of GPU memory, nearly 10 GB.
+class TestSortOpLargeCudaMemoryOccupation(TestSortOp):
+    def setUp(self):
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {"x": self.random([8192], "float64")}
+        self.axis = 0
+        self.descending = False
+
+
+class TestSortOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSortOpShapeTest"
+        self.cls = TestSortOp
+        self.inputs = [
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [1200],
+            },
+            {
+                "shape": [64, 16],
+            },
+            {
+                "shape": [4, 32, 8],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+            {
+                "shape": [2, 8, 4, 2, 5],
+            },
+            {
+                "shape": [4, 8, 1, 2, 16],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+            },
+            # TODO: known issue cinn/hlir/op/contrib/sort.cc:201
+            # the array will exceed the cuda kernel stack size limit
+            # {
+            #     "shape": [32768],
+            # },
+            # {
+            #     "shape": [65536],
+            # },
+            # {
+            #     "shape": [131072],
+            # },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [{"axis": 0, "descending": False}]
+
+
+class TestSortOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSortOpDtypeTest"
+        self.cls = TestSortOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [64, 16],
+            },
+            {
+                "shape": [4, 32, 8],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [{"axis": 0, "descending": False}]
+
+
+class TestSortOpAxisTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSortOpAttrsTest"
+        self.cls = TestSortOp
+        self.inputs = [
+            {
+                "shape": [16, 8, 4, 2],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [{
+            "axis": 0,
+            "descending": False
+        }, {
+            "axis": 1,
+            "descending": False
+        }, {
+            "axis": 2,
+            "descending": False
+        }, {
+            "axis": 3,
+            "descending": False
+        }]
+
+
+class TestSortOpDescedingTest(TestSortOpShapeTest):
+    def init_attrs(self):
+        self.class_name = "TestSortOpDescedingTest"
+        self.cls = TestSortOp
+        self.inputs = [
+            {
+                "shape": [16, 8, 4, 2],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [{
+            "axis": 0,
+            "descending": True
+        }, {
+            "axis": 1,
+            "descending": True
+        }, {
+            "axis": 2,
+            "descending": True
+        }, {
+            "axis": 3,
+            "descending": True
+        }]
+
+
+if __name__ == "__main__":
+    run_test(TestSortOpDumpicateElement)
+    # run_test(TestSortOpLargeCudaMemoryOccupation)
+
+    TestSortOpShapeTest().run()
+    TestSortOpDtypeTest().run()
+    TestSortOpAxisTest().run()
+    TestSortOpDescedingTest().run()
diff --git a/test/cinn/ops/test_split_op.py b/test/cinn/ops/test_split_op.py
new file mode 100755
index 0000000000000..b558aa035f5d0
--- /dev/null
+++ b/test/cinn/ops/test_split_op.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSplitOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x": self.random(self.case["shape"], self.case["dtype"], -1.0, 1.0)
+        }
+        self.num_or_sections = self.case["num_or_sections"]
+        self.axis = self.case["axis"]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        if len(self.num_or_sections) == 1:
+            num = self.num_or_sections[0]
+        else:
+            num = self.num_or_sections
+        out = paddle.split(x, num_or_sections=num, axis=self.axis)
+        self.paddle_outputs = out
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("split")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.split(
+            x, num_or_sections=self.num_or_sections, axis=self.axis)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]], out)
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSplitOpLegacy(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSplitOpLegacy"
+        self.cls = TestSplitOp
+        self.inputs = [
+            {
+                "shape": [9, 9, 5],
+                "num_or_sections": [2, 3, 4],
+                "axis": 0
+            },
+            {
+                "shape": [9, 9, 5],
+                "num_or_sections": [3],
+                "axis": 0
+            },
+            {
+                "shape": [9, 9, 5],
+                "num_or_sections": [3],
+                "axis": 1
+            },
+            {
+                "shape": [9, 9, 5],
+                "num_or_sections": [2, 3, -1],
+                "axis": 1
+            },
+            {
+                "shape": [8, 9, 5],
+                "num_or_sections": [2],
+                "axis": 0
+            },
+            {
+                "shape": [8, 9, 5],
+                "num_or_sections": [-1, 2, 2, 2],
+                "axis": 0
+            },
+            {
+                "shape": [2048, 9, 6],
+                "num_or_sections": [2],
+                "axis": 2
+            },
+            {
+                "shape": [10, 128, 4096],
+                "num_or_sections": [2],
+                "axis": 2
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestSplitOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSplitOpShape"
+        self.cls = TestSplitOp
+        self.inputs = [
+            {
+                "shape": [10],
+            },
+            {
+                "shape": [8, 5],
+            },
+            {
+                "shape": [10, 3, 5],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+            {
+                "shape": [80, 1, 5, 7],
+            },
+            {
+                "shape": [80, 3, 1024, 7],
+            },
+            {
+                "shape": [10, 5, 1024, 2048],
+            },
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [2048],
+            },
+            {
+                "shape": [65536],
+            },
+            {
+                "shape": [131072],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [{"num_or_sections": [2], "axis": 0}]
+
+
+class TestSplitOpOnes(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSplitOpOnes"
+        self.cls = TestSplitOp
+        self.inputs = [
+            {
+                "shape": [1],
+                "num_or_sections": [1],
+                "axis": 0
+            },
+            {
+                "shape": [1],
+                "num_or_sections": [1],
+                "axis": 0
+            },
+            {
+                "shape": [1, 1, 1, 1],
+                "num_or_sections": [1],
+                "axis": 0
+            },
+            {
+                "shape": [1, 1, 1, 1],
+                "num_or_sections": [1],
+                "axis": 2
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+                "num_or_sections": [1],
+                "axis": 4
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestSplitOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSplitOpDtype"
+        self.cls = TestSplitOp
+        self.inputs = [
+            {
+                "shape": [8],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [80, 40, 5, 7],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [{"num_or_sections": [2], "axis": 0}]
+
+
+class TestSplitOpAttributeNum(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSplitOpAttributeNum"
+        self.cls = TestSplitOp
+        self.inputs = [
+            {
+                "shape": [1024],
+                "num_or_sections": [16],
+                "axis": 0
+            },
+            {
+                "shape": [1024],
+                "num_or_sections": [-1, 256, 256],
+                "axis": 0
+            },
+            {
+                "shape": [256, 32],
+                "num_or_sections": [-1, 16],
+                "axis": 1
+            },
+            {
+                "shape": [16, 8, 32, 64],
+                "num_or_sections": [2, 3, 3],
+                "axis": 1
+            },
+            {
+                "shape": [1, 1, 1, 16, 1],
+                "num_or_sections": [4, 4, 4, 4],
+                "axis": 3
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestSplitOpAttributeAxis(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSplitOpAttributeAxis"
+        self.cls = TestSplitOp
+        self.inputs = [
+            {
+                "shape": [16, 8, 32, 64],
+                "num_or_sections": [3, -1, 3],
+                "axis": 0
+            },
+            {
+                "shape": [16, 8, 32, 64],
+                "num_or_sections": [3, -1, 3],
+                "axis": 1
+            },
+            {
+                "shape": [16, 8, 32, 64],
+                "num_or_sections": [3, -1, 3],
+                "axis": 2
+            },
+            {
+                "shape": [16, 8, 32, 64],
+                "num_or_sections": [3, -1, 3],
+                "axis": 3
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+class TestSplitOpAttributeLargeNum(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSplitOpAttributeLargeNum"
+        self.cls = TestSplitOp
+        self.inputs = [
+            {
+                "shape": [1024],
+                "num_or_sections": [16],
+                "axis": 0
+            },
+            {
+                "shape": [1024],
+                "num_or_sections": [256],
+                "axis": 0
+            },
+            {
+                "shape": [1024],
+                "num_or_sections": [1024],
+                "axis": 0
+            },
+            {
+                "shape": [1024],
+                "num_or_sections": [512],
+                "axis": 0
+            },
+            {
+                "shape": [131072],
+                "num_or_sections": [131072],
+                "axis": 0
+            },
+            {
+                "shape": [131072],
+                "num_or_sections": [65536],
+                "axis": 0
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSplitOpLegacy().run()
+    TestSplitOpShape().run()
+    TestSplitOpOnes().run()
+    TestSplitOpDtype().run()
+    TestSplitOpAttributeNum().run()
+    TestSplitOpAttributeAxis().run()
+    # TestSplitOpAttributeLargeNum().run()
diff --git a/test/cinn/ops/test_sqrt_op.py b/test/cinn/ops/test_sqrt_op.py
new file mode 100644
index 0000000000000..9066a74a9430b
--- /dev/null
+++ b/test/cinn/ops/test_sqrt_op.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSqrtOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=0.01,
+            high=10000)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.sqrt(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.sqrt(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSqrtOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestUnaryOpCase"
+        self.cls = TestSqrtOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestSqrtOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestUnaryOpCase"
+        self.cls = TestSqrtOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSqrtOpShape().run()
+    TestSqrtOpDtype().run()
diff --git a/test/cinn/ops/test_squeeze_op.py b/test/cinn/ops/test_squeeze_op.py
new file mode 100644
index 0000000000000..78157aa83488d
--- /dev/null
+++ b/test/cinn/ops/test_squeeze_op.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest
+from op_test_helper import TestCaseHelper
+
+
+class TestSqueezeOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x": self.random(self.case["shape"], self.case["dtype"])
+        }
+        self.axes = self.case["axes"]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = paddle.squeeze(x, self.axes)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("squeeze")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.squeeze(x, self.axes)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSqueezeOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSqueezeOpShapeTest"
+        self.cls = TestSqueezeOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [64],
+            },
+            {
+                "shape": [64, 32],
+            },
+            {
+                "shape": [64, 1],
+            },
+            {
+                "shape": [64, 32, 128],
+            },
+            {
+                "shape": [64, 32, 1],
+            },
+            {
+                "shape": [64, 32, 16, 32],
+            },
+            {
+                "shape": [64, 32, 16, 1],
+            },
+            {
+                "shape": [64, 32, 16, 1, 128],
+            },
+            {
+                "shape": [1, 1],
+            },
+            {
+                "shape": [1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1024, 1, 1],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [{"axes": []}]
+
+
+class TestSqueezeOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSqueezeOpDtypeTest"
+        self.cls = TestSqueezeOp
+        self.inputs = [
+            {
+                "shape": [64, 1, 128],
+            },
+            {
+                "shape": [64, 32, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "int8"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [{"axes": []}]
+
+
+class TestSqueezeOpAttributeAxes(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSqueezeOpShapeTest"
+        self.cls = TestSqueezeOp
+        self.inputs = [
+            {
+                "shape": [1],
+                "axes": [0]
+            },
+            {
+                "shape": [64],
+                "axes": []
+            },
+            {
+                "shape": [64, 1],
+                "axes": [-1]
+            },
+            {
+                "shape": [64, 1],
+                "axes": [1]
+            },
+            {
+                "shape": [64, 1, 1, 32],
+                "axes": [1, 2]
+            },
+            {
+                "shape": [1, 32, 1, 32],
+                "axes": [0, -2]
+            },
+            {
+                "shape": [64, 1, 16, 1],
+                "axes": [1, -1]
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestSqueezeOpLargeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSqueezeOpLargeTest"
+        self.cls = TestSqueezeOp
+        self.inputs = [
+            {
+                "shape": [65536, 1],
+                "axes": [-1]
+            },
+            {
+                "shape": [1, 131072],
+                "axes": [-2]
+            },
+            {
+                "shape": [131072],
+                "axes": []
+            },
+            {
+                "shape": [64, 32, 16, 8, 4],
+                "axes": []
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSqueezeOpShapeTest().run()
+    TestSqueezeOpDtypeTest().run()
+    TestSqueezeOpAttributeAxes().run()
+    TestSqueezeOpLargeTest().run()
diff --git a/test/cinn/ops/test_subtract_op.py b/test/cinn/ops/test_subtract_op.py
new file mode 100644
index 0000000000000..2cb2169235ebe
--- /dev/null
+++ b/test/cinn/ops/test_subtract_op.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSubOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        if self.case["broadcast"]:
+            self.inputs = {
+                "x": self.random(self.case["x_shape"], self.case["dtype"]),
+                "y": self.random(self.case["y_shape"], self.case["dtype"])
+            }
+        else:
+            self.inputs = {
+                "x": self.random(self.case["shape"], self.case["dtype"]),
+                "y": self.random(self.case["shape"], self.case["dtype"])
+            }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=True)
+
+        out = paddle.subtract(x, y)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("sub")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["y"].shape, "y")
+        out = builder.subtract(x, y)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.inputs["x"], self.inputs["y"]], [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSubOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSubOpShapeTest"
+        self.cls = TestSubOp
+        self.inputs = [
+            {
+                "shape": [64],
+            },
+            {
+                "shape": [64, 32],
+            },
+            {
+                "shape": [64, 1],
+            },
+            {
+                "shape": [64, 32, 128],
+            },
+            {
+                "shape": [1, 32, 128],
+            },
+            {
+                "shape": [64, 32, 16, 32],
+            },
+            {
+                "shape": [64, 32, 1, 32],
+            },
+            {
+                "shape": [64, 32, 16, 1, 128],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [1, 1],
+            },
+            {
+                "shape": [1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1024, 1, 1],
+            },
+            {
+                "shape": [65536],
+            },
+            {
+                "shape": [131072],
+            },
+            {
+                "shape": [1048576]
+            },
+            {
+                "shape": [64, 32, 16, 8, 4],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [{"broadcast": False}]
+
+
+class TestSubOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSubOpDtypeTest"
+        self.cls = TestSubOp
+        self.inputs = [
+            {
+                "shape": [64, 1, 128],
+            },
+            {
+                "shape": [64, 32, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [{"axes": []}]
+        self.attrs = [{"broadcast": False}]
+
+
+class TestSubOpBroadcastTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSubOpShapeTest"
+        self.cls = TestSubOp
+        self.inputs = [
+            {
+                "x_shape": [64],
+                "y_shape": [1],
+            },
+            {
+                "x_shape": [1],
+                "y_shape": [64],
+            },
+            {
+                "x_shape": [64, 32],
+                "y_shape": [64, 1],
+            },
+            {
+                "x_shape": [1, 1],
+                "y_shape": [64, 32],
+            },
+            {
+                "x_shape": [64, 1],
+                "y_shape": [1, 32],
+            },
+            {
+                "x_shape": [64, 1, 128],
+                "y_shape": [64, 32, 128],
+            },
+            {
+                "x_shape": [64, 32, 128],
+                "y_shape": [64, 32, 1],
+            },
+            {
+                "x_shape": [64, 1, 128],
+                "y_shape": [1, 32, 128],
+            },
+            {
+                "x_shape": [1, 1, 1],
+                "y_shape": [64, 32, 128],
+            },
+            {
+                "x_shape": [64, 1, 16, 32],
+                "y_shape": [64, 32, 16, 32],
+            },
+            {
+                "x_shape": [64, 32, 16, 32],
+                "y_shape": [64, 32, 1, 32],
+            },
+            {
+                "x_shape": [64, 1, 1, 32],
+                "y_shape": [64, 32, 16, 32],
+            },
+            {
+                "x_shape": [64, 32, 16, 1],
+                "y_shape": [64, 1, 16, 32],
+            },
+            {
+                "x_shape": [1, 1, 1, 1],
+                "y_shape": [64, 32, 16, 32],
+            },
+            {
+                "x_shape": [1, 32, 16, 32],
+                "y_shape": [64, 32, 16, 32],
+            },
+            {
+                "x_shape": [64, 32, 16, 32],
+                "y_shape": [64, 32, 16, 32],
+            },
+            {
+                "x_shape": [65536],
+                "y_shape": [1],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [{"broadcast": True}]
+
+
+if __name__ == "__main__":
+    TestSubOpShapeTest().run()
+    TestSubOpDtypeTest().run()
+    TestSubOpBroadcastTest().run()
diff --git a/test/cinn/ops/test_sum_op.py b/test/cinn/ops/test_sum_op.py
new file mode 100644
index 0000000000000..1a3f791ca0eb0
--- /dev/null
+++ b/test/cinn/ops/test_sum_op.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSumOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        shapes = self.case["shapes"]
+        dtype = self.case["dtype"]
+        self.inputs = []
+        for shape in shapes:
+            self.inputs.append(self.random(shape, dtype))
+
+    def build_paddle_program(self, target):
+        inputs = []
+        for input in self.inputs:
+            inputs.append(paddle.to_tensor(input, stop_gradient=True))
+        out = paddle.add_n(inputs)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("sum")
+        cinn_inputs = []
+        for id, input in enumerate(self.inputs):
+            cinn_input = builder.create_input(
+                self.nptype2cinntype(input.dtype), input.shape,
+                "input_" + str(id))
+            cinn_inputs.append(cinn_input)
+        out = builder.sum(cinn_inputs)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, cinn_inputs, self.inputs,
+                                   [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSumOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSumOpShapeTest"
+        self.cls = TestSumOp
+        self.inputs = [
+            {
+                "shapes": [[64]] * 2,
+            },
+            {
+                "shapes": [[64, 32]] * 3,
+            },
+            {
+                "shapes": [[64, 1]] * 4,
+            },
+            {
+                "shapes": [[64, 32, 128]] * 2,
+            },
+            {
+                "shapes": [[1, 32, 128]] * 2,
+            },
+            {
+                "shapes": [[64, 32, 16, 32]] * 2,
+            },
+            {
+                "shapes": [[64, 32, 1, 32]] * 2,
+            },
+            {
+                "shapes": [[64, 32, 16, 1, 128]] * 2,
+            },
+            {
+                "shapes": [[1]] * 2,
+            },
+            {
+                "shapes": [[1, 1]] * 2,
+            },
+            {
+                "shapes": [[1, 1, 1]] * 3,
+            },
+            {
+                "shapes": [[1, 1, 1, 1]] * 3,
+            },
+            {
+                "shapes": [[1, 1, 1, 1, 1]] * 4,
+            },
+            {
+                "shapes": [[1, 1, 1024, 1, 1]] * 4,
+            },
+            {
+                "shapes": [[65536]] * 1,
+            },
+            {
+                "shapes": [[131072]] * 2,
+            },
+            {
+                "shapes": [[1048576]] * 3,
+            },
+            {
+                "shapes": [[64, 32, 16, 8, 4]] * 4,
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestSumOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSumOpDtypeTest"
+        self.cls = TestSumOp
+        self.inputs = [
+            {
+                "shapes": [[64, 1, 128]] * 2,
+            },
+            {
+                "shapes": [[64, 32, 1]] * 2,
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [{"axes": []}]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestSumOpShapeTest().run()
+    TestSumOpDtypeTest().run()
diff --git a/test/cinn/ops/test_tan_op.py b/test/cinn/ops/test_tan_op.py
new file mode 100644
index 0000000000000..963c6a396cc11
--- /dev/null
+++ b/test/cinn/ops/test_tan_op.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestTanOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.tan(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.tan(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestTanOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTanOpShape"
+        self.cls = TestTanOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestTanOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTanOpDtype"
+        self.cls = TestTanOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestTanOpShape().run()
+    TestTanOpDtype().run()
diff --git a/test/cinn/ops/test_tanh_op.py b/test/cinn/ops/test_tanh_op.py
new file mode 100644
index 0000000000000..e4ad4bdc2ea0f
--- /dev/null
+++ b/test/cinn/ops/test_tanh_op.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestTanhOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"], dtype=self.case["x_dtype"])
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.tanh(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.tanh(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestTanhOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTanhOpShape"
+        self.cls = TestTanhOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestTanhOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTanhOpDtype"
+        self.cls = TestTanhOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "float16",
+                "max_relative_error": 1e-3
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestTanhOpShape().run()
+    TestTanhOpDtype().run()
diff --git a/test/cinn/ops/test_top_k_op.py b/test/cinn/ops/test_top_k_op.py
new file mode 100644
index 0000000000000..0241f02cb2177
--- /dev/null
+++ b/test/cinn/ops/test_top_k_op.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper, run_test
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestTopKOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x": self.random(self.case["shape"], self.case["dtype"])
+        }
+        self.k = self.case["k"]
+        self.axis = self.case["axis"]
+        self.largest = self.case["largest"]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        if x.shape[self.axis] < self.k:
+            self.k = x.shape[self.axis]
+        out = paddle.topk(x, self.k, self.axis)
+        self.paddle_outputs = [out[0], out[1]]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("topk")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.top_k(x, self.k, self.axis, self.largest)
+        prog = builder.build()
+        forward_res = self.get_cinn_output(
+            prog, target, [x], [self.inputs["x"]], [out[0], out[1]])
+        self.cinn_outputs = forward_res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestTopKOpDumpicateElement(TestTopKOp):
+    def setUp(self):
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {"x": self.random([128], "int64", -10, 10)}
+        self.axis = 0
+        self.largest = False
+        self.k = 5
+
+
+# known issue: same as sort op
+# This test case will cause CINN to allocate a large amount of GPU memory, nearly 10 GB.
+class TestTopKOpLargeCudaMemoryOccupation(TestTopKOp):
+    def setUp(self):
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {"x": self.random([8192], "float64")}
+        self.axis = 0
+        self.largest = False
+        self.k = 5
+
+
+class TestTopKOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTopKOpShapeTest"
+        self.cls = TestTopKOp
+        self.inputs = [
+            {
+                "shape": [512],
+                "k": 3
+            },
+            {
+                "shape": [1024],
+                "k": 10
+            },
+            {
+                "shape": [1200],
+                "k": 1024
+            },
+            {
+                "shape": [64, 16],
+                "k": 3
+            },
+            {
+                "shape": [4, 32, 8],
+                "k": 4
+            },
+            {
+                "shape": [16, 8, 4, 2],
+                "k": 5
+            },
+            {
+                "shape": [2, 8, 4, 2, 5],
+                "k": 1
+            },
+            {
+                "shape": [4, 8, 1, 2, 16],
+                "k": 3
+            },
+            {
+                "shape": [1],
+                "k": 1
+            },
+            {
+                "shape": [1, 1, 1, 1],
+                "k": 1
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+                "k": 1
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [{"axis": 0, "largest": True}]
+
+
+class TestTopKOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTopKOpDtypeTest"
+        self.cls = TestTopKOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [64, 16],
+            },
+            {
+                "shape": [4, 32, 8],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [{"axis": 0, "largest": True, "k": 3}]
+
+
+class TestTopKOpAxisTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTopKOpAxisTest"
+        self.cls = TestTopKOp
+        self.inputs = [
+            {
+                "shape": [16, 8, 4, 8],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [
+            {
+                "axis": 0,
+                "largest": True,
+                "k": 3
+            },
+            {
+                "axis": 1,
+                "largest": True,
+                "k": 3
+            },
+            {
+                "axis": 2,
+                "largest": True,
+                "k": 3
+            },
+            {
+                "axis": 3,
+                "largest": True,
+                "k": 3
+            },
+        ]
+
+
+class TestTopKOpKTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTopKOpKTest"
+        self.cls = TestTopKOp
+        self.inputs = [
+            {
+                "shape": [16, 8, 4, 2],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [
+            {
+                "axis": 0,
+                "largest": True,
+                "k": 8
+            },
+            {
+                "axis": 1,
+                "largest": True,
+                "k": 4
+            },
+            {
+                "axis": 2,
+                "largest": True,
+                "k": 2
+            },
+            {
+                "axis": 3,
+                "largest": True,
+                "k": 1
+            },
+            {
+                "axis": 0,
+                "largest": True,
+                "k": 20
+            },
+            {
+                "axis": 1,
+                "largest": True,
+                "k": 10
+            },
+            {
+                "axis": 2,
+                "largest": True,
+                "k": 10
+            },
+            {
+                "axis": 3,
+                "largest": True,
+                "k": 5
+            },
+        ]
+
+
+class TestTopKOpAscendingTest(TestTopKOpShapeTest):
+    def init_attrs(self):
+        self.class_name = "TestTopKOpAscendingTest"
+        self.cls = TestTopKOp
+        self.inputs = [
+            {
+                "shape": [16, 8, 4, 8],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [
+            {
+                "axis": 0,
+                "largest": False,
+                "k": 3
+            },
+            {
+                "axis": 1,
+                "largest": False,
+                "k": 3
+            },
+            {
+                "axis": 2,
+                "largest": False,
+                "k": 3
+            },
+            {
+                "axis": 3,
+                "largest": False,
+                "k": 3
+            },
+        ]
+
+
+if __name__ == "__main__":
+    run_test(TestTopKOpDumpicateElement)
+    # run_test(TestTopKOpLargeCudaMemoryOccupation)
+
+    TestTopKOpShapeTest().run()
+    TestTopKOpDtypeTest().run()
+    TestTopKOpAxisTest().run()
+    TestTopKOpKTest().run()
+    TestTopKOpAscendingTest().run()
diff --git a/test/cinn/ops/test_transpose_op.py b/test/cinn/ops/test_transpose_op.py
new file mode 100644
index 0000000000000..13e6564a3a69f
--- /dev/null
+++ b/test/cinn/ops/test_transpose_op.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "x": self.random(self.case["shape"], self.case["dtype"])
+        }
+        self.axes = self.case["axes"]
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = paddle.transpose(x, [
+            axis + len(self.inputs["x"].shape) if axis < 0 else axis
+            for axis in self.axes
+        ])
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("transpose_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = builder.transpose(x, self.axes)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(all_equal=True)
+
+
+class TestTransposeOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTransposeOpShapeTest"
+        self.cls = TestTransposeOp
+        self.inputs = [
+            {
+                "shape": [512],
+                "axes": [0]
+            },
+            {
+                "shape": [1024],
+                "axes": [-1]
+            },
+            {
+                "shape": [1200],
+                "axes": [0]
+            },
+            {
+                "shape": [64, 16],
+                "axes": [1, 0]
+            },
+            {
+                "shape": [4, 32, 8],
+                "axes": [1, 0, 2]
+            },
+            {
+                "shape": [16, 8, 4, 2],
+                "axes": [0, 2, 1, 3]
+            },
+            {
+                "shape": [2, 8, 4, 2, 5],
+                "axes": [0, 2, 3, 1, 4]
+            },
+            {
+                "shape": [4, 8, 1, 2, 16],
+                "axes": [0, 2, 4, 1, 3]
+            },
+            {
+                "shape": [1],
+                "axes": [0]
+            },
+            {
+                "shape": [1, 1, 1, 1],
+                "axes": [0, 2, 3, 1]
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+                "axes": [0, 2, 3, 1, 4]
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestTransposeOpOnesTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTransposeOpOnesTest"
+        self.cls = TestTransposeOp
+        self.inputs = [
+            {
+                "shape": [1],
+                "axes": [0]
+            },
+            {
+                "shape": [1, 1, 1, 1],
+                "axes": [0, 2, 3, 1]
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+                "axes": [0, 2, 3, 1, 4]
+            },
+            {
+                "shape": [1, 1, 512, 1, 1],
+                "axes": [0, 2, 3, 1, 4]
+            },
+            {
+                "shape": [1, 2048, 1, 1],
+                "axes": [0, 2, 3, 1]
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestTransposeOpLargeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTransposeOpLargeTest"
+        self.cls = TestTransposeOp
+        self.inputs = [
+            {
+                "shape": [2048],
+                "axes": [0]
+            },
+            {
+                "shape": [1, 1, 65536, 1],
+                "axes": [0, 2, 3, 1]
+            },
+            {
+                "shape": [1, 1, 131072, 1, 1],
+                "axes": [0, 2, 3, 1, 4]
+            },
+            {
+                "shape": [1, 1048576, 1, 1],
+                "axes": [0, 2, 3, 1]
+            },
+            {
+                "shape": [16, 32, 64, 32],
+                "axes": [0, 2, 3, 1]
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = []
+
+
+class TestTransposeOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTransposeOpDtypeTest"
+        self.cls = TestTransposeOp
+        self.inputs = [
+            {
+                "shape": [1024],
+                "axes": [0]
+            },
+            {
+                "shape": [64, 16],
+                "axes": [1, 0]
+            },
+            {
+                "shape": [4, 32, 8],
+                "axes": [0, 2, 1]
+            },
+            {
+                "shape": [16, 8, 4, 2],
+                "axes": [1, 2, 3, 0]
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float16"
+            },
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "bool"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = []
+
+
+class TestTransposeOpAttributeAxes(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTransposeOpAttributeAxes"
+        self.cls = TestTransposeOp
+        self.inputs = [
+            {
+                "shape": [1024],
+                "axes": [0]
+            },
+            {
+                "shape": [1024],
+                "axes": [-1]
+            },
+            {
+                "shape": [64, 16],
+                "axes": [1, 0]
+            },
+            {
+                "shape": [64, 16],
+                "axes": [0, -1]
+            },
+            {
+                "shape": [4, 32, 8],
+                "axes": [0, 2, 1]
+            },
+            {
+                "shape": [4, 32, 8],
+                "axes": [-3, -1, 1]
+            },
+            {
+                "shape": [16, 8, 4, 2],
+                "axes": [1, 2, 3, 0]
+            },
+            {
+                "shape": [16, 8, 4, 2],
+                "axes": [1, -2, -1, -4]
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestTransposeOpShapeTest().run()
+    TestTransposeOpOnesTest().run()
+    TestTransposeOpLargeTest().run()
+    TestTransposeOpDtypeTest().run()
+    TestTransposeOpAttributeAxes().run()
diff --git a/test/cinn/ops/test_triangular_solve_op.py b/test/cinn/ops/test_triangular_solve_op.py
new file mode 100644
index 0000000000000..f406ca41db2d5
--- /dev/null
+++ b/test/cinn/ops/test_triangular_solve_op.py
@@ -0,0 +1,392 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from cinn.frontend import *
+from cinn.common import *
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper, run_test
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "triangular solve op support GPU only now.")
+class TestTriangularSolveOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "input1": self.random(self.case["shape1"], self.case["dtype"]),
+            "input2": self.random(self.case["shape2"], self.case["dtype"])
+        }
+        self.left_side = self.case["left_side"]
+        self.upper = self.case["upper"]
+        self.transpose_a = self.case["transpose_a"]
+        self.unit_diagonal = self.case["unit_diagonal"]
+
+    def build_paddle_program(self, target):
+        def transpose_last_two_dims(x):
+            shape = x.shape
+            last_dim_idx = len(shape) - 1
+            second_last_dim_idx = len(shape) - 2
+            perm = list(range(len(shape)))
+            perm[last_dim_idx], perm[second_last_dim_idx] = perm[
+                second_last_dim_idx], perm[last_dim_idx]
+            x_transposed = paddle.transpose(x, perm=perm)
+            return x_transposed
+
+        input1 = paddle.to_tensor(self.inputs["input1"], stop_gradient=True)
+        input2 = paddle.to_tensor(self.inputs["input2"], stop_gradient=True)
+        if self.left_side:
+            out = paddle.linalg.triangular_solve(input1, input2, self.upper,
+                                                 self.transpose_a,
+                                                 self.unit_diagonal)
+            self.paddle_outputs = [out]
+        else:
+            input1 = transpose_last_two_dims(input1)
+            input2 = transpose_last_two_dims(input2)
+            out = paddle.linalg.triangular_solve(
+                input1, input2, not self.upper, self.transpose_a,
+                self.unit_diagonal)
+            out = transpose_last_two_dims(out)
+            self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("triangular_solve")
+        input1 = builder.create_input(
+            self.nptype2cinntype(self.inputs["input1"].dtype),
+            self.inputs["input1"].shape,
+            "input1",
+        )
+        input2 = builder.create_input(
+            self.nptype2cinntype(self.inputs["input2"].dtype),
+            self.inputs["input2"].shape,
+            "input2",
+        )
+        out = builder.triangular_solve(
+            input1,
+            input2,
+            self.left_side,
+            self.upper,
+            self.transpose_a,
+            self.unit_diagonal,
+        )
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog,
+            target,
+            [input1, input2],
+            [self.inputs["input1"], self.inputs["input2"]],
+            [out],
+            passes=[],
+        )
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestTriangularSolveOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTriangularSolveOpShapeTest"
+        self.cls = TestTriangularSolveOp
+        self.inputs = [
+            {
+                "shape1": [1, 3, 3],
+                "shape2": [1, 3, 1],
+            },
+            {
+                "shape1": [1, 1, 1],
+                "shape2": [1, 1, 1],
+            },
+            {
+                "shape1": [2, 3, 3],
+                "shape2": [2, 3, 10],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "left_side": True,
+                "upper": True,
+                "transpose_a": False,
+                "unit_diagonal": False
+            },
+        ]
+
+
+class TestTriangularSolveOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTriangularSolveOpDtypeTest"
+        self.cls = TestTriangularSolveOp
+        self.inputs = [
+            {
+                "shape1": [2, 8, 8],
+                "shape2": [2, 8, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+        ]
+        self.attrs = [
+            {
+                "left_side": True,
+                "upper": True,
+                "transpose_a": False,
+                "unit_diagonal": False
+            },
+        ]
+
+
+class TestTriangularSolveOpBatchDimTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTriangularSolveOpBatchDimTest"
+        self.cls = TestTriangularSolveOp
+        self.inputs = [
+            {
+                "shape1": [8, 8],
+                "shape2": [8, 4],
+            },
+            {
+                "shape1": [3, 16, 16],
+                "shape2": [16, 4],
+            },
+            {
+                "shape1": [16, 16],
+                "shape2": [5, 16, 4],
+            },
+            {
+                "shape1": [5, 16, 16],
+                "shape2": [5, 16, 4],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "left_side": True,
+                "upper": True,
+                "transpose_a": False,
+                "unit_diagonal": False
+            },
+        ]
+
+
+class TestTriangularSolveOpBroadcastTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTriangularSolveOpBroadcastTest"
+        self.cls = TestTriangularSolveOp
+        self.inputs = [
+            {
+                "shape1": [2, 2, 3, 3, 3],
+                "shape2": [1, 3, 4],
+            },
+            {
+                "shape1": [3, 3, 3],
+                "shape2": [2, 2, 3, 3, 4],
+            },
+            {
+                "shape1": [2, 1, 3, 3, 3],
+                "shape2": [2, 2, 3, 3, 4],
+            },
+            {
+                "shape1": [5, 1, 3, 3, 3],
+                "shape2": [1, 2, 1, 3, 4],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "left_side": True,
+                "upper": True,
+                "transpose_a": False,
+                "unit_diagonal": False
+            },
+        ]
+
+
+class TestTriangularSolveOpAttributeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTriangularSolveOpAttributeTest"
+        self.cls = TestTriangularSolveOp
+        self.inputs = [
+            {
+                "shape1": [1, 3, 3],
+                "shape2": [1, 3, 1],
+            },
+            {
+                "shape1": [2, 3, 3],
+                "shape2": [2, 3, 10],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+        ]
+        self.attrs = [
+            {
+                "left_side": True,
+                "upper": True,
+                "transpose_a": False,
+                "unit_diagonal": False
+            },
+            {
+                "left_side": True,
+                "upper": True,
+                "transpose_a": False,
+                "unit_diagonal": True
+            },
+            {
+                "left_side": True,
+                "upper": True,
+                "transpose_a": True,
+                "unit_diagonal": False
+            },
+        ]
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "triangular solve op support GPU only now.")
+class TestTriangularSolveOpRightSide(TestTriangularSolveOp):
+    def setUp(self):
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "input1": self.random([2, 3, 3], "float32"),
+            "input2": self.random([2, 1, 3], "float32")
+        }
+        self.left_side = False
+        self.upper = True
+        self.transpose_a = False
+        self.unit_diagonal = False
+
+
+class TestTriangularSolveOpRightSide1(TestTriangularSolveOpRightSide):
+    def prepare_inputs(self):
+        self.inputs = {
+            "input1": self.random([1, 3, 2, 3, 3], "float32"),
+            "input2": self.random([2, 1, 2, 1, 3], "float32")
+        }
+        self.left_side = False
+        self.upper = True
+        self.transpose_a = False
+        self.unit_diagonal = False
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "triangular solve op support GPU only now.")
+class TestTriangularSolveOpSingular(TestTriangularSolveOp):
+    def setUp(self):
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "input1": self.random([1, 3, 3], "float32"),
+            "input2": self.random([1, 3, 1], "float32")
+        }
+        # set one dim to zeros to make a singular matrix
+        self.inputs["input1"][0][0] = 0
+        self.left_side = True
+        self.upper = True
+        self.transpose_a = True
+        self.unit_diagonal = False
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(equal_nan=True)
+
+
+class TestTriangularSolveOpSingular1(TestTriangularSolveOpSingular):
+    def prepare_inputs(self):
+        self.inputs = {
+            "input1": self.random([1, 3, 3], "float32"),
+            "input2": self.random([1, 3, 1], "float32")
+        }
+        # set one dim to zeros to make a singular matrix
+        self.inputs["input1"][0][2] = 0
+        self.left_side = True
+        self.upper = True
+        self.transpose_a = True
+        self.unit_diagonal = False
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "triangular solve op support GPU only now.")
+class TestTriangularSolveOpLarge(TestTriangularSolveOp):
+    def setUp(self):
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {
+            "input1": self.random([1, 1024, 1024], "float64", -0.01, 0.01),
+            "input2": self.random([1, 1024, 512], "float64", -0.01, 0.01)
+        }
+        self.left_side = True
+        self.upper = True
+        self.transpose_a = False
+        self.unit_diagonal = False
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(equal_nan=True)
+
+
+class TestTriangularSolveOpLarge1(TestTriangularSolveOpLarge):
+    def prepare_inputs(self):
+        self.inputs = {
+            "input1": self.random([1, 2048, 2048], "float64", -0.01, 0.01),
+            "input2": self.random([1, 2048, 512], "float64", -0.01, 0.01)
+        }
+        self.left_side = True
+        self.upper = True
+        self.transpose_a = False
+        self.unit_diagonal = False
+
+
+if __name__ == "__main__":
+    TestTriangularSolveOpShapeTest().run()
+    TestTriangularSolveOpDtypeTest().run()
+    TestTriangularSolveOpBatchDimTest().run()
+    TestTriangularSolveOpBroadcastTest().run()
+    TestTriangularSolveOpAttributeTest().run()
+
+    run_test(TestTriangularSolveOpRightSide)
+    run_test(TestTriangularSolveOpRightSide1)
+    run_test(TestTriangularSolveOpSingular)
+    run_test(TestTriangularSolveOpSingular1)
+    run_test(TestTriangularSolveOpLarge)
+    run_test(TestTriangularSolveOpLarge1)
diff --git a/test/cinn/ops/test_trunc_op.py b/test/cinn/ops/test_trunc_op.py
new file mode 100644
index 0000000000000..74db6db877bb8
--- /dev/null
+++ b/test/cinn/ops/test_trunc_op.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestTruncOp(OpTest):
+    def setUp(self):
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.x_np = self.random(
+            shape=self.case["x_shape"],
+            dtype=self.case["x_dtype"],
+            low=-1000.0,
+            high=1000.0)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.x_np, stop_gradient=True)
+        out = paddle.trunc(x)
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.case["x_dtype"]), self.case["x_shape"],
+            "x")
+        out = builder.trunc(x)
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.x_np], [out])
+
+        self.cinn_outputs = [res[0]]
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestTruncOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTruncOpShape"
+        self.cls = TestTruncOp
+        self.inputs = [{
+            "x_shape": [1],
+        }, {
+            "x_shape": [1024],
+        }, {
+            "x_shape": [1, 2048],
+        }, {
+            "x_shape": [1, 1, 1],
+        }, {
+            "x_shape": [32, 64],
+        }, {
+            "x_shape": [16, 8, 4, 2],
+        }, {
+            "x_shape": [16, 8, 4, 2, 1],
+        }]
+        self.dtypes = [{
+            "x_dtype": "float32",
+        }]
+        self.attrs = []
+
+
+class TestTruncOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestTruncOpDtype"
+        self.cls = TestTruncOp
+        self.inputs = [{
+            "x_shape": [32, 64],
+        }]
+        self.dtypes = [
+            {
+                "x_dtype": "int32",
+            },
+            {
+                "x_dtype": "int64",
+            },
+            {
+                "x_dtype": "float32",
+            },
+            {
+                "x_dtype": "float64",
+            },
+        ]
+        self.attrs = []
+
+
+if __name__ == "__main__":
+    TestTruncOpShape().run()
+    TestTruncOpDtype().run()
diff --git a/test/cinn/ops/test_unary_elementwise_op.py b/test/cinn/ops/test_unary_elementwise_op.py
new file mode 100644
index 0000000000000..edddda16f6100
--- /dev/null
+++ b/test/cinn/ops/test_unary_elementwise_op.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestUnaryOp(OpTest):
+    def setUp(self):
+        self.init_case()
+
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', -10.0, 10.0)}
+
+    def paddle_func(self, x):
+        return paddle.abs(x)
+
+    def cinn_func(self, builder, x):
+        return builder.abs(x)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
+        out = self.paddle_func(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_elementwise_test")
+        x = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
+        out = self.cinn_func(builder, x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestSqrtOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', 1.0, 1000.0)}
+
+    def paddle_func(self, x):
+        return paddle.sqrt(x)
+
+    def cinn_func(self, builder, x):
+        return builder.sqrt(x)
+
+
+class TestSqrtOpFP64(TestSqrtOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float64', 1.0, 1000.0)}
+
+
+class TestReluOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.nn.functional.relu(x)
+
+    def cinn_func(self, builder, x):
+        return builder.relu(x)
+
+
+class TestSigmoidOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.nn.functional.sigmoid(x)
+
+    def cinn_func(self, builder, x):
+        return builder.sigmoid(x)
+
+
+class TestIdentityOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.assign(x)
+
+    def cinn_func(self, builder, x):
+        return builder.identity(x)
+
+
+class TestExpOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.exp(x)
+
+    def cinn_func(self, builder, x):
+        return builder.exp(x)
+
+
+class TestExpOpFP64(TestExpOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float64', -10.0, 10.0)}
+
+
+class TestErfOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.erf(x)
+
+    def cinn_func(self, builder, x):
+        return builder.erf(x)
+
+
+class TestRsqrtOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', 0.00001, 1.0)}
+
+    def paddle_func(self, x):
+        return paddle.rsqrt(x)
+
+    def cinn_func(self, builder, x):
+        return builder.rsqrt(x)
+
+
+class TestLogOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', 1.0, 10.0)}
+
+    def paddle_func(self, x):
+        return paddle.log(x)
+
+    def cinn_func(self, builder, x):
+        return builder.log(x)
+
+
+class TestLog2Op(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', 1.0, 10.0)}
+
+    def paddle_func(self, x):
+        return paddle.log2(x)
+
+    def cinn_func(self, builder, x):
+        return builder.log2(x)
+
+
+class TestLog10Op(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', 1.0, 10.0)}
+
+    def paddle_func(self, x):
+        return paddle.log10(x)
+
+    def cinn_func(self, builder, x):
+        return builder.log10(x)
+
+
+class TestFloorOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.floor(x)
+
+    def cinn_func(self, builder, x):
+        return builder.floor(x)
+
+
+class TestCeilOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.ceil(x)
+
+    def cinn_func(self, builder, x):
+        return builder.ceil(x)
+
+
+class TestRoundOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.round(x)
+
+    def cinn_func(self, builder, x):
+        return builder.round(x)
+
+
+class TestTruncOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.trunc(x)
+
+    def cinn_func(self, builder, x):
+        return builder.trunc(x)
+
+
+class TestSinOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.sin(x)
+
+    def cinn_func(self, builder, x):
+        return builder.sin(x)
+
+
+class TestCosOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.cos(x)
+
+    def cinn_func(self, builder, x):
+        return builder.cos(x)
+
+
+class TestTanOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.tan(x)
+
+    def cinn_func(self, builder, x):
+        return builder.tan(x)
+
+
+class TestSinhOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.sinh(x)
+
+    def cinn_func(self, builder, x):
+        return builder.sinh(x)
+
+
+class TestCoshOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.cosh(x)
+
+    def cinn_func(self, builder, x):
+        return builder.cosh(x)
+
+
+class TestTanhOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.tanh(x)
+
+    def cinn_func(self, builder, x):
+        return builder.tanh(x)
+
+
+class TestAsinOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', -1.0, 1.0)}
+
+    def paddle_func(self, x):
+        return paddle.asin(x)
+
+    def cinn_func(self, builder, x):
+        return builder.asin(x)
+
+
+class TestAcosOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', -1.0, 1.0)}
+
+    def paddle_func(self, x):
+        return paddle.acos(x)
+
+    def cinn_func(self, builder, x):
+        return builder.acos(x)
+
+
+class TestAtanOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', -1.0, 1.0)}
+
+    def paddle_func(self, x):
+        return paddle.atan(x)
+
+    def cinn_func(self, builder, x):
+        return builder.atan(x)
+
+
+class TestAsinhOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', -1.0, 1.0)}
+
+    def paddle_func(self, x):
+        return paddle.asinh(x)
+
+    def cinn_func(self, builder, x):
+        return builder.asinh(x)
+
+
+class TestAcoshOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', 1.0, 100.0)}
+
+    def paddle_func(self, x):
+        return paddle.acosh(x)
+
+    def cinn_func(self, builder, x):
+        return builder.acosh(x)
+
+
+class TestAtanhOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'float32', -1.0, 1.0)}
+
+    def paddle_func(self, x):
+        return paddle.atanh(x)
+
+    def cinn_func(self, builder, x):
+        return builder.atanh(x)
+
+
+class TestLogicalNotOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'bool')}
+
+    def paddle_func(self, x):
+        return paddle.logical_not(x)
+
+    def cinn_func(self, builder, x):
+        return builder.logical_not(x)
+
+
+class TestBitwiseNotOp(TestUnaryOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], 'int32', 1, 10000)}
+
+    def paddle_func(self, x):
+        return paddle.bitwise_not(x)
+
+    def cinn_func(self, builder, x):
+        return builder.bitwise_not(x)
+
+
+class TestSignOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.sign(x)
+
+    def cinn_func(self, builder, x):
+        return builder.sign(x)
+
+
+class TestAbsOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.abs(x)
+
+    def cinn_func(self, builder, x):
+        return builder.abs(x)
+
+
+class TestIsNanOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.isnan(x)
+
+    def cinn_func(self, builder, x):
+        return builder.is_nan(x)
+
+
+class TestIsNanCase1(TestIsNanOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64])}
+        self.inputs["x"][0] = [np.nan] * 64
+
+
+class TestIsFiniteOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.isfinite(x)
+
+    def cinn_func(self, builder, x):
+        return builder.is_finite(x)
+
+
+class TestIsFiniteCase1(TestIsFiniteOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64])}
+        self.inputs["x"][0] = [np.inf] * 64
+
+
+class TestIsInfOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.isinf(x)
+
+    def cinn_func(self, builder, x):
+        return builder.is_inf(x)
+
+
+class TestIsInfCase1(TestIsInfOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64])}
+        self.inputs["x"][0] = [np.inf] * 64
+
+
+class TestNegOp(TestUnaryOp):
+    def paddle_func(self, x):
+        return paddle.neg(x)
+
+    def cinn_func(self, builder, x):
+        return builder.negative(x)
+
+
+class TestNegCase1(TestNegOp):
+    def init_case(self):
+        self.inputs = {"x": self.random([32, 64], low=-1.0, high=1.0)}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/ops/test_uniform_random_op.py b/test/cinn/ops/test_uniform_random_op.py
new file mode 100644
index 0000000000000..f64ffe8b8dd58
--- /dev/null
+++ b/test/cinn/ops/test_uniform_random_op.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest, OpTestTool
+from op_test_helper import TestCaseHelper
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestUniformRandomOp(OpTest):
+    def setUp(self):
+        # print(f"\n{self.__class__.__name__}: {self.case}")
+        pass
+
+    def build_paddle_program(self, target):
+        out = paddle.uniform(
+            shape=self.case["shape"],
+            dtype=self.case["dtype"],
+            min=self.case["min"],
+            max=self.case["max"],
+            seed=self.case["seed"])
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("uniform_random")
+        out = builder.uniform_random(self.case["shape"], self.case["min"],
+                                     self.case["max"], self.case["seed"],
+                                     self.case["dtype"])
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [], [], [out], passes=[])
+        self.cinn_outputs = res
+
+    def test_check_results(self):
+        # Due to the different random number generation numbers implemented
+        # in the specific implementation, the random number results generated
+        # by CINN and Paddle are not the same, but they all conform to the
+        # Uniform distribution.
+        self.check_outputs_and_grads(
+            max_relative_error=10000, max_absolute_error=10000)
+
+
+class TestUniformRandomOpShape(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestUniformRandomOpCase"
+        self.cls = TestUniformRandomOp
+        self.inputs = [
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [512, 256],
+            },
+            {
+                "shape": [128, 64, 32],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+            {
+                "shape": [16, 8, 4, 2, 1],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "min": -1.0,
+                "max": 1.0,
+                "seed": 1234,
+            },
+        ]
+
+
+class TestUniformRandomOpDtype(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestUniformRandomOpCase"
+        self.cls = TestUniformRandomOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+            {
+                "dtype": "float64",
+            },
+        ]
+        self.attrs = [
+            {
+                "min": -1.0,
+                "max": 1.0,
+                "seed": 1234,
+            },
+        ]
+
+
+class TestUniformRandomOpAttr(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestUniformRandomOpCase"
+        self.cls = TestUniformRandomOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32",
+            },
+        ]
+        self.attrs = [
+            {
+                "min": -10.0,
+                "max": 0,
+                "seed": 1,
+            },
+            {
+                "min": 0,
+                "max": 10.0,
+                "seed": 2,
+            },
+            {
+                "min": -100.0,
+                "max": 100.0,
+                "seed": 3,
+            },
+        ]
+
+
+if __name__ == "__main__":
+    TestUniformRandomOpShape().run()
+    TestUniformRandomOpDtype().run()
+    TestUniformRandomOpAttr().run()
diff --git a/test/cinn/ops/test_zero_dim_tensor.py b/test/cinn/ops/test_zero_dim_tensor.py
new file mode 100644
index 0000000000000..29cf737e53d85
--- /dev/null
+++ b/test/cinn/ops/test_zero_dim_tensor.py
@@ -0,0 +1,642 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, OpTestTool
+import paddle
+import cinn
+from cinn.frontend import *
+from cinn.common import *
+
+
+def cinn_dtype_convert(dtype_str):
+    if dtype_str == "float32":
+        return Float(32)
+    elif dtype_str == "int64":
+        return Int(64)
+    elif dtype_str == "bool":
+        return Bool()
+    else:
+        print("Datatype %s has not been supported yet", dtype_str)
+
+
+##################################
+####  TestElementwiseAddGrad  ####
+##################################
+# 1) x is 0D, y is 0D
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestElementwiseAddGrad(OpTest):
+    def setUp(self):
+        np.random.seed(2023)
+        self.init_input()
+
+    def init_input(self):
+        self.inputs = {
+            "x": np.random.randint(-10, 10, []).astype("float32"),
+            "y": np.random.randint(-10, 10, []).astype("float32"),
+            "dout": np.random.randint(-10, 10, []).astype("float32"),
+        }
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=False)
+        out = paddle.add(x, y)
+
+        self.paddle_outputs = [out]
+        self.paddle_grads = self.get_paddle_grads([out], [x, y],
+                                                  [self.inputs["dout"]])
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("add")
+        x = builder.create_input(Float(32), self.inputs["x"].shape, "x")
+        y = builder.create_input(Float(32), self.inputs["y"].shape, "y")
+        # Test elementwise_add here, next unittest tests add, actually these two APIs are same.
+        out = builder.elementwise_add(x, y)
+
+        dout = builder.create_input(
+            Float(32), self.inputs["dout"].shape, "dout")
+        x_grad, y_grad = builder.elementwise_add_grad(dout, x, y)
+
+        prog = builder.build()
+        res = self.get_cinn_output(
+            prog, target, [x, y, dout],
+            [self.inputs["x"], self.inputs["y"], self.inputs["dout"]],
+            [out, x_grad, y_grad])
+
+        out, x_grad, y_grad = res
+        self.cinn_outputs = [out]
+        self.cinn_grads = [x_grad, y_grad]
+        self.assertEqual(out.shape, self.inputs["dout"].shape)
+        self.assertEqual(x_grad.shape, self.inputs["x"].shape)
+        self.assertEqual(y_grad.shape, self.inputs["y"].shape)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+# 2) x is ND, y is 0D
+# NOTE: CINN only supports x's rank >= y's rank, hence no need to test next scenario: `3) x is 0D, y is ND`
+class TestElementwiseAddGrad1(TestElementwiseAddGrad):
+    def init_input(self):
+        self.inputs = {
+            "x": np.random.randint(-10, 10, [3, 5]).astype("float32"),
+            "y": np.random.randint(-10, 10, []).astype("float32"),
+            "dout": np.random.randint(-10, 10, [3, 5]).astype("float32"),
+        }
+
+
+##################################
+#### TestElementwiseBinaryOp  ####
+##################################
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestElementwiseBinaryOp_0DTo0D(OpTest):
+    def setUp(self):
+        np.random.seed(2023)
+        self.init_dtype()
+        self.init_input()
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def init_input(self):
+        self.inputs = {
+            "x": np.random.randint(-10, 10, []).astype(self.dtype),
+            "y": np.random.randint(-10, 10, []).astype(self.dtype),
+        }
+        self.target_shape = ()
+
+    def paddle_func(self, x, y):
+        return paddle.add(x, y)
+
+    def cinn_func(self, builder, x, y):
+        return builder.add(x, y)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=False)
+        out = self.paddle_func(x, y)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("binary_op")
+        x = builder.create_input(
+            cinn_dtype_convert(self.dtype), self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            cinn_dtype_convert(self.dtype), self.inputs["y"].shape, "y")
+        out = self.cinn_func(builder, x, y)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.inputs["x"], self.inputs["y"]], [out])
+
+        self.cinn_outputs = res
+        self.assertEqual(res[0].shape, self.target_shape)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+class TestElementwiseBinaryOp_NdTo0d(TestElementwiseBinaryOp_0DTo0D):
+    def init_input(self):
+        self.inputs = {
+            "x": np.random.randint(-10, 10, [3, 5]).astype(self.dtype),
+            "y": np.random.randint(-10, 10, []).astype(self.dtype),
+        }
+        self.target_shape = (3, 5)
+
+
+def create_unit_test(parent,
+                     test_name,
+                     fn_paddle,
+                     fn_cinn,
+                     dtype="float32",
+                     **kwargs):
+    @OpTestTool.skip_if(not is_compiled_with_cuda(),
+                        "x86 test will be skipped due to timeout.")
+    class TestClass(parent):
+        def setUp(self):
+            super().setUp()
+            for k, v in kwargs.items():
+                setattr(self, k, v)
+
+        def init_dtype(self):
+            self.dtype = dtype
+
+        def paddle_func(self, *args):
+            return fn_paddle(*args)
+
+        def cinn_func(self, builder, *args):
+            return eval(fn_cinn)(*args)
+
+    cls_name = "{}_{}".format(parent.__name__, test_name)
+    TestClass.__name__ = cls_name
+    globals()[cls_name] = TestClass
+
+
+# NOTE: CINN only supports x's rank >= y's rank, hence no need to test scenario: x is 0D, y is ND
+create_unit_test(TestElementwiseBinaryOp_0DTo0D, "sub", paddle.subtract,
+                 "builder.subtract")
+create_unit_test(TestElementwiseBinaryOp_NdTo0d, "sub", paddle.subtract,
+                 "builder.subtract")
+create_unit_test(TestElementwiseBinaryOp_0DTo0D, "mul1", paddle.multiply,
+                 "builder.multiply")
+create_unit_test(TestElementwiseBinaryOp_NdTo0d, "mul1", paddle.multiply,
+                 "builder.multiply")
+create_unit_test(TestElementwiseBinaryOp_0DTo0D, "mul2", paddle.multiply,
+                 "builder.elementwise_mul")
+create_unit_test(TestElementwiseBinaryOp_NdTo0d, "mul2", paddle.multiply,
+                 "builder.elementwise_mul")
+create_unit_test(TestElementwiseBinaryOp_0DTo0D, "div", paddle.divide,
+                 "builder.divide")
+create_unit_test(TestElementwiseBinaryOp_NdTo0d, "div", paddle.divide,
+                 "builder.divide")
+# # Paddle'atan2 only supports 0D + 0D -> 0D
+create_unit_test(TestElementwiseBinaryOp_0DTo0D, "atan2", paddle.atan2,
+                 "builder.atan2")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "floor_divide",
+    paddle.floor_divide,
+    "builder.floor_divide",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "floor_divide",
+    paddle.floor_divide,
+    "builder.floor_divide",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "mod",
+    paddle.mod,
+    "builder.mod",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "mod",
+    paddle.mod,
+    "builder.mod",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "remainder",
+    paddle.remainder,
+    "builder.remainder",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "remainder",
+    paddle.remainder,
+    "builder.remainder",
+    dtype="int64")
+create_unit_test(TestElementwiseBinaryOp_0DTo0D, "max", paddle.maximum,
+                 "builder.max")
+create_unit_test(TestElementwiseBinaryOp_NdTo0d, "max", paddle.maximum,
+                 "builder.max")
+create_unit_test(TestElementwiseBinaryOp_0DTo0D, "min", paddle.minimum,
+                 "builder.min")
+create_unit_test(TestElementwiseBinaryOp_NdTo0d, "min", paddle.minimum,
+                 "builder.min")
+create_unit_test(TestElementwiseBinaryOp_0DTo0D, "pow", paddle.pow,
+                 "builder.pow")
+create_unit_test(TestElementwiseBinaryOp_NdTo0d, "pow", paddle.pow,
+                 "builder.pow")
+
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "logical_and",
+    paddle.logical_and,
+    "builder.logical_and",
+    dtype="bool")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "logical_and",
+    paddle.logical_and,
+    "builder.logical_and",
+    dtype="bool")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "logical_or",
+    paddle.logical_or,
+    "builder.logical_or",
+    dtype="bool")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "logical_or",
+    paddle.logical_or,
+    "builder.logical_or",
+    dtype="bool")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "logical_xor",
+    paddle.logical_xor,
+    "builder.logical_xor",
+    dtype="bool")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "logical_xor",
+    paddle.logical_xor,
+    "builder.logical_xor",
+    dtype="bool")
+
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "bitwise_and",
+    paddle.bitwise_and,
+    "builder.bitwise_and",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "bitwise_and",
+    paddle.bitwise_and,
+    "builder.bitwise_and",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "bitwise_or",
+    paddle.bitwise_or,
+    "builder.bitwise_or",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "bitwise_or",
+    paddle.bitwise_or,
+    "builder.bitwise_or",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "bitwise_xor",
+    paddle.bitwise_xor,
+    "builder.bitwise_xor",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "bitwise_xor",
+    paddle.bitwise_xor,
+    "builder.bitwise_xor",
+    dtype="int64")
+
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "equal",
+    paddle.equal,
+    "builder.equal",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "equal",
+    paddle.equal,
+    "builder.equal",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "not_equal",
+    paddle.not_equal,
+    "builder.not_equal",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "not_equal",
+    paddle.not_equal,
+    "builder.not_equal",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "greater_than",
+    paddle.greater_than,
+    "builder.greater_than",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "greater_than",
+    paddle.greater_than,
+    "builder.greater_than",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "greater_equal",
+    paddle.greater_equal,
+    "builder.greater_equal",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "greater_equal",
+    paddle.greater_equal,
+    "builder.greater_equal",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "less_than",
+    paddle.less_than,
+    "builder.less_than",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "less_than",
+    paddle.less_than,
+    "builder.less_than",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_0DTo0D,
+    "less_equal",
+    paddle.less_equal,
+    "builder.less_equal",
+    dtype="int64")
+create_unit_test(
+    TestElementwiseBinaryOp_NdTo0d,
+    "less_equal",
+    paddle.less_equal,
+    "builder.less_equal",
+    dtype="int64")
+
+
+######################
+#### TestUnaryOp  ####
+######################
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestUnaryOp(OpTest):
+    def setUp(self):
+        np.random.seed(2023)
+        self.init_dtype()
+        self.init_input()
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def init_input(self):
+        self.inputs = {
+            "x": np.random.uniform(0.0, 1.0, []).astype(self.dtype),
+        }
+        self.target_shape = ()
+
+    def paddle_func(self, x):
+        return paddle.sqrt(x)
+
+    def cinn_func(self, builder, x):
+        return builder.sqrt(x)
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        out = self.paddle_func(x)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("unary_op")
+        x = builder.create_input(
+            cinn_dtype_convert(self.dtype), self.inputs["x"].shape, "x")
+        out = self.cinn_func(builder, x)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+        self.assertEqual(res[0].shape, self.target_shape)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+create_unit_test(TestUnaryOp, "tanh", paddle.tanh, "builder.tanh")
+create_unit_test(TestUnaryOp, "relu", paddle.nn.functional.relu,
+                 "builder.relu")
+create_unit_test(TestUnaryOp, "gelu", paddle.nn.functional.gelu,
+                 "builder.gelu")
+create_unit_test(TestUnaryOp, "sigmoid", paddle.nn.functional.sigmoid,
+                 "builder.sigmoid")
+create_unit_test(TestUnaryOp, "exp", paddle.exp, "builder.exp")
+create_unit_test(TestUnaryOp, "erf", paddle.erf, "builder.erf")
+create_unit_test(TestUnaryOp, "rsqrt", paddle.rsqrt, "builder.rsqrt")
+create_unit_test(TestUnaryOp, "log", paddle.log, "builder.log")
+create_unit_test(TestUnaryOp, "log2", paddle.log2, "builder.log2")
+create_unit_test(TestUnaryOp, "log10", paddle.log10, "builder.log10")
+create_unit_test(TestUnaryOp, "floor", paddle.floor, "builder.floor")
+create_unit_test(TestUnaryOp, "ceil", paddle.ceil, "builder.ceil")
+create_unit_test(TestUnaryOp, "round", paddle.round, "builder.round")
+create_unit_test(TestUnaryOp, "trunc", paddle.trunc, "builder.trunc")
+create_unit_test(TestUnaryOp, "sin", paddle.sin, "builder.sin")
+create_unit_test(TestUnaryOp, "cos", paddle.cos, "builder.cos")
+create_unit_test(TestUnaryOp, "tan", paddle.tan, "builder.tan")
+create_unit_test(TestUnaryOp, "sinh", paddle.sinh, "builder.sinh")
+create_unit_test(TestUnaryOp, "cosh", paddle.cosh, "builder.cosh")
+create_unit_test(TestUnaryOp, "asin", paddle.asin, "builder.asin")
+create_unit_test(TestUnaryOp, "acos", paddle.acos, "builder.acos")
+create_unit_test(TestUnaryOp, "atan", paddle.atan, "builder.atan")
+create_unit_test(TestUnaryOp, "asinh", paddle.asinh, "builder.asinh")
+create_unit_test(TestUnaryOp, "atanh", paddle.atanh, "builder.atanh")
+create_unit_test(TestUnaryOp, "isnan", paddle.isnan, "builder.is_nan")
+create_unit_test(TestUnaryOp, "isfinite", paddle.isfinite, "builder.is_finite")
+create_unit_test(TestUnaryOp, "isinf", paddle.isinf, "builder.is_inf")
+create_unit_test(
+    TestUnaryOp,
+    "logical_not",
+    paddle.logical_not,
+    "builder.logical_not",
+    dtype="bool")
+create_unit_test(
+    TestUnaryOp,
+    "bitwise_not",
+    paddle.bitwise_not,
+    "builder.bitwise_not",
+    dtype="int64")
+create_unit_test(TestUnaryOp, "negative", paddle.neg, "builder.negative")
+create_unit_test(TestUnaryOp, "sign", paddle.sign, "builder.sign")
+create_unit_test(TestUnaryOp, "abs", paddle.abs, "builder.abs")
+create_unit_test(TestUnaryOp, "reciprocal", paddle.reciprocal,
+                 "builder.reciprocal")
+
+
+# acosh requires input value > 1.0, specific init_input instead of using create_unit_test
+class TestUnaryOp_acosh(TestUnaryOp):
+    def init_input(self):
+        self.inputs = {
+            "x": np.random.uniform(1.0, 10.0, []).astype(self.dtype),
+        }
+        self.target_shape = ()
+
+    def paddle_func(self, x):
+        return paddle.acosh(x)
+
+    def cinn_func(self, builder, x):
+        return builder.acosh(x)
+
+
+#######################
+#### TestSundryOp  ####
+#######################
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestScaleOp(OpTest):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.init_input()
+
+    def init_input(self):
+        self.inputs = {
+            "x": np.random.randint(-10, 10, []).astype(self.dtype),
+        }
+        self.target_shape = ()
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        out = paddle.scale(x, scale=2.0, bias=1.0)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("reduce_op")
+        x = builder.create_input(
+            cinn_dtype_convert(self.dtype), self.inputs["x"].shape, "x")
+        out = builder.scale(x, 2.0, 1.0)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+        self.assertEqual(res[0].shape, self.target_shape)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestSumOp(OpTest):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.init_input()
+
+    def init_input(self):
+        self.inputs = {
+            "x": np.random.randint(-10, 10, []).astype(self.dtype),
+            "y": np.random.randint(-10, 10, []).astype(self.dtype),
+        }
+        self.target_shape = ()
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        y = paddle.to_tensor(self.inputs["y"], stop_gradient=False)
+        out = x + y
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("reduce_op")
+        x = builder.create_input(
+            cinn_dtype_convert(self.dtype), self.inputs["x"].shape, "x")
+        y = builder.create_input(
+            cinn_dtype_convert(self.dtype), self.inputs["y"].shape, "y")
+        out = builder.sum([x, y])
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x, y],
+                                   [self.inputs["x"], self.inputs["y"]], [out])
+
+        self.cinn_outputs = res
+        self.assertEqual(res[0].shape, self.target_shape)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+@OpTestTool.skip_if(not is_compiled_with_cuda(),
+                    "x86 test will be skipped due to timeout.")
+class TestDropoutOp(OpTest):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.init_input()
+
+    def init_input(self):
+        self.inputs = {
+            "x": np.random.randint(-10, 10, []).astype(self.dtype),
+        }
+        self.target_shape = ()
+
+    def build_paddle_program(self, target):
+        x = paddle.to_tensor(self.inputs["x"], stop_gradient=False)
+        out = paddle.nn.functional.dropout(x, 1.0)
+
+        self.paddle_outputs = [out]
+
+    def build_cinn_program(self, target):
+        builder = NetBuilder("reduce_op")
+        x = builder.create_input(
+            cinn_dtype_convert(self.dtype), self.inputs["x"].shape, "x")
+        out = builder.dropout_infer(x, 1.0)
+
+        prog = builder.build()
+        res = self.get_cinn_output(prog, target, [x], [self.inputs["x"]],
+                                   [out])
+
+        self.cinn_outputs = res
+        self.assertEqual(res[0].shape, self.target_shape)
+
+    def test_check_results(self):
+        self.check_outputs_and_grads()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/passes/pass_test.py b/test/cinn/passes/pass_test.py
new file mode 100644
index 0000000000000..57009d64d2504
--- /dev/null
+++ b/test/cinn/passes/pass_test.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cinn.frontend import NetBuilder, Variable
+from cinn.frontend import get_default_program_pass, get_default_graph_pass
+import logging
+import os
+from tests.ops.op_test import OpTest
+
+logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
+logger = logging.getLogger(name="pass_test")
+
+
+class PassTest(OpTest):
+    def __init__(self, *args, **kwargs):
+        super(PassTest, self).__init__(*args, **kwargs)
+        self.init_input_data()
+
+    def init_input_data(self) -> dict:
+        """Set feed data
+        """
+        self.feed_data = dict()
+        logger.warn("No Input Data")
+
+    def build_program(self, builder, target):
+        """
+        """
+        raise Exception("Not implemented.")
+
+    def run_program(self):
+        net_builder = NetBuilder("pass_test_netbuilder")
+
+        inputs, outputs = self.build_program(net_builder, self.target)
+
+        self.assertIsNotNone(
+            outputs, msg="The program's output should not empty!")
+        self.assertGreater(
+            len(outputs), 0, msg="The program's output should not empty!")
+        self.assertIsInstance(
+            outputs[0],
+            Variable,
+            msg="The program's output should be list(cinn.frontend.Variable)")
+
+        pass_prog = net_builder.build()
+        return pass_prog, inputs, outputs
+
+    def get_pass_outputs(self, passes):
+        pass_prog, inputs, outputs = self.run_program()
+
+        feed_list = list()
+        for var in inputs:
+            self.assertIn(
+                var.name(),
+                self.feed_data,
+                msg="Cannot found input data {} in self.feed_data".format(
+                    var.name()))
+            feed_list.append(self.feed_data[var.name()])
+
+        return self.get_cinn_output(pass_prog, self.target, inputs, feed_list,
+                                    outputs, passes)
+
+    def get_pass_size(self, passes):
+        pass_prog, _, outputs = self.run_program()
+        fetch_ids = {str(out) for out in outputs}
+        logger.debug("Before pass {}:\n{}".format(passes, str(pass_prog)))
+        op_num = pass_prog.apply_pass(fetch_ids, self.target, passes)
+        logger.debug("After pass {}:\n{}".format(passes, str(pass_prog)))
+        return op_num
+
+    def check_pass_outputs(
+            self,
+            pass_diff,
+            test_passes,
+            base_passes=["AutoCast", "Decomposer", "TransToCustomCallPass"],
+            max_relative_error=1e-5,
+            all_equal=False,
+            equal_nan=False):
+        base_pass_size = self.get_pass_size(base_passes)
+        logger.debug(
+            "Pass after base pass optimize has {} ops".format(base_pass_size))
+        test_pass_size = self.get_pass_size(base_passes + test_passes)
+        logger.debug(
+            "Pass after base and test pass optimize has {} ops".format(
+                test_pass_size))
+        self.assertEqual(base_pass_size - test_pass_size, pass_diff,
+                         "The pass not running as expected")
+
+        cinn_no_pass_outputs = self.get_pass_outputs(base_passes)
+        cinn_pass_outputs = self.get_pass_outputs(base_passes + test_passes)
+
+        logger.debug("============ Check Outputs ============")
+        self.check_results(cinn_no_pass_outputs, cinn_pass_outputs,
+                           max_relative_error, all_equal, equal_nan)
diff --git a/test/cinn/passes/test_auto_cast_pass.py b/test/cinn/passes/test_auto_cast_pass.py
new file mode 100644
index 0000000000000..05db454be5b9d
--- /dev/null
+++ b/test/cinn/passes/test_auto_cast_pass.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pass_test import PassTest
+from cinn.frontend import *
+from cinn.common import *
+
+
+class TestAutoCastPass(PassTest):
+    def init_input_data(self):
+        self.feed_data = {'x': self.random([4, 5, 6], "float16")}
+
+    def build_program(self, builder, target):
+        x = builder.create_input(
+            self.nptype2cinntype(self.feed_data['x'].dtype),
+            self.feed_data['x'].shape, "x")
+        out = builder.exp(x)
+        return [x], [out]
+
+    def test_check_results(self):
+        self.check_pass_outputs(
+            pass_diff=-2, test_passes=["AutoCast"], base_passes=["Decomposer"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/passes/test_expand_zero_dim_pass.py b/test/cinn/passes/test_expand_zero_dim_pass.py
new file mode 100644
index 0000000000000..51baf2f8e7f42
--- /dev/null
+++ b/test/cinn/passes/test_expand_zero_dim_pass.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pass_test import PassTest
+from cinn.frontend import *
+from cinn.common import *
+import numpy as np
+
+
+class TestExpandZeroDimPass(PassTest):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': np.random.randint(-10, 10, []).astype("float32")
+        }
+
+    def build_program(self, builder, target):
+        x = builder.create_input(
+            self.nptype2cinntype(self.feed_data['x'].dtype),
+            self.feed_data['x'].shape, "x")
+        out = builder.exp(x)
+        return [x], [out]
+
+    def test_check_results(self):
+        # ExpandZeroDim is the first pass in CINN, so that set base_passes = empty
+        self.check_pass_outputs(
+            pass_diff=0, test_passes=["ExpandZeroDim"], base_passes=[])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/passes/test_transpose_floding_input_pass.py b/test/cinn/passes/test_transpose_floding_input_pass.py
new file mode 100644
index 0000000000000..d846463fcd82f
--- /dev/null
+++ b/test/cinn/passes/test_transpose_floding_input_pass.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pass_test import PassTest
+from cinn.frontend import *
+from cinn.common import *
+
+
+class TestTransposeFoldingInputPass(PassTest):
+    def init_input_data(self):
+        """Do not set the shape like [B, N, N].
+        You should set the shape like [B, M, N], where M != N.
+        """
+        self.feed_data = {
+            'x': self.random([4, 5, 3], "float32"),
+            'y': self.random([4, 5, 6], "float32"),
+        }
+        self.folded_num = 1
+
+    def trans_x_func(self, builder, x):
+        return builder.transpose(x, [0, 2, 1])
+
+    def trans_y_func(self, builder, y):
+        return y
+
+    def build_program(self, builder, target):
+        x = builder.create_input(
+            str(self.feed_data['x'].dtype), self.feed_data['x'].shape, "x")
+        y = builder.create_input(
+            str(self.feed_data['y'].dtype), self.feed_data['y'].shape, "y")
+        x_t = self.trans_x_func(builder, x)
+        y_t = self.trans_y_func(builder, y)
+        out = builder.matmul(x_t, y_t)
+        return [x, y], [out]
+
+    def test_check_results(self):
+        self.check_pass_outputs(
+            pass_diff=self.folded_num,
+            test_passes=["TransposeFoldingInput", "GemmRewriter"])
+
+
+class TestTransposeFoldingInputPassTransY(TestTransposeFoldingInputPass):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([4, 3, 5], "float32"),
+            'y': self.random([4, 6, 5], "float32"),
+        }
+        self.folded_num = 1
+
+    def trans_x_func(self, builder, x):
+        return x
+
+    def trans_y_func(self, builder, y):
+        return builder.transpose(y, [0, 2, 1])
+
+
+class TestTransposeFoldingInputPassTransXY(TestTransposeFoldingInputPass):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([4, 5, 3], "float32"),
+            'y': self.random([4, 6, 5], "float32"),
+        }
+        self.folded_num = 2
+
+    def trans_x_func(self, builder, x):
+        return builder.transpose(x, [0, 2, 1])
+
+    def trans_y_func(self, builder, y):
+        return builder.transpose(y, [0, 2, 1])
+
+
+class TestTransposeFoldingInputPassWithScale(TestTransposeFoldingInputPass):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([4, 5, 3], "float32"),
+            'y': self.random([1, 6, 5], "float32"),
+        }
+        self.folded_num = 4
+
+    def trans_x_func(self, builder, x):
+        x_s = builder.scale(x, scale=2.0)
+        return builder.transpose(x_s, [0, 2, 1])
+
+    def trans_y_func(self, builder, y):
+        y_s = builder.scale(y, scale=2.0)
+        return builder.transpose(y_s, [0, 2, 1])
+
+
+class TestTransposeFoldingInputPassWithIdentity(TestTransposeFoldingInputPass):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([4, 5, 3], "float32"),
+            'y': self.random([1, 6, 5], "float32"),
+        }
+        self.folded_num = 4
+
+    def trans_x_func(self, builder, x):
+        x_s = builder.scale(x, scale=2.0)
+        x_t = builder.transpose(x_s, [0, 2, 1])
+        return builder.identity(x_t)
+
+    def trans_y_func(self, builder, y):
+        y_s = builder.scale(y, scale=2.0)
+        y_t = builder.transpose(y_s, [0, 2, 1])
+        return builder.identity(y_t)
+
+
+class TestTransposeFoldingInputPassWithBroadcastX(
+        TestTransposeFoldingInputPass):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 3], "float32"),
+            'y': self.random([4, 6, 5], "float32"),
+        }
+        self.folded_num = 5
+
+    def trans_x_func(self, builder, x):
+        x_b = builder.broadcast_to(x, [4, 5, 3])
+        x_s = builder.scale(x_b, scale=2.0)
+        x_t = builder.transpose(x_s, [0, 2, 1])
+        return builder.identity(x_t)
+
+    def trans_y_func(self, builder, y):
+        y_s = builder.scale(y, scale=2.0)
+        y_t = builder.transpose(y_s, [0, 2, 1])
+        return builder.identity(y_t)
+
+
+class TestTransposeFoldingInputPassWithBroadcastXY(
+        TestTransposeFoldingInputPass):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 3], "float32"),
+            'y': self.random([6, 5], "float32"),
+        }
+        # need keep one broadcast to ensure the shape correct
+        self.folded_num = 5
+
+    def trans_x_func(self, builder, x):
+        x_b = builder.broadcast_to(x, [4, 5, 3])
+        x_s = builder.scale(x_b, scale=2.0)
+        x_t = builder.transpose(x_s, [0, 2, 1])
+        return builder.identity(x_t)
+
+    def trans_y_func(self, builder, y):
+        y_b = builder.broadcast_to(y, [4, 6, 5])
+        y_s = builder.scale(y_b, scale=2.0)
+        y_t = builder.transpose(y_s, [0, 2, 1])
+        return builder.identity(y_t)
+
+
+class TestTransposeFoldingInputPassWithBroadcastAfterTrans(
+        TestTransposeFoldingInputPass):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([5, 3], "float32"),
+            'y': self.random([4, 6, 5], "float32"),
+        }
+        self.folded_num = 2
+
+    def trans_x_func(self, builder, x):
+        x_t = builder.transpose(x, [1, 0])
+        # cannot remove transpose when it before broadcast, because broadcast will change the shape
+        x_b = builder.broadcast_to(x_t, [4, 3, 5])
+        return builder.identity(x_b)
+
+    def trans_y_func(self, builder, y):
+        y_t = builder.transpose(y, [0, 2, 1])
+        return builder.identity(y_t)
+
+
+class TestTransposeFoldingInputPassInvalidTran(TestTransposeFoldingInputPass):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([3, 4, 5], "float32"),
+            'y': self.random([5, 4, 6], "float32"),
+        }
+        self.folded_num = 2
+
+    def trans_x_func(self, builder, x):
+        x_s = builder.scale(x, scale=2.0)
+        x_t = builder.transpose(x_s, [1, 0, 2])
+        return builder.identity(x_t)
+
+    def trans_y_func(self, builder, y):
+        y_s = builder.scale(y, scale=2.0)
+        y_t = builder.transpose(y_s, [1, 0, 2])
+        return builder.identity(y_t)
+
+
+class TestTransposeFoldingInputPassInvalidScale(TestTransposeFoldingInputPass):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([4, 5, 3], "float32"),
+            'y': self.random([1, 6, 5], "float32"),
+        }
+        self.folded_num = 2
+
+    def trans_x_func(self, builder, x):
+        x_s = builder.scale(x, scale=2.0, bias=1.0)
+        x_t = builder.transpose(x_s, [0, 2, 1])
+        return builder.identity(x_t)
+
+    def trans_y_func(self, builder, y):
+        y_s = builder.scale(y, scale=2.0, bias=1.0)
+        y_t = builder.transpose(y_s, [0, 2, 1])
+        return builder.identity(y_t)
+
+
+class TestTransposeFoldingInputPassNoFold(TestTransposeFoldingInputPass):
+    def init_input_data(self):
+        self.feed_data = {
+            'x': self.random([4, 5, 3], "float32"),
+            'y': self.random([4, 6, 5], "float32"),
+        }
+        self.folded_num = 0
+
+    def trans_x_func(self, builder, x):
+        x_s = builder.scale(x, scale=2.0)
+        x_t = builder.transpose(x_s, [0, 2, 1])
+        return builder.reshape(x_t, [2, 6, 5])
+
+    def trans_y_func(self, builder, y):
+        y_s = builder.scale(y, scale=2.0)
+        y_t = builder.transpose(y_s, [0, 2, 1])
+        return builder.reshape(y_t, [2, 5, 12])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/passes/test_transpose_floding_output_pass.py b/test/cinn/passes/test_transpose_floding_output_pass.py
new file mode 100644
index 0000000000000..efb12cf38e8a0
--- /dev/null
+++ b/test/cinn/passes/test_transpose_floding_output_pass.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2023 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pass_test import PassTest
+from cinn.frontend import *
+from cinn.common import *
+
+
+class TestTransposeFoldingOutputPass(PassTest):
+    def init_input_data(self):
+        """Do not set the shape like [B, N, N].
+        You should set the shape like [B, M, N], where M != N.
+        """
+        self.feed_data = {
+            'x': self.random([4, 3, 5], "float32"),
+            'y': self.random([4, 5, 6], "float32"),
+        }
+
+    def expect_folding_number(self):
+        return 1
+
+    def trans_out_func(self, builder, out):
+        return builder.transpose(out, [0, 2, 1])
+
+    def build_program(self, builder, target):
+        x = builder.create_input(
+            str(self.feed_data['x'].dtype), self.feed_data['x'].shape, "x")
+        y = builder.create_input(
+            str(self.feed_data['y'].dtype), self.feed_data['y'].shape, "y")
+        res = builder.matmul(x, y)
+        out = self.trans_out_func(builder, res)
+        return [x, y], [out]
+
+    def test_check_results(self):
+        self.check_pass_outputs(
+            pass_diff=self.expect_folding_number(),
+            test_passes=[
+                "TransposeFoldingInput", "GemmRewriter",
+                "TransposeFoldingOutput", "GemmRewriter"
+            ])
+
+
+class TestTransposeFoldingOutputPassWithScale(TestTransposeFoldingOutputPass):
+    def expect_folding_number(self):
+        return 2
+
+    def trans_out_func(self, builder, out):
+        out_s = builder.scale(out, scale=2.0)
+        return builder.transpose(out_s, [0, 2, 1])
+
+
+class TestTransposeFoldingOutputPassWithIdentity(
+        TestTransposeFoldingOutputPass):
+    def expect_folding_number(self):
+        return 2
+
+    def trans_out_func(self, builder, out):
+        out_i = builder.identity(out)
+        out_s = builder.scale(out_i, scale=2.0)
+        return builder.transpose(out_s, [0, 2, 1])
+
+
+class TestTransposeFoldingOutputPassInvlidTrans(
+        TestTransposeFoldingOutputPass):
+    def expect_folding_number(self):
+        return 1
+
+    def trans_out_func(self, builder, out):
+        out_t = builder.transpose(out, [1, 0, 2])
+        return builder.scale(out_t, scale=2.0)
+
+
+class TestTransposeFoldingOutputPassInvlidScale(
+        TestTransposeFoldingOutputPass):
+    def expect_folding_number(self):
+        return 1
+
+    def trans_out_func(self, builder, out):
+        out_s = builder.scale(out, scale=2.0, bias=1.0)
+        return builder.transpose(out_s, [0, 2, 1])
+
+
+class TestTransposeFoldingOutputPassNoFold(TestTransposeFoldingOutputPass):
+    def expect_folding_number(self):
+        return 0
+
+    def trans_out_func(self, builder, out):
+        out_r = builder.reshape(out, [4, 6, 3])
+        out_s = builder.scale(out_r, scale=2.0)
+        return builder.transpose(out_s, [0, 2, 1])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/pool_utils.py b/test/cinn/pool_utils.py
new file mode 100644
index 0000000000000..f7892c766f272
--- /dev/null
+++ b/test/cinn/pool_utils.py
@@ -0,0 +1,422 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+import sys
+
+
+def pool2d(np_data, attrs, dtype="float32"):
+    pool_type = "max"
+    ceil_mode = False
+    exclusive = True
+    data_format = "NCHW"
+    for key in attrs.attr_store:
+        if key == "kernel_size":
+            kernel_size = attrs.get_attr("kernel_size")
+        elif key == "stride_size":
+            stride_size = attrs.get_attr("stride_size")
+        elif key == "padding_size":
+            padding_size = attrs.get_attr("padding_size")
+        elif key == "pool_type":
+            pool_type = attrs.get_attr("pool_type")
+        elif key == "ceil_mode":
+            ceil_mode = attrs.get_attr("ceil_mode")
+        elif key == "exclusive":
+            exclusive = attrs.get_attr("exclusive")
+        elif key == "data_format":
+            data_format = attrs.get_attr("data_format")
+        else:
+            raise ValueError("attr_store {} is not supported".format(key))
+
+    if data_format == "NCHW":
+        in_n, in_c, in_h, in_w = in_shape = np_data.shape
+        height_axis = 2
+        width_axis = 3
+    elif data_format == "NHWC":
+        in_n, in_h, in_w, in_c = in_shape = np_data.shape
+        height_axis = 1
+        width_axis = 2
+    else:
+        raise ValueError("data_format {} is not supported".format(data_format))
+
+    if isinstance(kernel_size, int):
+        k_h = k_w = kernel_size
+    else:
+        k_h, k_w = kernel_size
+    if isinstance(stride_size, int):
+        s_h = s_w = stride_size
+    else:
+        s_h, s_w = stride_size
+    if isinstance(padding_size, int):
+        pt = pl = pb = pr = padding_size
+    else:
+        pt, pl, pb, pr = padding_size
+
+    out_shape = list(in_shape)
+    if ceil_mode:
+        out_shape[height_axis] = int(
+            math.ceil(float(in_shape[height_axis] - k_h + pt + pb) / s_h) + 1)
+        out_shape[width_axis] = int(
+            math.ceil(float(in_shape[width_axis] - k_w + pl + pr) / s_w) + 1)
+    else:
+        out_shape[height_axis] = int(
+            math.floor(float(in_shape[height_axis] - k_h + pt + pb) / s_h) + 1)
+        out_shape[width_axis] = int(
+            math.floor(float(in_shape[width_axis] - k_w + pl + pr) / s_w) + 1)
+
+    fill_value = 0
+    if exclusive and pool_type == 'max':
+        fill_value = sys.float_info.min
+
+    if data_format == "NCHW":
+        pad_np = np.full(
+            shape=(in_n, in_c, in_h + pt + pb, in_w + pl + pr),
+            fill_value=fill_value,
+            dtype=dtype)
+        no_zero = (range(in_n), range(in_c), range(pt, in_h + pt),
+                   range(pl, in_w + pl))
+    else:
+        pad_np = np.full(
+            shape=(in_n, in_h + pt + pb, in_w + pl + pr, in_c),
+            fill_value=fill_value,
+            dtype=dtype)
+        no_zero = (range(in_n), range(pt, in_h + pt), range(pl, in_w + pl),
+                   range(in_c))
+
+    pad_np[np.ix_(*no_zero)] = np_data
+    ret_np = np.zeros(shape=out_shape).astype(dtype)
+    if pool_type == 'avg':
+        for i in range(out_shape[height_axis]):
+            for j in range(out_shape[width_axis]):
+                if exclusive:
+                    pad_exclusive = pad_np.copy()
+                    pad_exclusive[np.ix_(*no_zero)] = 1
+                    if data_format == "NCHW":
+                        pad_count = np.sum(
+                            pad_exclusive[:, :, i * s_h:i * s_h +
+                                          k_h, j * s_w:j * s_w + k_w] == 1,
+                            axis=(height_axis, width_axis))
+                        ret_np[:, :, i, j] = np.sum(
+                            pad_np[:, :, i * s_h:i * s_h +
+                                   k_h, j * s_w:j * s_w + k_w],
+                            axis=(height_axis, width_axis)) / np.maximum(
+                                pad_count, 1)
+                    else:
+                        pad_count = np.sum(
+                            pad_exclusive[:, i * s_h:i * s_h +
+                                          k_h, j * s_w:j * s_w + k_w, :] == 1,
+                            axis=(height_axis, width_axis))
+                        ret_np[:, i, j, :] = np.sum(
+                            pad_np[:, i * s_h:i * s_h + k_h, j * s_w:j * s_w +
+                                   k_w, :],
+                            axis=(height_axis, width_axis)) / np.maximum(
+                                pad_count, 1)
+                else:
+                    if data_format == "NCHW":
+                        ret_np[:, :,i, j] = \
+                            np.mean(pad_np[:, :,
+                                    i * s_h: i * s_h + k_h,
+                                    j * s_w: j * s_w + k_w], axis=(height_axis, width_axis))
+                    else:
+                        ret_np[:, i, j, :] = \
+                            np.mean(pad_np[:,
+                                    i * s_h: i * s_h + k_h,
+                                    j * s_w: j * s_w + k_w, :], axis=(height_axis, width_axis))
+    elif pool_type == 'max':
+        for i in range(out_shape[height_axis]):
+            for j in range(out_shape[width_axis]):
+                if data_format == "NCHW":
+                    ret_np[:, :, i, j] = np.max(
+                        pad_np[:, :, i * s_h:i * s_h + k_h, j * s_w:j * s_w +
+                               k_w],
+                        axis=(height_axis, width_axis))
+                else:
+                    ret_np[:, i, j, :] = np.max(
+                        pad_np[:, i * s_h:i * s_h + k_h, j * s_w:j * s_w +
+                               k_w, :],
+                        axis=(height_axis, width_axis))
+    else:
+        raise ValueError("pool type {} is not supported".format(pool_type))
+
+    ret_np = np.maximum(ret_np, fill_value)
+    return ret_np, [out_shape]
+
+
+def pool3d(np_data, attrs, dtype="float32"):
+    pool_type = "max"
+    ceil_mode = False
+    exclusive = True
+    data_format = "NCDHW"
+    for key in attrs.attr_store:
+        if key == "kernel_size":
+            kernel_size = attrs.get_attr("kernel_size")
+        elif key == "stride_size":
+            stride_size = attrs.get_attr("stride_size")
+        elif key == "padding_size":
+            padding_size = attrs.get_attr("padding_size")
+        elif key == "pool_type":
+            pool_type = attrs.get_attr("pool_type")
+        elif key == "ceil_mode":
+            ceil_mode = attrs.get_attr("ceil_mode")
+        elif key == "exclusive":
+            exclusive = attrs.get_attr("exclusive")
+        elif key == "data_format":
+            data_format = attrs.get_attr("data_format")
+        else:
+            raise ValueError("attr_store {} is not supported".format(key))
+
+    if data_format == "NCDHW":
+        in_n, in_c, in_d, in_h, in_w = in_shape = np_data.shape
+        depth_axis = 2
+        height_axis = 3
+        width_axis = 4
+    elif data_format == "NDHWC":
+        in_n, in_d, in_h, in_w, in_c = in_shape = np_data.shape
+        depth_axis = 1
+        height_axis = 2
+        width_axis = 3
+    else:
+        raise ValueError("data_format {} is not supported".format(data_format))
+
+    if isinstance(kernel_size, int):
+        k_d = k_h = k_w = kernel_size
+    else:
+        k_d, k_h, k_w = kernel_size
+    if isinstance(stride_size, int):
+        s_d = s_h = s_w = stride_size
+    else:
+        s_d, s_h, s_w = stride_size
+    if isinstance(padding_size, int):
+        pf = pt = pl = pk = pb = pr = padding_size
+    else:
+        pf, pt, pl, pk, pb, pr = padding_size
+
+    out_shape = list(in_shape)
+    if ceil_mode:
+        out_shape[depth_axis] = int(
+            math.ceil(float(in_shape[depth_axis] - k_d + pf + pk) / s_d) + 1)
+        out_shape[height_axis] = int(
+            math.ceil(float(in_shape[height_axis] - k_h + pt + pb) / s_h) + 1)
+        out_shape[width_axis] = int(
+            math.ceil(float(in_shape[width_axis] - k_w + pl + pr) / s_w) + 1)
+    else:
+        out_shape[depth_axis] = int(
+            math.floor(float(in_shape[depth_axis] - k_d + pf + pk) / s_d) + 1)
+        out_shape[height_axis] = int(
+            math.floor(float(in_shape[height_axis] - k_h + pt + pb) / s_h) + 1)
+        out_shape[width_axis] = int(
+            math.floor(float(in_shape[width_axis] - k_w + pl + pr) / s_w) + 1)
+
+    fill_value = 0
+    if exclusive and pool_type == 'max':
+        fill_value = sys.float_info.min
+
+    if data_format == "NCDHW":
+        pad_np = np.full(
+            shape=(in_n, in_c, in_d + pf + pk, in_h + pt + pb, in_w + pl + pr),
+            fill_value=fill_value,
+            dtype=dtype)
+        no_zero = (range(in_n), range(in_c), range(pf, in_d + pf),
+                   range(pt, in_h + pt), range(pl, in_w + pl))
+    else:
+        pad_np = np.full(
+            shape=(in_n, in_d + pf + pk, in_h + pt + pb, in_w + pl + pr, in_c),
+            fill_value=fill_value,
+            dtype=dtype)
+        no_zero = (range(in_n), range(pf, in_d + pf), range(pt, in_h + pt),
+                   range(pl, in_w + pl), range(in_c))
+
+    pad_np[np.ix_(*no_zero)] = np_data
+    ret_np = np.zeros(shape=out_shape).astype(dtype)
+    if pool_type == 'avg':
+        for i in range(out_shape[depth_axis]):
+            for j in range(out_shape[height_axis]):
+                for k in range(out_shape[width_axis]):
+                    if exclusive:
+                        pad_exclusive = pad_np.copy()
+                        pad_exclusive[np.ix_(*no_zero)] = 1
+                        if data_format == "NCDHW":
+                            pad_count = np.sum(
+                                pad_exclusive[:, :, i * s_d:i * s_d +
+                                              k_d, j * s_h:j * s_h +
+                                              k_h, k * s_w:k * s_w + k_w] == 1,
+                                axis=(depth_axis, height_axis, width_axis))
+                            ret_np[:, :, i, j, k] = np.sum(
+                                pad_np[:, :, i * s_d:i * s_d + k_d, j * s_h:j *
+                                       s_h + k_h, k * s_w:k * s_w + k_w],
+                                axis=(depth_axis, height_axis,
+                                      width_axis)) / np.maximum(pad_count, 1)
+                        else:
+                            pad_count = np.sum(
+                                pad_exclusive[:, i * s_d:i * s_d +
+                                              k_d, j * s_h:j * s_h + k_h, k *
+                                              s_w:k * s_w + k_w, :] == 1,
+                                axis=(depth_axis, height_axis, width_axis))
+                            ret_np[:, i, j, k, :] = np.sum(
+                                pad_np[:, i * s_d:i * s_d + k_d, j * s_h:j *
+                                       s_h + k_h, k * s_w:k * s_w + k_w, :],
+                                axis=(depth_axis, height_axis,
+                                      width_axis)) / np.maximum(pad_count, 1)
+                    else:
+                        if data_format == "NCDHW":
+                            ret_np[:, :,i, j, k] = \
+                                np.mean(pad_np[:, :,
+                                        i * s_d: i * s_d + k_d,
+                                        j * s_h: j * s_h + k_h,
+                                        k * s_w: k * s_w + k_w], axis=(depth_axis, height_axis, width_axis))
+                        else:
+                            ret_np[:, i, j, k, :] = \
+                                np.mean(pad_np[:,
+                                        i * s_d: i * s_d + k_d,
+                                        j * s_h: j * s_h + k_h,
+                                        k * s_w: k * s_w + k_w,
+                                        :], axis=(depth_axis, height_axis, width_axis))
+    elif pool_type == 'max':
+        for i in range(out_shape[depth_axis]):
+            for j in range(out_shape[height_axis]):
+                for k in range(out_shape[width_axis]):
+                    if data_format == "NCDHW":
+                        ret_np[:, :, i, j, k] = np.max(
+                            pad_np[:, :, i * s_d:i * s_d + k_d, j *
+                                   s_h:j * s_h + k_h, k * s_w:k * s_w + k_w],
+                            axis=(depth_axis, height_axis, width_axis))
+                    else:
+                        ret_np[:, i, j, k, :] = np.max(
+                            pad_np[:, i * s_d:i * s_d + k_d, j * s_h:j * s_h +
+                                   k_h, k * s_w:k * s_w + k_w, :],
+                            axis=(depth_axis, height_axis, width_axis))
+    else:
+        raise ValueError("pool type {} is not supported".format(pool_type))
+
+    ret_np = np.maximum(ret_np, fill_value)
+    return ret_np, [out_shape]
+
+
+def pool1d(np_data, attrs, dtype="float32"):
+    pool_type = "max"
+    ceil_mode = False
+    exclusive = True
+    data_format = "NCW"
+    for key in attrs.attr_store:
+        if key == "kernel_size":
+            kernel_size = attrs.get_attr("kernel_size")
+        elif key == "stride_size":
+            stride_size = attrs.get_attr("stride_size")
+        elif key == "padding_size":
+            padding_size = attrs.get_attr("padding_size")
+        elif key == "pool_type":
+            pool_type = attrs.get_attr("pool_type")
+        elif key == "ceil_mode":
+            ceil_mode = attrs.get_attr("ceil_mode")
+        elif key == "exclusive":
+            exclusive = attrs.get_attr("exclusive")
+        elif key == "data_format":
+            data_format = attrs.get_attr("data_format")
+        else:
+            raise ValueError("attr_store {} is not supported".format(key))
+
+    if data_format == "NCW":
+        in_n, in_c, in_w = in_shape = np_data.shape
+        width_axis = 2
+    elif data_format == "NWC":
+        in_n, in_w, in_c = in_shape = np_data.shape
+        width_axis = 1
+    else:
+        raise ValueError("data_format {} is not supported".format(data_format))
+
+    if isinstance(kernel_size, int):
+        k_w = kernel_size
+    else:
+        k_w, = kernel_size
+    if isinstance(stride_size, int):
+        s_w = stride_size
+    else:
+        s_w, = stride_size
+    if isinstance(padding_size, int):
+        pl = pr = padding_size
+    else:
+        pl, pr = padding_size
+
+    out_shape = list(in_shape)
+    if ceil_mode:
+        out_shape[width_axis] = int(
+            math.ceil(float(in_shape[width_axis] - k_w + pl + pr) / s_w) + 1)
+    else:
+        out_shape[width_axis] = int(
+            math.floor(float(in_shape[width_axis] - k_w + pl + pr) / s_w) + 1)
+
+    fill_value = 0
+    if exclusive and pool_type == 'max':
+        fill_value = sys.float_info.min
+
+    if data_format == "NCW":
+        pad_np = np.full(
+            shape=(in_n, in_c, in_w + pl + pr),
+            fill_value=fill_value,
+            dtype=dtype)
+        no_zero = (range(in_n), range(in_c), range(pl, in_w + pl))
+    else:
+        pad_np = np.full(
+            shape=(in_n, in_w + pl + pr, in_c),
+            fill_value=fill_value,
+            dtype=dtype)
+        no_zero = (range(in_n), range(pl, in_w + pl), range(in_c))
+
+    pad_np[np.ix_(*no_zero)] = np_data
+    ret_np = np.zeros(shape=out_shape).astype(dtype)
+    if pool_type == 'avg':
+        for i in range(out_shape[width_axis]):
+            if exclusive:
+                pad_exclusive = pad_np.copy()
+                pad_exclusive[np.ix_(*no_zero)] = 1
+                if data_format == "NCW":
+                    pad_count = np.sum(
+                        pad_exclusive[:, :, i * s_w:i * s_w + k_w] == 1,
+                        axis=width_axis)
+                    ret_np[:, :, i] = np.sum(
+                        pad_np[:, :, i * s_w:i * s_w + k_w],
+                        axis=width_axis) / np.maximum(pad_count, 1)
+                else:
+                    pad_count = np.sum(
+                        pad_exclusive[:, i * s_w:i * s_w + k_w, :] == 1,
+                        axis=width_axis)
+                    ret_np[:, i, :] = np.sum(
+                        pad_np[:, i * s_w:i * s_w + k_w, :],
+                        axis=width_axis) / np.maximum(pad_count, 1)
+            else:
+                if data_format == "NCW":
+                    ret_np[:, :, i] = \
+                        np.mean(pad_np[:, :,
+                                i * s_w: i * s_w + k_w], axis=width_axis)
+                else:
+                    ret_np[:, i, :] = \
+                        np.mean(pad_np[:,
+                                i * s_w: i * s_w + k_w,
+                                :], axis=width_axis)
+    elif pool_type == 'max':
+        for k in range(out_shape[width_axis]):
+            if data_format == "NCW":
+                ret_np[:, :, k] = np.max(
+                    pad_np[:, :, k * s_w:k * s_w + k_w], axis=width_axis)
+            else:
+                ret_np[:, k, :] = np.max(
+                    pad_np[:, k * s_w:k * s_w + k_w, :], axis=width_axis)
+    else:
+        raise ValueError("pool type {} is not supported".format(pool_type))
+
+    ret_np = np.maximum(ret_np, fill_value)
+    return ret_np, [out_shape]
diff --git a/test/cinn/test_common.py b/test/cinn/test_common.py
new file mode 100644
index 0000000000000..db2f4625631eb
--- /dev/null
+++ b/test/cinn/test_common.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn import Target
+from cinn.common import *
+
+
+class TestType(unittest.TestCase):
+    def test_type_constructs(self):
+        self.assertEqual(str(Float(32)), "float32")
+        self.assertEqual(str(Int(32)), "int32")
+        self.assertEqual(str(Int(64)), "int64")
+        self.assertEqual(str(UInt(64)), "uint64")
+        self.assertEqual(str(UInt(32)), "uint32")
+        self.assertEqual(str(Bool()), "bool")
+
+    def test_make_const(self):
+        self.assertEqual(str(make_const(Float(32), 1.23)), "1.23000002f")
+        self.assertEqual(str(make_const(Int(32), 1.23)), "1")
+        # self.assertEqual(str(make_const(UInt(32), 1.23)), "1")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_computation.py b/test/cinn/test_computation.py
new file mode 100755
index 0000000000000..6c19ba6940cad
--- /dev/null
+++ b/test/cinn/test_computation.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import paddle.static as static
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+
+assert len(sys.argv) == 3
+enable_gpu = sys.argv.pop()
+naive_model_dir = sys.argv.pop()
+
+
+class TestNetBuilder(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+
+    def get_paddle_result(self, inputdata):
+        paddle.enable_static()
+
+        a = static.data(name='A', shape=[24, 56, 56], dtype='float32')
+        b = static.data(name='B', shape=[24, 56, 56], dtype='float32')
+        c = paddle.add(a, b)
+        d = paddle.nn.initializer.NumpyArrayInitializer(
+            np.array(inputdata[2]).reshape((144, 24, 1, 1)).astype('float32'))
+        res = static.nn.conv2d(
+            input=c,
+            num_filters=144,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            dilation=1,
+            param_attr=d)
+
+        exe = static.Executor(paddle.CPUPlace())
+        exe.run(static.default_startup_program())
+
+        x = np.array(inputdata[0]).reshape((1, 24, 56, 56)).astype("float32")
+        y = np.array(inputdata[1]).reshape((1, 24, 56, 56)).astype("float32")
+        output = exe.run(feed={"A": x, "B": y}, fetch_list=[res])
+        return np.array(output)
+
+    def test_build_and_compile(self):
+        builder = NetBuilder("test_basic")
+        a = builder.create_input(Float(32), (1, 24, 56, 56), "A")
+        b = builder.create_input(Float(32), (1, 24, 56, 56), "B")
+        c = builder.add(a, b)
+        d = builder.create_input(Float(32), (144, 24, 1, 1), "D")
+        e = builder.conv(c, d)
+
+        computation = Computation.build_and_compile(self.target, builder)
+
+        A_data = np.random.random([1, 24, 56, 56]).astype("float32")
+        B_data = np.random.random([1, 24, 56, 56]).astype("float32")
+        D_data = np.random.random([144, 24, 1, 1]).astype("float32")
+
+        computation.get_tensor("A").from_numpy(A_data, self.target)
+        computation.get_tensor("B").from_numpy(B_data, self.target)
+        computation.get_tensor("D").from_numpy(D_data, self.target)
+
+        computation.execute()
+
+        e_tensor = computation.get_tensor(str(e))
+        edata_cinn = e_tensor.numpy(self.target)
+
+        edata_paddle = self.get_paddle_result([A_data, B_data, D_data])
+
+        self.assertTrue(np.allclose(edata_cinn, edata_paddle, atol=1e-5))
+
+
+class TestCompilePaddleModel(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+
+    def test_compile_paddle_model(self):
+        A_shape = [4, 30]
+        A_data = np.random.random(A_shape).astype("float32")
+        computation = Computation.compile_paddle_model(
+            self.target, naive_model_dir, ["A"], [A_shape], False)
+
+        A_tensor = computation.get_tensor("A")
+        A_tensor.from_numpy(A_data, self.target)
+
+        computation.execute()
+
+        out = computation.get_tensor("fc_0.tmp_2")
+        res_cinn = out.numpy(self.target)
+
+        config = fluid.core.AnalysisConfig(naive_model_dir)
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(A_data)
+        paddle_out = paddle_predictor.run([data])
+        res_paddle = paddle_out[0].as_ndarray()
+
+        self.assertTrue(np.allclose(res_cinn, res_paddle, atol=1e-5))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_efficientnet.py b/test/cinn/test_efficientnet.py
new file mode 100755
index 0000000000000..2aa0dc97f36e7
--- /dev/null
+++ b/test/cinn/test_efficientnet.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle as paddle
+import paddle.fluid as fluid
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+import time
+
+enable_gpu = sys.argv.pop()
+model_dir = sys.argv.pop()
+
+
+class TestLoadEfficientNetModel(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+        self.model_dir = model_dir
+        self.x_shape = [1, 3, 224, 224]
+        self.target_tensor = 'save_infer_model/scale_0'
+        self.input_tensor = 'image'
+
+    def get_paddle_inference_result(self, model_dir, data):
+        config = fluid.core.AnalysisConfig(model_dir + '/__model__',
+                                           model_dir + '/params')
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        self.paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(data)
+        results = self.paddle_predictor.run([data])
+        get_tensor = self.paddle_predictor.get_output_tensor(
+            self.target_tensor).copy_to_cpu()
+        return get_tensor
+
+    def apply_test(self):
+        start = time.time()
+        x_data = np.random.random(self.x_shape).astype("float32")
+        self.executor = Interpreter([self.input_tensor], [self.x_shape])
+        print("self.mode_dir is:", self.model_dir)
+        # True means load combined model
+        self.executor.load_paddle_model(self.model_dir, self.target, True)
+        end1 = time.time()
+        print("load_paddle_model time is: %.3f sec" % (end1 - start))
+        a_t = self.executor.get_tensor(self.input_tensor)
+        a_t.from_numpy(x_data, self.target)
+        out = self.executor.get_tensor(self.target_tensor)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        for i in range(10):
+            self.executor.run()
+
+        repeat = 10
+        end4 = time.perf_counter()
+        for i in range(repeat):
+            self.executor.run()
+        end5 = time.perf_counter()
+
+        print("Repeat %d times, average Executor.run() time is: %.3f ms" %
+              (repeat, (end5 - end4) * 1000 / repeat))
+        a_t.from_numpy(x_data, self.target)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        self.executor.run()
+
+        out = out.numpy(self.target)
+        target_result = self.get_paddle_inference_result(
+            self.model_dir, x_data)
+
+        print("result in test_model: \n")
+        out = out.reshape(-1)
+        target_result = target_result.reshape(-1)
+        for i in range(0, min(out.shape[0], 200)):
+            if np.abs(out[i] - target_result[i]) > 1e-3:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      out[i], " vs: ", target_result[i], ". Diff is: ",
+                      out[i] - target_result[i])
+        self.assertTrue(np.allclose(out, target_result, atol=1e-3))
+
+    def test_model(self):
+        self.apply_test()
+        #self.target.arch = Target.Arch.NVGPU
+        #self.apply_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_facedet.py b/test/cinn/test_facedet.py
new file mode 100755
index 0000000000000..dd34279d1743a
--- /dev/null
+++ b/test/cinn/test_facedet.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle as paddle
+import paddle.fluid as fluid
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+import time
+
+enable_gpu = sys.argv.pop()
+model_dir = sys.argv.pop()
+print("enable_gpu is : ", enable_gpu)
+print("model_dir is : ", model_dir)
+
+
+class TestLoadFaceDetModel(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+        self.model_dir = model_dir
+        self.x_shape = [1, 3, 240, 320]
+        self.target_tensor = 'save_infer_model/scale_0.tmp_1'
+        self.input_tensor = 'x0'
+
+    def get_paddle_inference_result(self, model_dir, data):
+        config = fluid.core.AnalysisConfig(model_dir + '/__model__',
+                                           model_dir + '/params')
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        self.paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(data)
+        results = self.paddle_predictor.run([data])
+        get_tensor = self.paddle_predictor.get_output_tensor(
+            self.target_tensor).copy_to_cpu()
+        return get_tensor
+
+    def apply_test(self):
+        start = time.time()
+        x_data = np.random.random(self.x_shape).astype("float32")
+        self.executor = Interpreter([self.input_tensor], [self.x_shape])
+        print("self.mode_dir is:", self.model_dir)
+        # True means load combined model
+        self.executor.load_paddle_model(self.model_dir, self.target, True,
+                                        "facedet")
+        end1 = time.time()
+        print("load_paddle_model time is: %.3f sec" % (end1 - start))
+        a_t = self.executor.get_tensor(self.input_tensor)
+        a_t.from_numpy(x_data, self.target)
+        out = self.executor.get_tensor(self.target_tensor)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        for i in range(10):
+            self.executor.run()
+
+        repeat = 10
+        end4 = time.perf_counter()
+        for i in range(repeat):
+            self.executor.run()
+        end5 = time.perf_counter()
+        print("Repeat %d times, average Executor.run() time is: %.3f ms" %
+              (repeat, (end5 - end4) * 1000 / repeat))
+
+        a_t.from_numpy(x_data, self.target)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        self.executor.run()
+
+        out = out.numpy(self.target)
+        target_result = self.get_paddle_inference_result(
+            self.model_dir, x_data)
+
+        print("result in test_model: \n")
+        out = out.reshape(-1)
+        target_result = target_result.reshape(-1)
+        for i in range(0, min(out.shape[0], 200)):
+            if np.abs(out[i] - target_result[i]) > 1e-3:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      out[i], " vs: ", target_result[i], ". Diff is: ",
+                      out[i] - target_result[i])
+        self.assertTrue(np.allclose(out, target_result, atol=1e-3))
+
+    def test_model(self):
+        self.apply_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_frontend.py b/test/cinn/test_frontend.py
new file mode 100755
index 0000000000000..0492aaccfb626
--- /dev/null
+++ b/test/cinn/test_frontend.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+
+assert len(sys.argv) == 1 + 2 + 1  # model and enable_gpu count
+enable_gpu = sys.argv.pop()
+multi_fc_model_dir = sys.argv.pop()
+naive_model_dir = sys.argv.pop()
+""" class TestFrontend(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+
+    def paddle_verify(self, result):
+        paddle.enable_static()
+
+        a = static.data(name='A', shape=[24, 56, 56], dtype='float32')
+        b = static.data(name='B', shape=[24, 56, 56], dtype='float32')
+        c = paddle.add(a, b)
+        d = paddle.nn.functional.relu(c)
+        e = paddle.nn.initializer.NumpyArrayInitializer(
+            np.array(result[2]).reshape((144, 24, 1, 1)).astype("float32"))
+        f = static.nn.conv2d(
+            input=d,
+            num_filters=144,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            dilation=1,
+            param_attr=e)
+        g = paddle.scale(f, scale=2.0, bias=0.5)
+        res = paddle.nn.functional.softmax(g, axis=1)
+
+        exe = static.Executor(paddle.CPUPlace())
+        exe.run(static.default_startup_program())
+
+        x = np.array(result[0]).reshape((1, 24, 56, 56)).astype("float32")
+        y = np.array(result[1]).reshape((1, 24, 56, 56)).astype("float32")
+        output = exe.run(feed={"A": x, "B": y}, fetch_list=[res])
+        output = np.array(output).reshape(-1)
+        print("result in paddle_verify: \n")
+        for i in range(0, output.shape[0]):
+            if np.abs(output[i] - result[len(result) - 1][i]) > 1e-4:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      output[i], " vs: ", result[len(result) - 1][i],
+                      ". Diff is: ", output[i] - result[len(result) - 1][i])
+        self.assertTrue(
+            np.allclose(result[len(result) - 1], output, atol=1e-4))
+
+    def test_basic(self):
+        prog = Program()
+
+        a = Variable("A").set_type(Float(32)).set_shape([1, 24, 56, 56])
+        b = Variable("B").set_type(Float(32)).set_shape([1, 24, 56, 56])
+        c = prog.add(a, b)
+        d = prog.relu(c)
+        e = Variable("E").set_type(Float(32)).set_shape([144, 24, 1, 1])
+        f = prog.conv2d(d, e, {
+            "stride": [1, 1],
+            "dilation": [1, 1],
+            "padding": [0, 0]
+        })
+        g = prog.scale(f, {"scale": 2.0, "bias": 0.5})
+        h = prog.softmax(g, {"axis": 1})
+
+        self.assertEqual(prog.size(), 5)
+        # print program
+        for i in range(prog.size()):
+            print(prog[i])
+        tensor_data = [
+            np.random.random([1, 24, 56, 56]).astype("float32"),
+            np.random.random([1, 24, 56, 56]).astype("float32"),
+            np.random.random([144, 24, 1, 1]).astype("float32")
+        ]
+        result = prog.build_and_get_output(self.target, [a, b, e], tensor_data,
+                                           [h])
+        result[0].set_type(Float(32))
+        result = result[0].numpy(self.target).reshape(-1)
+        tensor_data.append(result)
+        self.paddle_verify(tensor_data) """
+
+
+class TestLoadPaddleModel_FC(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+
+        self.model_dir = naive_model_dir
+
+    def get_paddle_inference_result(self, model_dir, data):
+        config = fluid.core.AnalysisConfig(model_dir)
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        self.paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(data)
+        results = self.paddle_predictor.run([data])
+
+        return results[0].as_ndarray()
+
+    def test_model(self):
+        np.random.seed(0)
+        self.x_shape = [4, 30]
+        x_data = np.random.random(
+            self.x_shape).astype("float16").astype("float32")
+        print('x_data', x_data)
+
+        self.executor = Interpreter(["A"], [self.x_shape])
+        self.executor.load_paddle_model(self.model_dir, self.target, False)
+        a_t = self.executor.get_tensor("A")
+        a_t.from_numpy(x_data, self.target)
+
+        self.executor.run()
+
+        out = self.executor.get_tensor("fc_0.tmp_2")
+        target_data = self.get_paddle_inference_result(self.model_dir, x_data)
+        print("target_data's shape is: ", target_data.shape)
+        out_np = out.numpy(self.target)
+        print("cinn data's shape is: ", out_np.shape)
+
+        self.assertTrue(np.allclose(out_np, target_data, atol=1e-4))
+
+
+class TestLoadPaddleModel_MultiFC(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+
+        self.model_dir = multi_fc_model_dir
+
+    def get_paddle_inference_result(self, model_dir, data):
+        config = fluid.core.AnalysisConfig(model_dir)
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        self.paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(data)
+        results = self.paddle_predictor.run([data])
+
+        return results[0].as_ndarray()
+
+    def test_model(self):
+        np.random.seed(0)
+        self.x_shape = [8, 64]
+        x_data = np.random.random(self.x_shape).astype("float32")
+
+        self.executor = Interpreter(["A"], [self.x_shape])
+        self.executor.load_paddle_model(self.model_dir, self.target, False)
+        a_t = self.executor.get_tensor("A")
+        a_t.from_numpy(x_data, self.target)
+
+        self.executor.run()
+
+        out = self.executor.get_tensor("fc_5.tmp_2")
+        target = self.get_paddle_inference_result(self.model_dir, x_data)
+
+        self.assertTrue(np.allclose(out.numpy(self.target), target, atol=1e-4))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_hlir_framework.py b/test/cinn/test_hlir_framework.py
new file mode 100644
index 0000000000000..5793ea8a35e81
--- /dev/null
+++ b/test/cinn/test_hlir_framework.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cinn.framework import *
+import unittest
+import numpy as np
+
+
+class TensorTest(unittest.TestCase):
+    def test_basic(self):
+        target = Target()
+        target.arch = Target.Arch.X86
+        target.bits = Target.Bit.k64
+        target.os = Target.OS.Linux
+        tensor = Tensor()
+        data = np.random.random([10, 5])
+        tensor.from_numpy(data, target)
+
+        self.assertTrue(np.allclose(tensor.numpy(), data))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_ir.py b/test/cinn/test_ir.py
new file mode 100644
index 0000000000000..9f284c7323f24
--- /dev/null
+++ b/test/cinn/test_ir.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.optim import *
+from cinn import Target
+from cinn.common import *
+from cinn.ir import *
+
+
+class TestIR(unittest.TestCase):
+    def test_pod(self):
+        one = Expr(1)
+        self.assertEqual(str(simplify(one + one)), "2")
+        self.assertEqual(str(simplify(one * Expr(0))), "0")
+
+    def test_expr(self):
+        a = Var("A")
+        b = Var("B")
+
+        expr = 1 + b
+        print(expr)
+
+        expr = b + 1
+        print(expr)
+
+        self.assertEqual(str(simplify(b * 0)), "0")
+        print(expr)
+        print(simplify(expr))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_matmul.py b/test/cinn/test_matmul.py
new file mode 100755
index 0000000000000..78b55aecd1e0e
--- /dev/null
+++ b/test/cinn/test_matmul.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn import Target
+from cinn import utils
+from cinn.poly import create_stages
+
+
+class TestMamul(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.target = Target()
+        self.target.arch = Target.Arch.X86
+        self.target.bits = Target.Bit.k32
+        self.target.os = Target.OS.Linux
+        self.m = 1024
+        self.n = 1024
+        self.k = 1024
+        self.bn = 32
+
+        self.engine = cinn.ExecutionEngine()
+        utils.ProfilerHelper.enable_cpu()
+        self.assertTrue(utils.ProfilerHelper.is_enable_cpu())
+
+    def test_matmul_basic(self):
+        a, b, c, c_target, *args = create_data(self.m, self.n, self.k, self.bn)
+        module = create_matmul_basic(self.target, self.m, self.n, self.k)
+
+        self.engine.link(module)
+        matmul = self.engine.lookup("matmul")
+        matmul(args)
+        cd = c.numpy()
+        cd_target = c_target.numpy()
+        self.assertTrue(np.allclose(cd, cd_target, atol=1e-4))
+        print(utils.HostEventRecorder.table())
+
+    def test_matmul_tile(self):
+        a, b, c, c_target, *args = create_data(self.m, self.n, self.k, self.bn)
+        module = create_matmul_tile(self.target, self.m, self.n, self.k)
+        print('module:\n', module.get_c_code())
+        self.engine.link(module)
+        matmul = self.engine.lookup("matmul_tile")
+        matmul(args)
+        cd = c.numpy()
+        cd_target = c_target.numpy()
+        self.assertTrue(np.allclose(cd, cd_target, atol=1e-4))
+
+
+def create_matmul_basic(target, m, n, k):
+    m, n, k = [ir.Expr(_) for _ in (m, n, k)]
+
+    a = lang.Placeholder("float32", "A", [m, k])
+    b = lang.Placeholder("float32", "B", [k, n])
+
+    k1 = ir.Var(k.as_int32(), "k1")
+    c = lang.compute([m, n], lambda v: lang.reduce_sum(
+        a(v[0], k1.to_expr_mutable()) * b(k1.to_expr_mutable(), v[1]), [k1]),
+                     "c")
+
+    stages = create_stages([c])
+    c_stage = stages[c]
+
+    builder = lang.Module.Builder("matmul", target)
+
+    ts = [a.to_tensor(), b.to_tensor(), c]
+    func = lang.lower("matmul", stages, ts)
+    print('func', func)
+    builder.add_function(func)
+    return builder.build()
+
+
+def create_matmul_tile(target, m, n, k):
+    m, n, k = [ir.Expr(_) for _ in [m, n, k]]
+    a = lang.Placeholder("float32", "A", [m, k])
+    b = lang.Placeholder("float32", "B", [k, n])
+
+    k1 = ir.Var(k.as_int32(), "k1")
+    c = lang.compute([m, n], lambda v: lang.reduce_sum(
+        a(v[0], k1.to_expr_mutable()) * b(k1.to_expr_mutable(), v[1]), [k1]),
+                     "c")
+
+    stages = create_stages([c])
+    stages[c].tile(0, 1, 4, 4)
+
+    builder = lang.Module.Builder("matmul_tile", target)
+    ts = [a.to_tensor(), b.to_tensor(), c]
+    func = lang.lower("matmul_tile", stages, ts)
+    print('func', func)
+    builder.add_function(func)
+    return builder.build()
+
+
+def create_data(m, n, k, bn):
+    # call around to lower the numpy's float precision so that it will not vary too much from C's float precision.
+    a_init = np.around(np.random.randn(m, k).astype("float32"), 2)
+    b_init = np.around(np.random.randn(k, n).astype("float32"), 2)
+    a = runtime.cinn_buffer_t(a_init, runtime.cinn_x86_device)
+    b = runtime.cinn_buffer_t(b_init, runtime.cinn_x86_device)
+    c = runtime.cinn_buffer_t(
+        np.zeros([m, n]).astype("float32"), runtime.cinn_x86_device)
+    c_target = runtime.cinn_buffer_t(a.numpy() @ b.numpy(),
+                                     runtime.cinn_x86_device)
+    packed_b = runtime.cinn_buffer_t(
+        np.zeros([n // bn, k, bn]).astype("float32"), runtime.cinn_x86_device)
+
+    a_arg = runtime.cinn_pod_value_t(a)
+    b_arg = runtime.cinn_pod_value_t(b)
+    c_arg = runtime.cinn_pod_value_t(c)
+    packed_b_arg = runtime.cinn_pod_value_t(packed_b)
+    return [a, b, c, c_target, a_arg, b_arg, c_arg]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_mobilenetv1.py b/test/cinn/test_mobilenetv1.py
new file mode 100644
index 0000000000000..d04dbd4407c32
--- /dev/null
+++ b/test/cinn/test_mobilenetv1.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle as paddle
+import paddle.fluid as fluid
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+import time
+
+enable_gpu = sys.argv.pop()
+model_dir = sys.argv.pop()
+print("enable_gpu is : ", enable_gpu)
+print("model_dir is : ", model_dir)
+
+
+class TestLoadMobilenetV1Model(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+        self.model_dir = model_dir
+        self.x_shape = [1, 3, 224, 224]
+        self.target_tensor = 'save_infer_model/scale_0'
+        self.input_tensor = 'image'
+
+    def get_paddle_inference_result(self, model_dir, data):
+        config = fluid.core.AnalysisConfig(model_dir)
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        self.paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(data)
+        results = self.paddle_predictor.run([data])
+        get_tensor = self.paddle_predictor.get_output_tensor(
+            self.target_tensor).copy_to_cpu()
+        return get_tensor
+
+    def apply_test(self):
+        start = time.time()
+        x_data = np.random.random(self.x_shape).astype("float32")
+        self.executor = Interpreter([self.input_tensor], [self.x_shape])
+        print("self.mode_dir is:", self.model_dir)
+        # True means load combined model
+        self.executor.load_paddle_model(self.model_dir, self.target, False,
+                                        "mobilenetv1")
+        end1 = time.time()
+        print("load_paddle_model time is: %.3f sec" % (end1 - start))
+        a_t = self.executor.get_tensor(self.input_tensor)
+        a_t.from_numpy(x_data, self.target)
+        out = self.executor.get_tensor(self.target_tensor)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        for i in range(10):
+            self.executor.run()
+
+        repeat = 10
+        end4 = time.perf_counter()
+        for i in range(repeat):
+            self.executor.run()
+        end5 = time.perf_counter()
+        print("Repeat %d times, average Executor.run() time is: %.3f ms" %
+              (repeat, (end5 - end4) * 1000 / repeat))
+
+        a_t.from_numpy(x_data, self.target)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        self.executor.run()
+
+        out = out.numpy(self.target)
+        target_result = self.get_paddle_inference_result(
+            self.model_dir, x_data)
+
+        print("result in test_model: \n")
+        out = out.reshape(-1)
+        target_result = target_result.reshape(-1)
+        for i in range(0, min(out.shape[0], 200)):
+            if np.abs(out[i] - target_result[i]) > 1e-1:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      out[i], " vs: ", target_result[i], ". Diff is: ",
+                      out[i] - target_result[i])
+        # TODO(thisjiang): revert atol to 1e-3 after fix inference mul problem
+        self.assertTrue(np.allclose(out, target_result, atol=1e-1))
+
+    def test_model(self):
+        self.apply_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_mobilenetv2.py b/test/cinn/test_mobilenetv2.py
new file mode 100755
index 0000000000000..7277a79435ca3
--- /dev/null
+++ b/test/cinn/test_mobilenetv2.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle as paddle
+import paddle.fluid as fluid
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+import time
+
+enable_gpu = sys.argv.pop()
+model_dir = sys.argv.pop()
+
+
+class TestLoadResnetModel(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+        self.model_dir = model_dir
+        self.x_shape = [1, 3, 224, 224]
+        self.target_tensor = 'save_infer_model/scale_0'
+        self.input_tensor = 'image'
+
+    def get_paddle_inference_result(self, model_dir, data):
+        config = fluid.core.AnalysisConfig(model_dir + '/__model__',
+                                           model_dir + '/params')
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        self.paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(data)
+        results = self.paddle_predictor.run([data])
+        get_tensor = self.paddle_predictor.get_output_tensor(
+            self.target_tensor).copy_to_cpu()
+        #return results[0].as_ndarray()
+        return get_tensor
+
+    def apply_test(self):
+        start = time.time()
+        x_data = np.random.random(self.x_shape).astype("float32")
+        self.executor = Interpreter([self.input_tensor], [self.x_shape])
+        print("self.mode_dir is:", self.model_dir)
+        # True means load combined model
+        self.executor.load_paddle_model(self.model_dir, self.target, True,
+                                        "mobilenetv2")
+        end1 = time.time()
+        print("load_paddle_model time is: %.3f sec" % (end1 - start))
+        a_t = self.executor.get_tensor(self.input_tensor)
+        a_t.from_numpy(x_data, self.target)
+        out = self.executor.get_tensor(self.target_tensor)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+
+        for i in range(10):
+            self.executor.run()
+
+        repeat = 10
+        end4 = time.perf_counter()
+        for i in range(repeat):
+            self.executor.run()
+        end5 = time.perf_counter()
+        print("Repeat %d times, average Executor.run() time is: %.3f ms" %
+              (repeat, (end5 - end4) * 1000 / repeat))
+
+        a_t.from_numpy(x_data, self.target)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        self.executor.run()
+
+        out = out.numpy(self.target)
+        target_result = self.get_paddle_inference_result(
+            self.model_dir, x_data)
+
+        print("result in test_model: \n")
+        out = out.reshape(-1)
+        target_result = target_result.reshape(-1)
+        for i in range(0, min(out.shape[0], 200)):
+            if np.abs(out[i] - target_result[i]) > 1e-1:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      out[i], " vs: ", target_result[i], ". Diff is: ",
+                      out[i] - target_result[i])
+        # TODO(thisjiang): revert atol to 1e-3 after fix inference mul problem
+        self.assertTrue(np.allclose(out, target_result, atol=1e-1))
+
+    def test_model(self):
+        self.apply_test()
+        #self.target.arch = Target.Arch.NVGPU
+        #self.apply_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_netbuilder.py b/test/cinn/test_netbuilder.py
new file mode 100755
index 0000000000000..60e04977eada1
--- /dev/null
+++ b/test/cinn/test_netbuilder.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.static as static
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+
+enable_gpu = sys.argv.pop()
+
+
+class TestNetBuilder(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+
+    def paddle_verify_basic(self, result):
+        paddle.enable_static()
+
+        a = static.data(name='A', shape=[1, 24, 56, 56], dtype='float32')
+        b = static.data(name='B', shape=[1, 24, 56, 56], dtype='float32')
+        c = paddle.add(a, b)
+        d = paddle.nn.initializer.NumpyArrayInitializer(
+            np.array(result[2]).reshape((144, 24, 1, 1)).astype('float32'))
+        res = static.nn.conv2d(
+            input=c,
+            num_filters=144,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            dilation=1,
+            param_attr=d)
+
+        exe = static.Executor(paddle.CPUPlace())
+        exe.run(static.default_startup_program())
+
+        x = np.array(result[0]).reshape((1, 24, 56, 56)).astype("float32")
+        y = np.array(result[1]).reshape((1, 24, 56, 56)).astype("float32")
+        output = exe.run(feed={"A": x, "B": y}, fetch_list=[res])
+        output = np.array(output).reshape(-1)
+        print("result in paddle_verify: \n")
+        for i in range(0, output.shape[0]):
+            if np.abs(output[i] - result[len(result) - 1][i]) > 1e-4:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      output[i], " vs: ", result[len(result) - 1][i],
+                      ". Diff is: ", output[i] - result[len(result) - 1][i])
+        self.assertTrue(
+            np.allclose(result[len(result) - 1], output, atol=1e-4))
+
+    def test_basic(self):
+        builder = NetBuilder("test_basic")
+        a = builder.create_input(Float(32), (1, 24, 56, 56), "A")
+        b = builder.create_input(Float(32), (1, 24, 56, 56), "B")
+        c = builder.add(a, b)
+        d = builder.create_input(Float(32), (144, 24, 1, 1), "D")
+        e = builder.conv2d(c, d)
+        prog = builder.build()
+        self.assertEqual(prog.size(), 2)
+        # print program
+        for i in range(prog.size()):
+            print(prog[i])
+        tensor_data = [
+            np.random.random([1, 24, 56, 56]).astype("float32"),
+            np.random.random([1, 24, 56, 56]).astype("float32"),
+            np.random.random([144, 24, 1, 1]).astype("float32")
+        ]
+        result = prog.build_and_get_output(self.target, [a, b, d], tensor_data,
+                                           [e])
+        result = result[0].numpy(self.target).reshape(-1)
+        tensor_data.append(result)
+        self.paddle_verify_basic(tensor_data)
+
+
+class TestNetBuilderOp(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+
+    def test_basic(self):
+        builder = NetBuilder("testmul")
+        a = builder.create_input(Float(32), (4, 4), "A")
+        tensor_data = [np.random.random([4, 4]).astype("float32")]
+        print(tensor_data[0])
+        b = builder.add(a, a)
+        prog = builder.build()
+        result = prog.build_and_get_output(self.target, [a], tensor_data, [b])
+        res = result[0].numpy(self.target)
+        print(res)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_op_benchmark.py b/test/cinn/test_op_benchmark.py
new file mode 100755
index 0000000000000..c3db022b5a9a9
--- /dev/null
+++ b/test/cinn/test_op_benchmark.py
@@ -0,0 +1,479 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.static as static
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+
+assert len(sys.argv) == 2
+enable_gpu = sys.argv.pop()
+
+
+class TestBenchmark(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+
+    def paddle_verify(self, result):
+        paddle.enable_static()
+
+        a = static.data(name='A', shape=[1, 128, 28, 28], dtype='float32')
+        e = paddle.nn.initializer.NumpyArrayInitializer(
+            np.array(result[1]).reshape((256, 128, 1, 1)).astype("float32"))
+        res = static.nn.conv2d(
+            input=a,
+            num_filters=256,
+            filter_size=1,
+            stride=2,
+            padding=0,
+            dilation=1,
+            param_attr=e)
+
+        exe = static.Executor(paddle.CPUPlace())
+        exe.run(static.default_startup_program())
+
+        x = np.array(result[0]).reshape((1, 128, 28, 28)).astype("float32")
+        output = exe.run(feed={"A": x}, fetch_list=[res])
+        output = np.array(output).reshape(-1)
+        print("result in conv2d paddle_verify: \n")
+        for i in range(0, output.shape[0]):
+            if np.abs(output[i] - result[len(result) - 1][i]) > 1e-4:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      output[i], " vs: ", result[len(result) - 1][i],
+                      ". Diff is: ", output[i] - result[len(result) - 1][i])
+        self.assertTrue(
+            np.allclose(result[len(result) - 1], output, atol=1e-4))
+
+    def atest_conv2d_cinn(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([1, 128, 28, 28])
+        b = Variable("E").set_type(Float(32)).set_shape([256, 128, 1, 1])
+        c = prog.conv2d(a, b, {
+            "stride": [2, 2],
+            "dilation": [1, 1],
+            "padding": [0, 0]
+        })
+        tensor_data = [
+            np.random.random([1, 128, 28, 28]).astype("float32"),
+            np.random.random([256, 128, 1, 1]).astype("float32")
+        ]
+        result = prog.test_benchmark(
+            self.target, [a, b], tensor_data, c, 20000,
+            "TESTING [conv2d] time cost with shape [1, 128, 28, 28]...")
+        result = result.numpy(self.target).reshape(-1)
+        tensor_data.append(result)
+        self.paddle_verify(tensor_data)
+
+    def atest_conv2d_cinn_code(self):
+        prog = Program()
+        a = Variable("X").set_type(Float(32)).set_shape([1, 128, 28, 28])
+        b = Variable("Y").set_type(Float(32)).set_shape([256, 128, 1, 1])
+        c = prog.conv2d(a, b, {
+            "stride": [2, 2],
+            "dilation": [1, 1],
+            "padding": [0, 0]
+        })
+        tensor_data = [
+            np.random.random([1, 128, 28, 28]).astype("float32"),
+            np.random.random([256, 128, 1, 1]).astype("float32")
+        ]
+        result = prog.test_benchmark_with_code(
+            self.target, [a, b], tensor_data, c, 20000,
+            "TESTING [conv2d of tvm schedule] time cost with shape [1, 128, 28, 28]...",
+            """
+extern "C" {
+
+#include "cinn_cuda_runtime_source.cuh"
+
+#ifdef __CUDACC_RTC__
+typedef int int32_t;
+typedef char int8_t;
+#endif
+
+
+
+__global__
+void fn_conv2d_0_kernel(const float* __restrict__ X, const float* __restrict__ Y, float* __restrict__ COD)
+{
+  __shared__ float _input_pad_0_read_cache [ 224 ];
+  float _COD_write_cache [ 2 ];
+  __shared__ float _Y_read_cache [ 256 ];
+  float* COD_write_cache = _COD_write_cache;
+  float* COD_write_cache__reduce_init = _COD_write_cache;
+  float* Y_read_cache = _Y_read_cache;
+  float* input_pad_0_read_cache = _input_pad_0_read_cache;
+  if ((blockIdx.z < 8)) {
+    if ((blockIdx.y < 14)) {
+      if ((threadIdx.z < 16)) {
+        if ((threadIdx.x < 14)) {
+        {
+          for (int32_t rc_outer = 0; rc_outer < 2; rc_outer += 1) {
+            COD_write_cache__reduce_init[rc_outer] = 0;
+          };
+          for (int32_t rc_outer = 0; rc_outer < 16; rc_outer += 1) {
+            {
+              __syncthreads();
+              if ((threadIdx.z < 8)) {
+                input_pad_0_read_cache[((2 * threadIdx.x) + (28 * threadIdx.z))] = X[((56 * blockIdx.y) + ((6272 * rc_outer) + ((2 * threadIdx.x) + (784 * threadIdx.z))))];
+              };
+            };
+            for (int32_t rc_inner = 0; rc_inner < 2; rc_inner += 1) {
+              if ((threadIdx.x < 8)) {
+                Y_read_cache[((threadIdx.x / 2) + ((8 * (threadIdx.x % 2)) + ((4 * rc_inner) + (16 * threadIdx.z))))] = Y[((threadIdx.x / 2) + ((128 * (threadIdx.x % 2)) + ((4096 * blockIdx.z) + ((4 * rc_inner) + ((8 * rc_outer) + (256 * threadIdx.z))))))];
+              };
+            };
+            __syncthreads();
+            for (int32_t rc_inner = 0; rc_inner < 8; rc_inner += 1) {
+              for (int32_t j_inner = 0; j_inner < 2; j_inner += 1) {
+                COD_write_cache[j_inner] = (COD_write_cache[j_inner] + (input_pad_0_read_cache[((28 * rc_inner) + (2 * threadIdx.x))] * Y_read_cache[((8 * j_inner) + ((16 * threadIdx.z) + rc_inner))]));
+              };
+            };
+          };
+          for (int32_t rc_outer = 0; rc_outer < 2; rc_outer += 1) {
+            COD[((14 * blockIdx.y) + ((6272 * blockIdx.z) + ((196 * rc_outer) + ((392 * threadIdx.z) + threadIdx.x))))] = COD_write_cache[rc_outer];
+          };
+        }
+        };
+      };
+    };
+  };
+}
+
+}
+            """)
+        result = result.numpy(self.target).reshape(-1)
+        tensor_data.append(result)
+        self.paddle_verify(tensor_data)
+
+    def atest_conv2d_tvm_code(self):
+        prog = Program()
+        a = Variable("placeholder").set_type(Float(32)).set_shape(
+            [1, 128, 28, 28])
+        b = Variable("placeholder1").set_type(Float(32)).set_shape(
+            [256, 128, 1, 1])
+        c = prog.conv2d(a, b, {
+            "stride": [2, 2],
+            "dilation": [1, 1],
+            "padding": [0, 0]
+        })
+        tensor_data = [
+            np.random.random([1, 128, 28, 28]).astype("float32"),
+            np.random.random([256, 128, 1, 1]).astype("float32")
+        ]
+        result = prog.test_benchmark_with_code(
+            self.target, [a, b], tensor_data, c, 20000,
+            "TESTING [conv2d of tvm schedule] time cost with shape [1, 128, 28, 28]...",
+            """
+extern "C" {
+
+#include "cinn_cuda_runtime_source.cuh"
+
+#ifdef __CUDACC_RTC__
+typedef int int32_t;
+typedef char int8_t;
+#endif
+
+
+
+__global__ void fn_conv2d_0_kernel(float* __restrict__ placeholder, float* __restrict__ placeholder1, float* __restrict__ Conv2d_nchw_out) {
+  float compute_local[2];
+  __shared__ float pad_temp_shared[216];
+  __shared__ float placeholder_shared[256];
+  for (int ff_c_init = 0; ff_c_init < 2; ++ff_c_init) {
+    compute_local[(ff_c_init)] = 0.000000e+00f;
+  }
+  for (int rc_outer = 0; rc_outer < 16; ++rc_outer) {
+    __syncthreads();
+    if (((((int)threadIdx.z) * 14) + ((int)threadIdx.x)) < 216) {
+      pad_temp_shared[(((((int)threadIdx.z) * 14) + ((int)threadIdx.x)))] = placeholder[(((((rc_outer * 6272) + ((((((int)threadIdx.z) * 14) + ((int)threadIdx.x)) / 27) * 784)) + (((int)blockIdx.y) * 56)) + (((((int)threadIdx.z) * 14) + ((int)threadIdx.x)) % 27)))];
+    }
+    for (int ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner = 0; ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner < 2; ++ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) {
+      if (((((int)threadIdx.z) * 2) + (((((int)threadIdx.x) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) >> 3)) < 32) {
+        if ((((((int)threadIdx.z) * 16) + (((int)threadIdx.x) * 2)) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) < 256) {
+          if (((((int)threadIdx.x) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) < 16) {
+            placeholder_shared[((((((int)threadIdx.z) * 16) + (((int)threadIdx.x) * 2)) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner))] = placeholder1[((((((((int)blockIdx.z) * 4096) + (((int)threadIdx.z) * 256)) + ((((((int)threadIdx.x) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) >> 3) * 128)) + (rc_outer * 8)) + (((((int)threadIdx.x) * 2) + ax0_ax1_fused_ax2_fused_ax3_fused_inner_inner_inner) & 7)))];
+          }
+        }
+      }
+    }
+    __syncthreads();
+    for (int rc_inner = 0; rc_inner < 8; ++rc_inner) {
+      for (int ff_c = 0; ff_c < 2; ++ff_c) {
+        compute_local[(ff_c)] = (compute_local[(ff_c)] + (pad_temp_shared[(((rc_inner * 27) + (((int)threadIdx.x) * 2)))] * placeholder_shared[((((((int)threadIdx.z) * 16) + (ff_c * 8)) + rc_inner))]));
+      }
+    }
+  }
+  for (int ff_inner_inner_inner = 0; ff_inner_inner_inner < 2; ++ff_inner_inner_inner) {
+    Conv2d_nchw_out[((((((((int)blockIdx.z) * 6272) + (((int)threadIdx.z) * 392)) + (ff_inner_inner_inner * 196)) + (((int)blockIdx.y) * 14)) + ((int)threadIdx.x)))] = compute_local[(ff_inner_inner_inner)];
+  }
+}
+
+}
+            """)
+        result = result.numpy(self.target).reshape(-1)
+        tensor_data.append(result)
+        self.paddle_verify(tensor_data)
+
+    def atest_softmax(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([1024, 2048])
+        c = prog.softmax(a, {})
+        tensor_data = [np.random.random([1024, 2048]).astype("float32")]
+        result = prog.test_benchmark(
+            self.target, [a], tensor_data, c, 200,
+            "TESTING [softmax] time cost with shape [1024,2048]...")
+
+    def atest_matmul(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([512, 512])
+        b = Variable("B").set_type(Float(32)).set_shape([512, 512])
+        c = prog.mul(a, b, 1, 1)
+        tensor_data = [
+            np.random.random([512, 512]).astype("float32"),
+            np.random.random([512, 512]).astype("float32")
+        ]
+        result = prog.test_benchmark(
+            self.target, [a, b], tensor_data, c, 200,
+            "TESTING [matmul] time cost with shape [512,512]...")
+
+    def atest_matmul2(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([128, 512])
+        b = Variable("B").set_type(Float(32)).set_shape([256, 512])
+        c = Variable("C").set_type(Float(32)).set_shape([128, 256])
+        d = prog.mul(a, b, 1, 1)
+        e = prog.add(d, c)
+        tensor_data = [
+            np.random.random([128, 512]).astype("float32"),
+            np.random.random([256, 512]).astype("float32"),
+            np.random.random([128, 256]).astype("float32")
+        ]
+        result = prog.test_benchmark(
+            self.target, [a, b, c], tensor_data, e, 200,
+            "TESTING [mul and add] time cost with shape [128,512]*[256,512]..."
+        )
+
+    def atest_matmul(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([512, 512])
+        b = Variable("B").set_type(Float(32)).set_shape([512, 512])
+        c = Variable("C").set_type(Float(32)).set_shape([512, 512])
+        d = prog.mul(a, b, 1, 1)
+        # e = prog.add(d, c)
+        tensor_data = [
+            np.random.random([512, 512]).astype("float32"),
+            np.random.random([512, 512]).astype("float32")
+        ]
+        result = prog.test_benchmark_with_code(
+            self.target, [a, b], tensor_data, d, 200,
+            "TESTING [matmul] time cost with shape [512,512]...", '''
+            extern "C" {
+#include "cinn_cuda_runtime_source.cuh"
+#ifdef __CUDACC_RTC__
+typedef int int32_t;
+typedef char int8_t;
+#endif
+
+ __global__
+ void fn_mul_0_kernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ Mul_output)
+ {
+   const float* A_reshape = A;
+   const float* B_reshape = B;
+   float* Mul_output__reduce_init = Mul_output;
+   if ((blockIdx.x < 512)) {
+   {
+     if ((threadIdx.x < 256)) {
+     {
+       for (int32_t j_inner = 0; j_inner < 2; j_inner += 1) {
+         Mul_output__reduce_init[((512 * blockIdx.x) + ((2 * threadIdx.x) + j_inner))] = 0;
+       };
+     }
+     };
+   }
+   };
+   if ((blockIdx.x < 512)) {
+   {
+     if ((threadIdx.x < 256)) {
+     {
+       for (int32_t j_inner = 0; j_inner < 2; j_inner += 1) {
+        for (int32_t axis_k = 0; axis_k < 512; axis_k += 1) {
+          Mul_output[((512 * blockIdx.x) + ((2 * threadIdx.x) + j_inner))] = (Mul_output[((512 * blockIdx.x) + ((2 * threadIdx.x) + j_inner))] + (A_reshape[((512 * blockIdx.x) + axis_k)] * B_reshape[((512 * axis_k) + ((2 * threadIdx.x) + j_inner))])) + Mul_output[((512 * blockIdx.x) + ((2 * threadIdx.x) + j_inner))];
+         };
+       };
+     }
+     };
+  }
+  };
+ }
+ }''')
+
+    def atest_pool2d(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([2, 64, 112, 112])
+        c = prog.pool2d(
+            a, {
+                "kernel_size": (3, 3),
+                "stride_size": (2, 2),
+                "padding_size": (1, 1, 1, 1),
+                "pool_type": "max"
+            })
+        tensor_data = [np.random.random([2, 64, 112, 112]).astype("float32")]
+        result = prog.test_benchmark(
+            self.target, [a], tensor_data, c, 2000,
+            "TESTING [pool2d] time cost with shape [2, 64, 112, 112]...")
+
+    def atest_elementwise1(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([64, 64])
+        b = Variable("B").set_type(Float(32)).set_shape([64, 64])
+        c = prog.add(a, b)
+        tensor_data = [
+            np.random.random([64, 64]).astype("float32"),
+            np.random.random([64, 64]).astype("float32")
+        ]
+        result = prog.test_benchmark(
+            self.target, [a, b], tensor_data, c, 200,
+            "TESTING [elementwise_add] time cost with shape [64, 64]...")
+        result = result.numpy(self.target).reshape(-1)
+        self.assertTrue(
+            np.allclose(
+                (tensor_data[0] + tensor_data[1]).reshape(-1),
+                result,
+                atol=1e-4))
+
+    def atest_elementwise2(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([2, 512, 112, 112])
+        b = Variable("B").set_type(Float(32)).set_shape([2, 512, 112, 112])
+        c = prog.add(a, b)
+        tensor_data = [
+            np.random.random([2, 512, 112, 112]).astype("float32"),
+            np.random.random([2, 512, 112, 112]).astype("float32")
+        ]
+        result = prog.test_benchmark(
+            self.target, [a, b], tensor_data, c, 200,
+            "TESTING [elementwise_add] time cost with shape [2, 512, 112, 112]..."
+        )
+        result = result.numpy(self.target).reshape(-1)
+        self.assertTrue(
+            np.allclose(
+                (tensor_data[0] + tensor_data[1]).reshape(-1),
+                result,
+                atol=1e-4))
+
+    def atest_elementwise2(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([4, 1024])
+        b = Variable("B").set_type(Float(32)).set_shape([4, 1024])
+        c = prog.add(a, b)
+        tensor_data = [
+            np.random.random([4, 1024]).astype("float32"),
+            np.random.random([4, 1024]).astype("float32")
+        ]
+        result = prog.test_benchmark_with_code(
+            self.target, [a, b], tensor_data, c, 200,
+            "TESTING [elementwise_add] time cost with input code...",
+            '''extern "C" {
+
+__global__
+void fn_elementwise_add_0_kernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ EleAdd_Out_0)
+{
+
+      EleAdd_Out_0[1024 * blockIdx.x + threadIdx.x] = (A[1024 * blockIdx.x + threadIdx.x] + B[1024 * blockIdx.x + threadIdx.x]);
+}
+
+}''')
+
+    def atest_batchnorm(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([2, 512, 32, 32])
+        b = Variable("B").set_type(Float(32)).set_shape([512])
+        c = Variable("C").set_type(Float(32)).set_shape([512])
+        d = Variable("D").set_type(Float(32)).set_shape([512])
+        e = Variable("E").set_type(Float(32)).set_shape([512])
+        f = prog.batchnorm(a, b, c, d, e, {})
+        tensor_data = [
+            np.random.random([2, 512, 32, 32]).astype("float32"),
+            np.random.random([512]).astype("float32"),
+            np.random.random([512]).astype("float32"),
+            np.random.random([512]).astype("float32"),
+            np.random.random([512]).astype("float32")
+        ]
+        result = prog.test_benchmark(
+            self.target, [a, b, c, d, e], tensor_data, f, 1000,
+            "TESTING [batchnorm] time cost with shape [2, 512, 32, 32]...")
+
+    def atest_batchnorm2(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([2, 64, 8, 8])
+        b = Variable("B").set_type(Float(32)).set_shape([64])
+        c = Variable("C").set_type(Float(32)).set_shape([64])
+        d = Variable("D").set_type(Float(32)).set_shape([64])
+        e = Variable("E").set_type(Float(32)).set_shape([64])
+        f = prog.batchnorm(a, b, c, d, e, {})
+        tensor_data = [
+            np.random.random([2, 64, 8, 8]).astype("float32"),
+            np.random.random([64]).astype("float32"),
+            np.random.random([64]).astype("float32"),
+            np.random.random([64]).astype("float32"),
+            np.random.random([64]).astype("float32")
+        ]
+        result = prog.test_benchmark(
+            self.target, [a, b, c, d, e], tensor_data, f, 200,
+            "TESTING [batchnorm] time cost with shape [2, 64, 8, 8]...")
+
+    def atest_relu3(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([2, 512, 112, 112])
+        c = prog.relu(a)
+        tensor_data = [np.random.random([2, 512, 112, 112]).astype("float32")]
+        result = prog.test_benchmark(
+            self.target, [a], tensor_data, c, 200,
+            "TESTING [relu] time cost with shape [2,512,112,112]...")
+
+    def atest_relu(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([64, 64])
+        c = prog.sigmoid(a)
+        tensor_data = [np.random.random([64, 64]).astype("float32")]
+        result = prog.test_benchmark(
+            self.target, [a], tensor_data, c, 200,
+            "TESTING [sigmoid] time cost with shape [64,64]...")
+
+    def atest_relu2(self):
+        prog = Program()
+        a = Variable("A").set_type(Float(32)).set_shape([2, 512, 112, 112])
+        c = prog.sigmoid(a)
+        tensor_data = [np.random.random([2, 512, 112, 112]).astype("float32")]
+        result = prog.test_benchmark(
+            self.target, [a], tensor_data, c, 200,
+            "TESTING [sigmoid] time cost with shape [2,512,112,112]...")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_op_broadcast.py b/test/cinn/test_op_broadcast.py
new file mode 100644
index 0000000000000..dba3421f3983a
--- /dev/null
+++ b/test/cinn/test_op_broadcast.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import math
+import numpy as np
+import cinn
+from cinn import frontend
+from cinn import runtime
+from cinn import lang
+from cinn import framework
+from cinn import ir
+from cinn import common
+from cinn.poly import create_stages
+import logging
+from test_utils import SingleOpTester
+
+
+class OpTest_add_0(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X, Y] = inputs_data
+        return X + Y
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("axis", 0)
+        self.to_test_op([[100, 32], [100, 32]], [[100, 32]], "elementwise_add",
+                        attrs)
+
+
+class OpTest_add_1(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X, Y] = inputs_data
+        return X + Y
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("axis", 1)
+        self.to_test_op([[3, 2], [2]], [[3, 2]], "elementwise_add", attrs)
+
+
+class OpTest_mul_0(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X, Y] = inputs_data
+        return X * Y
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("axis", 0)
+        self.to_test_op([[100, 32], [100, 32]], [[100, 32]], "elementwise_mul",
+                        attrs)
+
+
+class OpTest_mul_1(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X, Y] = inputs_data
+        return X * Y
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("axis", 1)
+        self.to_test_op([[3, 2], [2]], [[3, 2]], "elementwise_mul", attrs)
+
+
+class OpTest_scale_0(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        return X * attrs.attr_store["scale"] + attrs.attr_store["bias"]
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("scale", 0.7)
+        attrs.set_attr("bias", 0.3)
+        self.to_test_op([[100, 32]], [[100, 32]], "scale", attrs)
+
+
+class OpTest_scale_1(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        return (X + attrs.attr_store["bias"]) * attrs.attr_store["scale"]
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("scale", 0.6)
+        attrs.set_attr("bias", 0.4)
+        attrs.set_attr("bias_after_scale", False)
+        self.to_test_op([[100, 32]], [[100, 32]], "scale", attrs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_op_nn.py b/test/cinn/test_op_nn.py
new file mode 100644
index 0000000000000..452916a3f1771
--- /dev/null
+++ b/test/cinn/test_op_nn.py
@@ -0,0 +1,595 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle as paddle
+import numpy as np
+import unittest
+import math
+import cinn
+from cinn import frontend
+from cinn import runtime
+from cinn import lang
+from cinn import framework
+from cinn import ir
+from cinn import common
+from cinn.poly import create_stages
+import logging
+from test_utils import SingleOpTester
+import pool_utils
+import conv2d_utils
+
+
+class OpTest_relu(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        return np.maximum(X, np.zeros(X.shape).astype("float32"))
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        self.to_test_op([[32]], [[32]], "relu", attrs)
+
+
+class OpTest_relu6(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        return np.minimum(
+            np.maximum(X,
+                       np.zeros(np.array(X).shape).astype("float32")), 6)
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        self.to_test_op([[32, 32]], [[32, 32]], "relu6", attrs)
+
+
+class OpTest_conv2d_nchw(SingleOpTester):
+    def init_testcase(self):
+        self.input_size = [1, 3, 10, 10]
+        self.groups = 1
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [2, f_c, 2, 2]
+        assert np.mod(self.filter_size[0], self.groups) == 0
+        self.data_format = "NCHW"
+        self.attrs = framework.NodeAttr()
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [2, 2]
+        self.attrs.set_attr("stride", self.stride)
+        self.attrs.set_attr("padding", self.padding)
+        self.attrs.set_attr("dilation", self.dilation)
+        self.attrs.set_attr("groups", self.groups)
+        self.attrs.set_attr("data_format", self.data_format)
+
+    def create_target_data(self, inputs_data, attrs):
+        return conv2d_utils.conv2d_native(inputs_data, self.input_size,
+                                          self.filter_size, self.attrs, False)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op([self.input_size, self.filter_size], None, "conv2d",
+                        self.attrs, 0, True)
+
+
+class OpTest_conv2d_nchw_1(SingleOpTester):
+    def init_testcase(self):
+        self.input_size = [1, 3, 224, 224]
+        self.groups = 1
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [64, f_c, 7, 7]
+        self.data_format = "NCHW"
+        self.attrs = framework.NodeAttr()
+        self.padding = [3, 3]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.attrs.set_attr("stride", self.stride)
+        self.attrs.set_attr("padding", self.padding)
+        self.attrs.set_attr("dilation", self.dilation)
+        self.attrs.set_attr("groups", self.groups)
+        self.attrs.set_attr("data_format", self.data_format)
+
+    def create_target_data(self, inputs_data, attrs):
+        return conv2d_utils.conv2d_native(inputs_data, self.input_size,
+                                          self.filter_size, self.attrs, False)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op([self.input_size, self.filter_size], None, "conv2d",
+                        self.attrs, 0, True)
+
+
+class OpTest_conv2d_nchw_group(SingleOpTester):
+    def init_testcase(self):
+        self.input_size = [2, 8, 10, 10]
+        self.groups = 4
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [16, f_c, 7, 7]
+        self.data_format = "NCHW"
+        self.attrs = framework.NodeAttr()
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.attrs.set_attr("stride", self.stride)
+        self.attrs.set_attr("padding", self.padding)
+        self.attrs.set_attr("dilation", self.dilation)
+        self.attrs.set_attr("groups", self.groups)
+        self.attrs.set_attr("data_format", self.data_format)
+
+    def create_target_data(self, inputs_data, attrs):
+        return conv2d_utils.conv2d_native(inputs_data, self.input_size,
+                                          self.filter_size, self.attrs, False)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op([self.input_size, self.filter_size], None, "conv2d",
+                        self.attrs, 0, True)
+
+
+class OpTest_conv2d_nchw_depthwise(SingleOpTester):
+    def init_testcase(self):
+        self.input_size = [2, 8, 10, 10]
+        self.groups = 8
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [16, f_c, 7, 7]
+        self.data_format = "NCHW"
+        self.attrs = framework.NodeAttr()
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.attrs.set_attr("stride", self.stride)
+        self.attrs.set_attr("padding", self.padding)
+        self.attrs.set_attr("dilation", self.dilation)
+        self.attrs.set_attr("groups", self.groups)
+        self.attrs.set_attr("data_format", self.data_format)
+
+    def create_target_data(self, inputs_data, attrs):
+        return conv2d_utils.conv2d_native(inputs_data, self.input_size,
+                                          self.filter_size, self.attrs, False)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op([self.input_size, self.filter_size], None, "conv2d",
+                        self.attrs, 0, True)
+
+
+class OpTest_conv2d_nhwc_group(SingleOpTester):
+    def init_testcase(self):
+        self.input_size = [2, 10, 10, 8]
+        self.groups = 4
+        assert np.mod(self.input_size[3], self.groups) == 0
+        f_c = self.input_size[3] // self.groups
+        self.filter_size = [16, f_c, 7, 7]
+        self.data_format = "NHWC"
+        self.attrs = framework.NodeAttr()
+        self.padding = [2, 2]
+        self.stride = [2, 2]
+        self.dilation = [2, 2]
+        self.attrs.set_attr("stride", self.stride)
+        self.attrs.set_attr("padding", self.padding)
+        self.attrs.set_attr("dilation", self.dilation)
+        self.attrs.set_attr("groups", self.groups)
+        self.attrs.set_attr("data_format", self.data_format)
+
+    def create_target_data(self, inputs_data, attrs):
+        return conv2d_utils.conv2d_native(inputs_data, self.input_size,
+                                          self.filter_size, self.attrs, False)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op([self.input_size, self.filter_size], None, "conv2d",
+                        self.attrs, 0, True)
+
+
+class OpTest_conv2d_nhwc_depthwise(SingleOpTester):
+    def init_testcase(self):
+        self.input_size = [2, 10, 10, 8]
+        self.groups = 8
+        assert np.mod(self.input_size[3], self.groups) == 0
+        f_c = self.input_size[3] // self.groups
+        self.filter_size = [16, f_c, 7, 7]
+        self.data_format = "NHWC"
+        self.attrs = framework.NodeAttr()
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.attrs.set_attr("stride", self.stride)
+        self.attrs.set_attr("padding", self.padding)
+        self.attrs.set_attr("dilation", self.dilation)
+        self.attrs.set_attr("groups", self.groups)
+        self.attrs.set_attr("data_format", self.data_format)
+
+    def create_target_data(self, inputs_data, attrs):
+        return conv2d_utils.conv2d_native(inputs_data, self.input_size,
+                                          self.filter_size, self.attrs, False)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op([self.input_size, self.filter_size], None, "conv2d",
+                        self.attrs, 0, True)
+
+
+# test channel multiplier format
+class OpTest_depthwise_conv2d_nchw(SingleOpTester):
+    def init_testcase(self):
+        self.input_size = [2, 8, 10, 10]
+        self.groups = self.input_size[1]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        channel_multiplier = 1
+        self.filter_size = [self.input_size[1], channel_multiplier, 7, 7]
+        self.data_format = "NCHW"
+        self.attrs = framework.NodeAttr()
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.attrs.set_attr("stride", self.stride)
+        self.attrs.set_attr("padding", self.padding)
+        self.attrs.set_attr("dilation", self.dilation)
+        self.attrs.set_attr("groups", self.groups)
+        self.attrs.set_attr("data_format", self.data_format)
+
+    def create_target_data(self, inputs_data, attrs):
+        return conv2d_utils.conv2d_native(inputs_data, self.input_size,
+                                          self.filter_size, self.attrs, True)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op([self.input_size, self.filter_size], None,
+                        "depthwise_conv2d", self.attrs, 0, True)
+
+
+# test channel multiplier format
+class OpTest_depthwise_conv2d_nhwc(SingleOpTester):
+    def init_testcase(self):
+        self.input_size = [2, 10, 10, 8]
+        self.groups = self.input_size[3]
+        assert np.mod(self.input_size[3], self.groups) == 0
+        channel_multiplier = 4
+        self.filter_size = [self.input_size[3], channel_multiplier, 7, 7]
+        self.data_format = "NHWC"
+        self.attrs = framework.NodeAttr()
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.attrs.set_attr("stride", self.stride)
+        self.attrs.set_attr("padding", self.padding)
+        self.attrs.set_attr("dilation", self.dilation)
+        self.attrs.set_attr("groups", self.groups)
+        self.attrs.set_attr("data_format", self.data_format)
+
+    def create_target_data(self, inputs_data, attrs):
+        return conv2d_utils.conv2d_native(inputs_data, self.input_size,
+                                          self.filter_size, self.attrs, True)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op([self.input_size, self.filter_size], None,
+                        "depthwise_conv2d", self.attrs, 0, True)
+
+
+class OpTest_pool1d(SingleOpTester):
+    attrs = framework.NodeAttr()
+    attrs.set_attr("kernel_size", [2])
+    attrs.set_attr("stride_size", [2])
+    attrs.set_attr("padding_size", [1, 1])
+    attrs.set_attr("pool_type", "max")
+    attrs.set_attr("ceil_mode", False)
+    attrs.set_attr("exclusive", True)
+    attrs.set_attr("data_format", "NCW")
+
+    def create_target_data(self, inputs_data, attrs):
+        return pool_utils.pool1d(inputs_data[0], self.attrs)
+
+    def test_op(self):
+        input_shape = [1, 3, 8]
+        self.to_test_op([input_shape], None, "pool1d", self.attrs)
+
+
+class OpTest_pool1d_1(SingleOpTester):
+    attrs = framework.NodeAttr()
+    attrs.set_attr("kernel_size", [2])
+    attrs.set_attr("stride_size", [2])
+    attrs.set_attr("padding_size", [2, 3])
+    attrs.set_attr("pool_type", "avg")
+    attrs.set_attr("ceil_mode", False)
+    attrs.set_attr("exclusive", True)
+    attrs.set_attr("data_format", "NCW")
+
+    def create_target_data(self, inputs_data, attrs):
+        return pool_utils.pool1d(inputs_data[0], self.attrs)
+
+    def test_op(self):
+        input_shape = [1, 3, 8]
+        self.to_test_op([input_shape], None, "pool1d", self.attrs)
+
+
+class OpTest_pool1d_2(SingleOpTester):
+    attrs = framework.NodeAttr()
+    attrs.set_attr("kernel_size", [2])
+    attrs.set_attr("stride_size", [3])
+    attrs.set_attr("padding_size", [4, 5])
+    attrs.set_attr("pool_type", "avg")
+    attrs.set_attr("ceil_mode", True)
+    attrs.set_attr("exclusive", False)
+    attrs.set_attr("data_format", "NWC")
+
+    def create_target_data(self, inputs_data, attrs):
+        return pool_utils.pool1d(inputs_data[0], self.attrs)
+
+    def test_op(self):
+        input_shape = [1, 8, 3]
+        self.to_test_op([input_shape], None, "pool1d", self.attrs)
+
+
+class OpTest_pool2d(SingleOpTester):
+    attrs = framework.NodeAttr()
+    attrs.set_attr("kernel_size", [2, 2])
+    attrs.set_attr("stride_size", [2, 2])
+    attrs.set_attr("padding_size", [1, 1, 1, 1])
+    attrs.set_attr("pool_type", "max")
+    attrs.set_attr("ceil_mode", False)
+    attrs.set_attr("exclusive", True)
+    attrs.set_attr("data_format", "NCHW")
+
+    def create_target_data(self, inputs_data, attrs):
+        return pool_utils.pool2d(inputs_data[0], self.attrs)
+
+    def test_op(self):
+        input_shape = [1, 3, 8, 8]
+        self.to_test_op([input_shape], None, "pool2d", self.attrs)
+
+
+class OpTest_pool2d_1(SingleOpTester):
+    attrs = framework.NodeAttr()
+    attrs.set_attr("kernel_size", [2, 2])
+    attrs.set_attr("stride_size", [2, 2])
+    attrs.set_attr("padding_size", [2, 3, 4, 5])
+    attrs.set_attr("pool_type", "avg")
+    attrs.set_attr("ceil_mode", False)
+    attrs.set_attr("exclusive", True)
+    attrs.set_attr("data_format", "NCHW")
+
+    def create_target_data(self, inputs_data, attrs):
+        return pool_utils.pool2d(inputs_data[0], self.attrs)
+
+    def test_op(self):
+        input_shape = [1, 3, 8, 8]
+        self.to_test_op([input_shape], None, "pool2d", self.attrs)
+
+
+class OpTest_pool2d_2(SingleOpTester):
+    attrs = framework.NodeAttr()
+    attrs.set_attr("kernel_size", [2, 2])
+    attrs.set_attr("stride_size", [3, 3])
+    attrs.set_attr("padding_size", [2, 3, 4, 5])
+    attrs.set_attr("pool_type", "avg")
+    attrs.set_attr("ceil_mode", True)
+    attrs.set_attr("exclusive", False)
+    attrs.set_attr("data_format", "NHWC")
+
+    def create_target_data(self, inputs_data, attrs):
+        return pool_utils.pool2d(inputs_data[0], self.attrs)
+
+    def test_op(self):
+        input_shape = [1, 8, 8, 3]
+        self.to_test_op([input_shape], None, "pool2d", self.attrs)
+
+
+# The following test is temporarily broken
+
+# class OpTest_pool3d(SingleOpTester):
+#     attrs = framework.NodeAttr()
+#     attrs.attr_store = {
+#         "kernel_size": [2, 2, 2],
+#         "stride_size": [2, 2, 2],
+#         "padding_size": [1, 2, 3, 4, 5, 6],
+#         "pool_type": "max",
+#         "ceil_mode": False,
+#         "exclusive": True,
+#         "data_format": "NCDHW"
+#     }
+
+#     def create_target_data(self, inputs_data, attrs):
+#         return pool_utils.pool3d(inputs_data[0], self.attrs)
+
+#     def test_op(self):
+#         input_shape = [2, 3, 8, 8, 8]
+#         self.to_test_op([input_shape], None, "pool3d", self.attrs)
+
+
+class OpTest_pool3d_1(SingleOpTester):
+    attrs = framework.NodeAttr()
+    attrs.set_attr("kernel_size", [2, 2, 2])
+    attrs.set_attr("stride_size", [2, 2, 2])
+    attrs.set_attr("padding_size", [1, 1, 1, 1, 1, 1])
+    attrs.set_attr("pool_type", "avg")
+    attrs.set_attr("ceil_mode", False)
+    attrs.set_attr("exclusive", True)
+    attrs.set_attr("data_format", "NCDHW")
+
+    def create_target_data(self, inputs_data, attrs):
+        return pool_utils.pool3d(inputs_data[0], self.attrs)
+
+    def test_op(self):
+        input_shape = [1, 3, 8, 8, 8]
+        self.to_test_op([input_shape], None, "pool3d", self.attrs)
+
+
+class OpTest_pool3d_2(SingleOpTester):
+    attrs = framework.NodeAttr()
+    attrs.set_attr("kernel_size", [2, 2, 2])
+    attrs.set_attr("stride_size", [2, 2, 2])
+    attrs.set_attr("padding_size", [1, 2, 3, 4, 5, 6])
+    attrs.set_attr("pool_type", "avg")
+    attrs.set_attr("ceil_mode", True)
+    attrs.set_attr("exclusive", False)
+    attrs.set_attr("data_format", "NDHWC")
+
+    def create_target_data(self, inputs_data, attrs):
+        return pool_utils.pool3d(inputs_data[0], self.attrs)
+
+    def test_op(self):
+        input_shape = [1, 8, 8, 8, 3]
+        self.to_test_op([input_shape], None, "pool3d", self.attrs)
+
+
+class OpTest_batchnorm(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X, Scale, Bias, Mean, Variance] = inputs_data
+        c = X.shape[1]
+        for i in range(0, c):
+            X[:, i, :, :] = (X[:, i, :, :] - Mean[i]) / math.sqrt(
+                Variance[i] + 0.00001) * Scale[i] + Bias[i]
+        return X
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        self.to_test_op([[1, 64, 112, 112], [64], [64], [64], [64]],
+                        [[1, 64, 112, 112]], "batch_norm", attrs)
+
+
+class OpTest_softmax_0(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        Y = np.zeros(X.shape).astype("float32")
+        for i in range(0, Y.shape[1]):
+            Y[:, i, :] = np.exp(X[:, i, :]) / np.sum(
+                np.exp(X), axis=1, keepdims=True)[:, 0, :]
+        return Y
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("axis", 1)
+        self.to_test_op([[12, 224, 224]], [[12, 224, 224], [12, 224, 224]],
+                        "softmax", attrs, 0)
+
+
+class OpTest_softmax_1(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        Y = np.zeros(X.shape).astype("float32")
+        for i in range(0, Y.shape[2]):
+            Y[:, :, i] = np.exp(X[:, :, i]) / np.sum(
+                np.exp(X), axis=2, keepdims=True)[:, :, 0]
+        return Y
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("axis", -1)
+        self.to_test_op([[12, 224, 224]], [[12, 224, 224], [12, 224, 224]],
+                        "softmax", attrs, 0)
+
+
+class OpTest_softmax_2(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        Y = np.zeros(X.shape).astype("float32")
+        for i in range(0, Y.shape[0]):
+            Y[i, :, :] = np.exp(X[i, :, :]) / np.sum(
+                np.exp(X), axis=0, keepdims=True)[0, :, :]
+        return Y
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("axis", 0)
+        self.to_test_op([[12, 224, 224]], [[12, 224, 224], [12, 224, 224]],
+                        "softmax", attrs, 0)
+
+
+class OpTest_sigmoid(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        x = np.array(inputs_data[0])
+        y = 1 / (1 + np.exp(-x))
+        return y
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        self.to_test_op([[3, 224, 224]], [[3, 224, 224]], "sigmoid", attrs)
+
+
+class OpTest_slice_0(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        Y = X[:, 0:2, 2:4, :]
+        return Y
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("axes", [0, 1, 2])
+        attrs.set_attr("starts", [-3, 0, 2])
+        attrs.set_attr("ends", [3, 2, 4])
+        self.to_test_op([[3, 4, 5, 6]], [[3, 2, 2, 6]], "slice", attrs)
+
+
+class OpTest_slice_1(SingleOpTester):
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        Y = X[:, 0:3, 1:2, 2:4]
+        return Y
+
+    def test_op(self):
+        attrs = framework.NodeAttr()
+        attrs.set_attr("axes", [1, 2, 3])
+        attrs.set_attr("starts", [0, 1, 2])
+        attrs.set_attr("ends", [3, 2, 4])
+        self.to_test_op([[3, 4, 5, 6]], [[3, 3, 1, 2]], "slice", attrs)
+
+
+class OpTest_dropout_infer_0(SingleOpTester):
+    def init_testcase(self):
+        self.attrs = framework.NodeAttr()
+        self.attrs.set_attr("dropout_prob", 0.2)
+        self.attrs.set_attr("dropout_implementation", "downgrade_in_infer")
+
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        assert "dropout_implementation" in self.attrs.attr_store
+        if self.attrs.attr_store[
+                "dropout_implementation"] == "downgrade_in_infer":
+            return X * (1 - self.attrs.attr_store["dropout_prob"])
+        else:
+            return X
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op([[2, 1280, 2, 2]], [[2, 1280, 2, 2]], "dropout_infer",
+                        self.attrs)
+
+
+class OpTest_dropout_infer_1(SingleOpTester):
+    def init_testcase(self):
+        self.attrs = framework.NodeAttr()
+        self.attrs.set_attr("dropout_prob", 0.2)
+        self.attrs.set_attr("dropout_implementation", "upscale_in_train")
+
+    def create_target_data(self, inputs_data, attrs):
+        [X] = inputs_data
+        assert "dropout_implementation" in self.attrs.attr_store
+        if self.attrs.attr_store[
+                "dropout_implementation"] == "downgrade_in_infer":
+            return X * (1 - self.attrs.attr_store["dropout_prob"])
+        else:
+            return X
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op([[2, 1280, 2, 2]], [[2, 1280, 2, 2]], "dropout_infer",
+                        self.attrs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_op_transform.py b/test/cinn/test_op_transform.py
new file mode 100644
index 0000000000000..823dc3786cb15
--- /dev/null
+++ b/test/cinn/test_op_transform.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import math
+import numpy as np
+import cinn
+from cinn import frontend
+from cinn import runtime
+from cinn import lang
+from cinn import framework
+from cinn import ir
+from cinn import common
+from cinn.poly import create_stages
+import logging
+from test_utils import SingleOpTester
+import paddle
+import paddle.static as static
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+
+def matmul_util(inputs_data, input_shape, trans_a, trans_b, alpha):
+    main_program = static.Program()
+    paddle.enable_static()
+    with static.program_guard(main_program, static.Program()):
+        [input_x, input_y] = inputs_data
+        x = static.data(name='x', shape=input_shape[0], dtype='float32')
+        y = static.data(name='y', shape=input_shape[1], dtype='float32')
+        output = paddle.matmul(x, y, trans_a, trans_b)
+        output = paddle.scale(output, scale=alpha)
+        exe = static.Executor(paddle.CPUPlace())
+        exe.run(static.default_startup_program())
+        res, = exe.run(
+            static.default_main_program(),
+            feed={
+                'x': input_x,
+                'y': input_y
+            },
+            fetch_list=[output])
+        return res
+
+
+class OpTest_matmul_0(SingleOpTester):
+    def init_testcase(self):
+        self.input_shape = [[100, 32], [32, 100]]
+        self.output_shape = [[100, 100], [100, 100]]
+        self.trans_a = False
+        self.trans_b = False
+        self.alpha = 1.0
+        self.attrs = framework.NodeAttr()
+        self.attrs.set_attr("trans_a", self.trans_a)
+        self.attrs.set_attr("trans_b", self.trans_b)
+        self.attrs.set_attr("alpha", self.alpha)
+
+    def create_target_data(self, inputs_data, attrs):
+        return matmul_util(inputs_data, self.input_shape, self.trans_a,
+                           self.trans_b, self.alpha)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op(self.input_shape, self.output_shape, "matmul",
+                        self.attrs, 0)
+
+
+class OpTest_matmul_1(SingleOpTester):
+    def init_testcase(self):
+        self.input_shape = [[100, 32], [100, 32]]
+        self.output_shape = [[100, 100], [2, 32, 50]]
+        self.trans_a = False
+        self.trans_b = True
+        self.alpha = 2.0
+        self.attrs = framework.NodeAttr()
+        self.attrs.set_attr("trans_a", self.trans_a)
+        self.attrs.set_attr("trans_b", self.trans_b)
+        self.attrs.set_attr("alpha", self.alpha)
+
+    def create_target_data(self, inputs_data, attrs):
+        return matmul_util(inputs_data, self.input_shape, self.trans_a,
+                           self.trans_b, self.alpha)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op(self.input_shape, self.output_shape, "matmul",
+                        self.attrs, 0)
+
+
+class OpTest_matmul_2(SingleOpTester):
+    def init_testcase(self):
+        self.input_shape = [[2, 3, 100, 32], [2, 3, 100, 32]]
+        self.output_shape = [[2, 3, 100, 100], [2, 3, 2, 100, 16]]
+        self.trans_a = False
+        self.trans_b = True
+        self.alpha = 2.0
+        self.attrs = framework.NodeAttr()
+        self.attrs.set_attr("trans_a", self.trans_a)
+        self.attrs.set_attr("trans_b", self.trans_b)
+        self.attrs.set_attr("alpha", self.alpha)
+
+    def create_target_data(self, inputs_data, attrs):
+        return matmul_util(inputs_data, self.input_shape, self.trans_a,
+                           self.trans_b, self.alpha)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op(self.input_shape, self.output_shape, "matmul",
+                        self.attrs, 0)
+
+
+class OpTest_matmul_3(SingleOpTester):
+    def init_testcase(self):
+        self.input_shape = [[32, 100], [32, 100]]
+        self.output_shape = [[100, 100], [2, 100, 16]]
+        self.trans_a = True
+        self.trans_b = False
+        self.alpha = 2.0
+        self.attrs = framework.NodeAttr()
+        self.attrs.set_attr("trans_a", self.trans_a)
+        self.attrs.set_attr("trans_b", self.trans_b)
+        self.attrs.set_attr("alpha", self.alpha)
+
+    def create_target_data(self, inputs_data, attrs):
+        return matmul_util(inputs_data, self.input_shape, self.trans_a,
+                           self.trans_b, self.alpha)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op(self.input_shape, self.output_shape, "matmul",
+                        self.attrs, 0)
+
+
+class OpTest_matmul_4(SingleOpTester):
+    def init_testcase(self):
+        self.input_shape = [[32, 100], [100]]
+        self.output_shape = [[32], [2, 100, 16]]
+        self.trans_a = False
+        self.trans_b = False
+        self.alpha = 2.0
+        self.attrs = framework.NodeAttr()
+        self.attrs.set_attr("trans_a", self.trans_a)
+        self.attrs.set_attr("trans_b", self.trans_b)
+        self.attrs.set_attr("alpha", self.alpha)
+
+    def create_target_data(self, inputs_data, attrs):
+        return matmul_util(inputs_data, self.input_shape, self.trans_a,
+                           self.trans_b, self.alpha)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op(self.input_shape, self.output_shape, "matmul",
+                        self.attrs, 0)
+
+
+class OpTest_matmul_5(SingleOpTester):
+    def init_testcase(self):
+        self.input_shape = [[100], [100]]
+        self.output_shape = [[1], [1, 100, 1]]
+        self.trans_a = False
+        self.trans_b = False
+        self.alpha = 2.0
+        self.attrs = framework.NodeAttr()
+        self.attrs.set_attr("trans_a", self.trans_a)
+        self.attrs.set_attr("trans_b", self.trans_b)
+        self.attrs.set_attr("alpha", self.alpha)
+
+    def create_target_data(self, inputs_data, attrs):
+        return matmul_util(inputs_data, self.input_shape, self.trans_a,
+                           self.trans_b, self.alpha)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op(self.input_shape, self.output_shape, "matmul",
+                        self.attrs, 0)
+
+
+class OpTest_matmul_6(SingleOpTester):
+    def init_testcase(self):
+        self.input_shape = [[32, 1], [1, 100]]
+        self.output_shape = [[32, 100], [2, 1, 50]]
+        self.trans_a = False
+        self.trans_b = False
+        self.alpha = 2.0
+        self.attrs = framework.NodeAttr()
+        self.attrs.set_attr("trans_a", self.trans_a)
+        self.attrs.set_attr("trans_b", self.trans_b)
+        self.attrs.set_attr("alpha", self.alpha)
+
+    def create_target_data(self, inputs_data, attrs):
+        return matmul_util(inputs_data, self.input_shape, self.trans_a,
+                           self.trans_b, self.alpha)
+
+    def test_op(self):
+        self.init_testcase()
+        self.to_test_op(self.input_shape, self.output_shape, "matmul",
+                        self.attrs, 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_packed_func.py b/test/cinn/test_packed_func.py
new file mode 100755
index 0000000000000..c8c90ebdb6dfd
--- /dev/null
+++ b/test/cinn/test_packed_func.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import cinn
+from cinn import ir
+from cinn import CINNValue
+from math import isclose
+
+
+class TestPackedFunc(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_lambda(self):
+        add3 = ir.register_packed_func(
+            "test_packed_func_add3")(lambda x, y, z: x + y + z)
+        self.assertEqual(add3(1, 2, 3), 6)
+        self.assertEqual(ir.get_global_func("test_packed_func_add3"), add3)
+        self.assertTrue(isinstance(add3, ir.PackedFunc))
+
+    def test_normal_function(self):
+        @ir.register_packed_func("test_packed_func_mul")
+        def mul(x, y):
+            return x * y
+
+        self.assertTrue(isclose(mul(2.3, 3.0), 6.9, abs_tol=1e-5))
+        self.assertEqual(mul(4, 5), 20)
+
+    def test_callable_object(self):
+        class Accumulator(object):
+            def __init__(self, init):
+                self.init = init
+
+            def __call__(self, *args):
+                r = cinn.CINNValue(self.init)
+                for arg in args:
+                    r = r + arg
+                return r
+
+        accumulate = ir.register_packed_func("accumulate_float")(
+            Accumulator(1.0))
+        self.assertTrue(isclose(accumulate(1., 2., 3., 4.), 11.))
+
+    def test_cxx_register(self):
+        add_int = ir.Registry.get("test_add_int64")
+        self.assertEqual(add_int(2, 3), 5)
+
+        add_expr = ir.Registry.get("test_add_expr")
+        x = ir.Expr(1)
+        y = ir.Expr(2)
+        z = x + y
+        r = add_expr(x, y)
+        self.assertEqual(r.node_type(), z.node_type())
+
+        mul_float = ir.Registry.get("test_mul_float")
+        self.assertTrue(isclose(mul_float(2.4, 2.5), 6.0, abs_tol=1e-5))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_paddle_model_convertor.py b/test/cinn/test_paddle_model_convertor.py
new file mode 100644
index 0000000000000..8835784a35901
--- /dev/null
+++ b/test/cinn/test_paddle_model_convertor.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os, sys
+import numpy as np
+import unittest
+import logging, argparse
+
+import paddle
+
+from cinn.frontend import PaddleModelConvertor
+from cinn.common import is_compiled_with_cuda, DefaultNVGPUTarget
+from cinn.runtime import seed as cinn_seed
+from ops.op_test import OpTestTool
+from op_mappers.op_mapper_test import OpMapperTest
+
+logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
+logger = logging.getLogger(name="paddle_model_convertor")
+
+parser = argparse.ArgumentParser(
+    description='Load Paddle Model File and Running at CINN')
+parser.add_argument(
+    "--path",
+    help="The path to load the paddle model",
+    type=str,
+    required=True)
+parser.add_argument(
+    "-m",
+    "--model_filename",
+    help="The filename of model file, default \"__model__\"",
+    type=str,
+    default="__model__")
+parser.add_argument(
+    "-p",
+    "--params_filename",
+    help=
+    "The filename of model parameter file, default None, in which each parameter will saved in each file",
+    type=str,
+    default=None)
+parser.add_argument(
+    "-cuda",
+    "--enable_cuda",
+    help="Whether enable CUDA, default True",
+    type=bool,
+    default=True)
+args = parser.parse_args()
+
+np.random.seed(1234)
+paddle.seed(1234)
+cinn_seed(1234)
+
+paddle.enable_static()
+
+# first save paddle model like:
+# ```
+# import paddle
+# paddle.enable_static()
+
+# x = paddle.static.data(name='x', shape=[10, 12, 128, 128], dtype='float32')
+# y = paddle.static.data(name='y', shape=[10, 12, 128, 128], dtype='float32')
+# prediction = paddle.stack([x, y], 1)
+
+# place = paddle.CUDAPlace(0)
+
+# exe = paddle.static.Executor(place)
+# exe.run(paddle.static.default_startup_program())
+# prog = paddle.static.default_main_program()
+
+# paddle.fluid.io.save_inference_model("./stack", [x.name, y.name], [prediction], exe, prog)
+# ```
+# Second load and run model like:
+# ```
+# python test_paddle_model_convertor.py --path build/thirds/resnet_model -m "__model__" -p "params"
+# ```
+
+
+class TestPaddleModel(OpMapperTest):
+    def setUp(self):
+        if args.enable_cuda:
+            self.target = DefaultNVGPUTarget()
+            self.place = paddle.CUDAPlace(0)
+        else:
+            self.target = DefaultHostTarget()
+            self.place = paddle.CPUPlace()
+
+        self.model_dir = args.path
+        self.model_filename = args.model_filename
+        self.params_filename = args.params_filename
+
+        logger.info(
+            "Run Model From \"{}\", which model filename is \"{}\", and parameter filename is \"{}\""
+            .format(self.model_dir, self.model_filename, self.params_filename))
+
+        self.load_paddle_program()
+        self.init_case()
+
+    @staticmethod
+    def eliminate_unkown_shape(shape):
+        return [1 if dim == -1 else dim for dim in shape]
+
+    def get_paddle_op_attrs(self, op):
+        attr_map = {}
+        for n in op.attr_names:
+            attr_map[n] = op.attr(n)
+
+        return attr_map
+
+    def init_case(self):
+        self.feed_data = dict()
+        for i in range(len(self.feed_names)):
+            # check no repeat variable
+            self.assertNotIn(
+                self.feed_names[i],
+                self.feed_data,
+                msg="Repeat feed name: " + self.feed_names[i])
+
+            dtype = self.paddleddtype2nptype(self.feed_dtypes[i])
+            # random int type data should not limited to [0, 1]
+            high = 1 if ("int" not in dtype) else self.feed_shapes[i][0]
+
+            # the paddle's feed list need dict not list
+            self.feed_data[self.feed_names[i]] = self.random(
+                self.eliminate_unkown_shape(self.feed_shapes[i]),
+                dtype,
+                high=high)
+
+    def load_paddle_program(self):
+        self.exe = paddle.static.Executor(self.place)
+
+        [self.inference_program, self.feed_names,
+         self.fetch_targets] = paddle.fluid.io.load_inference_model(
+             dirname=self.model_dir,
+             executor=self.exe,
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
+
+        self.param_vars = paddle.load(
+            self.model_dir,
+            model_filename=self.model_filename,
+            params_filename=self.params_filename,
+            return_numpy=True)
+
+        logger.debug(msg="Program:\n{}".format(self.inference_program))
+        logger.debug(msg="Param List: {}".format(self.param_vars.keys()))
+        logger.debug(msg="Feed List: {}".format(self.feed_names))
+        logger.debug(msg="Fetch List: {}".format(
+            [var.name for var in self.fetch_targets]))
+
+        self.feed_shapes = []
+        self.feed_dtypes = []
+
+        for var in self.inference_program.list_vars():
+            if var.name in self.feed_names:
+                self.feed_shapes.append(var.shape)
+                self.feed_dtypes.append(var.dtype)
+
+        self.assertEqual(
+            len(self.feed_names),
+            len(self.feed_shapes),
+            msg="Cannot found some feed var in program!")
+
+    def build_paddle_program(self, target):
+        self.paddle_outputs = self.exe.run(
+            self.inference_program,
+            feed=self.feed_data,
+            fetch_list=self.fetch_targets,
+            return_numpy=True)
+        logger.debug("Paddle Result:\n{}".format(self.paddle_outputs))
+
+    def build_cinn_program(self, target):
+        self.assertEqual(
+            1,
+            self.inference_program.num_blocks,
+            msg="CINN only support single block now")
+
+        feed_with_param = list()
+
+        convertor = PaddleModelConvertor(target)
+        for i in range(len(self.feed_names)):
+            convertor.create_input(
+                dtype=self.paddleddtype2nptype(self.feed_dtypes[i]),
+                shape=self.feed_data[self.feed_names[i]].shape,
+                name=self.feed_names[i])
+            feed_with_param.append(self.feed_names[i])
+
+        for param_name, param_value in self.param_vars.items():
+            convertor.create_input(
+                dtype=str(param_value.dtype),
+                shape=param_value.shape,
+                name=param_name)
+            feed_with_param.append(param_name)
+
+        for op in self.inference_program.global_block().ops:
+            if op.desc.type() == "feed" or op.desc.type() == "fetch":
+                continue
+            convertor.append_op(op.desc.type(), op.desc.inputs(),
+                                op.desc.outputs(),
+                                self.get_paddle_op_attrs(op))
+
+        prog = convertor()
+
+        # get cinn input list
+        inputs = prog.get_inputs()
+        logger.debug("CINN Input List: {}".format(
+            [var.name() for var in inputs]))
+        self.assertEqual(
+            len(feed_with_param),
+            len(inputs),
+            msg="The paddle's input list not equal to cinn's input list!")
+
+        # map the name the variable
+        input_dict = {var.name(): var for var in inputs}
+
+        cinn_inputs = []
+        cinn_feed_datas = []
+        for name in feed_with_param:
+            cinn_name = convertor.get_cinn_name(name)
+
+            self.assertIn(
+                cinn_name,
+                input_dict,
+                msg="Cannot find variable " + cinn_name +
+                " in cinn program's input, which are " + str(
+                    input_dict.items()))
+            cinn_inputs.append(input_dict[cinn_name])
+
+            if name in self.feed_data:
+                cinn_feed_datas.append(self.feed_data[name])
+            else:
+                self.assertIn(
+                    name,
+                    self.param_vars,
+                    msg=
+                    "The input variable should in feed list or parameter list")
+                cinn_feed_datas.append(self.param_vars[name])
+
+        # get cinn output list
+        fetch_names = {var.name for var in self.fetch_targets}
+        output_dict = convertor.get_fetch_list(fetch_names)
+        cinn_output = [output_dict[var.name] for var in self.fetch_targets]
+
+        # run and get result
+        self.cinn_outputs = self.get_cinn_output(
+            prog, target, cinn_inputs, cinn_feed_datas, cinn_output, passes=[])
+
+        logger.debug("CINN Result:\n{}".format(self.cinn_outputs))
+
+    def test_check_results(self):
+        self.check_outputs_and_grads(max_relative_error=1e-2)
+
+
+if __name__ == "__main__":
+    tester = unittest.defaultTestLoader.loadTestsFromTestCase(TestPaddleModel)
+    test_runer = unittest.TextTestRunner()
+    res = test_runer.run(tester)
+    sys.exit(not res.wasSuccessful())
diff --git a/test/cinn/test_pe_elementwise.py b/test/cinn/test_pe_elementwise.py
new file mode 100644
index 0000000000000..1d28d359e136e
--- /dev/null
+++ b/test/cinn/test_pe_elementwise.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import cinn
+import numpy as np
+from cinn import runtime
+from cinn import ir
+from cinn.poly import create_stages
+from cinn import lang
+from cinn import Target
+from cinn import pe
+from cinn.common import *
+import scipy
+from scipy import special
+
+
+class TestPEElementwise(unittest.TestCase):
+    def setUp(self):
+        self.m = 32
+        self.n = 32
+
+        self.target = Target()
+        self.target.arch = Target.Arch.X86
+        self.target.bits = Target.Bit.k32
+        self.target.os = Target.OS.Linux
+        cinn.set_target(self.target)
+        self.unary_data = []
+
+    def test_unary(self):
+        for (fn_name, pe_fn, np_fn, dtype, low, high) in [
+            ("exp", pe.exp, np.exp, "float32", -10, 10),
+            ("erf", pe.erf, scipy.special.erf, "float32", -99, 99),
+            ("sqrt", pe.sqrt, np.sqrt, "float32", 0.1, 10),
+            ("log", pe.log, np.log, "float32", 0.1, 99),
+            ("log2", pe.log2, np.log2, "float32", 0.1, 99),
+            ("log10", pe.log10, np.log10, "float32", 0.1, 99),
+            ("floor", pe.floor, np.floor, "float32", -99, 99),
+            ("ceil", pe.ceil, np.ceil, "float32", -99, 99),
+            ("round", pe.round, np.round, "float32", -99, 99),
+            ("trunc", pe.trunc, np.trunc, "float32", -99, 99),
+            ("cos", pe.cos, np.cos, "float32", -2.0 * np.pi, 2.0 * np.pi),
+            ("cosh", pe.cosh, np.cosh, "float32", -2.0 * np.pi, 2.0 * np.pi),
+            ("tan", pe.tan, np.tan, "float32", -2.0 * np.pi, 2.0 * np.pi),
+            ("tanh", pe.tanh, np.tanh, "float32", -2.0 * np.pi, 2.0 * np.pi),
+            ("tanh", pe.tanh, np.tanh, "float32", -2.0 * np.pi, 2.0 * np.pi),
+            ("sin", pe.sin, np.sin, "float32", -2.0 * np.pi, 2.0 * np.pi),
+            ("sinh", pe.sinh, np.sinh, "float32", -2.0 * np.pi, 2.0 * np.pi),
+                # TODO(wenming2014) not numpy
+                # ("acos", pe.acos, np.acos, "float32", -99, 99),
+                # ("acosh", pe.acosh, np.acosh, "float32"),
+                # ("asin", pe.asin, np.asin, "float32"),
+                # ("asinh", pe.asinh, np.asinh, "float32"),
+                # ("atan", pe.atan, np.atan, "float32"),
+                # ("atanh", pe.atanh, np.atanh, "float32"),
+            ("isnan", pe.isnan, np.isnan, "float32", -99, 99),
+            ("isfinite", pe.isfinite, np.isfinite, "float32", -99, 99),
+            ("isinf", pe.isinf, np.isinf, "float32", -99, 99),
+            ("negative", pe.negative, np.negative, "float32", -99, 99),
+                # TODO(wenming2014) further support
+                # ("identity", pe.identity, np.identity, "float32",-99,99),
+                # ("logical_not", pe.logical_not, np.logical_not, "bool",0,1),
+            ("bitwise_not", pe.bitwise_not, np.bitwise_not, "int32", -99, 99),
+            ("sigmoid", pe.sigmoid, lambda x: 1 / (1 + np.exp(-x)), "float32",
+             -99, 99),
+            ("sign", pe.sign, np.sign, "float32", -99, 99),
+            ("abs", pe.abs, np.abs, "float32", -99, 99),
+            ("rsqrt", pe.rsqrt, lambda x: np.ones_like(x) / np.sqrt(x),
+             "float32", 0.1, 99),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            is_round = fn_name == "round"
+            is_bool = (fn_name == "isnan") | (fn_name == "isfinite") | (
+                fn_name == "isinf") | (fn_name == "logical_not")
+            self.union_tester(fn_name, pe_fn, np_fn, dtype, low, high,
+                              is_round, is_bool)
+
+    def union_tester(self,
+                     fn_name,
+                     cinn_fn,
+                     np_fn,
+                     dtype="float32",
+                     low=0,
+                     high=1,
+                     is_round=False,
+                     is_bool=False):
+        m, n = [ir.Expr(_) for _ in (
+            self.m,
+            self.n,
+        )]
+
+        x = lang.Placeholder(dtype, "x", [m, n])
+        y = cinn_fn(x.to_tensor())
+
+        func_name = "test_" + fn_name
+
+        args = [x.to_tensor()]
+        for out in y:
+            args.append(out)
+        stages = create_stages(args)
+        func = lang.lower(func_name, stages, args)
+
+        builder = lang.Module.Builder("elementwise_module", self.target)
+        builder.add_function(func)
+
+        module = builder.build()
+        self.compiler.build(module)
+
+        fn = self.compiler.lookup(func_name)
+
+        x_data, x_buf, out_buf, *args = self.create_data(
+            dtype, low, high, is_round, is_bool)
+        fn(args)
+
+        self.assertTrue(
+            np.allclose(
+                out_buf.numpy(),
+                self.create_target_data(x_data, np_fn),
+                atol=1e-4), func_name)
+
+    def create_target_data(self, x_data, np_target_fn):
+        return np_target_fn(x_data)
+
+    def create_data(self, dtype, low, high, is_round, is_bool):
+        self.unary_data.clear()
+        if not self.unary_data:
+            x_data = np.around(
+                np.random.uniform(low, high, (self.m, self.n)).astype(dtype),
+                2)
+            if is_round:
+                x_data += ((np.abs(np.fmod(x_data, 1)) - 0.5) < 1e-6) * 1e-4
+            x = runtime.cinn_buffer_t(x_data, runtime.cinn_x86_device)
+            if is_bool:
+                out = runtime.cinn_buffer_t(
+                    np.zeros([self.m, self.n]).astype(np.bool_),
+                    runtime.cinn_x86_device)
+            else:
+                out = runtime.cinn_buffer_t(
+                    np.zeros([self.m, self.n]).astype(dtype),
+                    runtime.cinn_x86_device)
+            self.unary_data = [
+                x_data, x, out,
+                runtime.cinn_pod_value_t(x),
+                runtime.cinn_pod_value_t(out)
+            ]
+
+        return self.unary_data
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_pe_reduction.py b/test/cinn/test_pe_reduction.py
new file mode 100644
index 0000000000000..3f272fab9dfd2
--- /dev/null
+++ b/test/cinn/test_pe_reduction.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import cinn
+import numpy as np
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn import Target
+from cinn import pe
+from cinn.common import *
+from cinn.poly import create_stages
+
+
+class TestPEReduction(unittest.TestCase):
+    def setUp(self):
+        self.m = 32
+        self.n = 32
+
+        self.target = Target()
+        self.target.arch = Target.Arch.X86
+        self.target.bits = Target.Bit.k64
+        self.target.os = Target.OS.Linux
+
+        self.reduction_data = []
+
+    def test_reduction_0(self):
+        for (fn_name, pe_fn, np_fn) in [
+            ("sum", pe.reduce_sum, np.sum),
+            ("prod", pe.reduce_prod, np.prod),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            self.reduction_tester(fn_name, pe_fn, np_fn, [], True)
+
+    def test_reduction_1(self):
+        for (fn_name, pe_fn, np_fn) in [
+            ("sum", pe.reduce_sum, np.sum),
+            ("prod", pe.reduce_prod, np.prod),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            self.reduction_tester(fn_name, pe_fn, np_fn, [], False)
+
+    def test_reduction_2(self):
+        for (fn_name, pe_fn, np_fn) in [
+            ("sum", pe.reduce_sum, np.sum),
+            ("prod", pe.reduce_prod, np.prod),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            self.reduction_tester(fn_name, pe_fn, np_fn, [0], False)
+
+    def test_reduction_3(self):
+        for (fn_name, pe_fn, np_fn) in [
+            ("sum", pe.reduce_sum, np.sum),
+            ("prod", pe.reduce_prod, np.prod),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            self.reduction_tester(fn_name, pe_fn, np_fn, [0], True)
+
+    def test_reduction_4(self):
+        for (fn_name, pe_fn, np_fn) in [
+            ("sum", pe.reduce_sum, np.sum),
+            ("prod", pe.reduce_prod, np.prod),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            self.reduction_tester(fn_name, pe_fn, np_fn, [1], False)
+
+    def test_reduction_5(self):
+        for (fn_name, pe_fn, np_fn) in [
+            ("sum", pe.reduce_sum, np.sum),
+            ("prod", pe.reduce_prod, np.prod),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            self.reduction_tester(fn_name, pe_fn, np_fn, [1], True)
+
+    def test_reduction_6(self):
+        for (fn_name, pe_fn, np_fn) in [
+            ("max", pe.reduce_max, np.max),
+            ("min", pe.reduce_min, np.min),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            self.reduction_tester(fn_name, pe_fn, np_fn, [1], True)
+
+    def test_reduction_7(self):
+        for (fn_name, pe_fn, np_fn) in [
+            ("max", pe.reduce_max, np.max),
+            ("min", pe.reduce_min, np.min),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            self.reduction_tester(fn_name, pe_fn, np_fn, [1], False)
+
+    def reduction_tester(self, fn_name, cinn_fn, np_fn, axes, keep_dims):
+        m, n = [ir.Expr(_) for _ in (
+            self.m,
+            self.n,
+        )]
+        x = lang.Placeholder("float32", "x", [m, n])
+        func_name = "test_" + fn_name
+        y = cinn_fn(x.to_tensor(), axes, keep_dims)
+        stages = create_stages([x.to_tensor(), y])
+        func = lang.lower(func_name, stages, [x.to_tensor(), y])
+
+        builder = lang.Module.Builder("reduction_module", self.target)
+        builder.add_function(func)
+        print(func)
+
+        module = builder.build()
+        self.compiler.build(module)
+
+        fn = self.compiler.lookup(func_name)
+
+        x_data, x_buf, out_buf, *args = self.create_data(axes, keep_dims)
+        fn(args)
+
+        self.assertTrue(
+            np.allclose(
+                out_buf.numpy(),
+                self.create_target_data(x_data, np_fn, axes, keep_dims),
+                atol=1e-4))
+
+    def create_target_data(self, x_data, np_target_fn, axes, keep_dims):
+
+        axes_tuple = tuple(axes)
+        if len(axes) == 0:
+            axes_tuple = None
+        return np_target_fn(x_data, axis=axes_tuple, keepdims=keep_dims)
+
+    def create_data(self, axes, keep_dims):
+        if not self.reduction_data:
+            x_data = np.around(
+                np.random.randn(self.m, self.n).astype("float32"), 2)
+            x = runtime.cinn_buffer_t(x_data, runtime.cinn_x86_device)
+            if keep_dims:
+                output_shape = [self.m, self.n]
+                if axes:
+                    for i in axes:
+                        if i < 0:
+                            i = i + len(output_shape)
+                        output_shape[i] = 1
+                else:
+                    for i in range(len(output_shape)):
+                        output_shape[i] = 1
+            else:
+                output_shape = [self.m, self.n]
+                if axes:
+                    for i in axes:
+                        if i < 0:
+                            i = i + len(output_shape)
+                        output_shape.pop(i)
+                else:
+                    output_shape = [1]
+
+            out = runtime.cinn_buffer_t(
+                np.zeros(output_shape).astype("float32"),
+                runtime.cinn_x86_device)
+            self.reduction_data = [
+                x_data, x, out,
+                runtime.cinn_pod_value_t(x),
+                runtime.cinn_pod_value_t(out)
+            ]
+
+        return self.reduction_data
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_pe_transform.py b/test/cinn/test_pe_transform.py
new file mode 100644
index 0000000000000..d485d66c1c302
--- /dev/null
+++ b/test/cinn/test_pe_transform.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import cinn
+import numpy as np
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn import Target
+from cinn import pe
+from cinn.poly import create_stages
+from cinn.common import *
+
+
+class TestPETransform(unittest.TestCase):
+    def setUp(self):
+        self.m = 100
+        self.n = 32
+        self.k = 16
+
+        self.target = Target()
+        self.target.arch = Target.Arch.X86
+        self.target.bits = Target.Bit.k64
+        self.target.os = Target.OS.Linux
+
+        self.transform_data = []
+
+    def test_transform_0(self):
+        for (fn_name, pe_fn, np_fn) in [
+            ("matmul", pe.matmul, np.matmul),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            self.transform_matmul_tester(fn_name, pe_fn, np_fn, False, False,
+                                         1)
+
+    def test_transform_1(self):
+        for (fn_name, pe_fn, np_fn) in [
+            ("matmul", pe.matmul, np.matmul),
+        ]:
+            self.compiler = cinn.Compiler.create(self.target)
+            self.transform_matmul_tester(fn_name, pe_fn, np_fn, False, True, 2)
+
+    def transform_matmul_tester(self, fn_name, cinn_fn, np_fn, trans_a,
+                                trans_b, alpha):
+        m, n, k = [ir.Expr(_) for _ in (
+            self.m,
+            self.n,
+            self.k,
+        )]
+        x_shape_expr = [k, m] if trans_a else [m, k]
+        y_shape_expr = [n, k] if trans_b else [k, n]
+        x = lang.Placeholder("float32", "x", x_shape_expr)
+        y = lang.Placeholder("float32", "y", y_shape_expr)
+        func_name = "test_" + fn_name
+        z = cinn_fn(x.to_tensor(), y.to_tensor(), trans_a, trans_b, alpha)
+        tensor_args = [x.to_tensor(), y.to_tensor()]
+        for out in z:
+            tensor_args.append(out)
+        stages = create_stages(tensor_args)
+        func = lang.lower(func_name, stages, tensor_args)
+        print(func)
+
+        builder = lang.Module.Builder("transform_module", self.target)
+        builder.add_function(func)
+
+        module = builder.build()
+        self.compiler.build(module)
+
+        fn = self.compiler.lookup(func_name)
+
+        x_data, y_data, x_buf, y_buf, out_buf, *args = self.create_data(
+            (self.m, self.n), trans_a, trans_b, alpha)
+        fn(args)
+
+        self.assertTrue(
+            np.allclose(
+                out_buf.numpy(),
+                self.create_target_data(np_fn, x_data, y_data, trans_a,
+                                        trans_b, alpha),
+                atol=1e-4))
+
+    def create_target_data(self, np_target_fn, x_data, y_data, trans_a,
+                           trans_b, alpha):
+        x_data = np.transpose(x_data) if trans_a else x_data
+        y_data = np.transpose(y_data) if trans_b else y_data
+        return np_target_fn(x_data, y_data) * alpha
+
+    def create_data(self, output_shape, trans_a, trans_b, alpha=1):
+        if not self.transform_data:
+            if trans_a:
+                x_data = np.around(
+                    np.random.randn(self.k, self.m).astype("float32"), 2)
+            else:
+                x_data = np.around(
+                    np.random.randn(self.m, self.k).astype("float32"), 2)
+            if trans_b:
+                y_data = np.around(
+                    np.random.randn(self.n, self.k).astype("float32"), 2)
+            else:
+                y_data = np.around(
+                    np.random.randn(self.k, self.n).astype("float32"), 2)
+            x = runtime.cinn_buffer_t(x_data, runtime.cinn_x86_device)
+            y = runtime.cinn_buffer_t(y_data, runtime.cinn_x86_device)
+            out = runtime.cinn_buffer_t(
+                np.zeros(output_shape).astype("float32"),
+                runtime.cinn_x86_device)
+            out1 = runtime.cinn_buffer_t(
+                np.zeros(output_shape).astype("float32"),
+                runtime.cinn_x86_device)
+            self.transform_data = [
+                x_data, y_data, x, y, out,
+                runtime.cinn_pod_value_t(x),
+                runtime.cinn_pod_value_t(y),
+                runtime.cinn_pod_value_t(out),
+                runtime.cinn_pod_value_t(out1)
+            ]
+
+        return self.transform_data
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_resnet.py b/test/cinn/test_resnet.py
new file mode 100755
index 0000000000000..8a2ba6f53b5fa
--- /dev/null
+++ b/test/cinn/test_resnet.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle as paddle
+import paddle.fluid as fluid
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+
+enable_gpu = sys.argv.pop()
+model_dir = sys.argv.pop()
+
+
+class TestLoadResnetModel(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+
+        self.model_dir = model_dir
+
+        self.x_shape = [1, 160, 7, 7]
+
+    def get_paddle_inference_result(self, data):
+        config = fluid.core.AnalysisConfig(self.model_dir)
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        self.paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(data)
+        results = self.paddle_predictor.run([data])
+        return results[0].as_ndarray()
+
+    def apply_test(self):
+        np.random.seed(0)
+        x_data = np.random.random(self.x_shape).astype("float32")
+        self.executor = Interpreter(["resnet_input"], [self.x_shape])
+        self.executor.load_paddle_model(self.model_dir, self.target, False)
+        a_t = self.executor.get_tensor("resnet_input")
+        a_t.from_numpy(x_data, self.target)
+
+        out = self.executor.get_tensor("relu_0.tmp_0")
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+
+        self.executor.run()
+
+        out = out.numpy(self.target)
+        target_result = self.get_paddle_inference_result(x_data)
+
+        print("result in test_model: \n")
+        out = out.reshape(-1)
+        target_result = target_result.reshape(-1)
+        # out.shape[0]
+        for i in range(0, min(out.shape[0], 200)):
+            if np.abs(out[i] - target_result[i]) > 1e-3:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      out[i], " vs: ", target_result[i], ". Diff is: ",
+                      out[i] - target_result[i])
+        self.assertTrue(np.allclose(out, target_result, atol=1e-3))
+
+    def test_model(self):
+        self.apply_test()
+        #self.target.arch = Target.Arch.NVGPU
+        #self.apply_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_resnet18.py b/test/cinn/test_resnet18.py
new file mode 100755
index 0000000000000..2bb40937a7b80
--- /dev/null
+++ b/test/cinn/test_resnet18.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle as paddle
+import paddle.fluid as fluid
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+import time
+
+enable_gpu = sys.argv.pop()
+model_dir = sys.argv.pop()
+print("enable_gpu is : ", enable_gpu)
+print("model_dir is : ", model_dir)
+
+
+class TestLoadResnetModel(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+        self.model_dir = model_dir
+        self.x_shape = [1, 3, 224, 224]
+        self.target_tensor = 'save_infer_model/scale_0'
+        self.input_tensor = 'image'
+
+    def get_paddle_inference_result(self, model_dir, data):
+        config = fluid.core.AnalysisConfig(model_dir + '/__model__',
+                                           model_dir + '/params')
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        self.paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(data)
+        results = self.paddle_predictor.run([data])
+        get_tensor = self.paddle_predictor.get_output_tensor(
+            self.target_tensor).copy_to_cpu()
+        return get_tensor
+
+    def apply_test(self):
+        start = time.time()
+        x_data = np.random.random(self.x_shape).astype("float32")
+        self.executor = Interpreter([self.input_tensor], [self.x_shape])
+        print("self.mode_dir is:", self.model_dir)
+        # True means load combined model
+        self.executor.load_paddle_model(self.model_dir, self.target, True,
+                                        "resnet18")
+        end1 = time.time()
+        print("load_paddle_model time is: %.3f sec" % (end1 - start))
+        a_t = self.executor.get_tensor(self.input_tensor)
+        a_t.from_numpy(x_data, self.target)
+        out = self.executor.get_tensor(self.target_tensor)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        for i in range(10):
+            self.executor.run()
+
+        repeat = 10
+        end4 = time.perf_counter()
+        for i in range(repeat):
+            self.executor.run()
+        end5 = time.perf_counter()
+        print("Repeat %d times, average Executor.run() time is: %.3f ms" %
+              (repeat, (end5 - end4) * 1000 / repeat))
+
+        a_t.from_numpy(x_data, self.target)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        self.executor.run()
+
+        out = out.numpy(self.target)
+        target_result = self.get_paddle_inference_result(
+            self.model_dir, x_data)
+
+        print("result in test_model: \n")
+        out = out.reshape(-1)
+        target_result = target_result.reshape(-1)
+        for i in range(0, min(out.shape[0], 200)):
+            if np.abs(out[i] - target_result[i]) > 1.0:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      out[i], " vs: ", target_result[i], ". Diff is: ",
+                      out[i] - target_result[i])
+        # TODO(thisjiang): revert atol to 1e-3 after fix inference mul problem
+        self.assertTrue(np.allclose(out, target_result, atol=1.0))
+
+    def test_model(self):
+        self.apply_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_resnet50.py b/test/cinn/test_resnet50.py
new file mode 100755
index 0000000000000..16d1dbd7160fc
--- /dev/null
+++ b/test/cinn/test_resnet50.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle as paddle
+import paddle.fluid as fluid
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+import time
+
+enable_gpu = sys.argv.pop()
+model_dir = sys.argv.pop()
+print("enable_gpu is : ", enable_gpu)
+print("model_dir is : ", model_dir)
+
+
+class TestLoadResnet50Model(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+        self.model_dir = model_dir
+        self.x_shape = [1, 3, 224, 224]
+        self.target_tensor = 'save_infer_model/scale_0.tmp_1'
+        self.input_tensor = 'inputs'
+
+    def get_paddle_inference_result(self, model_dir, data):
+        config = fluid.core.AnalysisConfig(model_dir + '/__model__',
+                                           model_dir + '/params')
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        self.paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(data)
+        results = self.paddle_predictor.run([data])
+        get_tensor = self.paddle_predictor.get_output_tensor(
+            self.target_tensor).copy_to_cpu()
+        return get_tensor
+
+    def apply_test(self):
+        start = time.time()
+        x_data = np.random.random(self.x_shape).astype("float32")
+        self.executor = Interpreter([self.input_tensor], [self.x_shape])
+        print("self.mode_dir is:", self.model_dir)
+        # True means load combined model
+        self.executor.load_paddle_model(self.model_dir, self.target, True,
+                                        "resnet50")
+        prog = self.executor.get_program()
+        # print program
+        print("resnet50 program is:\n")
+        print(prog)
+        end1 = time.time()
+        print("load_paddle_model time is: %.3f sec" % (end1 - start))
+        a_t = self.executor.get_tensor(self.input_tensor)
+        a_t.from_numpy(x_data, self.target)
+        out = self.executor.get_tensor(self.target_tensor)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        for i in range(10):
+            self.executor.run()
+
+        repeat = 10
+        end4 = time.perf_counter()
+        for i in range(repeat):
+            self.executor.run()
+        end5 = time.perf_counter()
+        print("Repeat %d times, average Executor.run() time is: %.3f ms" %
+              (repeat, (end5 - end4) * 1000 / repeat))
+
+        a_t.from_numpy(x_data, self.target)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        self.executor.run()
+
+        out = out.numpy(self.target)
+        target_result = self.get_paddle_inference_result(
+            self.model_dir, x_data)
+
+        print("result in test_model: \n")
+        out = out.reshape(-1)
+        target_result = target_result.reshape(-1)
+        for i in range(0, min(out.shape[0], 200)):
+            if np.abs(out[i] - target_result[i]) > 1e-3:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      out[i], " vs: ", target_result[i], ". Diff is: ",
+                      out[i] - target_result[i])
+        self.assertTrue(np.allclose(out, target_result, atol=1e-3))
+
+    def test_model(self):
+        self.apply_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_squeezenet.py b/test/cinn/test_squeezenet.py
new file mode 100644
index 0000000000000..2f753b9b763e3
--- /dev/null
+++ b/test/cinn/test_squeezenet.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle as paddle
+import paddle.fluid as fluid
+from cinn.frontend import *
+from cinn import Target
+from cinn.framework import *
+import unittest
+import cinn
+from cinn import runtime
+from cinn import ir
+from cinn import lang
+from cinn.common import *
+import numpy as np
+import sys
+import time
+
+enable_gpu = sys.argv.pop()
+model_dir = sys.argv.pop()
+print("enable_gpu is : ", enable_gpu)
+print("model_dir is : ", model_dir)
+
+
+class TestLoadSqueezeNetModel(unittest.TestCase):
+    def setUp(self):
+        if enable_gpu == "ON":
+            self.target = DefaultNVGPUTarget()
+        else:
+            self.target = DefaultHostTarget()
+        self.model_dir = model_dir
+        self.x_shape = [1, 3, 227, 227]
+        self.target_tensor = 'save_infer_model/scale_0'
+        self.input_tensor = 'data'
+
+    def get_paddle_inference_result(self, model_dir, data):
+        config = fluid.core.AnalysisConfig(model_dir)
+        config.disable_gpu()
+        config.switch_ir_optim(False)
+        self.paddle_predictor = fluid.core.create_paddle_predictor(config)
+        data = fluid.core.PaddleTensor(data)
+        results = self.paddle_predictor.run([data])
+        get_tensor = self.paddle_predictor.get_output_tensor(
+            self.target_tensor).copy_to_cpu()
+        return get_tensor
+
+    def apply_test(self):
+        start = time.time()
+        x_data = np.random.random(self.x_shape).astype("float32")
+        self.executor = Interpreter([self.input_tensor], [self.x_shape])
+        print("self.mode_dir is:", self.model_dir)
+        # True means load combined model
+        self.executor.load_paddle_model(self.model_dir, self.target, False,
+                                        "squeezenet")
+        end1 = time.time()
+        print("load_paddle_model time is: %.3f sec" % (end1 - start))
+        a_t = self.executor.get_tensor(self.input_tensor)
+        a_t.from_numpy(x_data, self.target)
+        out = self.executor.get_tensor(self.target_tensor)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        for i in range(10):
+            self.executor.run()
+
+        repeat = 10
+        end4 = time.perf_counter()
+        for i in range(repeat):
+            self.executor.run()
+        end5 = time.perf_counter()
+        print("Repeat %d times, average Executor.run() time is: %.3f ms" %
+              (repeat, (end5 - end4) * 1000 / repeat))
+
+        a_t.from_numpy(x_data, self.target)
+        out.from_numpy(np.zeros(out.shape(), dtype='float32'), self.target)
+        self.executor.run()
+
+        out = out.numpy(self.target)
+        target_result = self.get_paddle_inference_result(
+            self.model_dir, x_data)
+
+        print("result in test_model: \n")
+        out = out.reshape(-1)
+        target_result = target_result.reshape(-1)
+        for i in range(0, min(out.shape[0], 200)):
+            if np.abs(out[i] - target_result[i]) > 1e-3:
+                print("Error! ", i, "-th data has diff with target data:\n",
+                      out[i], " vs: ", target_result[i], ". Diff is: ",
+                      out[i] - target_result[i])
+        self.assertTrue(np.allclose(out, target_result, atol=1e-3))
+
+    def test_model(self):
+        self.apply_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cinn/test_utils.py b/test/cinn/test_utils.py
new file mode 100755
index 0000000000000..765c3e8e35ce6
--- /dev/null
+++ b/test/cinn/test_utils.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import math
+import numpy as np
+import cinn
+from cinn import frontend
+from cinn import runtime
+from cinn import lang
+from cinn import framework
+from cinn import ir
+from cinn import common
+from cinn.poly import create_stages
+import logging
+
+
+class SingleOpTester(unittest.TestCase):
+    '''
+    A unittest framework for testing a single operator.
+
+    Two methods one should override for each Operator's unittest
+
+    1. create_target_data
+    2. test_op
+    '''
+
+    def setUp(self):
+        np.random.seed(0)
+        self.counter = 0
+        self.target = common.DefaultHostTarget()
+
+    def create_target_data(self, inputs_data, attrs):
+        '''
+        create the target of the operator's execution output.
+        '''
+        raise NotImplemented
+
+    def test_op(self):
+        '''
+        USER API
+
+        The real use case should implement this method!
+        '''
+        pass
+
+    def to_test_op(self,
+                   input_shapes,
+                   output_shapes,
+                   op_name,
+                   attrs,
+                   out_index=None,
+                   do_infer_shape=False):
+        '''
+        Test the operator.
+        '''
+        self.compiler = cinn.Compiler.create(self.target)
+        inputs = []
+        inputs_data = []
+
+        for i_shape in input_shapes:
+            expr_shape = []
+            inputs_data.append(
+                np.around(np.random.random(i_shape).astype("float32"), 3))
+
+            for dim_shape in i_shape:
+                expr_shape.append(ir.Expr(dim_shape))
+
+            inputs.append(
+                lang.Placeholder("float32", self.__gen_var_name(),
+                                 expr_shape).to_tensor())
+
+        args = []
+        temp_inputs = []
+        alignment = 0
+        if self.target.arch == common.Target.Arch.X86:
+            alignment = 32
+        for in_data in inputs_data:
+            temp_inputs.append(
+                runtime.cinn_buffer_t(in_data, runtime.cinn_x86_device,
+                                      alignment))
+        for in_data in temp_inputs:
+            args.append(runtime.cinn_pod_value_t(in_data))
+        if output_shapes == None:
+            correct_result, output_shapes = self.create_target_data(
+                inputs_data, attrs)
+        else:
+            correct_result = self.create_target_data(inputs_data, attrs)
+
+        func = self.__lower(op_name, inputs, output_shapes, attrs)
+        builder = lang.Module.Builder(op_name, self.target)
+        builder.add_function(func)
+        module = builder.build()
+
+        self.compiler.build(module)
+        fn = self.compiler.lookup(func.name())
+
+        out = []
+
+        for out_shape in output_shapes:
+            out.append(
+                runtime.cinn_buffer_t(
+                    np.zeros(out_shape).astype("float32"),
+                    runtime.cinn_x86_device, alignment))
+        if do_infer_shape:
+            infer_shapes = framework.Operator.get_op_shape_attrs("infershape")
+            out_shapes = infer_shapes.infer_shape(op_name, input_shapes,
+                                                  attrs.attr_store)
+            print("out_shapes", out_shapes)
+            for out_shape in out_shapes[1:]:
+                out.append(
+                    runtime.cinn_buffer_t(
+                        np.zeros(out_shape).astype("float32"),
+                        runtime.cinn_x86_device, alignment))
+
+        for out_data in out:
+            args.append(runtime.cinn_pod_value_t(out_data))
+        fn(args)
+
+        out_result = out[len(out) - 1].numpy()
+        if out_index != None:
+            out_result = out[out_index].numpy()
+        self.assertTrue(np.allclose(out_result, correct_result, atol=1e-4))
+
+    def __lower(self, op_name, inputs, output_shapes, attrs):
+        types = [common.Float(32)]
+        strategy_map = framework.Operator.get_op_attrs("CINNStrategy")
+        func = strategy_map.apply_strategy(op_name, attrs, inputs, types,
+                                           output_shapes, self.target)
+        logging.warning('func:\n\n%s\n', func)
+        return func
+
+    def __gen_var_name(self):
+        self.counter = self.counter + 1
+        return "Var_" + str(self.counter)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cpp/cinn/CMakeLists.txt b/test/cpp/cinn/CMakeLists.txt
new file mode 100644
index 0000000000000..68e1ce566a4c4
--- /dev/null
+++ b/test/cpp/cinn/CMakeLists.txt
@@ -0,0 +1,22 @@
+cc_library(test_program_builder SRCS program_builder.cc DEPS cinncore)
+
+if (WITH_TESTING)
+  include_directories(${CMAKE_SOURCE_DIR}/cinn/runtime)
+  add_subdirectory(benchmark)
+
+  cc_test(test01_elementwise_add_main SRCS test01_elementwise_add_main.cc DEPS cinncore
+    ARGS ${global_test_args}
+    )
+  cc_test(test01_elementwise_add_case SRCS test01_elementwise_add_case.cc DEPS cinncore)
+  add_run_test_dependency(test01_elementwise_add_case test01_elementwise_add_main)
+
+  cc_test(test02_matmul_main SRCS test02_matmul_main.cc DEPS cinncore ARGS ${global_test_args})
+  cc_test(test02_matmul_case SRCS test02_matmul_case.cc DEPS cinncore)
+  target_compile_options(test02_matmul_case PRIVATE "-O3")
+  add_run_test_dependency(test02_matmul_case test02_matmul_main)
+
+  cc_test(test03_conv_main SRCS test03_convolution_main.cc DEPS cinncore ARGS ${global_test_args})
+  cc_test(test03_conv_case SRCS test03_convolution_case.cc DEPS cinncore)
+  target_compile_options(test03_conv_case PRIVATE "-O3")
+  add_run_test_dependency(test03_conv_case test03_conv_main)
+endif()
diff --git a/test/cpp/cinn/benchmark/CMakeLists.txt b/test/cpp/cinn/benchmark/CMakeLists.txt
new file mode 100755
index 0000000000000..b83f49b23616b
--- /dev/null
+++ b/test/cpp/cinn/benchmark/CMakeLists.txt
@@ -0,0 +1,11 @@
+include_directories(${CMAKE_SOURCE_DIR}/cinn/runtime)
+set(srcs test_utils.cc test_matmul.cc test_elementwise.cc test_all_ops_default.cc)
+
+#cc_test(test_bk_matmul SRCS test_matmul.cc test_utils.cc DEPS cinncore ARGS ${global_test_args})
+#target_compile_options(test_bk_matmul PRIVATE "-O3")
+
+cc_test(test_bk_elementwise SRCS test_elementwise.cc test_utils.cc DEPS cinncore ARGS ${global_test_args})
+target_compile_options(test_bk_elementwise PRIVATE "-O3")
+
+#cc_test(test_all_ops_default SRCS test_all_ops_default.cc test_utils.cc DEPS cinncore ARGS ${global_test_args})
+#target_compile_options(test_all_ops_default PRIVATE "-O3")
diff --git a/test/cpp/cinn/benchmark/test_all_ops_default.cc b/test/cpp/cinn/benchmark/test_all_ops_default.cc
new file mode 100644
index 0000000000000..1de336684a53d
--- /dev/null
+++ b/test/cpp/cinn/benchmark/test_all_ops_default.cc
@@ -0,0 +1,368 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <absl/container/flat_hash_map.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/cinn.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/runtime/cpu/use_extern_funcs.h"
+#include "tests/benchmark/test_utils.h"
+
+namespace cinn {
+namespace tests {
+
+using cinn::hlir::framework::AttrType;
+
+#define TEST_DEFAULT(op_name__, shape_name__, input_types_, output_types_)                          \
+  TEST(op_defualt, shape_name__) {                                                                  \
+    std::vector<std::vector<int>> input_shapes = shapes_##shape_name__;                             \
+    std::string op_name                        = #op_name__;                                        \
+    hlir::framework::NodeAttr attrs;                                                                \
+    OpBenchmarkTester tester(op_name, input_shapes);                                                \
+    auto input_tensors = tester.CreateInputTensors<float>();                                        \
+    tester.TestOp(common::UniqName(#op_name__), input_tensors, attrs, input_types_, output_types_); \
+  }
+
+#define TEST_DEFAULT1(op_name__, shape_name__, input_types_, output_types_, attr_store__)           \
+  TEST(op_defualt1, shape_name__) {                                                                 \
+    std::vector<std::vector<int>> input_shapes = shapes_##shape_name__;                             \
+    std::string op_name                        = #op_name__;                                        \
+    OpBenchmarkTester tester(op_name, input_shapes);                                                \
+    hlir::framework::NodeAttr attrs;                                                                \
+    attrs.attr_store   = attr_store__;                                                              \
+    auto input_tensors = tester.CreateInputTensors<float>();                                        \
+    std::vector<Type> input_types{Float(32), Float(32)};                                            \
+    tester.TestOp(common::UniqName(#op_name__), input_tensors, attrs, input_types_, output_types_); \
+  }
+
+#define TEST_DEFAULT_INT(op_name__, shape_name__, input_types_, output_types_)                      \
+  TEST(op_defualt, shape_name__) {                                                                  \
+    std::vector<std::vector<int>> input_shapes = shapes_##shape_name__;                             \
+    std::string op_name                        = #op_name__;                                        \
+    hlir::framework::NodeAttr attrs;                                                                \
+    OpBenchmarkTester tester(op_name, input_shapes);                                                \
+    auto input_tensors = tester.CreateInputTensors<int>();                                          \
+    tester.TestOp(common::UniqName(#op_name__), input_tensors, attrs, input_types_, output_types_); \
+  }
+
+std::vector<Type> type  = {Float(32)};
+std::vector<Type> type1 = {Float(32), Float(32)};
+std::vector<Type> type2 = {Int(32)};
+std::vector<Type> type3 = {Bool()};
+std::vector<Type> type4 = {Float(32), Float(32), Float(32), Float(32), Float(32)};
+std::vector<Type> type5 = {Int(32), Int(32)};
+std::vector<Type> type6 = {Float(32), Void()};
+std::vector<Type> type7 = {Float(32), Float(32), Float(32), Float(32)};
+std::vector<Type> type8 = {Float(32), Float(32), Float(32)};
+
+// broadcast_to
+std::vector<std::vector<int>> shapes_broadcast_to                  = {{32}};
+std::vector<int> out_shape                                         = {100, 32};
+std::vector<int> broadcast_axes                                    = {1};
+absl::flat_hash_map<std::string, AttrType> attr_store_broadcast_to = {{"out_shape", out_shape},
+                                                                      {"broadcast_axes", broadcast_axes}};
+TEST_DEFAULT1(broadcast_to, broadcast_to, type, type, attr_store_broadcast_to)
+
+// concat
+std::vector<std::vector<int>> shapes_concat                  = {{2, 2, 3}, {2, 4, 3}};
+absl::flat_hash_map<std::string, AttrType> attr_store_concat = {{"axis", 1}};
+TEST_DEFAULT1(concat, concat, type1, type, attr_store_concat)
+
+std::vector<std::vector<int>> shapes_concat1                  = {{2, 2, 3}, {2, 4, 3}, {2, 5, 3}};
+absl::flat_hash_map<std::string, AttrType> attr_store_concat1 = {{"axis", -2}};
+TEST_DEFAULT1(concat, concat1, type8, type, attr_store_concat1)
+
+// add
+std::vector<std::vector<int>> shapes_add = {{1024, 1024, 1024}, {1024, 1024, 1024}};
+TEST_DEFAULT(elementwise_add, add, type1, type)
+std::vector<std::vector<int>> shapes_add1_0 = {{100, 32}, {100, 32}};
+std::vector<std::vector<int>> shapes_add1_1 = {{32, 100}, {32, 100}};
+std::vector<std::vector<int>> shapes_add1_2 = {{100, 33}, {100, 33}};
+std::vector<std::vector<int>> shapes_add1_3 = {{33, 100}, {33, 100}};
+std::vector<std::vector<int>> shapes_add1_4 = {{100, 16}, {100, 16}};
+std::vector<std::vector<int>> shapes_add1_5 = {{1, 33}, {1, 33}};
+std::vector<std::vector<int>> shapes_add1_6 = {{33}, {33}};
+TEST_DEFAULT(elementwise_add, add1_0, type1, type)
+TEST_DEFAULT(elementwise_add, add1_1, type1, type)
+TEST_DEFAULT(elementwise_add, add1_2, type1, type)
+TEST_DEFAULT(elementwise_add, add1_3, type1, type)
+TEST_DEFAULT(elementwise_add, add1_4, type1, type)
+TEST_DEFAULT(elementwise_add, add1_5, type1, type)
+TEST_DEFAULT(elementwise_add, add1_6, type1, type)
+std::vector<std::vector<int>> shapes_add2 = {{1024, 14, 14}, {1024, 14, 14}};
+TEST_DEFAULT(elementwise_add, add2, type1, type)
+std::vector<std::vector<int>> shapes_add3 = {{1}, {1}};
+TEST_DEFAULT(elementwise_add, add3, type1, type)
+std::vector<std::vector<int>> shapes_add4 = {{1, 8}, {1, 8}};
+TEST_DEFAULT(elementwise_add, add4, type1, type)
+std::vector<std::vector<int>> shapes_add5_0 = {{1024, 2}, {1024, 2}};
+std::vector<std::vector<int>> shapes_add5_1 = {{2, 1024}, {2, 1024}};
+std::vector<std::vector<int>> shapes_add5_2 = {{1025, 2}, {1025, 2}};
+std::vector<std::vector<int>> shapes_add5_3 = {{2, 1025}, {2, 1025}};
+TEST_DEFAULT(elementwise_add, add5_0, type1, type)
+TEST_DEFAULT(elementwise_add, add5_1, type1, type)
+TEST_DEFAULT(elementwise_add, add5_2, type1, type)
+TEST_DEFAULT(elementwise_add, add5_3, type1, type)
+// mul
+std::vector<std::vector<int>> shapes_elementwise_mul = {{1024, 1024, 1024}, {1024, 1024, 1024}};
+TEST_DEFAULT(elementwise_mul, elementwise_mul, type1, type)
+std::vector<std::vector<int>> shapes_elementwise_mul1 = {{100, 32}, {100, 32}};
+TEST_DEFAULT(elementwise_mul, elementwise_mul1, type1, type)
+std::vector<std::vector<int>> shapes_elementwise_mul2 = {{1024, 14, 14}, {1024, 14, 14}};
+TEST_DEFAULT(elementwise_mul, elementwise_mul2, type1, type)
+std::vector<std::vector<int>> shapes_elementwise_mul3 = {{1}, {1}};
+TEST_DEFAULT(elementwise_mul, elementwise_mul3, type1, type)
+
+// relu
+std::vector<std::vector<int>> shapes_relu = {{2, 512, 7, 7}};
+TEST_DEFAULT(relu, relu, type, type)
+std::vector<std::vector<int>> shapes_relu1 = {{1024, 14, 14}};
+TEST_DEFAULT(relu, relu1, type, type)
+
+// conv2d nchw
+std::vector<std::vector<int>> shapes_conv2d_nchw = {{2, 512, 7, 7}, {512, 512, 3, 3}};
+std::vector<int> padding_conv2d({0, 0});
+std::vector<int> stride_conv2d({1, 1});
+std::vector<int> dilation_conv2d({1, 1});
+absl::flat_hash_map<std::string, AttrType> attr_store_conv2d = {
+    {"padding", padding_conv2d}, {"stride", stride_conv2d}, {"dilation", dilation_conv2d}};
+TEST_DEFAULT1(conv2d, conv2d_nchw, type1, type8, attr_store_conv2d)
+std::vector<std::vector<int>> shapes_conv2d_nchw1 = {{2, 1024, 14, 14}, {256, 1024, 1, 1}};
+TEST_DEFAULT1(conv2d, conv2d_nchw1, type1, type8, attr_store_conv2d)
+std::vector<std::vector<int>> shapes_conv2d_nchw2 = {{8, 32, 1, 1}, {8, 32, 1, 1}};
+TEST_DEFAULT1(conv2d, conv2d_nchw2, type1, type8, attr_store_conv2d)
+
+// resnet18
+std::vector<std::vector<int>> shapes_conv2d_nchw3 = {{1, 3, 224, 224}, {64, 3, 7, 7}};
+std::vector<int> padding_conv2d1({3, 3});
+std::vector<int> stride_conv2d1({2, 2});
+std::vector<int> dilation_conv2d1({1, 1});
+absl::flat_hash_map<std::string, AttrType> attr_store_conv2d1 = {
+    {"padding", padding_conv2d1}, {"stride", stride_conv2d1}, {"dilation", dilation_conv2d1}};
+TEST_DEFAULT1(conv2d, conv2d_nchw3, type1, type7, attr_store_conv2d1)
+
+// resnet18 1*1
+std::vector<std::vector<int>> shapes_conv2d_nchw4 = {{1, 64, 56, 56}, {64, 64, 1, 1}};
+std::vector<int> padding_conv2d4({0, 0});
+std::vector<int> stride_conv2d4({1, 1});
+std::vector<int> dilation_conv2d4({1, 1});
+absl::flat_hash_map<std::string, AttrType> attr_store_conv2d4 = {
+    {"padding", padding_conv2d4}, {"stride", stride_conv2d4}, {"dilation", dilation_conv2d4}};
+TEST_DEFAULT1(conv2d, conv2d_nchw4, type1, type8, attr_store_conv2d4)
+
+// mobilenet 1*1
+std::vector<std::vector<int>> shapes_conv2d_nchw5 = {{1, 16, 112, 112}, {96, 16, 1, 1}};
+std::vector<int> padding_conv2d5({0, 0});
+std::vector<int> stride_conv2d5({1, 1});
+std::vector<int> dilation_conv2d5({1, 1});
+absl::flat_hash_map<std::string, AttrType> attr_store_conv2d5 = {
+    {"padding", padding_conv2d5}, {"stride", stride_conv2d5}, {"dilation", dilation_conv2d5}};
+TEST_DEFAULT1(conv2d, conv2d_nchw5, type1, type8, attr_store_conv2d5)
+
+// effi
+std::vector<std::vector<int>> shapes_conv2d_nchw6 = {{1, 3, 224, 224}, {32, 3, 3, 3}};
+std::vector<int> padding_conv2d6({2, 2});
+std::vector<int> stride_conv2d6({2, 2});
+std::vector<int> dilation_conv2d6({1, 1});
+absl::flat_hash_map<std::string, AttrType> attr_store_conv2d6 = {
+    {"padding", padding_conv2d6}, {"stride", stride_conv2d6}, {"dilation", dilation_conv2d6}};
+TEST_DEFAULT1(conv2d, conv2d_nchw6, type1, type7, attr_store_conv2d6)
+
+// test_op_nn
+std::vector<std::vector<int>> shapes_conv2d_nchw7 = {{1, 3, 10, 10}, {2, 3, 2, 2}};
+std::vector<int> padding_conv2d7({1, 1});
+std::vector<int> stride_conv2d7({2, 2});
+std::vector<int> dilation_conv2d7({2, 2});
+absl::flat_hash_map<std::string, AttrType> attr_store_conv2d7 = {
+    {"padding", padding_conv2d7}, {"stride", stride_conv2d7}, {"dilation", dilation_conv2d7}};
+TEST_DEFAULT1(conv2d, conv2d_nchw7, type1, type7, attr_store_conv2d7)
+
+// conv2d_NCHWc
+// resnet18
+std::vector<std::vector<int>> shapes_conv2d_nchwc = {{1, 1, 224, 224, 3}, {4, 1, 7, 7, 3, 16}};
+std::vector<int> padding_conv2d_nchwc({3, 3});
+std::vector<int> stride_conv2d_nchwc({2, 2});
+std::vector<int> dilation_conv2d_nchwc({1, 1});
+absl::flat_hash_map<std::string, AttrType> attr_store_conv2d_nchwc = {
+    {"padding", padding_conv2d_nchwc}, {"stride", stride_conv2d_nchwc}, {"dilation", dilation_conv2d_nchwc}};
+TEST_DEFAULT1(conv2d_NCHWc, conv2d_nchwc, type1, type8, attr_store_conv2d_nchwc)
+
+// depthwise_conv2d nchw
+std::vector<std::vector<int>> shapes_depthwise_conv2d_nchw             = {{2, 32, 112, 112}, {32, 1, 3, 3}};
+std::vector<int> stride_depthwise_conv2d                               = {1, 1};
+std::vector<int> padding_depthwise_conv2d                              = {1, 1};
+std::vector<int> dilation_depthwise_conv2d                             = {1, 1};
+absl::flat_hash_map<std::string, AttrType> attr_store_depthwise_conv2d = {{"padding", padding_depthwise_conv2d},
+                                                                          {"stride", stride_depthwise_conv2d},
+                                                                          {"dilation", dilation_depthwise_conv2d}};
+TEST_DEFAULT1(depthwise_conv2d, depthwise_conv2d_nchw, type1, type7, attr_store_depthwise_conv2d)
+
+// layout_transform
+std::vector<std::vector<int>> shapes_layout_transform                  = {{512, 512, 3, 3}};
+std::string src_layout                                                 = "OIHW";
+std::string dst_layout                                                 = "OIHW16i16o";
+absl::flat_hash_map<std::string, AttrType> attr_store_layout_transform = {{"src_layout", src_layout},
+                                                                          {"dst_layout", dst_layout}};
+TEST_DEFAULT1(layout_transform, layout_transform, type, type, attr_store_layout_transform)
+
+std::vector<std::vector<int>> shapes_layout_transform1                  = {{64, 3, 7, 7}};
+std::string src_layout1                                                 = "OIHW";
+std::string dst_layout1                                                 = "OIHW3i32o";
+absl::flat_hash_map<std::string, AttrType> attr_store_layout_transform1 = {{"src_layout", src_layout1},
+                                                                           {"dst_layout", dst_layout1}};
+TEST_DEFAULT1(layout_transform, layout_transform1, type, type, attr_store_layout_transform1)
+
+// pool2d
+hlir::framework::NodeAttr attrs;
+std::vector<int> kernel_size                                 = {3, 3};
+std::vector<int> stride_size                                 = {2, 2};
+std::vector<int> padding_size                                = {1, 1, 1, 1};
+std::string pool_type                                        = "max";
+absl::flat_hash_map<std::string, AttrType> attr_store_pool2d = {{"kernel_size", kernel_size},
+                                                                {"stride_size", stride_size},
+                                                                {"padding_size", padding_size},
+                                                                {"pool_type", pool_type}};
+
+std::vector<std::vector<int>> shapes_pool2d = {{2, 64, 112, 112}};
+TEST_DEFAULT1(pool2d, pool2d, type, type, attr_store_pool2d)
+std::vector<std::vector<int>> shapes_pool2d1 = {{2, 1024, 14, 14}};
+TEST_DEFAULT1(pool2d, pool2d1, type, type, attr_store_pool2d)
+
+// softmax
+std::vector<std::vector<int>> shapes_softmax = {{1024, 2048}};
+TEST_DEFAULT(softmax, softmax, type, type1)
+std::vector<std::vector<int>> shapes_softmax1 = {{3, 1000}};
+TEST_DEFAULT(softmax, softmax1, type, type1)
+
+// sigmoid
+std::vector<std::vector<int>> shapes_sigmoid = {{2, 672, 1, 1}};
+TEST_DEFAULT(sigmoid, sigmoid, type, type)
+std::vector<std::vector<int>> shapes_sigmoid1 = {{3, 1000}};
+TEST_DEFAULT(sigmoid, sigmoid1, type, type)
+
+// matmul
+std::vector<std::vector<int>> shapes_matmul = {{32, 32}, {32, 32}};
+TEST_DEFAULT(matmul, matmul, type1, type1)
+std::vector<std::vector<int>> shapes_matmul1 = {{512, 512}, {512, 512}};
+TEST_DEFAULT(matmul, matmul1, type1, type1)
+std::vector<std::vector<int>> shapes_matmul2 = {{100, 32}, {32, 100}};
+TEST_DEFAULT(matmul, matmul2, type1, type1)
+std::vector<std::vector<int>> shapes_matmul3 = {{1024, 1024}, {1024, 1024}};
+TEST_DEFAULT(matmul, matmul3, type1, type1)
+std::vector<std::vector<int>> shapes_matmul4 = {{1, 1024, 1024}, {1, 1024, 1024}};
+TEST_DEFAULT(matmul, matmul4, type1, type1)
+std::vector<std::vector<int>> shapes_matmul5 = {{1}, {1}};
+TEST_DEFAULT(matmul, matmul5, type1, type1)
+std::vector<std::vector<int>> shapes_matmul6 = {{1, 30}, {30}};
+TEST_DEFAULT(matmul, matmul6, type1, type1)
+std::vector<std::vector<int>> shapes_matmul7 = {{2, 100, 4}, {2, 4, 100}};
+TEST_DEFAULT(matmul, matmul7, type1, type1)
+
+// matrix mul
+std::vector<std::vector<int>> shapes_mul = {{32, 32}, {32, 32}};
+TEST_DEFAULT(mul, mul, type1, type1)
+std::vector<std::vector<int>> shapes_mul1 = {{512, 512}, {512, 512}};
+TEST_DEFAULT(mul, mul1, type1, type1)
+std::vector<std::vector<int>> shapes_mul2 = {{100, 32}, {100, 32}};
+TEST_DEFAULT(mul, mul2, type1, type1)
+std::vector<std::vector<int>> shapes_mul3 = {{1024, 1024}, {1024, 1024}};
+TEST_DEFAULT(mul, mul3, type1, type1)
+std::vector<std::vector<int>> shapes_mul4 = {{1}, {1}};
+TEST_DEFAULT(mul, mul4, type1, type1)
+std::vector<std::vector<int>> shapes_mul5 = {{1, 30}, {1, 30}};
+TEST_DEFAULT(mul, mul5, type1, type1)
+
+// batchnorm
+std::vector<std::vector<int>> shapes_batchnorm = {{2, 32, 112, 112}, {32}, {32}, {32}, {32}};
+TEST_DEFAULT(batch_norm, batchnorm, type4, type)
+
+// scale
+std::vector<std::vector<int>> shapes_scale = {{2, 1000}};
+TEST_DEFAULT(scale, scale, type, type)
+
+// slice
+std::vector<std::vector<int>> shapes_slice = {{2, 32, 113, 113}};
+std::vector<int> starts({1, 1});
+std::vector<int> ends({10000000, 10000000});
+std::vector<int> axes({2, 3});
+absl::flat_hash_map<std::string, AttrType> attr_store_slice = {{"starts", starts}, {"ends", ends}, {"axes", axes}};
+TEST_DEFAULT1(slice, slice, type, type, attr_store_slice)
+
+// unary
+#define TEST_DEFAULT_UNARY(op__)                                         \
+  std::vector<std::vector<int>> shapes_unary_##op__    = {{1024, 2048}}; \
+  std::vector<std::vector<int>> shapes_unary_##op__##1 = {{3, 1000}};    \
+  TEST_DEFAULT(op__, unary_##op__, type, type)                           \
+  TEST_DEFAULT(op__, unary_##op__##1, type, type)
+
+TEST_DEFAULT_UNARY(exp)
+TEST_DEFAULT_UNARY(erf)
+TEST_DEFAULT_UNARY(sigmoid)
+TEST_DEFAULT_UNARY(sqrt)
+TEST_DEFAULT_UNARY(log)
+TEST_DEFAULT_UNARY(log2)
+TEST_DEFAULT_UNARY(log10)
+TEST_DEFAULT_UNARY(floor)
+TEST_DEFAULT_UNARY(ceil)
+TEST_DEFAULT_UNARY(round)
+TEST_DEFAULT_UNARY(trunc)
+TEST_DEFAULT_UNARY(cos)
+TEST_DEFAULT_UNARY(cosh)
+TEST_DEFAULT_UNARY(tan)
+TEST_DEFAULT_UNARY(tanh)
+TEST_DEFAULT_UNARY(sin)
+TEST_DEFAULT_UNARY(sinh)
+TEST_DEFAULT_UNARY(acos)
+TEST_DEFAULT_UNARY(acosh)
+TEST_DEFAULT_UNARY(asin)
+TEST_DEFAULT_UNARY(asinh)
+TEST_DEFAULT_UNARY(atan)
+TEST_DEFAULT_UNARY(atanh)
+
+// unary_bool
+#define TEST_DEFAULT_UNARY_BOOL(op__)                                    \
+  std::vector<std::vector<int>> shapes_unary_##op__    = {{1024, 2048}}; \
+  std::vector<std::vector<int>> shapes_unary_##op__##1 = {{3, 1000}};    \
+  TEST_DEFAULT(op__, unary_##op__, type, type3)                          \
+  TEST_DEFAULT(op__, unary_##op__##1, type, type3)
+
+TEST_DEFAULT_UNARY_BOOL(isnan)
+TEST_DEFAULT_UNARY_BOOL(isfinite)
+TEST_DEFAULT_UNARY_BOOL(isinf)
+
+// bitwise_not
+std::vector<std::vector<int>> shapes_bitwise_not  = {{1024, 2048}};
+std::vector<std::vector<int>> shapes_bitwise_not1 = {{3, 1000}};
+TEST_DEFAULT_INT(bitwise_not, bitwise_not, type2, type2)
+TEST_DEFAULT_INT(bitwise_not, bitwise_not1, type2, type2)
+
+// binary bitwise
+#define TEST_DEFAULT_BINARY(op__)                                                       \
+  std::vector<std::vector<int>> shapes_binary_##op__    = {{1024, 2048}, {1024, 2048}}; \
+  std::vector<std::vector<int>> shapes_binary_##op__##1 = {{3, 1000}, {3, 1000}};       \
+  TEST_DEFAULT_INT(op__, binary_##op__, type5, type2)                                   \
+  TEST_DEFAULT_INT(op__, binary_##op__##1, type5, type2)
+
+TEST_DEFAULT_BINARY(left_shift)
+TEST_DEFAULT_BINARY(right_shift)
+TEST_DEFAULT_BINARY(bitwise_or)
+TEST_DEFAULT_BINARY(bitwise_and)
+TEST_DEFAULT_BINARY(bitwise_xor)
+
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/benchmark/test_elementwise.cc b/test/cpp/cinn/benchmark/test_elementwise.cc
new file mode 100644
index 0000000000000..2c5f5b9221e59
--- /dev/null
+++ b/test/cpp/cinn/benchmark/test_elementwise.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/benchmark/test_elementwise.h"
+
+#include "cinn/cinn.h"
+#include "cinn/hlir/framework/node.h"
+
+namespace cinn {
+namespace tests {
+
+TEST(test_elementwise_add, default_fp32) {
+  int M = 100;
+  int N = 32;
+  std::vector<std::vector<int>> input_shapes{{M, N}, {M, N}};
+  std::string op_name = "elementwise_add";
+  hlir::framework::NodeAttr attrs;
+  ElementwiseAddTester add_tester(op_name, input_shapes);
+  std::vector<Type> input_types{Float(32), Float(32)};
+  std::vector<Type> output_types{Float(32)};
+  auto input_tensors = add_tester.CreateInputTensors<float>();
+  add_tester.TestOp("elementwise_add_default_fp32", input_tensors, attrs, input_types, output_types);
+}
+
+TEST(test_elementwise_add, default_int32) {
+  int M = 100;
+  int N = 32;
+  std::vector<std::vector<int>> input_shapes{{M, N}, {M, N}};
+  std::string op_name = "elementwise_add";
+  hlir::framework::NodeAttr attrs;
+  ElementwiseAddTester add_tester(op_name, input_shapes);
+  std::vector<Type> input_types{Int(32), Int(32)};
+  std::vector<Type> output_types{Int(32)};
+  auto input_tensors = add_tester.CreateInputTensors<int>();
+  add_tester.TestOp("elementwise_add_default_int32", input_tensors, attrs, input_types, output_types);
+  add_tester.Compare<int>();
+}
+
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/benchmark/test_elementwise.h b/test/cpp/cinn/benchmark/test_elementwise.h
new file mode 100644
index 0000000000000..53108e6064599
--- /dev/null
+++ b/test/cpp/cinn/benchmark/test_elementwise.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "tests/benchmark/test_utils.h"
+
+namespace cinn {
+namespace tests {
+
+class ElementwiseAddTester : public OpBenchmarkTester {
+ public:
+  ElementwiseAddTester(const std::string &op_name,
+                       const std::vector<std::vector<int>> &input_shapes,
+                       const common::Target &target = common::DefaultHostTarget(),
+                       int repeat                   = 10,
+                       float diff                   = 1e-5)
+      : OpBenchmarkTester(op_name, input_shapes, target, repeat, diff) {}
+
+  template <typename T>
+  void Compare() {
+    auto all_args = GetAllArgs();
+    std::vector<T *> all_datas;
+    for (auto &arg : all_args) {
+      auto *buffer = cinn_pod_value_to_buffer_p(&arg);
+      all_datas.push_back(reinterpret_cast<T *>(buffer->memory));
+    }
+
+    int out_dims = GetOutDims();
+    CHECK_EQ(all_datas.size(), 3U) << "elementwise_add should have 3 args.\n";
+    for (int i = 0; i < out_dims; ++i) {
+      EXPECT_EQ(all_datas[0][i] + all_datas[1][i], all_datas[2][i]);
+    }
+  }
+};
+
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/benchmark/test_matmul.cc b/test/cpp/cinn/benchmark/test_matmul.cc
new file mode 100644
index 0000000000000..037726b808e19
--- /dev/null
+++ b/test/cpp/cinn/benchmark/test_matmul.cc
@@ -0,0 +1,305 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/benchmark/test_matmul.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn {
+namespace tests {
+
+// default
+std::vector<ir::Tensor> MatmulTester::CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                             poly::StageMap *stages) {
+  CHECK_EQ(inputs.size(), 2U) << "matmul's input tensor should be 2.\n";
+  std::vector<ir::Tensor> outs = hlir::pe::Matmul(inputs[0], inputs[1]);
+  for (auto &out : outs) {
+    (*stages)->InsertLazily(out);
+  }
+  return outs;
+}
+
+// tile
+std::vector<ir::Tensor> MatmulTileTester::CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                                 poly::StageMap *stages) {
+  CHECK_EQ(inputs.size(), 2U) << "matmul's input tensor should be 2.\n";
+  std::vector<ir::Tensor> outs = hlir::pe::Matmul(inputs[0], inputs[1]);
+  CHECK(!outs.empty());
+  for (auto &out : outs) {
+    (*stages)->InsertLazily(out);
+  }
+  auto out = outs[0];
+  (*stages)[out]->Tile(0, 1, 4, 4);
+  return outs;
+}
+
+// split
+std::vector<ir::Tensor> MatmulSplitTester::CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                                  poly::StageMap *stages) {
+  CHECK_EQ(inputs.size(), 2U) << "matmul's input tensor should be 2.\n";
+  std::vector<ir::Tensor> outs = hlir::pe::Matmul(inputs[0], inputs[1]);
+  CHECK(!outs.empty());
+  for (auto &out : outs) {
+    (*stages)->InsertLazily(out);
+  }
+  auto out = outs[0];
+  (*stages)[out]->Split(2, 16);
+
+  std::vector<poly::Iterator> polyIters;
+  for (auto idx : {1, 0, 2, 3}) {
+    polyIters.push_back((*stages)[out]->ith_iterator(idx));
+  }
+  (*stages)[out]->Reorder(polyIters);
+
+  return outs;
+}
+
+// block
+std::vector<ir::Tensor> MatmulBlockTester::CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                                  poly::StageMap *stages) {
+  CHECK_EQ(inputs.size(), 2U) << "matmul's input tensor should be 2.\n";
+  std::vector<ir::Tensor> outs;
+  auto k1 = Var(input_shapes_[0][1], "k1");
+  CHECK_EQ(input_shapes_.size(), 2U) << "matmul's input shape should be 2.\n";
+  CHECK_EQ(input_shapes_[0].size(), 2U) << "matmul's input teosor's shape should be 2.\n";
+  CHECK_EQ(input_shapes_[1].size(), 2U) << "matmul's input teosor's shape should be 2.\n";
+  CHECK_EQ(input_shapes_[0][1], input_shapes_[1][0]) << "matmul's reduce axis shape should be same\n";
+  auto C = Compute(
+      {Expr(input_shapes_[0][0]), Expr(input_shapes_[1][1])},
+      [&](Var i, Var j) { return ReduceSum(inputs[0](i, k1) * inputs[1](k1, j), {k1}); },
+      "C");
+  (*stages)->InsertLazily(C);
+  int bn                                 = 32;
+  auto _i_outer_i_inner_j_outer_j_inner_ = (*stages)[C]->Tile(0, 1, bn, bn);  // NOLINT
+  auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+  auto _k_outer_k_inner_                 = (*stages)[C]->Split(k1->name, 4);  // NOLINT
+  auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+  auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+  (*stages)[C]->Reorder({i_outer, j_outer, k_outer, k_inner, i_inner, j_inner});
+
+  outs.push_back(C);
+  return outs;
+}
+
+// vectorize
+std::vector<ir::Tensor> MatmulVectorizeTester::CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                                      poly::StageMap *stages) {
+  CHECK_EQ(inputs.size(), 2U) << "matmul's input tensor should be 2.\n";
+  std::vector<ir::Tensor> outs;
+  auto k1 = Var(input_shapes_[0][1], "k1");
+  CHECK_EQ(input_shapes_.size(), 2U) << "matmul's input shape should be 2.\n";
+  CHECK_EQ(input_shapes_[0].size(), 2U) << "matmul's input teosor's shape should be 2.\n";
+  CHECK_EQ(input_shapes_[1].size(), 2U) << "matmul's input teosor's shape should be 2.\n";
+  CHECK_EQ(input_shapes_[0][1], input_shapes_[1][0]) << "matmul's reduce axis shape should be same\n";
+  auto C = Compute(
+      {Expr(input_shapes_[0][0]), Expr(input_shapes_[1][1])},
+      [&](Var i, Var j) { return ReduceSum(inputs[0](i, k1) * inputs[1](k1, j), {k1}); },
+      "C");
+  (*stages)->InsertLazily(C);
+  int bn                                 = 32;
+  auto _i_outer_i_inner_j_outer_j_inner_ = (*stages)[C]->Tile(0, 1, bn, bn);  // NOLINT
+  auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+  auto _k_outer_k_inner_                 = (*stages)[C]->Split(k1->name, 4);  // NOLINT
+  auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+  auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+  (*stages)[C]->Reorder({i_outer, j_outer, k_outer, k_inner, i_inner, j_inner});
+  (*stages)[C]->Vectorize(j_inner, 8);
+
+  outs.push_back(C);
+  return outs;
+}
+
+// loop permutation
+std::vector<ir::Tensor> MatmulLoopPermutationTester::CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                                            poly::StageMap *stages) {
+  CHECK_EQ(inputs.size(), 2U) << "matmul's input tensor should be 2.\n";
+  std::vector<ir::Tensor> outs;
+  auto k1 = Var(input_shapes_[0][1], "k1");
+  CHECK_EQ(input_shapes_.size(), 2U) << "matmul's input shape should be 2.\n";
+  CHECK_EQ(input_shapes_[0].size(), 2U) << "matmul's input teosor's shape should be 2.\n";
+  CHECK_EQ(input_shapes_[1].size(), 2U) << "matmul's input teosor's shape should be 2.\n";
+  CHECK_EQ(input_shapes_[0][1], input_shapes_[1][0]) << "matmul's reduce axis shape should be same\n";
+  auto C = Compute(
+      {Expr(input_shapes_[0][0]), Expr(input_shapes_[1][1])},
+      [&](Var i, Var j) { return ReduceSum(inputs[0](i, k1) * inputs[1](k1, j), {k1}); },
+      "C");
+  (*stages)->InsertLazily(C);
+  int bn                                 = 32;
+  auto _i_outer_i_inner_j_outer_j_inner_ = (*stages)[C]->Tile(0, 1, bn, bn);  // NOLINT
+  auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+  auto _k_outer_k_inner_                 = (*stages)[C]->Split(k1->name, 4);  // NOLINT
+  auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+  auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+  (*stages)[C]->Reorder({i_outer, j_outer, k_outer, i_inner, k_inner, j_inner});
+  (*stages)[C]->Vectorize(j_inner, 8);
+  (*stages)[C]->Unroll(5);
+
+  outs.push_back(C);
+  return outs;
+}
+
+// array packing
+std::vector<ir::Tensor> MatmulArrayPackingTester::CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                                         poly::StageMap *stages) {
+  CHECK_EQ(inputs.size(), 2U) << "matmul's input tensor should be 2.\n";
+  std::vector<ir::Tensor> outs;
+  CHECK_EQ(input_shapes_.size(), 2U) << "matmul's input shape should be 2.\n";
+  CHECK_EQ(input_shapes_[0].size(), 2U) << "matmul's input teosor's shape should be 2.\n";
+  CHECK_EQ(input_shapes_[1].size(), 2U) << "matmul's input teosor's shape should be 2.\n";
+  CHECK_EQ(input_shapes_[0][1], input_shapes_[1][0]) << "matmul's reduce axis shape should be same\n";
+
+  Var k(input_shapes_[0][1], "k0");
+
+  Expr bn(32);
+
+  auto packedB = Compute(
+      {Expr(input_shapes_[1][1]) / bn, Expr(input_shapes_[0][1]), bn},
+      [&](Expr x, Expr y, Expr z) { return inputs[1](y, x * bn + z); },
+      "packedB");
+  auto C = Compute(
+      {Expr(input_shapes_[0][0]), Expr(input_shapes_[1][1])},
+      [&](Expr i, Expr j) { return ReduceSum(inputs[0](i, k) * packedB(j / bn, k, j % bn), {k}); },
+      "C");
+  (*stages)->InsertLazily(C);
+  (*stages)->InsertLazily(packedB);
+
+  (*stages)[packedB]->Vectorize(2, 8);
+
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ = (*stages)[C]->Tile(0, 1, bn.as_int32(), bn.as_int32());  // NOLINT
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_                 = (*stages)[C]->Split("k0", 4);  // NOLINT
+    auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+
+    (*stages)[C]->Reorder({i_outer, j_outer, k_outer, i_inner, k_inner, j_inner});
+    (*stages)[C]->Vectorize(j_inner, 8);
+  }
+  outs.push_back(packedB);
+  outs.push_back(C);
+  return outs;
+}
+
+TEST(test_matmul, default) {
+  int M = 1024;
+  int N = 1024;
+  int K = 1024;
+  std::vector<std::vector<int>> input_shapes{{M, K}, {K, N}};
+  std::string op_name = "matmul";
+  hlir::framework::NodeAttr attrs;
+  MatmulTester matmul_tester(op_name, input_shapes);
+  std::vector<Type> input_types{Float(32), Float(32)};
+  std::vector<Type> output_types{Float(32), Float(32)};
+  auto input_tensors = matmul_tester.CreateInputTensors<float>();
+  matmul_tester.TestOp("matmul_default", input_tensors, attrs, input_types, output_types);
+}
+
+TEST(test_matmul, tile) {
+  int M = 1024;
+  int N = 1024;
+  int K = 1024;
+  std::vector<std::vector<int>> input_shapes{{M, K}, {K, N}};
+  std::string op_name = "matmul";
+  hlir::framework::NodeAttr attrs;
+  MatmulTileTester matmul_tester(op_name, input_shapes);
+  std::vector<Type> input_types{Float(32), Float(32)};
+  std::vector<Type> output_types{Float(32)};
+  auto input_tensors = matmul_tester.CreateInputTensors<float>();
+  matmul_tester.TestOp("matmul_tile", input_tensors, attrs, input_types, output_types, false);
+}
+
+TEST(test_matmul, split) {
+  int M = 1024;
+  int N = 1024;
+  int K = 1024;
+  std::vector<std::vector<int>> input_shapes{{M, K}, {K, N}};
+  std::string op_name = "matmul";
+  hlir::framework::NodeAttr attrs;
+  MatmulSplitTester matmul_tester(op_name, input_shapes);
+  std::vector<Type> input_types{Float(32), Float(32)};
+  std::vector<Type> output_types{Float(32)};
+  auto input_tensors = matmul_tester.CreateInputTensors<float>();
+  matmul_tester.TestOp("matmul_split", input_tensors, attrs, input_types, output_types, false);
+}
+
+TEST(test_matmul, block) {
+  int M = 1024;
+  int N = 1024;
+  int K = 1024;
+  std::vector<std::vector<int>> input_shapes{{M, K}, {K, N}};
+  std::string op_name = "matmul";
+  hlir::framework::NodeAttr attrs;
+  MatmulBlockTester matmul_tester(op_name, input_shapes);
+  std::vector<Type> input_types{Float(32), Float(32)};
+  std::vector<Type> output_types{Float(32)};
+  auto input_tensors = matmul_tester.CreateInputTensors<float>();
+  matmul_tester.TestOp("matmul_block", input_tensors, attrs, input_types, output_types, false);
+}
+
+TEST(test_matmul, vectorize) {
+  int M = 1024;
+  int N = 1024;
+  int K = 1024;
+  std::vector<std::vector<int>> input_shapes{{M, K}, {K, N}};
+  std::string op_name = "matmul";
+  hlir::framework::NodeAttr attrs;
+  MatmulVectorizeTester matmul_tester(op_name, input_shapes);
+  std::vector<Type> input_types{Float(32), Float(32)};
+  std::vector<Type> output_types{Float(32)};
+  auto input_tensors = matmul_tester.CreateInputTensors<float>();
+  matmul_tester.TestOp("matmul_vectorize", input_tensors, attrs, input_types, output_types, false);
+}
+
+TEST(test_matmul, loop_permutation) {
+  int M = 1024;
+  int N = 1024;
+  int K = 1024;
+  std::vector<std::vector<int>> input_shapes{{M, K}, {K, N}};
+  std::string op_name = "matmul";
+  hlir::framework::NodeAttr attrs;
+  MatmulLoopPermutationTester matmul_tester(op_name, input_shapes);
+  std::vector<Type> input_types{Float(32), Float(32)};
+  std::vector<Type> output_types{Float(32)};
+  auto input_tensors = matmul_tester.CreateInputTensors<float>();
+  matmul_tester.TestOp("matmul_loop_permutation", input_tensors, attrs, input_types, output_types, false);
+}
+
+TEST(test_matmul, array_packing) {
+  int M = 1024;
+  int N = 1024;
+  int K = 1024;
+  std::vector<std::vector<int>> input_shapes{{M, K}, {K, N}};
+  std::string op_name = "matmul";
+  hlir::framework::NodeAttr attrs;
+  MatmulArrayPackingTester matmul_tester(op_name, input_shapes);
+  std::vector<Type> input_types{Float(32), Float(32)};
+  std::vector<Type> output_types{Float(32), Float(32)};
+  auto input_tensors = matmul_tester.CreateInputTensors<float>();
+  matmul_tester.TestOp("matmul_array_packing", input_tensors, attrs, input_types, output_types, false);
+}
+
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/benchmark/test_matmul.h b/test/cpp/cinn/benchmark/test_matmul.h
new file mode 100644
index 0000000000000..5db8229607ab5
--- /dev/null
+++ b/test/cpp/cinn/benchmark/test_matmul.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/hlir/pe/transform.h"
+#include "tests/benchmark/test_utils.h"
+
+namespace cinn {
+namespace tests {
+
+class MatmulTester : public OpBenchmarkTester {
+ public:
+  MatmulTester(const std::string &op_name,
+               const std::vector<std::vector<int>> &input_shapes,
+               const common::Target &target = common::DefaultHostTarget(),
+               int repeat                   = 10,
+               float diff                   = 1e-5)
+      : OpBenchmarkTester(op_name, input_shapes, target, repeat, diff) {}
+
+  std::vector<ir::Tensor> CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                 poly::StageMap *stages) override;
+};
+
+class MatmulTileTester : public MatmulTester {
+ public:
+  MatmulTileTester(const std::string &op_name,
+                   const std::vector<std::vector<int>> &input_shapes,
+                   const common::Target &target = common::DefaultHostTarget(),
+                   int repeat                   = 10,
+                   float diff                   = 1e-5)
+      : MatmulTester(op_name, input_shapes, target, repeat, diff) {}
+
+  std::vector<ir::Tensor> CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                 poly::StageMap *stages) override;
+};
+
+class MatmulSplitTester : public MatmulTester {
+ public:
+  MatmulSplitTester(const std::string &op_name,
+                    const std::vector<std::vector<int>> &input_shapes,
+                    const common::Target &target = common::DefaultHostTarget(),
+                    int repeat                   = 10,
+                    float diff                   = 1e-5)
+      : MatmulTester(op_name, input_shapes, target, repeat, diff) {}
+
+  virtual std::vector<ir::Tensor> CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs, poly::StageMap *stages);
+};
+
+class MatmulBlockTester : public MatmulTester {
+ public:
+  MatmulBlockTester(const std::string &op_name,
+                    const std::vector<std::vector<int>> &input_shapes,
+                    const common::Target &target = common::DefaultHostTarget(),
+                    int repeat                   = 10,
+                    float diff                   = 1e-5)
+      : MatmulTester(op_name, input_shapes, target, repeat, diff), input_shapes_(input_shapes) {}
+
+  std::vector<ir::Tensor> CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                 poly::StageMap *stages) override;
+
+  std::vector<std::vector<int>> input_shapes_;
+};
+
+class MatmulVectorizeTester : public MatmulTester {
+ public:
+  MatmulVectorizeTester(const std::string &op_name,
+                        const std::vector<std::vector<int>> &input_shapes,
+                        const common::Target &target = common::DefaultHostTarget(),
+                        int repeat                   = 10,
+                        float diff                   = 1e-5)
+      : MatmulTester(op_name, input_shapes, target, repeat, diff), input_shapes_(input_shapes) {}
+
+  std::vector<ir::Tensor> CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                 poly::StageMap *stages) override;
+  std::vector<std::vector<int>> input_shapes_;
+};
+
+class MatmulLoopPermutationTester : public MatmulTester {
+ public:
+  MatmulLoopPermutationTester(const std::string &op_name,
+                              const std::vector<std::vector<int>> &input_shapes,
+                              const common::Target &target = common::DefaultHostTarget(),
+                              int repeat                   = 10,
+                              float diff                   = 1e-5)
+      : MatmulTester(op_name, input_shapes, target, repeat, diff), input_shapes_(input_shapes) {}
+
+  std::vector<ir::Tensor> CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                 poly::StageMap *stages) override;
+  std::vector<std::vector<int>> input_shapes_;
+};
+
+class MatmulArrayPackingTester : public MatmulTester {
+ public:
+  MatmulArrayPackingTester(const std::string &op_name,
+                           const std::vector<std::vector<int>> &input_shapes,
+                           const common::Target &target = common::DefaultHostTarget(),
+                           int repeat                   = 10,
+                           float diff                   = 1e-5)
+      : MatmulTester(op_name, input_shapes, target, repeat, diff), input_shapes_(input_shapes) {}
+
+  std::vector<ir::Tensor> CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                 poly::StageMap *stages) override;
+  std::vector<std::vector<int>> input_shapes_;
+};
+
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/benchmark/test_utils.cc b/test/cpp/cinn/benchmark/test_utils.cc
new file mode 100755
index 0000000000000..350ff551192ed
--- /dev/null
+++ b/test/cpp/cinn/benchmark/test_utils.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/benchmark/test_utils.h"
+
+#include "cinn/backends/llvm/codegen_x86.h"
+#include "cinn/common/cas.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/optim/transform_gpu_forloop.h"
+#include "cinn/runtime/flags.h"
+#include "cinn/utils/timer.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace tests {
+using ir::Tensor;
+std::unique_ptr<backends::ExecutionEngine> OpBenchmarkTester::CreateExecutionEngine(const cinn::ir::Module& module) {
+  auto engine = backends::ExecutionEngine::Create({});
+  engine->Link<backends::CodeGenX86>(module);
+  return engine;
+}
+
+void OpBenchmarkTester::TestOp(const std::string& test_name,
+                               const std::vector<Tensor>& input_tensors,
+                               const hlir::framework::NodeAttr& attrs,
+                               const std::vector<Type>& input_types,
+                               const std::vector<Type>& out_types,
+                               bool use_default_stragegy) {
+  auto module        = CreateCinnModule(input_tensors, attrs, out_types, use_default_stragegy);
+  auto engine        = CreateExecutionEngine(module);
+  auto test_func_ptr = reinterpret_cast<void (*)(void**, int32_t)>(engine->Lookup(op_name_));
+  input_types_       = input_types;
+  out_types_         = out_types;
+  CreateBuffer();
+  LOG(INFO) << "Testing " << test_name;
+  cinn::utils::Timer timer;
+  // ignore first execution for lazy jit component
+  timer.Start();
+  test_func_ptr(reinterpret_cast<void**>(all_args_.data()), all_args_.size());
+  double test_op_time = timer.Stop();
+  LOG(INFO) << "kernel warmup run time: " << test_op_time << " ms";
+  timer.Start();
+  for (int i = 0; i < repeat_; i++) {
+    test_func_ptr(reinterpret_cast<void**>(all_args_.data()), all_args_.size());
+  }
+  test_op_time = timer.Stop() / repeat_;
+  LOG(INFO) << "repeat times: " << repeat_ << ", kernel run time: " << test_op_time << " ms";
+}
+
+Module OpBenchmarkTester::CreateCinnModule(const std::vector<Tensor>& input_tensors,
+                                           const hlir::framework::NodeAttr& attrs,
+                                           const std::vector<Type>& out_types,
+                                           bool use_default_stragegy) {
+  std::vector<Tensor> outs;
+  std::vector<Tensor> rets;
+  poly::StageMap stages;
+  CHECK(!out_types.empty());
+  rets = input_tensors;
+  Module::Builder builder("module_" + op_name_, target_);
+
+  if (use_default_stragegy) {
+    auto strategy = hlir::framework::Operator::GetAttrs<hlir::framework::StrategyFunction>("CINNStrategy");
+    auto op       = hlir::framework::Operator::Get(op_name_);
+    CHECK(op) << op_name_ << " isn't supported yet\n";
+    auto impl =
+        hlir::framework::OpStrategy::SelectImpl(strategy[op](attrs, input_tensors, out_types, input_shapes_, target_));
+
+    if (FLAGS_cinn_ir_schedule) {
+      std::string output_name = "out";
+      std::vector<common::CINNValue> temp_inputs;
+      std::vector<ir::Tensor> all_arg_tensors;
+      std::vector<std::string> input_output_names;
+      for (const auto& tensor : input_tensors) {
+        temp_inputs.emplace_back(tensor);
+        all_arg_tensors.push_back(tensor);
+        input_output_names.push_back(tensor->name);
+      }
+      temp_inputs.emplace_back(output_name);
+      common::CINNValuePack cinn_inputs = common::CINNValuePack{temp_inputs};
+      input_output_names.push_back(output_name);
+
+      // 1.Call Op's Compute function, using the default stages and LowerVec to get IR tree.
+      common::CINNValuePack C = impl->fcompute(cinn_inputs);
+
+      // 2. Collect tensors and arguments
+      // Add output tensors to all_arg_tensors
+      for (int i = 0; i < C->size() - 1; i++) {
+        ir::Expr temp = C[i];
+        // checkout whether the tensor is with buffer.
+        if (!temp.as_tensor_ref()->buffer.defined() || target_ != common::DefaultNVGPUTarget()) {
+          all_arg_tensors.push_back(temp.as_tensor_ref());
+        }
+      }
+
+      stages     = C.back();
+      auto funcs = lang::LowerVec(op_name_, stages, all_arg_tensors, {}, {}, nullptr, target_, true);
+
+      std::vector<common::CINNValue> schedule_inputs;
+      for (int i = 0; i < C.size() - 1; ++i) {
+        CHECK(C[i].is_tensor());
+        schedule_inputs.push_back(common::CINNValue(C[i]));
+      }
+      for (auto& f : funcs) {
+        schedule_inputs.push_back(common::CINNValue(f->body));
+      }
+
+      // 3. Call Op's Schedule function, optimizing the IR tree by new IR schedule
+      common::CINNValuePack expr_pack = impl->fschedule(common::CINNValuePack{schedule_inputs});
+
+      // 4. Optimize the LoweredFunc
+      std::vector<ir::LoweredFunc> res;
+      for (int i = 0; i < expr_pack.size(); i++) {
+#ifdef CINN_WITH_CUDA
+        optim::OptimizeExprGPU(&(funcs[i]->body));
+#endif
+        if (funcs.size() > expr_pack.size()) {
+          auto new_args  = lang::GetArgs(funcs[i]->body, input_output_names);
+          funcs[i]->args = new_args;
+        }
+        auto temp_buffers   = lang::GetTempBuffers(all_arg_tensors, stages, funcs[i]->body);
+        funcs[i]->temp_bufs = temp_buffers;
+        funcs[i]->PrepareBufferCastExprs();
+        res.push_back(funcs[i]);
+      }
+      for (int i = 0; i < res.size(); i++) {
+        res[i] = optim::Optimize(Expr(funcs[i]), target_, false).as_lowered_func_ref();
+      }
+
+      for (auto func : res) {
+        builder.AddFunction(func);
+
+        for (const auto& arg : func->args) {
+          std::vector<int> output_shape;
+          if (arg.io == ir::Argument::IO::kOutput) {
+            for (auto& shape_dim : arg.buffer_arg()->shape) {
+              LOG(INFO) << shape_dim << ",";
+              CHECK(shape_dim.is_constant());
+              output_shape.push_back(static_cast<int>(shape_dim.get_constant()));
+            }
+            output_shapes_.push_back(output_shape);
+            break;
+          }
+        }
+      }
+    } else {
+      std::vector<common::CINNValue> temp_inputs;
+      for (auto& tensor : input_tensors) {
+        temp_inputs.push_back(common::CINNValue(tensor));
+      }
+      common::CINNValuePack C = impl->fcompute(common::CINNValuePack(temp_inputs));
+      stages                  = C.back();
+      C                       = impl->fschedule(C);
+      for (int i = 0; i < C->size() - 1; i++) {
+        ir::Expr temp = C[i];
+        stages->InsertLazily(temp.as_tensor_ref());
+        std::vector<Expr> output_shape_expr = temp.as_tensor_ref()->domain_without_reduce_axis();
+        std::vector<int> output_shape;
+        for (auto& shape : output_shape_expr) {
+          LOG(INFO) << shape;
+          output_shape.push_back(common::AutoSimplify(shape).as_int32());
+        }
+        output_shapes_.push_back(output_shape);
+        rets.push_back(temp.as_tensor_ref());
+      }
+      auto func = Lower(op_name_, stages, rets);
+      LOG(INFO) << "After Lower, func is: \n" << func;
+
+      builder.AddFunction(func);
+    }
+  } else {
+    stages = CreateStages(input_tensors);
+    outs   = CreateSpecificStrategy(input_tensors, &stages);
+
+    for (auto& out : outs) {
+      stages->InsertLazily(out);
+      rets.push_back(out);
+      std::vector<Expr> output_shape_expr = out->domain_without_reduce_axis();
+      std::vector<int> output_shape;
+      for (auto& shape : output_shape_expr) {
+        output_shape.push_back(shape.as_int32());
+      }
+      output_shapes_.push_back(output_shape);
+    }
+    auto func = Lower(op_name_, stages, rets);
+    LOG(INFO) << "After Lower, func is: \n" << func;
+
+    builder.AddFunction(func);
+  }
+
+  CodeGenC compiler(target_);
+  Outputs outputs;
+  outputs = outputs.c_header("./test_" + op_name_ + ".h").c_source("./test_" + op_name_ + ".cc");
+  compiler.Compile(builder.Build(), outputs);
+  return builder.Build();
+}
+
+void OpBenchmarkTester::CreateBuffer() {
+  std::vector<cinn_pod_value_t> args;
+  for (size_t i = 0; i < input_shapes_.size(); i++) {
+    auto* buffer = common::BufferBuilder(input_types_[i], input_shapes_[i]).set_align(32).set_random().Build();
+    cinn_pod_value_t arg(buffer);
+    all_args_.push_back(arg);
+  }
+  CHECK(!output_shapes_.empty()) << "output shapes shouldn't be empty\n";
+  CHECK_EQ(output_shapes_.size(), out_types_.size());
+  for (size_t i = 0; i < output_shapes_.size(); i++) {
+    if (out_types_[i].is_void()) continue;
+    auto* buffer = common::BufferBuilder(out_types_[i], output_shapes_[i]).set_align(32).set_zero().Build();
+    CHECK(buffer);
+    out_dims_ = buffer->num_elements();
+    cinn_pod_value_t arg(buffer);
+    all_args_.push_back(arg);
+  }
+}
+
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/benchmark/test_utils.h b/test/cpp/cinn/benchmark/test_utils.h
new file mode 100755
index 0000000000000..38c1c29476193
--- /dev/null
+++ b/test/cpp/cinn/benchmark/test_utils.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/cinn.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/op/use_ops.h"
+#include "cinn/runtime/cinn_runtime.h"
+
+namespace cinn {
+namespace tests {
+
+class OpBenchmarkTester {
+ public:
+  OpBenchmarkTester(const std::string &op_name,
+                    const std::vector<std::vector<int>> &input_shapes,
+                    const common::Target &target = common::DefaultHostTarget(),
+                    int repeat                   = 10,
+                    float diff                   = 1e-5)
+      : op_name_(op_name), input_shapes_(input_shapes), target_(target), repeat_(repeat), diff_(diff) {}
+
+  virtual ~OpBenchmarkTester() = default;
+
+  void TestOp(const std::string &test_name,
+              const std::vector<ir::Tensor> &input_tensors,
+              const hlir::framework::NodeAttr &attrs,
+              const std::vector<Type> &input_types,
+              const std::vector<Type> &out_types,
+              bool use_default_stragegy = true);
+
+  virtual Module CreateCinnModule(const std::vector<ir::Tensor> &input_tensors,
+                                  const hlir::framework::NodeAttr &attrs,
+                                  const std::vector<Type> &out_types,
+                                  bool use_default_stragegy = true);
+
+  // should define specific stragey if not use default schedule
+  virtual std::vector<ir::Tensor> CreateSpecificStrategy(const std::vector<ir::Tensor> &inputs,
+                                                         poly::StageMap *stages) {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  virtual std::unique_ptr<backends::ExecutionEngine> CreateExecutionEngine(const cinn::ir::Module &module);
+
+  std::vector<cinn_pod_value_t> &GetAllArgs() { return all_args_; }
+  int GetOutDims() { return out_dims_; }
+
+  template <typename T = float>
+  std::vector<ir::Tensor> CreateInputTensors() {
+    std::vector<ir::Tensor> inputs;
+    std::vector<std::vector<Expr>> expr_shapes;
+    for (int i = 0; i < input_shapes_.size(); i++) {
+      std::vector<Expr> expr_shape;
+      for (int j = 0; j < input_shapes_[i].size(); ++j) {
+        expr_shape.push_back(Expr(input_shapes_[i][j]));
+      }
+      expr_shapes.push_back(expr_shape);
+      Placeholder<T> input(common::UniqName("input"), expr_shape);
+      inputs.push_back(input.tensor());
+    }
+    return inputs;
+  }
+
+ private:
+  void CreateBuffer();
+
+  common::Target target_;
+  std::string op_name_;
+  float diff_;
+  int repeat_;
+  std::vector<std::vector<int>> input_shapes_;
+  std::vector<std::vector<int>> output_shapes_;
+  std::vector<Type> input_types_;
+  std::vector<Type> out_types_;
+  std::vector<cinn_pod_value_t> all_args_;
+  int out_dims_;
+};
+
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/concrete_program_builder.h b/test/cpp/cinn/concrete_program_builder.h
new file mode 100644
index 0000000000000..5b70f76a974ba
--- /dev/null
+++ b/test/cpp/cinn/concrete_program_builder.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "tests/program_builder.h"
+
+namespace cinn {
+namespace tests {
+
+/*
+ * Add --* Multiply --* Add --* Relu
+ */
+class BiasBnReLUBuilder : public ProgramBuilder {
+ public:
+  BiasBnReLUBuilder() : ProgramBuilder("bias_bn_relu_builder") {}
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo, const utils::AttributeMap& attrs = {}) {
+    CHECK(inputs_varinfo.size() == 4);
+    auto conv_output = builder_.CreateInput(inputs_varinfo[0].type, inputs_varinfo[0].shape, inputs_varinfo[0].id);
+    auto bias        = builder_.CreateInput(inputs_varinfo[1].type, inputs_varinfo[1].shape, inputs_varinfo[1].id);
+    auto bn_scale    = builder_.CreateInput(inputs_varinfo[2].type, inputs_varinfo[2].shape, inputs_varinfo[2].id);
+    auto bn_offset   = builder_.CreateInput(inputs_varinfo[3].type, inputs_varinfo[3].shape, inputs_varinfo[3].id);
+
+    auto bias_add = builder_.Add(conv_output, bias);
+    auto bn_mul   = builder_.Multiply(bias_add, bn_scale);
+    auto bn_add   = builder_.Add(bn_mul, bn_offset);
+    builder_.Relu(bn_add);
+    return builder_.Build();
+  }
+};
+
+/*
+ * Exp --* Add
+ *    \
+ *     --* Multiply
+ */
+class ExpTwoConsumersOpBuilder : public ProgramBuilder {
+ public:
+  ExpTwoConsumersOpBuilder() : ProgramBuilder("exp_two_consumers_builder") {}
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo, const utils::AttributeMap& attrs = {}) {
+    CHECK(inputs_varinfo.size() == 1);
+    auto x     = builder_.CreateInput(inputs_varinfo[0].type, inputs_varinfo[0].shape, inputs_varinfo[0].id);
+    auto exp_x = builder_.Exp(x);
+    auto add_x = builder_.Add(exp_x, x);
+    auto mul_1 = builder_.Multiply(exp_x, add_x);
+    return builder_.Build();
+  }
+};
+
+/*
+ * Gather --* Add --* Subtract
+ *                    *
+ *                   /
+ *            Gather
+ */
+class GatherAddSubBuilder : public ProgramBuilder {
+ public:
+  GatherAddSubBuilder() : ProgramBuilder("gather_add_sub_builder") {}
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo, const utils::AttributeMap& attrs = {}) {
+    CHECK(inputs_varinfo.size() == 2);
+    auto x             = builder_.CreateInput(inputs_varinfo[0].type, inputs_varinfo[0].shape, inputs_varinfo[0].id);
+    auto y             = builder_.CreateInput(inputs_varinfo[1].type, inputs_varinfo[1].shape, inputs_varinfo[1].id);
+    auto input_x_shape = inputs_varinfo[0].shape;
+    auto where_x_0     = builder_.Gather(x, builder_.FillConstant({input_x_shape[0]}, 0, "constant_idx_first"));
+    auto where_x_last =
+        builder_.Gather(x, builder_.FillConstant({input_x_shape[0]}, input_x_shape[0] - 1, "constant_idx_last"));
+    auto add_1 = builder_.Add(where_x_0, y);
+    builder_.Subtract(where_x_last, add_1);
+    return builder_.Build();
+  }
+};
+
+/*
+ * FillConstant --* Add
+ */
+class FillConstantAddBuilder : public ProgramBuilder {
+ public:
+  FillConstantAddBuilder() : ProgramBuilder("fill_constant_add_builder") {}
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo, const utils::AttributeMap& attrs = {}) {
+    CHECK(inputs_varinfo.size() == 1);
+    auto x             = builder_.CreateInput(inputs_varinfo[0].type, inputs_varinfo[0].shape, inputs_varinfo[0].id);
+    auto fill_constant = builder_.FillConstant(inputs_varinfo[0].shape, 1.0f, "fill_constant");
+    builder_.Add(x, fill_constant);
+    return builder_.Build();
+  }
+};
+
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/program_builder.cc b/test/cpp/cinn/program_builder.cc
new file mode 100644
index 0000000000000..b392b63bbf51a
--- /dev/null
+++ b/test/cpp/cinn/program_builder.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/program_builder.h"
+
+namespace cinn {
+namespace tests {
+
+OpBuilder::OpBuilder(const std::string& op_name) : ProgramBuilder(op_name), op_name_(op_name) {}
+
+frontend::Program OpBuilder::Build(const std::vector<VariableInfo>& inputs_varinfo, const utils::AttributeMap& attrs) {
+  std::vector<frontend::Variable> inputs;
+  for (auto&& item : inputs_varinfo) {
+    inputs.emplace_back(builder_.CreateInput(item.type, item.shape, item.id));
+  }
+  outputs_ = builder_.CustomInstr(op_name_, inputs, attrs);
+  return builder_.Build();
+}
+
+PaddleModelBuilder::PaddleModelBuilder(const std::string& model_path, const common::Target& target)
+    : ProgramBuilder("test_paddle_model"), model_path_(model_path), target_(target) {}
+
+frontend::Program PaddleModelBuilder::Build(const std::vector<VariableInfo>& inputs_varinfo,
+                                            const utils::AttributeMap& attrs) {
+  // build a name to shape map of input
+  CHECK(!inputs_varinfo.empty());
+  auto scope = std::make_shared<hlir::framework::Scope>();
+  std::unordered_map<std::string, std::vector<int>> input_name2shape;
+  for (auto&& item : inputs_varinfo) {
+    input_name2shape[item.id] = item.shape;
+  }
+
+  auto loadedProgram = cinn::frontend::LoadPaddleProgram(model_path_, scope.get(), input_name2shape, true, target_);
+  auto& program      = std::get<0>(loadedProgram);
+  auto& varmap       = std::get<1>(loadedProgram);
+  VLOG(3) << "loaded program: " << *program;
+  CHECK(!varmap.empty());
+
+  // fetch input variables and set to program
+  std::vector<frontend::Variable> input_vars;
+  for (auto&& item : inputs_varinfo) {
+    input_vars.emplace_back(varmap.at(item.id));
+    input_vars.back()->shape = item.shape;
+  }
+
+  program->SetInputs(input_vars);
+  program->Validate();
+  return *program;
+}
+
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/program_builder.h b/test/cpp/cinn/program_builder.h
new file mode 100644
index 0000000000000..fbb142d3623a4
--- /dev/null
+++ b/test/cpp/cinn/program_builder.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+#include "cinn/common/type.h"
+#include "cinn/frontend/net_builder.h"
+#include "cinn/frontend/syntax.h"
+#include "cinn/utils/type_defs.h"
+
+namespace cinn {
+namespace tests {
+
+// This struct packing data fields for variable constructor
+struct VariableInfo {
+  std::string id;
+  std::vector<int> shape;
+  common::Type type;
+  VariableInfo(std::string name, std::vector<int> shape, common::Type dtype = common::Float(32))
+      : id(name), shape(shape), type(dtype) {}
+};
+
+// This class define a general interface to build a frontend::Program easily for test usage,
+// developer can implement derived classes to build customized programs which may be reused
+// by others through specifying the detail of input variables and attributes
+class ProgramBuilder {
+ public:
+  ProgramBuilder(const std::string& name) : builder_(name) {}
+
+  /*
+   * \brief Build a frontend::Program with the input variables info and attributes
+   * @param input_varinfo The detail data fields of each input variable, input order should
+   *                      match their usage in override implement
+   * @param attrs The detail value of each input attributes, input order should
+   *                      match their usage in override implementdefinition
+   * @return The built program
+   */
+  virtual frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo,
+                                  const utils::AttributeMap& attrs) = 0;
+
+  // return the output variables
+  const std::vector<frontend::Variable>& GetOutputs() const { return outputs_; }
+
+ protected:
+  void AppendOutput(frontend::Variable var) { outputs_.emplace_back(var); }
+
+  frontend::NetBuilder builder_;
+  std::vector<frontend::Variable> outputs_;
+};
+
+// Build a frontend::Program which has only one operator
+class OpBuilder final : public ProgramBuilder {
+ public:
+  /*
+   * @param op_name name of the built operator, you should lookup the name from its registry exactly
+   */
+  OpBuilder(const std::string& op_name);
+
+  // the item order in `inputs_varinfo` and `attrs` should match their usage in the underlying operator
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo,
+                          const utils::AttributeMap& attrs = {}) override;
+
+ private:
+  std::string op_name_;
+};
+
+// Build a frontend::Program by loading paddle model from local files
+class PaddleModelBuilder final : public ProgramBuilder {
+ public:
+  /*
+   * @param model_path the path to local model files
+   * @param target device type
+   */
+  PaddleModelBuilder(const std::string& model_path, const common::Target& target);
+
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo,
+                          const utils::AttributeMap& attrs = {}) override;
+
+ private:
+  std::string model_path_;
+  common::Target target_;
+};
+
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/test01_elementwise_add_case.cc b/test/cpp/cinn/test01_elementwise_add_case.cc
new file mode 100644
index 0000000000000..e0bf806b4a84e
--- /dev/null
+++ b/test/cpp/cinn/test01_elementwise_add_case.cc
@@ -0,0 +1,162 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/common/test_helper.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "tests/test01_elementwise_add.h"
+#include "tests/test01_elementwise_add_compute_at.h"
+#include "tests/test01_elementwise_add_compute_at_level1.h"
+#include "tests/test01_elementwise_add_vectorize.h"
+
+namespace cinn {
+
+TEST(test01, basic) {
+  auto* A = cinn::common::BufferBuilder(Float(32), {100, 32}).set_align(32).set_random().Build();
+  auto* B = cinn::common::BufferBuilder(Float(32), {100, 32}).set_align(32).set_random().Build();
+  auto* C = cinn::common::BufferBuilder(Float(32), {100, 32}).set_align(32).set_zero().Build();
+
+  float* Ad = reinterpret_cast<float*>(A->memory);
+  float* Bd = reinterpret_cast<float*>(B->memory);
+  float* Cd = reinterpret_cast<float*>(C->memory);
+  ASSERT_EQ(C->num_elements(), A->num_elements());
+
+  auto check = [&] {
+    for (int i = 0; i < C->num_elements(); i++) {
+      EXPECT_EQ(Ad[i] + Bd[i], Cd[i]);
+    }
+  };
+
+  auto args = common::ArgsBuilder().Add(A).Add(B).Add(C).Build();
+
+  LOG(INFO) << "test1 basic";
+  add1(args.data(), args.size());
+  check();
+
+  LOG(INFO) << "test1 vectorize";
+  add1_vectorize(args.data(), args.size());
+  check();
+}
+
+TEST(test01, compute_at) {
+  const int M = 100;
+  const int N = 32;
+  auto* A     = cinn::common::BufferBuilder(Float(32), {M, N}).set_align(32).set_random().Build();
+  auto* B     = cinn::common::BufferBuilder(Float(32), {M, N}).set_align(32).set_random().Build();
+  auto* C     = cinn::common::BufferBuilder(Float(32), {M, N}).set_align(32).set_zero().Build();
+
+  float* Ad = reinterpret_cast<float*>(A->memory);
+  float* Bd = reinterpret_cast<float*>(B->memory);
+  float* Cd = reinterpret_cast<float*>(C->memory);
+  ASSERT_EQ(C->num_elements(), A->num_elements());
+
+  auto check_add = [&] {
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        ASSERT_NEAR(Ad[i * N + j] + Bd[i * N + j], Cd[i * N + j], 1e-5);
+      }
+    }
+  };
+
+  auto check_compute = [&] {
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        float first = i > 0 ? Ad[(i - 1) * N + j] : 0.f;
+        float last  = i < M - 1 ? Ad[(i + 1) * N + j] : 0.f;
+        float left  = first + last + Ad[i * N + j] + Bd[i * N + j];
+        ASSERT_NEAR(left, Cd[i * N + j], 1e-5);
+      }
+    }
+  };
+
+  auto reset = [&]() {
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        Cd[i * N + j] = 0.f;
+      }
+    }
+  };
+
+  auto args = common::ArgsBuilder().Add(A).Add(B).Add(C).Build();
+
+  LOG(INFO) << "test1 basic";
+  add1(args.data(), args.size());
+  check_add();
+  reset();
+  LOG(INFO) << "test1 compute_at";
+  fn_compute_at(args.data(), args.size());
+  check_compute();
+
+  cinn_buffer_free(nullptr, A);
+  cinn_buffer_free(nullptr, B);
+  cinn_buffer_free(nullptr, C);
+}
+
+TEST(test01, compute_at_level1) {
+  const int M = 100;
+  const int N = 32;
+  auto* A     = cinn::common::BufferBuilder(Float(32), {M, N}).set_align(32).set_random().Build();
+  auto* B     = cinn::common::BufferBuilder(Float(32), {M, N}).set_align(32).set_random().Build();
+  auto* C     = cinn::common::BufferBuilder(Float(32), {M, N}).set_align(32).set_zero().Build();
+
+  float* Ad = reinterpret_cast<float*>(A->memory);
+  float* Bd = reinterpret_cast<float*>(B->memory);
+  float* Cd = reinterpret_cast<float*>(C->memory);
+  ASSERT_EQ(C->num_elements(), A->num_elements());
+
+  auto check_add = [&] {
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        ASSERT_NEAR(Ad[i * N + j] + Bd[i * N + j], Cd[i * N + j], 1e-5);
+      }
+    }
+  };
+
+  auto check_compute = [&] {
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        float first = i > 0 ? Ad[(i - 1) * N + j] : 0.f;
+        float last  = i < M - 1 ? Ad[(i + 1) * N + j] : 0.f;
+        float left  = first + last + Ad[i * N + j] + Bd[i * N + j];
+        ASSERT_NEAR(left, Cd[i * N + j], 1e-5);
+      }
+    }
+  };
+
+  auto args = common::ArgsBuilder().Add(A).Add(B).Add(C).Build();
+
+  LOG(INFO) << "test1 basic";
+  add1(args.data(), args.size());
+  check_add();
+
+  LOG(INFO) << "test1 compute_at_level1";
+  fn_compute_at_level1(args.data(), args.size());
+  check_compute();
+
+  cinn_buffer_free(nullptr, A);
+  cinn_buffer_free(nullptr, B);
+  cinn_buffer_free(nullptr, C);
+}
+
+}  // namespace cinn
+
+// include the generated C source code:
+// @{
+#include "tests/test01_elementwise_add.cc"
+#include "tests/test01_elementwise_add_compute_at.cc"
+#include "tests/test01_elementwise_add_compute_at_level1.cc"
+#include "tests/test01_elementwise_add_vectorize.cc"
+// @}
diff --git a/test/cpp/cinn/test01_elementwise_add_main.cc b/test/cpp/cinn/test01_elementwise_add_main.cc
new file mode 100644
index 0000000000000..7f488209002d6
--- /dev/null
+++ b/test/cpp/cinn/test01_elementwise_add_main.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/common/ir_util.h"
+#include "cinn/hlir/pe/broadcast.h"
+#include "cinn/optim/optimize.h"
+namespace cinn {
+
+TEST(test01_elementwise_add, basic) {
+  Expr M(100), N(32);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  Buffer C_buf(Float(32));
+  auto C = hlir::pe::Add(A.tensor(), B.tensor(), "C");
+  C->Bind(C_buf);
+
+  Target target;
+  target.arch = Target::Arch ::X86;
+  target.bits = Target::Bit ::k32;
+  target.os   = Target::OS ::Linux;
+  Module::Builder builder("module1", target);
+
+  auto stages = CreateStages({A, B, C});
+  auto func   = Lower("add1", stages, {A, B, C});
+
+  builder.AddFunction(func);
+
+  CodeGenC compiler(target);
+  Outputs outputs;
+  outputs = outputs.c_header("./test01_elementwise_add.h").c_source("./test01_elementwise_add.cc");
+  compiler.Compile(builder.Build(), outputs);
+}
+
+TEST(test01_elementwise_add, vectorize) {
+  Expr M(100);
+  Expr N(32);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto C = hlir::pe::Add(A.tensor(), B.tensor(), "C");
+
+  auto stages = CreateStages({C});
+  stages[C]->Vectorize(1, 8);
+
+  Target target;
+  target.arch = Target::Arch ::X86;
+  target.bits = Target::Bit ::k32;
+  target.os   = Target::OS ::Linux;
+  Module::Builder builder("module2", target);
+
+  auto func = Lower("add1_vectorize", stages, {A, B, C});
+
+  LOG(INFO) << "after optim:\n" << func;
+  builder.AddFunction(ir::LoweredFunc(func.As<ir::_LoweredFunc_>()));
+  // module.Append(C_buf);
+
+  CodeGenCX86 compiler(target, CodeGenCX86::Feature::AVX256);
+  Outputs outputs;
+  outputs = outputs.c_header("./test01_elementwise_add_vectorize.h").c_source("./test01_elementwise_add_vectorize.cc");
+  compiler.Compile(builder.Build(), outputs);
+}
+
+auto BuildComputeAtExpr() {
+  Expr M(100);
+  Expr N(32);
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  auto A_cache = Compute(
+      {M, N},
+      [=](Expr i, Expr j) {
+        auto first = cinn::common::select(i > 0, A(i - 1, j), common::make_const(Float(32), 0.f));
+        auto last  = cinn::common::select(i < M - 1, A(i + 1, j), common::make_const(Float(32), 0.f));
+        return first + A(i, j) + last;
+      },
+      "A_cache");
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return A_cache(i, j) + B(i, j); }, "C");
+
+  return std::make_tuple(A, B, A_cache, C);
+}
+
+TEST(elementwise_add, compute_at) {
+  auto _A_B_A_cache_C_ = BuildComputeAtExpr();
+  auto &A              = std::get<0>(_A_B_A_cache_C_);
+  auto &B              = std::get<1>(_A_B_A_cache_C_);
+  auto &A_cache        = std::get<2>(_A_B_A_cache_C_);
+  auto &C              = std::get<3>(_A_B_A_cache_C_);
+
+  auto stages = CreateStages({A, B, A_cache, C});
+  stages[A_cache]->ComputeAt2(stages[C], 0);
+  stages[C]->Parallel(0);
+
+  Module::Builder builder("module3", common::DefaultHostTarget());
+
+  auto fn = Lower("fn_compute_at", stages, {A, B, C}, {}, {A_cache}, &builder);
+
+  CodeGenCX86 compiler(common::DefaultHostTarget(), CodeGenCX86::Feature::AVX256);
+  Outputs outputs;
+  outputs =
+      outputs.c_header("./test01_elementwise_add_compute_at.h").c_source("./test01_elementwise_add_compute_at.cc");
+  compiler.Compile(builder.Build(), outputs);
+}
+
+TEST(elementwise_add, compute_at1) {
+  auto _A_B_A_cache_C_ = BuildComputeAtExpr();
+  auto &A              = std::get<0>(_A_B_A_cache_C_);
+  auto &B              = std::get<1>(_A_B_A_cache_C_);
+  auto &A_cache        = std::get<2>(_A_B_A_cache_C_);
+  auto &C              = std::get<3>(_A_B_A_cache_C_);
+
+  auto stages = CreateStages({A, B, A_cache, C});
+  stages[A_cache]->ComputeAt2(stages[C], 1);
+  stages[C]->Parallel(0);
+
+  Module::Builder builder("module4", common::DefaultHostTarget());
+
+  auto fn = Lower("fn_compute_at_level1", stages, {A, B, C}, {}, {A_cache}, &builder);
+
+  CodeGenCX86 compiler(common::DefaultHostTarget(), CodeGenCX86::Feature::AVX256);
+  Outputs outputs;
+  outputs = outputs.c_header("./test01_elementwise_add_compute_at_level1.h")
+                .c_source("./test01_elementwise_add_compute_at_level1.cc");
+  compiler.Compile(builder.Build(), outputs);
+}
+
+}  // namespace cinn
diff --git a/test/cpp/cinn/test02_helper.h b/test/cpp/cinn/test02_helper.h
new file mode 100644
index 0000000000000..63e3ffdedd017
--- /dev/null
+++ b/test/cpp/cinn/test02_helper.h
@@ -0,0 +1,308 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/strings/string_view.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "cinn/backends/llvm/execution_engine.h"
+#include "cinn/backends/llvm/simple_jit.h"
+#include "cinn/cinn.h"
+#include "cinn/optim/optimize.h"
+
+namespace cinn {
+namespace tests {
+
+auto CreateMatmulBasicModule(Target target, int m, int n, int k) {
+  auto _M_N_K_ = std::make_tuple(Expr(m), Expr(n), Expr(k));
+  auto &M      = std::get<0>(_M_N_K_);
+  auto &N      = std::get<1>(_M_N_K_);
+  auto &K      = std::get<2>(_M_N_K_);
+
+  auto A = Placeholder<float>("A", {M, K});
+  auto B = Placeholder<float>("B", {K, N});
+
+  auto k1 = Var(K.as_int32(), "k1");
+  auto C  = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k1) * B(k1, j), {k1}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  Module::Builder builder("module_basic", target);
+
+  auto func = Lower("matmul_basic", stages, {A, B, C});
+
+  builder.AddFunction(func);
+  return builder.Build();
+}
+
+auto CreateMatmulTileModule(Target target, int m, int n, int k) {
+  auto _M_N_K_ = std::make_tuple(Expr(m), Expr(n), Expr(k));
+  auto &M      = std::get<0>(_M_N_K_);
+  auto &N      = std::get<1>(_M_N_K_);
+  auto &K      = std::get<2>(_M_N_K_);
+
+  auto A = Placeholder<float>("A", {M, K});
+  auto B = Placeholder<float>("B", {K, N});
+
+  auto k1 = Var(K.as_int32(), "k1");
+  auto C  = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k1) * B(k1, j), {k1}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  stages[C]->Tile(0, 1, 4, 4);
+
+  Module::Builder builder("module_tile", target);
+
+  auto func = Lower("matmul_tile", stages, {A, B, C});
+
+  builder.AddFunction(func);
+  return builder.Build();
+}
+
+auto CreateMatmulSplitModule(Target target, int m, int n, int k) {
+  auto _M_N_K_ = std::make_tuple(Expr(m), Expr(n), Expr(k));
+  auto &M      = std::get<0>(_M_N_K_);
+  auto &N      = std::get<1>(_M_N_K_);
+  auto &K      = std::get<2>(_M_N_K_);
+
+  auto A = Placeholder<float>("A", {M, K});
+  auto B = Placeholder<float>("B", {K, N});
+
+  auto k1 = Var(K.as_int32(), "k1");
+  auto C  = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k1) * B(k1, j), {k1}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  stages[C]->Split(2, 16);
+
+  std::vector<poly::Iterator> polyIters;
+  for (auto idx : {1, 0, 2, 3}) {
+    polyIters.push_back(stages[C]->ith_iterator(idx));
+  }
+  stages[C]->Reorder(polyIters);
+
+  Module::Builder builder("module_split", target);
+
+  auto func = Lower("matmul_split", stages, {A, B, C});
+
+  builder.AddFunction(func);
+  return builder.Build();
+}
+
+auto CreateMatmulBlockModule(Target target, int m, int n, int k) {
+  auto _M_N_K_ = std::make_tuple(Expr(m), Expr(n), Expr(k));
+  auto &M      = std::get<0>(_M_N_K_);
+  auto &N      = std::get<1>(_M_N_K_);
+  auto &K      = std::get<2>(_M_N_K_);
+
+  auto A = Placeholder<float>("A", {M, K});
+  auto B = Placeholder<float>("B", {K, N});
+
+  auto k1 = Var(K.as_int32(), "k1");
+  auto C  = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k1) * B(k1, j), {k1}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  constexpr int bn                       = 32;
+  auto _i_outer_i_inner_j_outer_j_inner_ = stages[C]->Tile(0, 1, bn, bn);  // NOLINT
+  auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+  auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+  auto _k_outer_k_inner_                 = stages[C]->Split(k1->name, 4);  // NOLINT
+  auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+  auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+  stages[C]->Reorder({i_outer, j_outer, k_outer, k_inner, i_inner, j_inner});
+
+  Module::Builder builder("module_block", target);
+
+  auto func = Lower("matmul_block", stages, {A, B, C});
+
+  builder.AddFunction(func);
+  return builder.Build();
+}
+
+auto CreateMatmulVectorizeModule(Target target, int m, int n, int k) {
+  auto _M_N_K_ = std::make_tuple(Expr(m), Expr(n), Expr(k));
+  auto &M      = std::get<0>(_M_N_K_);
+  auto &N      = std::get<1>(_M_N_K_);
+  auto &K      = std::get<2>(_M_N_K_);
+
+  auto A = Placeholder<float>("A", {M, K});
+  auto B = Placeholder<float>("B", {K, N});
+
+  Var k0(K.as_int32(), "k0");
+
+  int bn = 32;
+
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k0) * B(k0, j), {k0}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ = stages[C]->Tile(0, 1, bn, bn);
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_                 = stages[C]->Split("k0", 4);
+    auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+    stages[C]->Reorder({i_outer, j_outer, k_outer, k_inner, i_inner, j_inner});
+    stages[C]->Vectorize(j_inner, 8);
+  }
+
+  Module::Builder builder("module_vectorize", target);
+  auto func = Lower("matmul_vectorize", stages, {A, B, C});
+
+  builder.AddFunction(func);
+
+  return builder.Build();
+}
+
+ir::Module CreateMatmulLoopPermutation(Target target, int m, int n, int k_) {
+  target.arch = Target::Arch::X86;
+  target.bits = Target::Bit::k32;
+  target.os   = Target::OS::Linux;
+
+  auto _M_N_K_ = std::make_tuple(Expr(m), Expr(n), Expr(k_));
+  auto &M      = std::get<0>(_M_N_K_);
+  auto &N      = std::get<1>(_M_N_K_);
+  auto &K      = std::get<2>(_M_N_K_);
+
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "k0");
+
+  int bn = 32;
+
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  // Blocking by loop tiling.
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ = stages[C]->Tile(0, 1, bn, bn);  // NOLINT
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_                 = stages[C]->Split("k0", 4);  // NOLINT
+    auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+
+    stages[C]->Reorder({i_outer, j_outer, k_outer, i_inner, k_inner, j_inner});
+
+    stages[C]->Vectorize(j_inner, 8);
+    stages[C]->Unroll(5);
+  }
+
+  Module::Builder builder("module_loop_permutation", target);
+  auto func = Lower("matmul_loop_permutation", stages, {A, B, C});
+
+  builder.AddFunction(func);
+  return builder.Build();
+}
+
+ir::Module CreateMatmulArrayPacking(Target target, int m, int n, int k_) {
+  auto _M_N_K_ = std::make_tuple(Expr(m), Expr(n), Expr(k_));
+  auto &M      = std::get<0>(_M_N_K_);
+  auto &N      = std::get<1>(_M_N_K_);
+  auto &K      = std::get<2>(_M_N_K_);
+
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "k0");
+
+  Expr bn(32);
+
+  auto packedB = Compute(
+      {N / bn, K, bn}, [&](Expr x, Expr y, Expr z) { return B(y, x * bn + z); }, "packedB");
+  auto C = Compute(
+      {M, N}, [&](Expr i, Expr j) { return ReduceSum(A(i, k) * packedB(j / bn, k, j % bn), {k}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  stages[packedB]->Vectorize(2, 8);
+
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ = stages[C]->Tile(0, 1, bn.as_int32(), bn.as_int32());  // NOLINT
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_                 = stages[C]->Split("k0", 4);  // NOLINT
+    auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+
+    stages[C]->Reorder({i_outer, j_outer, k_outer, i_inner, k_inner, j_inner});
+    stages[C]->Vectorize(j_inner, 8);
+  }
+
+  Module::Builder builder("module_array_packing", target);
+  auto func = Lower("matmul_array_packing", stages, {A, B, C, packedB});
+
+  builder.AddFunction(func);
+
+  return builder.Build();
+}
+
+// TODO(Superjomn) To refactor this, strange to use if-else here.
+auto CreateCinnMatmulModule(const std::string &name, Target target, int m, int n, int k) {
+  if (name == "basic") {
+    return CreateMatmulBasicModule(target, m, n, k);
+  } else if (name == "tile") {
+    return CreateMatmulTileModule(target, m, n, k);
+  } else if (name == "split") {
+    return CreateMatmulSplitModule(target, m, n, k);
+  } else if (name == "block") {
+    return CreateMatmulBlockModule(target, m, n, k);
+  } else if (name == "vectorize") {
+    return CreateMatmulVectorizeModule(target, m, n, k);
+  } else if (name == "loop_permutation") {
+    return CreateMatmulLoopPermutation(target, m, n, k);
+  } else if (name == "array_packing") {
+    return CreateMatmulArrayPacking(target, m, n, k);
+  }
+  { CINN_NOT_IMPLEMENTED }
+}
+
+auto CreateExecutionEngine(const cinn::ir::Module &module) {
+  auto engine = cinn::backends::ExecutionEngine::Create({});
+  engine->Link(module);
+  return engine;
+}
+
+auto CreateSimpleJit(const cinn::ir::Module &module) {
+  auto jit = cinn::backends::SimpleJIT::Create();
+  jit->Link(module, true);
+
+  return jit;
+}
+}  // namespace tests
+}  // namespace cinn
diff --git a/test/cpp/cinn/test02_matmul_case.cc b/test/cpp/cinn/test02_matmul_case.cc
new file mode 100644
index 0000000000000..cbf2eebd4e6c0
--- /dev/null
+++ b/test/cpp/cinn/test02_matmul_case.cc
@@ -0,0 +1,222 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/utils/timer.h"
+#include "tests/test02_helper.h"
+#include "tests/test02_matmul.h"
+#include "tests/test02_matmul_array_packing.h"
+#include "tests/test02_matmul_array_packing_dynamic_shape.h"
+#include "tests/test02_matmul_block.h"
+#include "tests/test02_matmul_call.h"
+#include "tests/test02_matmul_loop_permutation.h"
+#include "tests/test02_matmul_split.h"
+#include "tests/test02_matmul_tile.h"
+#include "tests/test02_matmul_varient_shape.h"
+#include "tests/test02_matmul_varient_shape_tile.h"
+#include "tests/test02_matmul_vectorize.h"
+
+TEST(test02, basic) {
+  const int M  = 1024;
+  const int N  = 1024;
+  const int K  = 1024;
+  const int bn = 32;
+
+  auto* A        = cinn_buffer_t::new_(cinn_device_kind_t::cinn_x86_device, cinn_float32_t(), {M, K}, 32);
+  auto* B        = cinn_buffer_t::new_(cinn_device_kind_t::cinn_x86_device, cinn_float32_t(), {K, N}, 32);
+  auto* C        = cinn_buffer_t::new_(cinn_device_kind_t::cinn_x86_device, cinn_float32_t(), {M, N}, 32);
+  auto* C_target = cinn_buffer_t::new_(cinn_device_kind_t::cinn_x86_device, cinn_float32_t(), {M, N});
+  auto* packedB  = cinn_buffer_t::new_(cinn_device_kind_t::cinn_x86_device, cinn_float32_t(), {N / bn, K, bn}, 32);
+  cinn_buffer_malloc(nullptr, A);
+  cinn_buffer_malloc(nullptr, B);
+  cinn_buffer_malloc(nullptr, C_target);
+  cinn_buffer_malloc(nullptr, C);
+  cinn_buffer_malloc(nullptr, packedB);
+
+  float* Ad        = reinterpret_cast<float*>(A->memory);
+  float* Bd        = reinterpret_cast<float*>(B->memory);
+  float* Cd_target = reinterpret_cast<float*>(C_target->memory);
+  float* Cd        = reinterpret_cast<float*>(C->memory);
+
+  for (int i = 0; i < M; i++) {
+    for (int k = 0; k < K; k++) {
+      Ad[i * K + k] = float(rand()) / RAND_MAX;  // NOLINT
+    }
+  }
+
+  for (int j = 0; j < M; j++) {
+    for (int k = 0; k < K; k++) {
+      Bd[k * N + j] = float(rand()) / RAND_MAX;  // NOLINT
+    }
+  }
+
+  // manually set zero
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      Cd_target[i * N + j] = 0.f;
+      // Cd[i * N + j]        = 0.f;
+    }
+  }
+
+  auto compare = [&](float diff = 1e-4) {
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        ASSERT_NEAR(Cd[i * N + j], Cd_target[i * N + j], diff);
+      }
+    }
+  };
+
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < K; k++) {
+        Cd_target[i * N + j] += Ad[i * K + k] * Bd[k * N + j];
+      }
+    }
+  }
+
+  auto reset = [&]() {
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        Cd[i * N + j] = 0.f;
+      }
+    }
+  };
+
+  cinn::utils::Timer timer;
+
+  const int repeat = 2;
+
+  cinn_pod_value_t A_arg(A);
+  cinn_pod_value_t B_arg(B);
+  cinn_pod_value_t C_arg(C);
+  cinn_pod_value_t packedB_arg(packedB);
+  cinn_pod_value_t M_arg(M);
+
+  cinn_pod_value_t args[]  = {A_arg, B_arg, C_arg};
+  cinn_pod_value_t args1[] = {A_arg, B_arg, C_arg, packedB_arg};
+  cinn_pod_value_t args2[] = {M_arg, A_arg, B_arg, C_arg};
+  cinn_pod_value_t args3[] = {M_arg, A_arg, B_arg, C_arg};
+
+#define TEST_FUNC(func__)                                                     \
+  LOG(INFO) << "Testing " #func__;                                            \
+  timer.Start();                                                              \
+  for (int i = 0; i < repeat; i++) func__(reinterpret_cast<void**>(args), 3); \
+  LOG(INFO) << timer.Stop() / repeat;                                         \
+  compare();                                                                  \
+  reset();
+
+#define TEST_FUNC1(func__, diff)                                               \
+  LOG(INFO) << "Testing " #func__;                                             \
+  timer.Start();                                                               \
+  for (int i = 0; i < repeat; i++) func__(reinterpret_cast<void**>(args1), 4); \
+  LOG(INFO) << timer.Stop() / repeat;                                          \
+  compare();                                                                   \
+  reset();
+
+#define TEST_FUNC2(func__, diff)                                               \
+  LOG(INFO) << "Testing " #func__;                                             \
+  timer.Start();                                                               \
+  for (int i = 0; i < repeat; i++) func__(reinterpret_cast<void**>(args2), 4); \
+  LOG(INFO) << timer.Stop() / repeat;                                          \
+  compare();                                                                   \
+  reset();
+
+#define TEST_FUNC3(func__, diff)                                               \
+  LOG(INFO) << "Testing " #func__;                                             \
+  timer.Start();                                                               \
+  for (int i = 0; i < repeat; i++) func__(reinterpret_cast<void**>(args3), 4); \
+  LOG(INFO) << timer.Stop() / repeat;                                          \
+  compare();                                                                   \
+  reset();
+
+  TEST_FUNC(matmul)
+
+  TEST_FUNC(matmul_tile)
+
+  TEST_FUNC(matmul_split)
+
+  TEST_FUNC(matmul_block)
+
+  TEST_FUNC(matmul_vectorize)
+
+  TEST_FUNC(matmul_loop_permutation)
+
+  TEST_FUNC1(matmul_array_packing, 1e-5)
+
+  TEST_FUNC2(matmul_dynamic_shape, 1e-5);
+
+  TEST_FUNC2(matmul_dynamic_shape_tile, 1e-5);
+
+  TEST_FUNC3(matmul_array_packing_dynamic_shape, 1e-5);
+
+  // Currently, the execution of a LoweredFunc is scheduled by the outer framework, so no need to Call inside another
+  // LoweredFunc.
+  // TODO(Superjomn) Fixit latter.
+  // TEST_FUNC(matmul_main);
+
+#define TEST_LLVM_MATMUL(test_name, TARGET)                                                                      \
+  do {                                                                                                           \
+    auto module             = cinn::tests::CreateCinnMatmulModule(#test_name, TARGET, 1024, 1024, 1024);         \
+    auto engine             = cinn::tests::CreateExecutionEngine(module);                                        \
+    auto matmul_##test_name = reinterpret_cast<void (*)(void**, int32_t)>(engine->Lookup("matmul_" #test_name)); \
+    TEST_FUNC(matmul_##test_name);                                                                               \
+  } while (false)
+
+#define TEST_LLVM_MATMUL1(test_name, TARGET)                                                                     \
+  do {                                                                                                           \
+    auto module             = cinn::tests::CreateCinnMatmulModule(#test_name, TARGET, 1024, 1024, 1024);         \
+    auto engine             = cinn::tests::CreateExecutionEngine(module);                                        \
+    auto matmul_##test_name = reinterpret_cast<void (*)(void**, int32_t)>(engine->Lookup("matmul_" #test_name)); \
+    TEST_FUNC1(matmul_##test_name, 1e-5);                                                                        \
+  } while (false)
+
+  cinn::Target target;
+  target.arch = cinn::Target::Arch::X86;
+  target.bits = cinn::Target::Bit::k32;
+  target.os   = cinn::Target::OS::Linux;
+
+  TEST_LLVM_MATMUL(basic, target);
+  TEST_LLVM_MATMUL(tile, target);
+  TEST_LLVM_MATMUL(block, target);
+  TEST_LLVM_MATMUL(vectorize, target);
+  TEST_LLVM_MATMUL(loop_permutation, target);
+  TEST_LLVM_MATMUL1(array_packing, target);
+
+  {
+    auto module    = cinn::tests::CreateMatmulBasicModule(target, 1024, 1024, 1024);
+    auto jit       = cinn::tests::CreateSimpleJit(module);
+    auto matmul_fn = reinterpret_cast<void (*)(void**, int32_t)>(jit->Lookup("matmul_basic"));
+    TEST_FUNC(matmul_fn);
+  }
+
+#undef TEST_LLVM_MATMUL
+}
+
+// include the generated C source code:
+// @{
+#include "tests/test02_matmul.cc"
+#include "tests/test02_matmul_array_packing.cc"
+#include "tests/test02_matmul_array_packing_dynamic_shape.cc"
+#include "tests/test02_matmul_block.cc"
+#include "tests/test02_matmul_call.cc"
+#include "tests/test02_matmul_loop_permutation.cc"
+#include "tests/test02_matmul_split.cc"
+#include "tests/test02_matmul_tile.cc"
+#include "tests/test02_matmul_varient_shape.cc"
+#include "tests/test02_matmul_varient_shape_tile.cc"
+#include "tests/test02_matmul_vectorize.cc"
+// @}
diff --git a/test/cpp/cinn/test02_matmul_main.cc b/test/cpp/cinn/test02_matmul_main.cc
new file mode 100644
index 0000000000000..5d04c98672765
--- /dev/null
+++ b/test/cpp/cinn/test02_matmul_main.cc
@@ -0,0 +1,333 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/optim/optimize.h"
+#include "tests/test02_helper.h"
+
+namespace cinn {
+using poly::Iterator;
+
+Expr M(1024);
+Expr N(1024);
+Expr K(1024);
+
+TEST(test02_matmul, basic) {
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "k0");
+
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  Target target = common::DefaultHostTarget();
+
+  {
+    auto stages = CreateStages({C});
+    Module::Builder builder("module1", target);
+    auto func = Lower("matmul", stages, {A, B, C});
+
+    builder.AddFunction(func);
+
+    CodeGenC compiler(target);
+    Outputs outputs;
+    outputs = outputs.c_header("./test02_matmul.h").c_source("./test02_matmul.cc");
+    compiler.Compile(builder.Build(), outputs);
+  }
+
+  // Tile
+  {
+    auto stages = CreateStages({C});
+    stages[C]->Tile(0, 1, 4, 4);
+
+    Module::Builder builder("module2", target);
+    auto func = Lower("matmul_tile", stages, {A, B, C});
+
+    builder.AddFunction(func);
+
+    CodeGenC compiler(target);
+    Outputs outputs;
+    outputs = outputs.c_header("./test02_matmul_tile.h").c_source("./test02_matmul_tile.cc");
+    compiler.Compile(builder.Build(), outputs);
+  }
+}
+
+TEST(matmul, Split) {
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "k0");
+
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  Target target = common::DefaultHostTarget();
+
+  auto _i0_i1_ = stages[C]->Split(2, 16);
+  auto &i0     = std::get<0>(_i0_i1_);
+  auto &i1     = std::get<1>(_i0_i1_);
+  std::vector<Iterator> iterators(
+      {stages[C]->ith_iterator(1), stages[C]->ith_iterator(0), stages[C]->ith_iterator(2), stages[C]->ith_iterator(3)});
+  stages[C]->Reorder(iterators);
+
+  Module::Builder builder("module3", target);
+  auto func = Lower("matmul_split", stages, {A, B, C});
+
+  builder.AddFunction(func);
+
+  CodeGenCX86 compiler(target, CodeGenCX86::Feature::AVX512);
+  Outputs outputs;
+  outputs = outputs.c_header("./test02_matmul_split.h").c_source("./test02_matmul_split.cc");
+  compiler.Compile(builder.Build(), outputs);
+}
+
+TEST(matmul, Blocking) {
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "k0");
+
+  int bn = 32;
+
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  Target target = common::DefaultHostTarget();
+
+  // Blocking by loop tiling.
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ = stages[C]->Tile(0, 1, bn, bn);
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_                 = stages[C]->Split("k0", 4);
+    auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+    stages[C]->Reorder({i_outer, j_outer, k_outer, k_inner, i_inner, j_inner});
+  }
+
+  Module::Builder builder("module_block", target);
+  auto func = Lower("matmul_block", stages, {A, B, C});
+
+  builder.AddFunction(func);
+
+  CodeGenCX86 compiler(target, CodeGenCX86::Feature::AVX512);
+  Outputs outputs;
+  outputs = outputs.c_header("./test02_matmul_block.h").c_source("./test02_matmul_block.cc");
+  compiler.Compile(builder.Build(), outputs);
+}
+
+TEST(matmul, Vectorization) {
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "k0");
+
+  int bn = 32;
+
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  Target target = common::DefaultHostTarget();
+
+  // Blocking by loop tiling.
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ = stages[C]->Tile(0, 1, bn, bn);
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_                 = stages[C]->Split("k0", 4);
+    auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+    stages[C]->Reorder({i_outer, j_outer, k_outer, k_inner, i_inner, j_inner});
+    stages[C]->Vectorize(j_inner, 8);
+  }
+
+  Module::Builder builder("module_vectorize", target);
+  auto func = Lower("matmul_vectorize", stages, {A, B, C});
+
+  builder.AddFunction(func);
+
+  CodeGenCX86 compiler(target, CodeGenCX86::Feature::AVX256);
+  Outputs outputs;
+  outputs = outputs.c_header("./test02_matmul_vectorize.h").c_source("./test02_matmul_vectorize.cc");
+  compiler.Compile(builder.Build(), outputs);
+}
+
+TEST(matmul, LoopPermutation) {
+  auto module = tests::CreateMatmulLoopPermutation(common::DefaultHostTarget(), 1024, 1024, 1024);
+
+  CodeGenCX86 compiler(common::DefaultHostTarget(), CodeGenCX86::Feature::AVX256);
+  Outputs outputs;
+  outputs = outputs.c_header("./test02_matmul_loop_permutation.h").c_source("./test02_matmul_loop_permutation.cc");
+  compiler.Compile(module, outputs);
+}
+
+TEST(matmul, ArrayPacking) {
+  auto target = common::DefaultHostTarget();
+
+  auto module = tests::CreateMatmulArrayPacking(target, 1024, 1024, 1024);
+
+  CodeGenCX86 compiler(target, CodeGenCX86::Feature::AVX256);
+  Outputs outputs;
+  outputs = outputs.c_header("./test02_matmul_array_packing.h").c_source("./test02_matmul_array_packing.cc");
+  compiler.Compile(module, outputs);
+}
+
+TEST(matmul, varient_shape) {
+  Var M("M");  // M is a symbol.
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "k0");
+
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  Target target = common::DefaultHostTarget();
+
+  {
+    auto stages = CreateStages({C});
+    Module::Builder builder("matmul_dynamic_shape", target);
+    auto func = Lower("matmul_dynamic_shape", stages, {A, B, C}, {M});
+
+    builder.AddFunction(func);
+
+    CodeGenC compiler(target);
+    Outputs outputs;
+    outputs = outputs.c_header("./test02_matmul_varient_shape.h").c_source("./test02_matmul_varient_shape.cc");
+    compiler.Compile(builder.Build(), outputs);
+  }
+
+  {
+    auto stages                            = CreateStages({C});
+    int bn                                 = 32;
+    auto _i_outer_i_inner_j_outer_j_inner_ = stages[C]->Tile(0, 1, bn, bn);  // NOLINT
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+
+    Module::Builder builder("matmul_dynamic_shape_tile", target);
+    auto func = Lower("matmul_dynamic_shape_tile", stages, {A, B, C} /*tensors*/, {M} /*scalars*/);
+    LOG(INFO) << "func " << Expr(func);
+
+    builder.AddFunction(func);
+
+    CodeGenC compiler(target);
+    Outputs outputs;
+
+    outputs =
+        outputs.c_header("./test02_matmul_varient_shape_tile.h").c_source("./test02_matmul_varient_shape_tile.cc");
+    compiler.Compile(builder.Build(), outputs);
+  }
+}
+
+TEST(matmul, ArrayPacking_dynamic_shape) {
+  Var M("M");
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "k0");
+
+  Expr bn(32);
+
+  auto packedB = Compute(
+      {N / bn, K, bn}, [&](Expr x, Expr y, Expr z) { return B(y, x * bn + z); }, "packedB");
+
+  auto C = Compute(
+      {M, N}, [&](Expr i, Expr j) { return ReduceSum(A(i, k) * packedB(j / bn, k, j % bn), {k}); }, "C");
+
+  auto stages = CreateStages({C});
+
+  stages[packedB]->Vectorize(2, 8);
+
+  Target target;
+  target.arch = Target::Arch::X86;
+  target.bits = Target::Bit::k32;
+  target.os   = Target::OS::Linux;
+
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ = stages[C]->Tile(0, 1, bn.as_int32(), bn.as_int32());
+    auto &i_outer                          = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner                          = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer                          = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner                          = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_                 = stages[C]->Split("k0", 4);
+    auto &k_outer                          = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner                          = std::get<1>(_k_outer_k_inner_);
+
+    stages[C]->Reorder({i_outer, j_outer, k_outer, i_inner, k_inner, j_inner});
+    stages[C]->Vectorize(j_inner, 8);
+  }
+
+  Module::Builder builder("module_array_packing_dynamic_shape", target);
+  auto func = Lower("matmul_array_packing_dynamic_shape", stages, {A, B, C}, {M}, {packedB}, &builder);
+
+  CodeGenCX86 compiler(target, CodeGenCX86::Feature::AVX256);
+  Outputs outputs;
+  outputs = outputs.c_header("./test02_matmul_array_packing_dynamic_shape.h")
+                .c_source("./test02_matmul_array_packing_dynamic_shape.cc");
+  compiler.Compile(builder.Build(), outputs);
+}
+
+TEST(matmul, call) {
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  Var k(K.as_int32(), "k0");
+  Buffer C_buf(Float(32));
+
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+
+  Target target = common::DefaultHostTarget();
+
+  auto stages = CreateStages({C});
+
+  Module::Builder builder("module_call", target);
+  {
+    auto func = Lower("matmul_kernel", stages, {A, B, C});
+
+    builder.AddFunction(func);
+  }
+
+  {  // main
+    std::vector<lang::ReturnType> returns({lang::ReturnType{Float(32), C->shape, C->name}});
+    auto tensors = lang::CallLowered("matmul_kernel", {A, B}, returns);
+    auto C       = tensors[0];
+
+    LOG(INFO) << "stage domain: " << stages[C]->domain();
+    auto fn = Lower("matmul_main", stages, {A, B, C}, {});
+    builder.AddFunction(fn);
+  }
+
+  CodeGenC compiler(target);
+  Outputs outputs;
+  outputs = outputs.c_header("./test02_matmul_call.h").c_source("./test02_matmul_call.cc");
+  compiler.Compile(builder.Build(), outputs);
+}
+
+}  // namespace cinn
diff --git a/test/cpp/cinn/test03_convolution_case.cc b/test/cpp/cinn/test03_convolution_case.cc
new file mode 100644
index 0000000000000..38b6ea52b760b
--- /dev/null
+++ b/test/cpp/cinn/test03_convolution_case.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "cinn/runtime/cinn_runtime.h"
+#include "cinn/utils/timer.h"
+#include "tests/test03_convolution.h"
+
+TEST(test03, basic) {}
+
+// include the generated C source code:
+// @{
+#include "tests/test03_convolution.cc"
+// @}
diff --git a/test/cpp/cinn/test03_convolution_main.cc b/test/cpp/cinn/test03_convolution_main.cc
new file mode 100755
index 0000000000000..ca819a272ddff
--- /dev/null
+++ b/test/cpp/cinn/test03_convolution_main.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "cinn/cinn.h"
+#include "cinn/common/target.h"
+
+namespace cinn {
+
+Expr batch(256);
+Expr in_channel(256);
+Expr out_channel(512);
+Expr in_size(14);
+Expr kernel(3);
+Expr pad(1);
+Expr stride(1);
+
+TEST(test03_conv, basic) {
+  Placeholder<float> A("A", {in_size, in_size, in_channel, batch});
+  Placeholder<float> W("W", {kernel, kernel, in_channel, out_channel});
+  Expr out_size = (in_size - kernel + 2 * pad) / stride + 1;
+
+  auto Apad = Compute(
+      {in_size + 2 * pad, in_size + 2 * pad, in_channel, batch},
+      [&](Expr yy, Expr xx, Expr cc, Expr nn) {
+        auto cond = logic_and({yy >= pad, yy - pad < in_size, xx >= pad, xx - pad < in_size});
+        return ir::Select::Make(cond, A(yy - pad, xx - pad, cc, nn), Expr(0.f));
+      },
+      "Apad");
+
+  Var rc(Expr(0), Expr(in_channel), "rc");
+  Var ry(Expr(0), Expr(kernel), "ry");
+  Var rx(Expr(0), Expr(kernel), "rx");
+
+  auto B = Compute(
+      {out_size, out_size, out_channel, batch},
+      [&](Expr yy, Expr xx, Expr ff, Expr nn) {
+        return ReduceSum(Apad(yy * stride + ry, xx * stride + rx, rc, nn) * W(ry, rx, rc, ff), {rx, ry, rc});
+      },
+      "B");
+
+  Target target = common::DefaultHostTarget();
+
+  Module::Builder builder("conv", target);
+
+  auto stages = CreateStages({Apad, B});
+
+  auto func = Lower("conv", stages, {A, W, Apad, B});
+
+  builder.AddFunction(func);
+
+  CodeGenCX86 compiler(target, CodeGenCX86::Feature::AVX256);
+  Outputs outputs;
+  outputs = outputs.c_header("./test03_convolution.h").c_source("./test03_convolution.cc");
+  compiler.Compile(builder.Build(), outputs);
+}
+
+}  // namespace cinn

From 9bca017818273b0106bf0b6f0b692e5ffce8a2a6 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Thu, 15 Jun 2023 16:53:21 +0800
Subject: [PATCH 04/14] feat(cmake): test/CMakeList.txt

---
 test/CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 90c7c3898bccc..4288f13d77375 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -91,6 +91,13 @@ function(bash_test_modules TARGET_NAME)
 endfunction()
 
 if(WITH_TESTING)
+  if(WITH_CINN)
+    add_subdirectory(cpp/cinn)
+    if(CINN_ONLY)
+      add_subdirectory(cinn)
+      return()
+    endif()
+  endif()
   add_subdirectory(amp)
   add_subdirectory(asp)
   add_subdirectory(autograd)

From c6964ae10b8f31c667212793c13bbf3f1f6ccd63 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Mon, 19 Jun 2023 13:56:20 +0800
Subject: [PATCH 05/14] feat(cmake): rebase to develop

---
 CMakeLists.txt                                |  2 +-
 cmake/cinn.cmake                              | 14 +++-
 cmake/cinn/config.cmake                       |  6 --
 cmake/third_party.cmake                       |  4 --
 paddle/cinn/CMakeLists.txt                    |  6 +-
 .../framework/paddle2cinn/CMakeLists.txt      | 12 ++--
 .../framework/paddle2cinn/build_cinn_pass.cc  |  4 +-
 .../framework/paddle2cinn/cinn_compiler.cc    | 20 +++---
 .../framework/paddle2cinn/cinn_compiler.h     |  2 +-
 .../paddle2cinn/cinn_compiler_test.cc         |  2 +-
 .../paddle2cinn/cinn_graph_symbolization.cc   |  4 +-
 .../paddle2cinn/cinn_graph_symbolization.h    |  4 +-
 .../framework/paddle2cinn/cinn_lib_test.cc    | 20 +++---
 .../framework/paddle2cinn/transform_desc.h    | 10 +--
 .../framework/paddle2cinn/transform_type.cc   |  4 +-
 .../paddle2cinn/transform_type_test.cc        |  4 +-
 paddle/fluid/operators/cinn/CMakeLists.txt    |  4 +-
 .../operators/cinn/cinn_instruction_run_op.h  |  4 +-
 .../operators/cinn/cinn_launch_context.cc     | 14 ++--
 paddle/fluid/operators/cinn/cinn_launch_op.cc |  8 +--
 .../fluid/operators/cinn/cinn_launch_op.cu.cc |  2 +-
 paddle/fluid/operators/cinn/cinn_launch_op.h  |  2 +-
 python/setup.py.in                            | 37 +++--------
 python/setup_cinn.py.in                       |  3 +
 setup.py                                      | 47 +-------------
 test/cinn/CMakeLists.txt                      | 65 ++++++++++---------
 test/cpp/fluid/cinn/CMakeLists.txt            |  2 +-
 .../fluid/cinn/cinn_launch_context_test.cc    | 16 ++---
 28 files changed, 133 insertions(+), 189 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d354485811f0..25621a1cd757f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -122,7 +122,7 @@ endif()
 
 if(WIN32)
   option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
-
+  message("Build static library of PHI")
   set(CMAKE_SUPPRESS_REGENERATION ON)
   set(CMAKE_STATIC_LIBRARY_PREFIX lib)
 
diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
index 74fdf7c4ae358..bed5b048a69ba 100644
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -28,8 +28,18 @@ add_definitions(-w)
 
 include(cmake/cinn/version.cmake)
 # include the customized configures
-if(EXISTS ${CMAKE_BINARY_DIR}/config.cmake)
-  include(${CMAKE_BINARY_DIR}/config.cmake)
+if(NOT EXISTS ${CMAKE_BINARY_DIR}/cmake/cinn/config.cmake)
+  file(COPY ${PROJECT_SOURCE_DIR}/cmake/cinn/config.cmake DESTINATION ${CMAKE_BINARY_DIR}/cmake/cinn)
+endif()
+include(${CMAKE_BINARY_DIR}/cmake/cinn/config.cmake)
+
+if(WITH_MKL)
+  generate_dummy_static_lib(LIB_NAME "cinn_mklml" GENERATOR "mklml.cmake")
+  target_link_libraries(cinn_mklml ${MKLML_LIB} ${MKLML_IOMP_LIB})
+  add_definitions(-DCINN_WITH_MKL_CBLAS)
+endif()
+if(WITH_MKLDNN)
+  add_definitions(-DCINN_WITH_MKLDNN)
 endif()
 
 if(WITH_GPU)
diff --git a/cmake/cinn/config.cmake b/cmake/cinn/config.cmake
index 4a390539fabef..728bcda2af401 100755
--- a/cmake/cinn/config.cmake
+++ b/cmake/cinn/config.cmake
@@ -2,10 +2,4 @@
 # Required!
 set(ISL_HOME "")
 
-# Whether enable NVidia CUDA support.
-# Possible values: ON, OFF
-set(WITH_GPU ON)
-
-set(WITH_MKL ON)
-set(WITH_MKLDNN ON)
 set(USE_OPENMP "intel")
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index d33b86944008a..af64e42dc2f18 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -272,13 +272,9 @@ if(CINN_ONLY)
   endif()
   if(WITH_MKL)
     include(external/mklml)
-    generate_dummy_static_lib(LIB_NAME "cinn_mklml" GENERATOR "mklml.cmake")
-    target_link_libraries(cinn_mklml ${MKLML_LIB} ${MKLML_IOMP_LIB})
-    add_definitions(-DCINN_WITH_MKL_CBLAS)
   endif()
   if(WITH_MKLDNN)
     include(external/mkldnn)
-    add_definitions(-DCINN_WITH_MKLDNN)
   endif()
   return()
 endif()
diff --git a/paddle/cinn/CMakeLists.txt b/paddle/cinn/CMakeLists.txt
index 16c70714d7f36..247f0a11b5dfe 100644
--- a/paddle/cinn/CMakeLists.txt
+++ b/paddle/cinn/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (WITH_TESTING)
-  cc_library(cinn_gtest_main SRCS gtest_main.cc DEPS gtest)
+  cinn_cc_library(cinn_gtest_main SRCS gtest_main.cc DEPS gtest gflags)
 endif()
 
 add_subdirectory(auto_schedule)
@@ -12,7 +12,9 @@ add_subdirectory(backends)
 add_subdirectory(lang)
 add_subdirectory(optim)
 add_subdirectory(hlir)
-add_subdirectory(pybind)
+if(CINN_ONLY)
+  add_subdirectory(pybind)
+endif()
 add_subdirectory(frontend)
 
 # Download a model
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index 19e25122e010b..57c45f662686a 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -13,11 +13,11 @@ pass_library(cinn_zero_tensor_trick_pass base)
 cc_library(
   transform_desc
   SRCS transform_desc.cc
-  DEPS proto_desc cinn)
+  DEPS proto_desc cinnapi)
 cc_library(
   transform_type
   SRCS transform_type.cc
-  DEPS phi enforce cinn)
+  DEPS phi enforce cinnapi)
 cc_library(
   cinn_cache_key
   SRCS cinn_cache_key.cc
@@ -29,7 +29,7 @@ cc_library(
 cc_library(
   cinn_graph_symbolization
   SRCS cinn_graph_symbolization.cc
-  DEPS lod_tensor graph transform_desc cinn)
+  DEPS lod_tensor graph transform_desc cinnapi)
 cc_library(
   cinn_compiler
   SRCS cinn_compiler.cc
@@ -38,14 +38,14 @@ cc_library(
        lod_tensor
        cinn_cache_key
        cinn_graph_symbolization
-       cinn
+       cinnapi
        cinn_launch_context
        parallel_executor
        python
        pybind)
 
 if(WITH_TESTING)
-  # cc_test_old(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
+  # cc_test_old(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinnapi)
   # set_tests_properties(cinn_lib_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
   cc_test_old(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS
@@ -107,7 +107,7 @@ if(WITH_TESTING)
   #   proto_desc
   #   graph_viz_pass
   #   build_cinn_pass
-  #   cinn
+  #   cinnapi
   #   mul_op
   #   activation_op
   #   elementwise_add_op
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index e013b033f93a1..0fba032e3c04c 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -24,8 +24,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/use_op_mappers.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/use_op_mappers.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/framework/ir/graph.h"
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 2cb74b2eccd51..fe8612414d386 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -22,16 +22,16 @@
 #include <string>
 #include <unordered_map>
 
-#include "cinn/auto_schedule/auto_tuner.h"
-#include "cinn/auto_schedule/tuning.h"
-#include "cinn/common/target.h"
-#include "cinn/common/type.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/auto_schedule/auto_tuner.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 9c5c51d932f14..3a92e4eefcb64 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -23,7 +23,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "cinn/common/target.h"
+#include "paddle/cinn/common/target.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index 61f74b22f76b7..772512bf4b664 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -23,7 +23,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "cinn/common/target.h"
+#include "paddle/cinn/common/target.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
index 3b73f29b89ce7..bda6ad030d242 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
@@ -25,8 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/paddle2cinn/transform_desc.h"
 #include "paddle/fluid/framework/variable.h"
 
-#include "cinn/frontend/op_mappers/use_op_mappers.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mappers/use_op_mappers.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
index a0f3c6a93b4b0..0da9d3071f93f 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
@@ -24,8 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
index 2dd09771cc5ea..47c6905abc8e4 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
@@ -24,16 +24,16 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #endif
 
-#include "cinn/cinn.h"
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc.h b/paddle/fluid/framework/paddle2cinn/transform_desc.h
index 2e6fd3755f51a..fe2343f27496a 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_desc.h
+++ b/paddle/fluid/framework/paddle2cinn/transform_desc.h
@@ -19,11 +19,11 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 
-#include "cinn/frontend/paddle/cpp/block_desc.h"
-#include "cinn/frontend/paddle/cpp/desc_api.h"
-#include "cinn/frontend/paddle/cpp/op_desc.h"
-#include "cinn/frontend/paddle/cpp/program_desc.h"
-#include "cinn/frontend/paddle/cpp/var_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/block_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/cpp/op_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/program_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/var_desc.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/transform_type.cc b/paddle/fluid/framework/paddle2cinn/transform_type.cc
index 3858e2145ffa0..784eac8128609 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_type.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_type.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
 
-#include "cinn/common/type.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/utils/string/string_helper.h"
diff --git a/paddle/fluid/framework/paddle2cinn/transform_type_test.cc b/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
index f24ba8f3ac6b1..5e88480cd8b6c 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
 
-#include "cinn/common/type.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index d1a77af60aaa0..bfcc9a98fc25f 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -17,13 +17,13 @@ cc_library(
        parallel_executor
        standalone_executor
        transform_type
-       cinn)
+       cinnapi)
 
 set(CINN_OP_DEPS
     parallel_executor
     string_helper
     variable_helper
-    cinn
+    cinnapi
     cinn_compiler
     cinn_op_helper
     cinn_launch_context
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
index 645a6a4c836f2..3a7779ae83338 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
@@ -19,8 +19,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index a42ea040bee79..fc23dbf88064c 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -19,13 +19,13 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/instruction.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/runtime/intrinsic.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/intrinsic.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
index 3ab9f6ba99b58..d26d430b9fdf1 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -17,10 +17,10 @@
 #include <functional>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/flags.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/generator.h"
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
index b399a8b33ad0c..a7ff605dca9b9 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
 
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/runtime/flags.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/core/generator.h"
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 9cb7d601809bf..2913da9bc5c39 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -20,7 +20,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "cinn/common/target.h"
+#include "paddle/cinn/common/target.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/python/setup.py.in b/python/setup.py.in
index 1fe7264c71560..1663a2efc94de 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -103,28 +103,7 @@ def is_taged():
 def get_cinn_version():
     if '@WITH_CINN@' != 'ON':
         return "False"
-
-    cinn_git_version = 'Unknown'
-    try:
-        cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
-        cinn_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd='@CINN_SOURCE_DIR@').communicate()[0].strip()
-        if len(cinn_tag) > 0:
-            cinn_git_version = cinn_tag
-    except:
-        pass
-
-    if cinn_git_version == 'Unknown':
-        try:
-            cmd = ['git', 'rev-parse', 'HEAD']
-            cinn_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE,
-                cwd='@CINN_SOURCE_DIR@').communicate()[0].strip()
-            if len(cinn_commit) > 0:
-                cinn_git_version = cinn_commit
-        except:
-            pass
-
-    cinn_git_version = cinn_git_version.decode('utf-8')
-    return str(cinn_git_version)
+    return "0.3.0"
 
 def write_version_py(filename='paddle/version/__init__.py'):
     cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
@@ -636,16 +615,16 @@ if '${WITH_LITE}' == 'ON':
 
 if '${WITH_CINN}' == 'ON':
     shutil.copy('${CINN_LIB_LOCATION}/${CINN_LIB_NAME}', libs_path)
-    shutil.copy('${CINN_INCLUDE_DIR}/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh', libs_path)
+    shutil.copy('${CINN_INCLUDE_DIR}/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh', libs_path)
     package_data['paddle.libs']+=['libcinnapi.so']
     package_data['paddle.libs']+=['cinn_cuda_runtime_source.cuh']
 
-    cinn_fp16_file = '${CINN_INCLUDE_DIR}/cinn/runtime/cuda/float16.h'
+    cinn_fp16_file = '${CINN_INCLUDE_DIR}/paddle/cinn/runtime/cuda/float16.h'
     if os.path.exists(cinn_fp16_file):
         shutil.copy(cinn_fp16_file, libs_path)
         package_data['paddle.libs']+=['float16.h']
 
-    cinn_bf16_file = '${CINN_INCLUDE_DIR}/cinn/runtime/cuda/bfloat16.h'
+    cinn_bf16_file = '${CINN_INCLUDE_DIR}/paddle/cinn/runtime/cuda/bfloat16.h'
     if os.path.exists(cinn_bf16_file):
         shutil.copy(cinn_bf16_file, libs_path)
         package_data['paddle.libs']+=['bfloat16.h']
@@ -919,7 +898,7 @@ if '${WITH_STRIP}' == 'ON':
 def install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir):
     """install cpp distribution and build test target
 
-    TODO(huangjiyi): 
+    TODO(huangjiyi):
     1. This function will be moved when seperating C++ distribution
     installation from python package installation.
     2. Reduce the header and library files to be installed.
@@ -928,7 +907,7 @@ def install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir):
         return
     os.makedirs(paddle_install_dir, exist_ok=True)
     # install C++ header files
-    for header in headers: 
+    for header in headers:
         install_dir = get_header_install_dir(header)
         install_dir = os.path.join(
             paddle_install_dir, 'include', os.path.dirname(install_dir)
@@ -948,7 +927,7 @@ def install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir):
     for lib in package_data['paddle.libs']:
         lib_path = os.path.join(libs_path, lib)
         shutil.copy(lib_path, lib_install_dir)
-    
+
     # build test target
     cmake_args = ["cmake", paddle_lib_test_dir, "-B", paddle_lib_test_dir]
     if os.getenv("GENERATOR") == "Ninja":
@@ -962,7 +941,7 @@ if '${WITH_CPP_DIST}' == 'ON':
     paddle_install_dir = '${PADDLE_INSTALL_DIR}'
     paddle_lib_test_dir = '${PADDLE_LIB_TEST_DIR}'
     install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir)
-    
+
 
 with redirect_stdout():
     setup(name='${PACKAGE_NAME}',
diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in
index fbdaac8625840..0556efa3f5c66 100644
--- a/python/setup_cinn.py.in
+++ b/python/setup_cinn.py.in
@@ -135,6 +135,9 @@ if '${WITH_MKL}' == 'ON':
     cinnlibs.append('${MKLML_LIB}')
     cinnlibs.append('${MKLML_IOMP_LIB}')
 
+if '${WITH_MKLDNN}' == 'ON':
+    cinnlibs.append('${MKLDNN_SHARED_LIB_2}')
+
 if '${WITH_GPU}' == 'ON':
     cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh')
     cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/cuda/float16.h')
diff --git a/setup.py b/setup.py
index de97ae234a2a3..96210cf989a4a 100644
--- a/setup.py
+++ b/setup.py
@@ -429,52 +429,7 @@ def is_taged():
 def get_cinn_version():
     if env_dict.get("WITH_CINN") != 'ON':
         return "False"
-
-    cinn_git_version = 'Unknown'
-    # try get cinn tag name
-    try:
-        cmd = [
-            'git',
-            'describe',
-            '--exact-match',
-            '--tags',
-            'HEAD',
-            '2>/dev/null',
-        ]
-        cinn_tag = (
-            subprocess.Popen(
-                cmd,
-                stdout=subprocess.PIPE,
-                cwd=env_dict.get("CINN_SOURCE_DIR"),
-            )
-            .communicate()[0]
-            .strip()
-        )
-        if len(cinn_tag) > 0:
-            cinn_git_version = cinn_tag
-    except:
-        pass
-
-    if cinn_git_version == 'Unknown':
-        # try get cinn commit id
-        try:
-            cmd = ['git', 'rev-parse', 'HEAD']
-            cinn_commit = (
-                subprocess.Popen(
-                    cmd,
-                    stdout=subprocess.PIPE,
-                    cwd=env_dict.get("CINN_SOURCE_DIR"),
-                )
-                .communicate()[0]
-                .strip()
-            )
-            if len(cinn_commit) > 0:
-                cinn_git_version = cinn_commit
-        except:
-            pass
-
-    cinn_git_version = cinn_git_version.decode('utf-8')
-    return str(cinn_git_version)
+    return "0.3.0"
 
 
 def write_version_py(filename='paddle/version/__init__.py'):
diff --git a/test/cinn/CMakeLists.txt b/test/cinn/CMakeLists.txt
index 3497958bf3d73..9223c4ba80076 100644
--- a/test/cinn/CMakeLists.txt
+++ b/test/cinn/CMakeLists.txt
@@ -1,12 +1,17 @@
-set(CINN_PYTHON_TEST_DIR ${PROJECT_SOURCE_DIR}/test/cinn)
-set(CINN_CORE_API ${CMAKE_BINARY_DIR}/python/cinn/core_api.so)
+set(CINN_PYTHON_TEST_DIR ${CMAKE_BINARY_DIR}/test/cinn)
+set(CINN_CORE_API ${CMAKE_BINARY_DIR}/python/core_api.so)
 
 add_custom_command(
-  OUTPUT ${CINN_CORE_API} POST_BUILD
-  COMMAND cp --remove-destination
-          ${CMAKE_BINARY_DIR}/paddle/cinn/pybind/core_api.so ${CINN_CORE_API}
-  DEPENDS core_api ${CINN_PY_FILES})
-
+  OUTPUT ${CMAKE_BINARY_DIR}/test/__init__.py POST_BUILD
+  COMMAND cp -rf --remove-destination
+          ${PROJECT_SOURCE_DIR}/test/cinn
+          ${CMAKE_BINARY_DIR}/test/
+  COMMAND cd ${CMAKE_BINARY_DIR}/test/ && touch __init__.py
+)
+add_custom_target(
+  COPY_CINN_PYTHON_TESTS ALL
+  DEPENDS ${CMAKE_BINARY_DIR}/test/__init__.py
+  )
 
 set(BASIC_TEST_NAMES
     test_matmul
@@ -24,14 +29,14 @@ foreach(basic_test_name ${BASIC_TEST_NAMES})
     NAME ${basic_test_name}
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/${basic_test_name}.py
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endforeach()
 
 if(NOT ${WITH_GPU})
   #    ADD_TEST(NAME test_op_nn
-  #        COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH}
+  #        COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
   #        python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_op_nn.py WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
   #    )
 endif()
@@ -39,7 +44,7 @@ endif()
 if(WITH_GPU)
   # TODO(thisjiang): revert test_cinn_frontend after fix inference mul problem
   # ADD_TEST(NAME test_cinn_frontend
-  #     COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH}
+  #     COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
   #     python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_frontend.py
   #     ${CMAKE_BINARY_DIR}/thirds/naive_mul_model
   #     ${CMAKE_BINARY_DIR}/thirds/multi_fc_model
@@ -49,20 +54,20 @@ if(WITH_GPU)
     NAME test_netbuilder
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_netbuilder.py "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endif()
 
 #ADD_TEST(NAME test_computation_python
-#    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH}
+#    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
 #    python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_computation.py
 #    ${CMAKE_BINARY_DIR}/thirds/naive_mul_model
 #    "${WITH_GPU}" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
 #)
 
 #ADD_TEST(NAME test_cinn_ops_check
-#    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH}
+#    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
 #    python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_ops.py "${WITH_GPU}"
 #    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
 #)
@@ -71,7 +76,7 @@ add_test(
   NAME test_cinn_op_benchmark
   COMMAND
     ${CMAKE_COMMAND} -E env
-    PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+    PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
     ${CMAKE_CURRENT_SOURCE_DIR}/test_op_benchmark.py "${WITH_GPU}"
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
@@ -80,7 +85,7 @@ if(WITH_GPU)
     NAME test_cinn_fake_resnet
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet.py
       "${CMAKE_BINARY_DIR}/thirds/resnet_model" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
@@ -89,7 +94,7 @@ if(WITH_GPU)
     NAME test_cinn_real_resnet18
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet18.py
       "${CMAKE_BINARY_DIR}/thirds/ResNet18" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
@@ -98,7 +103,7 @@ if(WITH_GPU)
     NAME test_cinn_real_mobilenetV2
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_mobilenetv2.py
       "${CMAKE_BINARY_DIR}/thirds/MobileNetV2" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
@@ -107,7 +112,7 @@ if(WITH_GPU)
     NAME test_cinn_real_efficientnet
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_efficientnet.py
       "${CMAKE_BINARY_DIR}/thirds/EfficientNet" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
@@ -116,7 +121,7 @@ if(WITH_GPU)
     NAME test_cinn_real_mobilenetV1
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_mobilenetv1.py
       "${CMAKE_BINARY_DIR}/thirds/MobilenetV1" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
@@ -125,7 +130,7 @@ if(WITH_GPU)
     NAME test_cinn_real_resnet50
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet50.py
       "${CMAKE_BINARY_DIR}/thirds/ResNet50" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
@@ -134,7 +139,7 @@ if(WITH_GPU)
     NAME test_cinn_real_squeezenet
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_squeezenet.py
       "${CMAKE_BINARY_DIR}/thirds/SqueezeNet" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
@@ -143,14 +148,14 @@ if(WITH_GPU)
     NAME test_paddle_model_convertor
     COMMAND
       ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_paddle_model_convertor.py --path
       "${CMAKE_BINARY_DIR}/thirds/resnet_model"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endif()
 
 #ADD_TEST(NAME test_cinn_real_facedet
-#    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH}
+#    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
 #    python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_facedet.py "${CMAKE_BINARY_DIR}/thirds/FaceDet" "${WITH_GPU}"
 #    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 if(WITH_GPU)
@@ -165,7 +170,7 @@ if(WITH_GPU)
       NAME test_conv2d_op
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
         ${CMAKE_CURRENT_SOURCE_DIR}/ops/test_conv2d_op.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endif()
@@ -180,7 +185,7 @@ if(WITH_GPU)
       NAME ${op_test_name}
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
         ${CMAKE_CURRENT_SOURCE_DIR}/${op_test_name}.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endforeach()
@@ -197,7 +202,7 @@ if(WITH_GPU)
       NAME test_mul_op_mapper
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
         ${CMAKE_CURRENT_SOURCE_DIR}/op_mappers/test_mul_op.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
@@ -205,7 +210,7 @@ if(WITH_GPU)
       NAME test_conv2d_op_mapper
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
         ${CMAKE_CURRENT_SOURCE_DIR}/op_mappers/test_conv2d_op.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endif()
@@ -220,7 +225,7 @@ if(WITH_GPU)
       NAME "${op_mapper_test_name}_mapper"
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
         ${CMAKE_CURRENT_SOURCE_DIR}/${op_mapper_test_name}.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endforeach()
@@ -241,7 +246,7 @@ if(WITH_GPU)
       NAME ${pass_test_name}
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${PROJECT_SOURCE_DIR}:${CMAKE_BINARY_DIR}/python:$ENV{PYTHONPATH} python3
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
         ${CMAKE_CURRENT_SOURCE_DIR}/${pass_test_name}.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endforeach()
@@ -261,7 +266,7 @@ if(WITH_GPU)
       NAME ${fusion_test_name}
       COMMAND
         ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${PROJECT_SOURCE_DIR}:$ENV{PYTHONPATH} python3
+        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
         ${CMAKE_CURRENT_SOURCE_DIR}/${fusion_test_name}.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   endforeach()
diff --git a/test/cpp/fluid/cinn/CMakeLists.txt b/test/cpp/fluid/cinn/CMakeLists.txt
index 72254d53f73cb..9d2d875d8e36b 100644
--- a/test/cpp/fluid/cinn/CMakeLists.txt
+++ b/test/cpp/fluid/cinn/CMakeLists.txt
@@ -12,7 +12,7 @@ if(WITH_TESTING)
     graph
     cinn_launch_context
     cinn_instruction_run_op
-    cinn)
+    cinnapi)
   target_link_libraries(cinn_launch_context_test ${PYTHON_LIBRARIES})
   set_tests_properties(cinn_launch_context_test PROPERTIES LABELS
                                                            "RUN_TYPE=CINN")
diff --git a/test/cpp/fluid/cinn/cinn_launch_context_test.cc b/test/cpp/fluid/cinn/cinn_launch_context_test.cc
index 97cc28f6293d0..95dabdb77560d 100644
--- a/test/cpp/fluid/cinn/cinn_launch_context_test.cc
+++ b/test/cpp/fluid/cinn/cinn_launch_context_test.cc
@@ -18,14 +18,14 @@ limitations under the License. */
 #include <set>
 #include <utility>
 
-#include "cinn/auto_schedule/auto_tuner.h"
-#include "cinn/common/target.h"
-#include "cinn/common/type.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/instruction.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/auto_schedule/auto_tuner.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/op_registry.h"

From 081549d3b783d7778382521709eebca52be7f7de Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Mon, 19 Jun 2023 15:17:09 +0800
Subject: [PATCH 06/14] feat(cmake): remove some flags

---
 CMakeLists.txt          |  5 +++++
 cmake/third_party.cmake | 12 ++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 25621a1cd757f..3246087627a4a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -574,11 +574,16 @@ include(flags) # set paddle compile flags
 
 #------------- cinn cmake config start --------------
 
+set(WITH_MKL_CBLAS ${WITH_MKL})
+set(WITH_CUDA ${WITH_GPU})
+set(WITH_CUDNN ${WITH_GPU})
 if(WITH_CINN)
   message(STATUS "Compile Paddle with CINN.")
   include(cmake/cinn.cmake)
   add_definitions(-DPADDLE_WITH_CINN)
   if(WITH_GPU)
+    set(WITH_CUDA ${WITH_GPU})
+    set(WITH_CUDNN ${WITH_GPU})
     add_definitions(-DCINN_WITH_CUDA)
     add_definitions(-DCINN_WITH_CUDNN)
   endif()
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index af64e42dc2f18..d94faf0a9ceca 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -261,6 +261,8 @@ endif()
 
 ########################### include third_party according to flags ###############################
 
+
+# cinn_only includes third-party libraries separately
 if(CINN_ONLY)
   include(external/zlib)
   include(external/gflags)
@@ -279,6 +281,16 @@ if(CINN_ONLY)
   return()
 endif()
 
+if(WITH_CINN)
+  if(WITH_MKL)
+    add_definitions(-DCINN_WITH_MKL_CBLAS)
+  endif()
+  if(WITH_MKLDNN)
+    add_definitions(-DCINN_WITH_MKLDNN)
+  endif()
+endif()
+
+
 include(external/zlib) # download, build, install zlib
 include(external/gflags) # download, build, install gflags
 include(external/glog) # download, build, install glog

From d33fdb684cc9e999c8b23d4ee03e00c6a0a5701b Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Mon, 19 Jun 2023 15:40:20 +0800
Subject: [PATCH 07/14] fix(cmake): fix cinn's gflags depends

---
 cmake/cinn.cmake        | 2 ++
 cmake/third_party.cmake | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
index bed5b048a69ba..28e9c74c55e12 100644
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -134,6 +134,7 @@ cinn_cc_library(
   ${cinnapi_src}
   DEPS
   glog
+  gflags
   ${llvm_libs}
   cinn_framework_proto
   param_proto
@@ -185,6 +186,7 @@ function(gen_cinncore LINKTYPE)
     ${core_src}
     DEPS
     glog
+    gflags
     ${llvm_libs}
     cinn_framework_proto
     param_proto
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index d94faf0a9ceca..73efd92db9f53 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -261,7 +261,6 @@ endif()
 
 ########################### include third_party according to flags ###############################
 
-
 # cinn_only includes third-party libraries separately
 if(CINN_ONLY)
   include(external/zlib)

From 46c69e4db938b723334ff303023f37e59ae940d0 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Mon, 19 Jun 2023 16:57:35 +0800
Subject: [PATCH 08/14] feat(cmake): add ci scripts of cinn

---
 cmake/cinn.cmake                              |   4 +
 tools/cinn/build.sh                           | 302 ++++++++++++++++
 tools/cinn/ci_build.sh                        |  82 +++++
 tools/cinn/codestyle/.gitignore               |   3 +
 tools/cinn/codestyle/clang_format.hook        |  15 +
 tools/cinn/codestyle/copyright.hook           | 134 +++++++
 tools/cinn/codestyle/cpplint_pre_commit.hook  |  27 ++
 tools/cinn/docker/Dockerfile                  | 132 +++++++
 tools/cinn/docker/Dockerfile.ci               |   1 +
 tools/cinn/docker/Dockerfile.ci.cuda          |   1 +
 tools/cinn/docker/requirements.txt            |  12 +
 tools/cinn/docker/script_build/install_gcc.sh |  60 ++++
 tools/cinn/gen_c++_tutorial.py                | 215 +++++++++++
 .../paddle_benchmark/paddle_save_model.py     |  39 ++
 .../paddle_benchmark/paddle_test_benchmark.py |  90 +++++
 .../cinn/paddle_benchmark/test_paddle_ops.py  | 276 ++++++++++++++
 tools/cinn/tutorials_demo/demo.cc             |  87 +++++
 tools/cinn/tvm_benchmark/test_topi_default.py | 340 ++++++++++++++++++
 .../tvm_benchmark/tvm_graph_with_single_op.py | 267 ++++++++++++++
 19 files changed, 2087 insertions(+)
 create mode 100755 tools/cinn/build.sh
 create mode 100755 tools/cinn/ci_build.sh
 create mode 100644 tools/cinn/codestyle/.gitignore
 create mode 100755 tools/cinn/codestyle/clang_format.hook
 create mode 100644 tools/cinn/codestyle/copyright.hook
 create mode 100755 tools/cinn/codestyle/cpplint_pre_commit.hook
 create mode 100644 tools/cinn/docker/Dockerfile
 create mode 100644 tools/cinn/docker/Dockerfile.ci
 create mode 100755 tools/cinn/docker/Dockerfile.ci.cuda
 create mode 100644 tools/cinn/docker/requirements.txt
 create mode 100644 tools/cinn/docker/script_build/install_gcc.sh
 create mode 100644 tools/cinn/gen_c++_tutorial.py
 create mode 100755 tools/cinn/paddle_benchmark/paddle_save_model.py
 create mode 100755 tools/cinn/paddle_benchmark/paddle_test_benchmark.py
 create mode 100755 tools/cinn/paddle_benchmark/test_paddle_ops.py
 create mode 100644 tools/cinn/tutorials_demo/demo.cc
 create mode 100644 tools/cinn/tvm_benchmark/test_topi_default.py
 create mode 100755 tools/cinn/tvm_benchmark/tvm_graph_with_single_op.py

diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
index 28e9c74c55e12..08fb81d5d6239 100644
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -262,6 +262,10 @@ if(PUBLISH_LIBS)
   add_custom_command(
     TARGET cinncore_static
     POST_BUILD
+    COMMAND cmake -E copy ${PROJECT_SOURCE_DIR}/tools/cinn/tutorials_demo/demo.cc
+            ${CMAKE_BINARY_DIR}/dist/demo.cc
+    COMMAND cmake -E copy ${PROJECT_SOURCE_DIR}/tools/cinn/tutorials_demo/build_demo.sh
+            ${CMAKE_BINARY_DIR}/dist/build_demo.sh
     COMMAND cmake -E copy ${CMAKE_BINARY_DIR}/libcinncore_static.a
             ${CMAKE_BINARY_DIR}/dist/cinn/lib/libcinncore_static.a
     COMMAND
diff --git a/tools/cinn/build.sh b/tools/cinn/build.sh
new file mode 100755
index 0000000000000..f0576d4230fd7
--- /dev/null
+++ b/tools/cinn/build.sh
@@ -0,0 +1,302 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+workspace=$(cd $(dirname ${BASH_SOURCE[0]})/../..; pwd)
+build_dir_name=${cinn_build:-build_ci}
+build_dir=$workspace/${build_dir_name}
+py_version=${py_version:-3.7}
+cinn_whl_path=python/dist/cinn-0.0.0-py3-none-any.whl
+
+#export LLVM11_DIR=${workspace}/THIRDS/usr
+
+JOBS=8
+cuda_config=OFF
+cudnn_config=OFF
+
+mklcblas_config=ON
+mkldnn_config=ON
+
+function mklcblas_off {
+  mklcblas_config=OFF
+}
+function mkldnn_off {
+  mkldnn_config=OFF
+}
+
+
+function gpu_on {
+  cinn_whl_path=python/dist/cinn_gpu-0.0.0-py3-none-any.whl
+  cuda_config=ON
+  cudnn_config=ON
+}
+
+function test_doc {
+    mkdir -p $build_dir
+    cd $build_dir
+    export runtime_include_dir=$workspace/paddle/cinn/runtime/cuda
+
+    prepare_ci
+    cmake_
+    build
+    make_doc
+}
+
+function cudnn_off {
+  cudnn_config=OFF
+}
+
+set +x
+OLD_HTTP_PROXY=$http_proxy &> /dev/null
+OLD_HTTPS_PROXY=$https_proxy &> /dev/null
+set -x
+
+function proxy_off {
+  set +x
+  unset http_proxy &> /dev/null
+  unset https_proxy &> /dev/null
+  set -x
+}
+function proxy_on {
+  set +x
+  export http_proxy=$OLD_HTTP_PROXY &> /dev/null
+  export https_proxy=$OLD_HTTPS_PROXY &> /dev/null
+  set -x
+}
+
+function prepare_ci {
+  cd $workspace
+  proxy_on
+  if [[ ! -z ${PULL_ID} ]]; then
+    # in ci environment, we use aliyun ubuntu mirror, thus turn off proxy
+    proxy_off
+  fi
+
+  if [[ $(command -v python) == $build_dir/ci-env/bin/python ]]; then
+    return
+  elif [[ -e $build_dir/ci-env/bin/activate ]]; then
+    source $build_dir/ci-env/bin/activate
+    return
+  fi
+
+  # NVIDIA update GPG key on 04/29/2022. Fetch the public key for CI machine
+  # Reference: https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/
+  set +x
+  apt-key adv --keyserver-options http-proxy=$http_proxy --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+  set -x
+
+  apt update
+  echo "the current user EUID=$EUID: $(whoami)"
+  if ! command -v doxygen &> /dev/null; then
+    apt install -y doxygen
+  fi
+
+  if ! command -v python${py_version}-config &> /dev/null; then
+    apt install -y python${py_version}-dev
+  fi
+
+  if ! command -v virtualenv  &> /dev/null; then
+    apt install -y virtualenv
+  fi
+
+  if [[ ! -e $build_dir/ci-env/bin/activate ]]; then
+    virtualenv ${build_dir}/ci-env -p python${py_version}
+  fi
+
+  source $build_dir/ci-env/bin/activate
+  python${py_version} -m pip install -U --no-cache-dir pip
+  python${py_version} -m pip install pre-commit
+  python${py_version} -m pip install clang-format==13.0.0
+  python${py_version} -m pip install wheel
+  python${py_version} -m pip install sphinx==3.3.1 sphinx_gallery==0.8.1 recommonmark==0.6.0 exhale scipy breathe==4.24.0 matplotlib sphinx_rtd_theme
+  python${py_version} -m pip install paddlepaddle-gpu==0.0.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html
+}
+
+
+function cmake_ {
+    proxy_off
+    mkdir -p $build_dir
+    cd $build_dir
+    set -x
+    cmake ${workspace} -DCINN_ONLY=ON -DWITH_CINN=ON -DWITH_GPU=${cuda_config} \
+      -DWITH_TESTING=ON  -DWITH_MKL=${mklcblas_config}  -DPY_VERSION=${py_version}
+    set +x
+
+}
+
+function _download_and_untar {
+    proxy_off
+    local tar_file=$1
+    if [[ ! -f $tar_file ]]; then
+        wget https://paddle-inference-dist.bj.bcebos.com/CINN/$tar_file
+        tar -zxvf $tar_file
+    fi
+}
+
+function prepare_model {
+    proxy_off
+    cd $build_dir/thirds
+
+    _download_and_untar ResNet18.tar.gz
+    _download_and_untar MobileNetV2.tar.gz
+    _download_and_untar EfficientNet.tar.gz
+    _download_and_untar MobilenetV1.tar.gz
+    _download_and_untar ResNet50.tar.gz
+    _download_and_untar SqueezeNet.tar.gz
+    _download_and_untar FaceDet.tar.gz
+
+    proxy_on
+    mkdir -p $build_dir/paddle
+    cd $build_dir/paddle
+    if [[ ! -f "libexternal_kernels.so.tgz" ]]; then
+        wget https://github.com/T8T9/files/raw/main/libexternal_kernels.so.tgz
+    fi
+    tar -zxvf libexternal_kernels.so.tgz
+    if [[ ! -f "paddle_1.8_fc_model.tgz" ]]; then
+        wget https://github.com/T8T9/files/raw/main/paddle_1.8_fc_model.tgz
+    fi
+    tar -zxvf paddle_1.8_fc_model.tgz
+    if [[ ! -f "mkldnn.tgz" ]]; then
+        wget https://github.com/T8T9/files/raw/main/mkldnn.tgz
+    fi
+    tar -zxvf mkldnn.tgz
+    cd $build_dir/thirds
+    python${py_version} $workspace/test/cinn/fake_model/naive_mul.py
+    python${py_version} $workspace/test/cinn/fake_model/naive_multi_fc.py
+    python${py_version} $workspace/test/cinn/fake_model/resnet_model.py
+}
+
+function build {
+    proxy_on
+    cd $build_dir
+
+    make -j $JOBS
+
+    ls python/dist
+    python${py_version} -m pip install xgboost
+    python${py_version} -m pip install -U ${cinn_whl_path}
+}
+
+function run_demo {
+    cd $build_dir/dist
+    export runtime_include_dir=$workspace/paddle/cinn/runtime/cuda
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$build_dir/dist/cinn/lib
+    bash build_demo.sh
+    ./demo
+    rm ./demo
+    cd -
+}
+
+function run_test {
+    source $build_dir/ci-env/bin/activate
+    cd $build_dir
+    export runtime_include_dir=$workspace/paddle/cinn/runtime/cuda
+
+    if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
+        ctest --parallel 10 -V
+    else
+        ctest --parallel 10 --output-on-failure
+    fi
+}
+
+function CI {
+    mkdir -p $build_dir
+    cd $build_dir
+    export runtime_include_dir=$workspace/paddle/cinn/runtime/cuda
+
+    prepare_ci
+#    codestyle_check
+
+    cmake_
+    build
+    run_demo
+    prepare_model
+    run_test
+    # make_doc
+}
+
+function CINNRT {
+    mkdir -p $build_dir
+    cd $build_dir
+    export runtime_include_dir=$workspace/paddle/cinn/runtime/cuda
+
+    prepare_ci
+    # codestyle_check
+
+    proxy_off
+    mkdir -p $build_dir
+    cd $build_dir
+    set -x
+    cmake ${workspace} -DCINN_ONLY=ON -DWITH_CINN=ON -DWITH_GPU=${cuda_config} \
+      -DWITH_TESTING=ON  -DWITH_MKL=${mklcblas_config} -DPUBLISH_LIBS=ON
+    set +x
+    make cinnopt -j $JOBS
+}
+
+function main {
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            mklcblas_off)
+                mklcblas_off
+                mkldnn_off
+                shift
+                ;;
+            mkldnn_off)
+                mkldnn_off
+                shift
+                ;;
+            gpu_on)
+                gpu_on
+                shift
+                ;;
+            cudnn_off)
+                cudnn_off
+                shift
+                ;;
+            check_style)
+                codestyle_check
+                shift
+                ;;
+            cmake)
+                cmake_
+                shift
+                ;;
+            build)
+                build
+                shift
+                ;;
+            test)
+                run_test
+                shift
+                ;;
+            ci)
+                CI
+                shift
+                ;;
+            CINNRT)
+               CINNRT
+               shift
+                ;;
+            prepare_model)
+                prepare_model
+                shift
+                ;;
+        esac
+    done
+}
+
+main $@
diff --git a/tools/cinn/ci_build.sh b/tools/cinn/ci_build.sh
new file mode 100755
index 0000000000000..19aef611d7158
--- /dev/null
+++ b/tools/cinn/ci_build.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+
+readonly workspace=$PWD
+
+function install_isl {
+    cd $workspace
+    if [ ! -d isl ]; then
+        git clone https://github.com/inducer/isl.git isl
+    fi
+
+    cd isl
+    git checkout a72ac2e
+    ./autogen.sh
+
+    find /usr -name "SourceLocation.h"
+
+    CFLAGS="-fPIC -DPIC" CPPFLAGS="-fPIC -DPIC" ./configure --with-clang=system --enable-shared=yes --enable-static=yes
+    make -j install
+    cd $workspace
+}
+
+function install_ginac {
+    cd $workspace
+    if [ ! -d gmp-6.2.1 ]; then
+      wget https://gmplib.org/download/gmp/gmp-6.2.1.tar.xz
+      tar xf gmp-6.2.1.tar.xz
+      cd gmp-6.2.1
+      CFLAGS="-fPIC -DPIC" CXXFLAGS="-fPIC -DPIC" ./configure --enable-shared=yes --enable-static=yes
+      make -j install
+    fi
+
+    if [ ! -d cln-1.3.6 ]; then
+      wget https://www.ginac.de/CLN/cln-1.3.6.tar.bz2 -O cln-1.3.6.tar.bz2
+      tar xf cln-1.3.6.tar.bz2
+      cd cln-1.3.6
+      CFLAGS="-fPIC -DPIC" CXXFLAGS="-fPIC -DPIC" ./configure --enable-shared=yes --enable-static=yes --with-gmp=/usr/local
+      make -j install
+    fi
+
+    if [ ! -d ginac-1.8.1 ]; then
+      wget https://www.ginac.de/ginac-1.8.1.tar.bz2 -O ginac-1.8.1.tar.bz2
+      tar xf ginac-1.8.1.tar.bz2
+      cd ginac-1.8.1
+      CFLAGS="-fPIC -DPIC" CXXFLAGS="-fPIC -DPIC" CLN_LIBS="-L/usr/local/lib -lcln" CLN_CFLAGS="-I/usr/local/include" ./configure --enable-shared=yes --enable-static=yes
+      make -j install
+    fi
+
+    cd $workspace
+}
+
+function compile_cinn {
+    cd $workspace
+    cmake .
+    make -j
+}
+
+function run_test {
+    ctest -V
+}
+
+#install_isl
+#install_ginac
+#
+#compile_cinn
+
+#run_test
diff --git a/tools/cinn/codestyle/.gitignore b/tools/cinn/codestyle/.gitignore
new file mode 100644
index 0000000000000..f3efb4c8433f3
--- /dev/null
+++ b/tools/cinn/codestyle/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+*.html
+*.json
\ No newline at end of file
diff --git a/tools/cinn/codestyle/clang_format.hook b/tools/cinn/codestyle/clang_format.hook
new file mode 100755
index 0000000000000..3a1f81a25f6de
--- /dev/null
+++ b/tools/cinn/codestyle/clang_format.hook
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+readonly VERSION="13.0.0"
+
+version=$(clang-format -version)
+
+if ! [[ $version == *"$VERSION"* ]]; then
+    echo "clang-format version check failed."
+    echo "a version contains '$VERSION' is needed, but get '$version'"
+    echo "you can install the right version, and make an soft-link to '\$PATH' env"
+    exit -1
+fi
+
+clang-format $@
diff --git a/tools/cinn/codestyle/copyright.hook b/tools/cinn/codestyle/copyright.hook
new file mode 100644
index 0000000000000..933f9940410ae
--- /dev/null
+++ b/tools/cinn/codestyle/copyright.hook
@@ -0,0 +1,134 @@
+# Copyright (c) 2020 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io
+import re
+import sys
+import os
+import datetime
+
+COPYRIGHT = '''Copyright (c) 2016 CINN Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.'''
+
+def _generate_copyright(comment_mark):
+    copyright=COPYRIGHT.split(os.linesep)
+    header = copyright[0].rstrip()
+
+    p = re.search('(\d{4})', header).group(0)
+    now = datetime.datetime.now()
+
+    header = header.replace(p,str(now.year))
+
+    ans=[comment_mark + " " + header + os.linesep]
+    for idx, line in enumerate(copyright[1:]):
+        ans.append(comment_mark + " " + line.rstrip() + os.linesep)
+
+    return ans
+
+def _get_comment_mark(path):
+    lang_type=re.compile(r"\.(py|sh)$")
+    if lang_type.search(path) is not None:
+        return "#"
+
+    lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
+    if lang_type.search(path) is not None:
+        return "//"
+
+    return None
+
+
+RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE)
+RE_COPYRIGHT = re.compile(r".*Copyright \(c\) \d{4}", re.IGNORECASE)
+RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!")
+
+def _check_copyright(path):
+    head=[]
+    try:
+        with open(path) as f:
+            head = [next(f) for x in range(4)]
+    except StopIteration:
+        pass
+
+    for idx, line in enumerate(head):
+        if RE_COPYRIGHT.search(line) is not None:
+            return True
+
+    return False
+
+def generate_copyright(path, comment_mark):
+    original_contents = io.open(path, encoding="utf-8").readlines()
+    head = original_contents[0:4]
+
+    insert_line_no=0
+    for i, line in enumerate(head):
+        if RE_ENCODE.search(line) or RE_SHEBANG.search(line):
+            insert_line_no=i+1
+
+    copyright = _generate_copyright(comment_mark)
+    if insert_line_no == 0:
+        new_contents = copyright
+        if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:
+            new_contents.append(os.linesep)
+        new_contents.extend(original_contents)
+    else:
+        new_contents=original_contents[0:insert_line_no]
+        new_contents.append(os.linesep)
+        new_contents.extend(copyright)
+        if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:
+            new_contents.append(os.linesep)
+        new_contents.extend(original_contents[insert_line_no:])
+    new_contents="".join(new_contents)
+
+    with io.open(path, 'w') as output_file:
+        output_file.write(new_contents)
+
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for path in args.filenames:
+        comment_mark = _get_comment_mark(path)
+        if comment_mark is None:
+            print("warning:Unsupported file", path, file=sys.stderr)
+            continue
+
+        if _check_copyright(path):
+            continue
+
+        generate_copyright(path, comment_mark)
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/tools/cinn/codestyle/cpplint_pre_commit.hook b/tools/cinn/codestyle/cpplint_pre_commit.hook
new file mode 100755
index 0000000000000..3baad1dabe60f
--- /dev/null
+++ b/tools/cinn/codestyle/cpplint_pre_commit.hook
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+TOTAL_ERRORS=0
+if [[ ! $TRAVIS_BRANCH ]]; then
+  # install cpplint on local machine.
+  if [[ ! $(which cpplint) ]]; then
+    pip install cpplint
+  fi
+  # diff files on local machine.
+  files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')
+else
+  # diff files between PR and latest commit on Travis CI.
+  branch_ref=$(git rev-parse "$TRAVIS_BRANCH")
+  head_ref=$(git rev-parse HEAD)
+  files=$(git diff --name-status $branch_ref $head_ref | awk '$1 != "D" {print $2}')
+fi
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $files; do
+    if [[ $file =~ ^(patches/grpc/.*) ]]; then
+        continue;
+    else
+        cpplint --filter=-readability/fn_size,-legal/copyright --linelength=120 $file;
+        TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+    fi
+done
+
+exit $TOTAL_ERRORS
diff --git a/tools/cinn/docker/Dockerfile b/tools/cinn/docker/Dockerfile
new file mode 100644
index 0000000000000..59e2f388038a6
--- /dev/null
+++ b/tools/cinn/docker/Dockerfile
@@ -0,0 +1,132 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV DEBIAN_FRONTEND=noninteractive
+ENV HOME /root
+# Add bash enhancements
+RUN apt-get update && \
+    apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install -y curl wget vim git unzip unrar tar xz-utils bzip2 gzip \ 
+        coreutils ntp language-pack-zh-hans python-qt4 libsm6 libxext6 libxrender-dev
+
+
+# Downgrade gcc&&g++
+WORKDIR /usr/bin 
+RUN apt-get update --fix-missing
+COPY script_build /script_build
+RUN bash /script_build/install_gcc.sh gcc82 && rm -rf /script_build && \
+    cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ && \
+    ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc && \
+    ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ && \
+    ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc && \
+    ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++
+ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
+
+RUN apt-get update && \
+    apt-get install -y python3.6 python3.6-dev python3.6-venv && \
+    apt-get install -y python3-pip
+
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.20/cmake-3.20.0-linux-x86_64.tar.gz && tar -zxvf cmake-3.20.0-linux-x86_64.tar.gz && rm cmake-3.20.0-linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.20.0-linux-x86_64/bin:$PATH
+
+# remove them when apt-get support 2.27 and higher version
+RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ 
+    tar -xzf binutils-2.33.1.tar.gz && \ 
+    cd binutils-2.33.1 && \
+    ./configure && make -j && make install && cd .. && rm -rf binutils-2.33.1 binutils-2.33.1.tar.gz
+
+
+# Install Go and glide
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com && \
+    pip3 --no-cache-dir install ipykernel==4.6.0 wheel -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+
+#For docstring checker
+RUN pip3 --no-cache-dir install pylint pytest astroid isort -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+
+COPY requirements.txt /root/
+RUN pip3 --no-cache-dir install -r /root/requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
+RUN apt-get install software-properties-common && \
+    apt-get update && \
+    add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
+    apt-get update -y && \
+    apt install gcc-10 -y && \
+    wget -q http://mirrors.edge.kernel.org/ubuntu/pool/universe/p/patchelf/patchelf_0.10-2build1_amd64.deb && \
+    dpkg -i patchelf_0.10-2build1_amd64.deb
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+#CMD source ~/.bashrc
+
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
+# For CINN environment 
+RUN apt update --fix-missing && \
+    apt install autoconf autogen libtool zlib1g-dev sudo libginac-dev clang cmake -y && \
+    apt remove python3-six python-six -y && \
+    pip3 install numpy pybind11 six matplotlib && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python2.7 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.6 2 && \
+    python3 -m pip install paddlepaddle-gpu==2.1.2.post101 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+
+# Install LLVM
+RUN echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main >> /etc/apt/source.list && \
+    echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main >> /etc/apt/source.list && \
+    echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main >> /etc/apt/source.list && \
+    echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main >> /etc/apt/source.list
+
+RUN ln -s /usr/bin/llvm-config-6.0 /usr/bin/llvm-config && \
+    printf "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main" |tee /etc/apt/sources.list.d/llvm-toolchain-xenial-10.list && \
+    wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add - && \
+    apt install -y libclang-dev llvm-10 llvm-10-dev libclang-10-dev -y
+
+# set C++ Path, libcudnn.so and llvm11 with mlir
+ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/7:/usr/include/x86_64-linux-gnu/c++/7
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/local/cuda/lib64/libcudnn.so && \
+    mkdir /WorkSpace && \
+    cd /WorkSpace && \
+    wget -q https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11-latest.tar && \
+    tar -xvf llvm11-latest.tar
+ENV LLVM11_DIR=/WorkSpace/llvm11-latest
+
+WORKDIR /WorkSpace
+EXPOSE 22
diff --git a/tools/cinn/docker/Dockerfile.ci b/tools/cinn/docker/Dockerfile.ci
new file mode 100644
index 0000000000000..942b8baae0b83
--- /dev/null
+++ b/tools/cinn/docker/Dockerfile.ci
@@ -0,0 +1 @@
+FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8-gcc82
diff --git a/tools/cinn/docker/Dockerfile.ci.cuda b/tools/cinn/docker/Dockerfile.ci.cuda
new file mode 100755
index 0000000000000..942b8baae0b83
--- /dev/null
+++ b/tools/cinn/docker/Dockerfile.ci.cuda
@@ -0,0 +1 @@
+FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8-gcc82
diff --git a/tools/cinn/docker/requirements.txt b/tools/cinn/docker/requirements.txt
new file mode 100644
index 0000000000000..3343a1cf2294b
--- /dev/null
+++ b/tools/cinn/docker/requirements.txt
@@ -0,0 +1,12 @@
+requests>=2.20.0
+numpy>=1.13, <=1.16.4 ; python_version<"3.5"
+numpy>=1.13 ; python_version>="3.5" and platform_system != "Windows"
+numpy>=1.13, <=1.19.3 ; python_version>="3.5" and platform_system == "Windows"
+protobuf>=3.1.0
+gast>=0.3.3 ; platform_system != "Windows"
+gast==0.3.3 ; platform_system == "Windows"
+Pillow
+six
+decorator==4.4.2
+astor
+xgboost
diff --git a/tools/cinn/docker/script_build/install_gcc.sh b/tools/cinn/docker/script_build/install_gcc.sh
new file mode 100644
index 0000000000000..e744e9ddac66e
--- /dev/null
+++ b/tools/cinn/docker/script_build/install_gcc.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Top-level build script called from Dockerfile
+
+# Stop at any error, show all commands
+set -ex
+
+if [ -f "/etc/redhat-release" ];then
+  lib_so_5=/usr/lib64/libgfortran.so.5
+  lib_so_6=/usr/lib64/libstdc++.so.6
+  lib_path=/usr/lib64
+else
+  lib_so_5=/usr/lib/x86_64-linux-gnu/libstdc++.so.5
+  lib_so_6=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
+  lib_path=/usr/lib/x86_64-linux-gnu
+fi
+
+if [ "$1" == "gcc82" ]; then
+  wget -q https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz 
+  tar -xvf gcc-8.2.0.tar.xz && \
+  cd gcc-8.2.0 && \
+  unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
+  ./contrib/download_prerequisites && \
+  cd .. && mkdir temp_gcc82 && cd temp_gcc82 && \
+  ../gcc-8.2.0/configure --prefix=/usr/local/gcc-8.2 --enable-threads=posix --disable-checking --disable-multilib && \
+  make -j8 && make install
+  cd .. && rm -rf temp_gcc82
+  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+  ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \
+  ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \
+  cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path}
+elif [ "$1" == "gcc54" ]; then
+  wget -q http://ftp.tsukuba.wide.ad.jp/software/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2 
+  tar -xvf gcc-5.4.0.tar.bz2 && \
+  cd gcc-5.4.0 && \
+  unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
+  ./contrib/download_prerequisites && \
+  cd .. && mkdir temp_gcc54 && cd temp_gcc54 && \
+  ../gcc-5.4.0/configure --prefix=/usr/local/gcc-5.4 --enable-checking=release --enable-languages=c,c++ --disable-multilib && \
+  make -j8 && make install
+  cd .. && rm -rf temp_gcc54
+  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+  ln -s /usr/local/gcc-5.4/lib64/libgfortran.so.5 ${lib_so_5} && \
+  ln -s /usr/local/gcc-5.4/lib64/libstdc++.so.6 ${lib_so_6} && \
+  cp /usr/local/gcc-5.4/lib64/libstdc++.so.6.0.21 ${lib_path}
+fi
diff --git a/tools/cinn/gen_c++_tutorial.py b/tools/cinn/gen_c++_tutorial.py
new file mode 100644
index 0000000000000..5aefbf064e740
--- /dev/null
+++ b/tools/cinn/gen_c++_tutorial.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+This script helps to extract the tutorial content from a C++ source file.
+'''
+
+# syntax definition
+# The text content locates in the comments with `//!` prefix.
+# Some predefined marks:
+#  - @h1, @h2, @h3, the nth headline
+#  - @IGNORE-NEXT, hide the next line of code
+#  - @ROC, the code block inside a C++ multi-line string guard `ROC()ROC`,
+#          display as a markdown code block.
+
+import sys
+from typing import List
+import logging
+
+
+class Markdown(object):
+    '''
+    A simple markdown generator.
+    '''
+
+    def __init__(self):
+        self.content: List[str] = []
+
+    def h1(self, title: str):
+        self.add_line('# ' + title)
+
+    def h2(self, title: str):
+        self.add_line('## ' + title)
+
+    def h3(self, title: str):
+        self.add_line('### ' + title)
+
+    def code_block(self, lang: str, block: List[str]):
+        # drop the precending and tailing empty lines to make code block more compact
+        pre_valid_offset = 0
+        tail_valid_offset = 0
+        for x in block:
+            if x.strip():
+                break
+            else:
+                pre_valid_offset += 1
+        for x in reversed(block):
+            if x.strip():
+                break
+            else:
+                tail_valid_offset += 1
+        logging.warning("block0: %s" % block)
+        block = block[pre_valid_offset:
+                      -tail_valid_offset] if tail_valid_offset > 0 else block[
+                          pre_valid_offset:]
+        logging.warning("block1: %s" % block)
+        if not block:
+            return
+
+        c = "```" + lang
+
+        # add empty lines to wrap code block
+        self.add_line('')
+        self.add_line('\n'.join([c, '\n'.join(block), "```"]))
+        self.add_line('')
+
+    def add_line(self, content: str):
+        self.content.append(content)
+
+    def generate(self):
+        return '\n'.join(self.content)
+
+
+class Mark:
+    h1 = "@h1"
+    h2 = "@h2"
+    h3 = "@h3"
+    h4 = "@h4"
+    ignore_next = "@IGNORE-NEXT"
+    roc = "@ROC"
+
+
+class ContentGenerator(object):
+    '''
+    Interface for some content passed into the parser.
+    '''
+
+    def has_next(self) -> bool:
+        pass
+
+    def get_line(self) -> str:
+        pass
+
+
+class Parser(object):
+    DOC_COMMENT_PREFIX = "//!"
+
+    def __init__(self):
+        self.doc = Markdown()
+        self.code_block = []
+
+    def parse(self, content: ContentGenerator):
+        while content.has_next():
+            line = content.get_line()
+            line_striped = line.strip()
+            is_doc = False
+            if line_striped.startswith(self.DOC_COMMENT_PREFIX):
+                is_doc = True
+                if self.code_block:
+                    self.doc.code_block('c++', self.code_block)
+                    self.code_block = []
+
+                line_striped = line_striped[len(self.DOC_COMMENT_PREFIX
+                                                ):].strip()
+
+                if line_striped.startswith(Mark.h1):
+                    self.eat_h1(line_striped)
+                elif line_striped.startswith(Mark.h2):
+                    self.eat_h2(line_striped)
+                elif line_striped.startswith(Mark.h3):
+                    self.eat_h3(line_striped)
+                elif line_striped.startswith(Mark.h4):
+                    self.eat_h4(line_striped)
+                elif line_striped.startswith(Mark.ignore_next):
+                    self.eat_ignore_next(content)
+                elif line_striped.startswith(Mark.roc):
+                    self.eat_roc(line_striped, content)
+                else:
+                    self.doc.add_line(line_striped)
+
+            else:  # normal code
+                self.code_block.append(line)
+
+    def eat_h1(self, content: str) -> None:
+        self.doc.h1(content[len(Mark.h1):].strip())
+
+    def eat_h2(self, content: str) -> None:
+        self.doc.h2(content[len(Mark.h2):].strip())
+
+    def eat_h3(self, content: str) -> None:
+        self.doc.h3(content[len(Mark.h3):].strip())
+
+    def eat_ignore_next(self, content: ContentGenerator) -> None:
+        content.get_line()
+
+    def eat_roc(self, header: str, content: ContentGenerator) -> None:
+        '''
+        Get the content from a pair of ROC guards.
+        @param header the string contains description of the ROC block.
+        @content: the content generator.
+
+        e.g.
+
+        the content:
+
+            //! @ROC[c++]
+            auto target_source = R"ROC(
+            function fn0 (_A, _B, _tensor)
+            {
+            }
+            ROC);
+
+        The parameter header is `//! @ROC[c++]`.
+        '''
+        assert "ROC" in header
+        lang = header[len("@ROC["):-1]
+
+        logging.warning("eating ROC")
+
+        assert content.has_next()
+        line: str = content.get_line()
+        assert "ROC(" in line
+        line = content.get_line()
+        code_block = []
+        while ")ROC" not in line:
+            code_block.append(line)
+            line: str = content.get_line()
+
+        logging.warning("DOC content: %s" % code_block)
+
+        self.doc.code_block(lang, code_block)
+
+    def generate(self):
+        return self.doc.generate()
+
+
+if __name__ == '__main__':
+
+    class Content(ContentGenerator):
+        def __init__(self):
+            self.lines = [line for line in sys.stdin]
+            self.cur = 0
+
+        def has_next(self):
+            return self.cur < len(self.lines)
+
+        def get_line(self):
+            assert self.has_next()
+            res = self.lines[self.cur]
+            self.cur += 1
+            return res.rstrip()
+
+    parser = Parser()
+    parser.parse(Content())
+    sys.stdout.write(parser.generate())
diff --git a/tools/cinn/paddle_benchmark/paddle_save_model.py b/tools/cinn/paddle_benchmark/paddle_save_model.py
new file mode 100755
index 0000000000000..d3858b8e93bb4
--- /dev/null
+++ b/tools/cinn/paddle_benchmark/paddle_save_model.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import sys, os
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.static as static
+
+#For paddlepaddle version >=2.0rc, we need to set paddle.enable_static()
+paddle.enable_static()
+
+a = static.data(name="A", shape=[512, 512], dtype='float32')
+b = static.data(name="B", shape=[512, 512], dtype='float32')
+
+label = static.data(name="label", shape=[512, 512], dtype='float32')
+
+a1 = paddle.matmul(a, b)
+
+cpu = paddle.CPUPlace()
+loss = exe = static.Executor(cpu)
+
+exe.run(static.default_startup_program())
+
+fluid.io.save_inference_model("./elementwise_add_model", [a.name, b.name],
+                              [a1], exe)
+print('input and output names are: ', a.name, b.name, a1.name)
diff --git a/tools/cinn/paddle_benchmark/paddle_test_benchmark.py b/tools/cinn/paddle_benchmark/paddle_test_benchmark.py
new file mode 100755
index 0000000000000..10e958a550466
--- /dev/null
+++ b/tools/cinn/paddle_benchmark/paddle_test_benchmark.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import numpy as np
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+import paddle.inference as paddle_infer
+
+
+def main():
+    args = parse_args()
+
+    config = set_config(args)
+
+    predictor = create_paddle_predictor(config)
+
+    input_names = predictor.get_input_names()
+    input_tensor = predictor.get_input_tensor(input_names[0])
+    fake_input = np.random.randn(1, 3, 224, 224).astype("float32")
+    input_tensor.reshape([1, 3, 224, 224])
+    input_tensor.copy_from_cpu(fake_input)
+
+    if len(input_names) > 1:
+        input_tensor2 = predictor.get_input_tensor(input_names[1])
+        fake_input2 = np.random.randn(512, 512).astype("float32")
+        input_tensor2.reshape([512, 512])
+        input_tensor2.copy_from_cpu(fake_input2)
+
+    for _ in range(0, 10):
+        predictor.zero_copy_run()
+
+    time1 = time.time()
+    repeat = 10
+    for i in range(0, repeat):
+        predictor.zero_copy_run()
+    time2 = time.time()
+    total_inference_cost = (time2 - time1) * 1000  # total time cost(ms)
+    print("Average latency : {} ms".format(total_inference_cost / repeat))
+    output_names = predictor.get_output_names()
+    output_tensor = predictor.get_output_tensor(output_names[0])
+    output_data = output_tensor.copy_to_cpu()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_dir", type=str, help="model filename")
+
+    return parser.parse_args()
+
+
+def set_config(args):
+    config = AnalysisConfig(args.model_dir + '/__model__',
+                            args.model_dir + '/params')
+    config.enable_profile()
+    config.enable_use_gpu(1000, 1)
+    # Enable TensorRT
+    config.enable_tensorrt_engine(
+        workspace_size=1 << 30,
+        max_batch_size=1,
+        min_subgraph_size=3,
+        precision_mode=paddle_infer.PrecisionType.Float32,
+        use_static=False,
+        use_calib_mode=False)
+    config.enable_memory_optim()
+    config.gpu_device_id()
+    config.switch_use_feed_fetch_ops(False)
+    config.switch_specify_input_names(True)
+    config.switch_ir_optim(True)
+    #To test cpu backend, just uncomment the following 2 lines.
+    # config.switch_ir_optim(True)
+    # config.disable_gpu()
+    # config.enable_mkldnn()
+    return config
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/cinn/paddle_benchmark/test_paddle_ops.py b/tools/cinn/paddle_benchmark/test_paddle_ops.py
new file mode 100755
index 0000000000000..f32ca7dfe4d82
--- /dev/null
+++ b/tools/cinn/paddle_benchmark/test_paddle_ops.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys, os
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.static as static
+import time
+import argparse
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import create_paddle_predictor
+
+
+def set_config(op_name, input_shapes, enable_gpu=False):
+    model_dir = "./" + op_name + "_model"
+    for input_shape in input_shapes[0]:
+        model_dir += "_" + str(input_shape)
+    config = AnalysisConfig(model_dir)
+    config.enable_profile()
+    if enable_gpu:
+        config.enable_use_gpu(1000, 1)
+        config.gpu_device_id()
+    else:
+        config.disable_gpu()
+        config.enable_mkldnn()
+    config.switch_use_feed_fetch_ops(False)
+    config.switch_specify_input_names(True)
+    config.switch_ir_optim(True)
+
+    return config
+
+
+def create_model(input_names, input_shapes, input_dtypes, fn, attrs=None):
+    # For paddlepaddle version >=2.0rc, we need to set paddle.enable_static()
+    paddle.enable_static()
+    input_args = []
+    input_args_names = []
+    assert len(input_names) == len(input_shapes) == len(input_dtypes)
+    fn_str = fn + "("
+    dim = len(input_shapes)
+    for i in range(dim - 1):
+        input_args.append(
+            static.data(
+                name=input_names[i],
+                shape=input_shapes[i],
+                dtype=input_dtypes[i]))
+        fn_str += "input_args[" + str(i) + "],"
+        input_args_names.append(input_args[i].name)
+    input_args.append(
+        static.data(
+            name=input_names[dim - 1],
+            shape=input_shapes[dim - 1],
+            dtype=input_dtypes[dim - 1]))
+    input_args_names.append(input_args[dim - 1].name)
+    fn_str += "input_args[" + str(dim - 1) + "]"
+    if attrs is not None:
+        fn_str += "," + attrs
+    fn_str += ")"
+
+    print("execute: ", fn_str)
+
+    res = eval(fn_str)
+    cpu = paddle.CPUPlace()
+    loss = exe = static.Executor(cpu)
+    exe.run(static.default_startup_program())
+
+    model_name = "./" + fn + "_model"
+
+    for i in range(len(input_shapes[0])):
+        model_name += "_" + str(input_shapes[0][i])
+    print("save model:", model_name)
+
+    fluid.io.save_inference_model(model_name, input_args_names, [res], exe)
+    print('output name is: ', res.name)
+
+
+def test_benchmark(input_names, input_shapes, input_dtypes, fn, attrs=None):
+    create_model(input_names, input_shapes, input_dtypes, fn, attrs)
+
+    config = set_config(fn, input_shapes)
+    predictor = create_paddle_predictor(config)
+
+    input_names = predictor.get_input_names()
+    input_tensor = predictor.get_input_tensor(input_names[0])
+    fake_input = np.random.random(input_shapes[0]).astype("float32")
+    print("input_shape_A", input_shapes[0])
+    input_tensor.reshape(input_shapes[0])
+    input_tensor.copy_from_cpu(fake_input)
+
+    if len(input_shapes) >= 2:
+        input_tensor2 = predictor.get_input_tensor(input_names[1])
+        fake_input2 = np.random.random(input_shapes[1]).astype("float32")
+        print("input_shape_B", input_shapes[1])
+        input_tensor2.reshape(input_shapes[1])
+        input_tensor2.copy_from_cpu(fake_input2)
+
+    for _ in range(0, 10):
+        predictor.zero_copy_run()
+    repeat = 90
+    start = time.time()
+    for i in range(0, repeat):
+        predictor.zero_copy_run()
+    end = time.time()
+    print("average execution time: ", (end - start) / repeat * 1000)
+    output_names = predictor.get_output_names()
+    output_tensor = predictor.get_output_tensor(output_names[0])
+    output_data = output_tensor.copy_to_cpu()
+
+
+def test_mul():
+    input_shapes = [[1024, 1024], [1024, 1024]]
+    input_names = ["mul_A", "mul_B"]
+    input_dtypes = ["float32", "float32"]
+    op_name = "paddle.matmul"
+    test_benchmark(input_names, input_shapes, input_dtypes, op_name)
+
+
+def test_unary():
+    input_shapes = [[1024, 2048]]
+    input_names = ["A"]
+    input_dtypes = ["float32"]
+    for fn in [
+            "paddle.exp",
+            "paddle.erf",
+            "paddle.nn.functional.sigmoid",
+            "paddle.sqrt",
+            "paddle.log",
+            #         "log2",
+            #         "log10",
+            "paddle.floor",
+            "paddle.ceil",
+            "paddle.round",
+            #         "trunc",
+            "paddle.cos",
+            "paddle.cosh",
+            #         "tan",
+            "paddle.tanh",
+            "paddle.sin",
+            "paddle.sinh",
+            "paddle.acos",
+            #         "acosh",
+            "paddle.asin",
+            #         "asinh",
+            "paddle.atan",
+            #         "atanh",
+            "paddle.nn.functional.softmax",
+            "paddle.scale",
+    ]:
+        test_benchmark(input_names, input_shapes, input_dtypes, fn)
+
+
+def test_binary():
+    # input_shapes = [[100,32], [100,32]]
+    input_shapes = [[1024, 2048], [1024, 2048]]
+    input_names = ["A", "B"]
+    input_dtypes = ["float32", "float32"]
+    for fn in [
+            "paddle.add",
+            "paddle.multiply",
+    ]:
+        test_benchmark(input_names, input_shapes, input_dtypes, fn)
+
+
+def test_relu():
+    input_shapes = [[1024, 2048]]
+    input_names = ["A"]
+    input_dtypes = ["float32"]
+    for fn in [
+            "paddle.nn.functional.relu",
+            "paddle.nn.functional.relu6",
+    ]:
+        test_benchmark(input_names, input_shapes, input_dtypes, fn)
+
+
+def test_conv2d():
+    input_shapes = [[2, 512, 7, 7]]
+    input_names = ["data"]
+    input_dtypes = ["float32"]
+    for fn in [
+            "paddle.static.nn.conv2d",
+    ]:
+        test_benchmark(input_names, input_shapes, input_dtypes, fn,
+                       "num_filters=512, filter_size=3")
+
+
+def test_conv2d_resnet():
+    input_shapes = [[1, 3, 224, 224]]
+    input_names = ["conv2d_resnet_data"]
+    input_dtypes = ["float32"]
+    for fn in [
+            "paddle.static.nn.conv2d",
+    ]:
+        test_benchmark(
+            input_names, input_shapes, input_dtypes, fn,
+            "num_filters=64, filter_size=7, stride=[2,2], padding=[3,3], groups=1, dilation=[1,1]"
+        )
+
+
+def test_depthwise_conv2d():
+    input_shapes = [[2, 32, 112, 112]]
+    input_names = ["depthwise_conv2d_data"]
+    input_dtypes = ["float32"]
+    for fn in [
+            "paddle.static.nn.conv2d",
+    ]:
+        test_benchmark(input_names, input_shapes, input_dtypes, fn,
+                       "num_filters=32, filter_size=3,groups=1")
+
+
+def test_pool2d():
+    input_shapes = [[2, 64, 112, 112]]
+    input_names = ["pool2d_data"]
+    input_dtypes = ["float32"]
+    for fn in [
+            "paddle.nn.functional.max_pool2d",
+    ]:
+        test_benchmark(
+            input_names, input_shapes, input_dtypes, fn,
+            "kernel_size=[3,3],stride=[2,2],padding=[1,1],ceil_mode=False")
+
+
+def test_batchnorm():
+    input_shapes = [[2, 32, 112, 112]]
+    input_names = ["batchnorm_data"]
+    input_dtypes = ["float32"]
+    for fn in [
+            "paddle.static.nn.batch_norm",
+    ]:
+        test_benchmark(input_names, input_shapes, input_dtypes, fn)
+
+
+def test_slice():
+    input_shapes = [[2, 32, 113, 113]]
+    input_names = ["slice_data"]
+    input_dtypes = ["float32"]
+    for fn in [
+            "paddle.slice",
+    ]:
+        test_benchmark(input_names, input_shapes, input_dtypes, fn,
+                       "axes=[2,3],starts=[1,1],ends=[10000000, 10000000]")
+
+
+def test_dropout():
+    input_shapes = [[1024, 2048]]
+    input_names = ["dropout_data"]
+    input_dtypes = ["float32"]
+    for fn in [
+            "paddle.nn.functional.dropout",
+    ]:
+        test_benchmark(input_names, input_shapes, input_dtypes, fn, "p=0")
+
+
+if __name__ == "__main__":
+    test_unary()
+    test_binary()
+    test_mul()
+    test_relu()
+    test_conv2d()
+    test_depthwise_conv2d()
+    test_pool2d()
+    test_batchnorm()
+    test_slice()
+    test_dropout()
+    test_conv2d_resnet()
diff --git a/tools/cinn/tutorials_demo/demo.cc b/tools/cinn/tutorials_demo/demo.cc
new file mode 100644
index 0000000000000..71bfc78ad80ce
--- /dev/null
+++ b/tools/cinn/tutorials_demo/demo.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#include "cinn/backends/compiler.h"
+#include "cinn/cinn.h"
+#include "cinn/common/test_helper.h"
+#include "cinn/hlir/pe/elementwise.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/runtime/cpu/use_extern_funcs.h"
+#include "cinn/utils/timer.h"
+
+namespace cinn {
+namespace backends {
+
+// test x86 compiler
+int run() {
+  Expr M(4), N(4);
+
+  auto create_module = [&]() {
+    Placeholder<float> A("A", {M, N});
+    Placeholder<float> B("B", {M, N});
+
+    auto C = Compute(
+        {M, N}, [=](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
+    return std::make_tuple(A, B, C);
+  };
+
+  // test x86
+  auto module = create_module();  // NOLINT
+  auto& A     = std::get<0>(module);
+  auto& B     = std::get<1>(module);
+  auto& C     = std::get<2>(module);
+
+  auto stages = CreateStages({C});
+
+  auto fn = Lower("fn", stages, {A, B, C});
+
+  ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+  builder.AddFunction(fn);
+
+  auto compiler = Compiler::Create(common::DefaultHostTarget());
+  compiler->Build(builder.Build());
+
+  auto* fnp = compiler->Lookup("fn");
+  if (fnp == nullptr) {
+    std::cerr << "lookup function failed." << std::endl;
+    return 1;
+  }
+
+  auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+  auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build();
+  auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_zero().Build();
+
+  auto args = common::ArgsBuilder().Add(Ab).Add(Bb).Add(Cb).Build();
+  reinterpret_cast<void (*)(void*, int)>(fnp)(args.data(), args.size());
+
+  // test result
+  auto* Ad = reinterpret_cast<float*>(Ab->memory);
+  auto* Bd = reinterpret_cast<float*>(Bb->memory);
+  auto* Cd = reinterpret_cast<float*>(Cb->memory);
+  for (int i = 0; i < Ab->num_elements(); i++) {
+    if (abs(Ad[i] + Bd[i] - Cd[i]) > 1e-5) {
+      std::cerr << "ERROR: Compute failed." << std::endl;
+      return 1;
+    }
+  }
+
+  std::cout << "run demo successfully." << std::endl;
+  return 0;
+}
+}  // namespace backends
+}  // namespace cinn
+
+int main() { return cinn::backends::run(); }
diff --git a/tools/cinn/tvm_benchmark/test_topi_default.py b/tools/cinn/tvm_benchmark/test_topi_default.py
new file mode 100644
index 0000000000000..cfaf423846210
--- /dev/null
+++ b/tools/cinn/tvm_benchmark/test_topi_default.py
@@ -0,0 +1,340 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tvm
+import tvm.testing
+from tvm import te
+import numpy
+import timeit
+from tvm.contrib import tar, ndk
+import os
+from tvm import topi
+
+dtype = ["float32", "float32", "float32", "float32"]
+target = "llvm"
+ctx = tvm.context(target, 0)
+repeat = 10
+
+
+def test_op(func,
+            input_shapes,
+            out_shape,
+            attrs={},
+            name="test_op",
+            dtype=dtype):
+    assert len(input_shapes) >= 1
+    A = te.placeholder(input_shapes[0], name="A", dtype=dtype[0])
+    if len(input_shapes) == 1:
+        C = func(A)
+    elif len(input_shapes) == 2:
+        B = te.placeholder(input_shapes[1], name="B", dtype=dtype[1])
+        C = func(A, B)
+    elif len(input_shapes) == 3:
+        B = te.placeholder(input_shapes[1], name="B", dtype=dtype[1])
+        B1 = te.placeholder(input_shapes[2], name="B1", dtype=dtype[2])
+        C = func(A, B, B1)
+    # Default schedule
+    s = te.create_schedule(C.op)
+    if len(input_shapes) == 1:
+        func = tvm.build(s, [A, C], target=target, name=name)
+    elif len(input_shapes) == 2:
+        func = tvm.build(s, [A, B, C], target=target, name=name)
+    elif len(input_shapes) == 3:
+        func = tvm.build(s, [A, B, B1, C], target=target, name=name)
+    assert func
+    print(func)
+    a = tvm.nd.array(
+        numpy.random.random(input_shapes[0]).astype(dtype[0]), ctx)
+    if len(input_shapes) > 1:
+        b = tvm.nd.array(
+            numpy.random.random(input_shapes[1]).astype(dtype[1]), ctx)
+    if len(input_shapes) > 2:
+        b1 = tvm.nd.array(
+            numpy.random.random(input_shapes[2]).astype(dtype[2]), ctx)
+    c = tvm.nd.array(numpy.zeros(out_shape, dtype=dtype[len(dtype) - 1]), ctx)
+
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=repeat)
+    print("repeat: %f" % repeat)
+    if len(input_shapes) == 1:
+        print("Baseline: %f" % (evaluator(a, c).mean * 1000))
+        print(tvm.lower(s, [A, C], simple_mode=True))
+    elif len(input_shapes) == 2:
+        print("Baseline: %f" % (evaluator(a, b, c).mean * 1000))
+        print(tvm.lower(s, [A, B, C], simple_mode=True))
+    elif len(input_shapes) == 3:
+        print("Baseline: %f" % (evaluator(a, b, b1, c).mean * 1000))
+        print(tvm.lower(s, [A, B, B1, C], simple_mode=True))
+
+
+def test_elementwise():
+    input_shapes, out_shape = [(100, 32), (100, 32)], (100, 32)
+    # input_shapes1, out_shape1 = [(1024, 1024, 1024),
+    #                              (1024, 1024, 1024)], (1024, 1024, 1024)
+    input_shapes2, out_shape2 = [(1024, 14, 14), (1024, 14, 14)], (1024, 14,
+                                                                   14)
+
+    def compute_add(A, B):
+        return topi.add(A, B)
+
+    def compute_mul(A, B):
+        return topi.multiply(A, B)
+
+    test_op(compute_add, input_shapes, out_shape, name="elementwise_add")
+    # test_op(compute_add, input_shapes1, out_shape1, name="elementwise_add")
+    test_op(compute_add, input_shapes2, out_shape2, name="elementwise_add")
+    test_op(compute_mul, input_shapes, out_shape, name="elementwise_mul")
+    # test_op(compute_mul, input_shapes1, out_shape1, name="elementwise_mul")
+    test_op(compute_mul, input_shapes2, out_shape2, name="elementwise_mul")
+
+
+def test_relu():
+    input_shapes, out_shape = [(2, 512, 7, 7)], (2, 512, 7, 7)
+    input_shapes1, out_shape1 = [(1024, 1024, 1024)], (1024, 1024, 1024)
+    input_shapes2, out_shape2 = [(1024, 14, 14)], (1024, 14, 14)
+    input_shapes3, out_shape3 = [(100, 32)], (100, 32)
+    name = "relu"
+
+    def compute(A):
+        return topi.nn.relu(A)
+
+    test_op(compute, input_shapes, out_shape, name=name)
+    test_op(compute, input_shapes1, out_shape1, name=name)
+    test_op(compute, input_shapes2, out_shape2, name=name)
+    test_op(compute, input_shapes3, out_shape3, name=name)
+
+
+def test_conv2d_nchw():
+    input_shapes, out_shape = [(2, 512, 7, 7), (512, 512, 3, 3)], (2, 512, 5,
+                                                                   5)
+    name = "conv2d_nchw"
+    strides, padding, dilation = [1, 1], [0, 0], [1, 1]
+
+    def compute(A, B):
+        return topi.nn.conv2d(
+            A, B, strides, padding, dilation, layout="NCHW", out_dtype=None)
+
+    test_op(compute, input_shapes, out_shape, name=name)
+
+
+# depthwise_conv2d_nchw
+def test_depthwise_conv2d_nchw():
+    input_shapes, out_shape = [(2, 32, 112, 112), (32, 1, 3, 3)], (2, 32, 112,
+                                                                   112)
+    name = "depthwise_conv2d_nchw"
+    strides, padding, dilation = [1, 1], [1, 1], [1, 1]
+
+    def compute(A, B):
+        return topi.nn.depthwise_conv2d_nchw(
+            A, B, strides, padding, dilation, out_dtype=None)
+
+    test_op(compute, input_shapes, out_shape, name=name)
+
+
+def test_pool2d():
+    input_shapes, out_shape = [(2, 64, 112, 112)], (2, 64, 56, 56)
+    name = "pool2d"
+    kernel, stride, padding = [3, 3], [2, 2], [1, 1, 1, 1]
+    pool_type = "max"
+
+    def compute(A):
+        return topi.nn.pool(
+            A,
+            kernel,
+            stride,
+            padding,
+            pool_type,
+            ceil_mode=False,
+            layout="NCHW",
+            count_include_pad=False)
+
+    test_op(compute, input_shapes, out_shape, name=name)
+
+
+def test_softmax():
+    input_shapes, out_shape = [(1024, 2048)], (1024, 2048)
+    input_shapes1, out_shape1 = [(3, 1000)], (3, 1000)
+    name = "softmax"
+
+    def compute(A):
+        return topi.nn.softmax(A)
+
+    test_op(compute, input_shapes, out_shape, name=name)
+    test_op(compute, input_shapes1, out_shape1, name=name)
+
+
+def test_unary():
+    input_shapes, out_shape = [(1024, 2048)], (1024, 2048)
+    input_shapes1, out_shape1 = [(3, 1000)], (3, 1000)
+    input_shapes2, out_shape2 = [(1024, 2047)], (1024, 2047)
+
+    def test_unary_basic(name, func):
+        def compute(A):
+            return func(A)
+
+        test_op(compute, input_shapes, out_shape, name=name)
+        test_op(compute, input_shapes1, out_shape1, name=name)
+        test_op(compute, input_shapes2, out_shape2, name=name)
+
+    for opfunc in [
+            topi.exp,
+            topi.erf,
+            topi.sigmoid,
+            topi.sqrt,
+            topi.log,
+            topi.log2,
+            topi.log10,
+            topi.floor,
+            topi.ceil,
+            topi.round,
+            topi.trunc,
+            topi.cos,
+            topi.cosh,
+            topi.tan,
+            topi.tanh,
+            topi.sin,
+            topi.sinh,
+            topi.acos,
+            topi.acosh,
+            topi.asin,
+            topi.asinh,
+            topi.atan,
+            topi.atanh,
+    ]:
+        test_unary_basic(str(opfunc), opfunc)
+
+
+def test_is():
+    input_shapes, out_shape = [(1024, 2048)], (1024, 2048)
+    input_shapes1, out_shape1 = [(3, 1000)], (3, 1000)
+    input_shapes2, out_shape2 = [(1024, 2047)], (1024, 2047)
+    type = ["float32", "bool"]
+
+    def test_is_basic(name, func):
+        def compute(A):
+            return func(A)
+
+        test_op(compute, input_shapes, out_shape, name=name, dtype=type)
+        test_op(compute, input_shapes1, out_shape1, name=name, dtype=type)
+        test_op(compute, input_shapes2, out_shape2, name=name, dtype=type)
+
+    for opfunc in [
+            topi.isnan,
+            topi.isfinite,
+            topi.isinf,
+    ]:
+        test_is_basic(str(opfunc), opfunc)
+
+
+def test_bitwise_not():
+    input_shapes, out_shape = [(1024, 2048)], (1024, 2048)
+    input_shapes1, out_shape1 = [(3, 1000)], (3, 1000)
+    input_shapes2, out_shape2 = [(1024, 2047)], (1024, 2047)
+    type = ["int32", "int32", "int32"]
+
+    def test_unary_basic(name, func):
+        def compute(A):
+            return func(A)
+
+        test_op(compute, input_shapes, out_shape, name=name, dtype=type)
+        test_op(compute, input_shapes1, out_shape1, name=name, dtype=type)
+        test_op(compute, input_shapes2, out_shape2, name=name, dtype=type)
+
+    for opfunc in [
+            topi.bitwise_not,
+    ]:
+        test_unary_basic(str(opfunc), opfunc)
+
+
+def test_bitwise_binary():
+    input_shapes, out_shape = [(1024, 2048), (1024, 2048)], (1024, 2048)
+    input_shapes1, out_shape1 = [(3, 1000), (3, 1000)], (3, 1000)
+    input_shapes2, out_shape2 = [(1024, 2047), (1024, 2047)], (1024, 2047)
+    type = ["int32", "int32", "int32"]
+
+    def test_binary_basic(name, func):
+        def compute(A, B):
+            return func(A, B)
+
+        test_op(compute, input_shapes, out_shape, name=name, dtype=type)
+        test_op(compute, input_shapes1, out_shape1, name=name, dtype=type)
+        test_op(compute, input_shapes2, out_shape2, name=name, dtype=type)
+
+    for opfunc in [
+            topi.bitwise_or,
+            topi.bitwise_and,
+            topi.bitwise_xor,
+            topi.left_shift,
+            topi.right_shift,
+    ]:
+        test_binary_basic(str(opfunc), opfunc)
+
+
+def test_sigmoid():
+    input_shapes, out_shape = [(2, 672, 1, 1)], (2, 672, 1, 1)
+    input_shapes1, out_shape1 = [(3, 1000)], (3, 1000)
+    name = "sigmoid"
+
+    def compute(A):
+        return topi.sigmoid(A)
+
+    test_op(compute, input_shapes, out_shape, name=name)
+    test_op(compute, input_shapes1, out_shape1, name=name)
+
+
+def test_matmul():
+    input_shapes, out_shape = [(32, 32), (32, 32)], (32, 32)
+    input_shapes1, out_shape1 = [(512, 512), (512, 512)], (512, 512)
+    # input_shapes2, out_shape2 = [(1024,1024),(1024,1024)], (1024,1024)
+    input_shapes3, out_shape3 = [(100, 32), (32, 100)], (100, 100)
+    name = "matmul"
+
+    def compute(A, B):
+        return topi.matmul(A, B, False, False)
+
+    test_op(compute, input_shapes, out_shape, name=name)
+    test_op(compute, input_shapes1, out_shape1, name=name)
+    # test_op(compute, input_shapes2, out_shape2, name=name)
+    test_op(compute, input_shapes3, out_shape3, name=name)
+
+
+# batch_norm
+def test_batch_norm():
+    input_shapes, out_shape = [(2, 32, 112, 112), (32, ), (32, )], (2, 32, 112,
+                                                                    112)
+    # mean,variance=32,32
+    name = "batch_norm"
+
+    def compute(A, Scale, Shift):
+        return te.compute(
+            A.shape,
+            lambda b, c, i, j: A[b, c, i, j] * Scale[c] + Shift[c],
+            name="ScaleShift")
+
+    test_op(compute, input_shapes, out_shape, name=name)
+
+
+if __name__ == "__main__":
+    test_elementwise()
+    test_relu()
+    test_conv2d_nchw()
+    test_depthwise_conv2d_nchw()
+    test_pool2d()
+    test_softmax()
+    test_unary()
+    test_is()
+    test_bitwise_not()
+    test_bitwise_binary()
+    test_sigmoid()
+    test_matmul()
+    test_batch_norm()
diff --git a/tools/cinn/tvm_benchmark/tvm_graph_with_single_op.py b/tools/cinn/tvm_benchmark/tvm_graph_with_single_op.py
new file mode 100755
index 0000000000000..20912f3bae56d
--- /dev/null
+++ b/tools/cinn/tvm_benchmark/tvm_graph_with_single_op.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import relay
+import tvm.relay.testing
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.utils import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+# To test different ops, change this single-op network.
+# See https://github.com/apache/incubator-tvm/blob/main/docs/langref/relay_op.rst to get the op list.
+
+
+def get_network_conv2d():
+    input_shape = [(2, 512, 7, 7), (512, 512, 3, 3)]
+    output_shape = (2, 512, 7, 7)
+    input_names = ["x", "y"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    y = relay.Var(input_names[1], tvm.relay.TensorType(input_shape[1]))
+    print("[Test]Begin building graph with op relay.nn.conv2d")
+    mod = relay.Function([x, y],
+                         relay.nn.conv2d(
+                             x,
+                             y,
+                             kernel_size=(3, 3),
+                             padding=(1, 1),
+                             strides=(1, 1)))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_conv2d_resnet1():
+    input_shape = [(2, 3, 224, 224), (64, 3, 7, 7)]
+    output_shape = (2, 64, 112, 112)
+    input_names = ["x", "y"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    y = relay.Var(input_names[1], tvm.relay.TensorType(input_shape[1]))
+    print("[Test]Begin building graph with op relay.nn.conv2d resnet1")
+    mod = relay.Function([x, y],
+                         relay.nn.conv2d(
+                             x,
+                             y,
+                             kernel_size=(7, 7),
+                             padding=(3, 3),
+                             strides=(2, 2)))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_conv2d_resnet2():
+    input_shape = [(2, 64, 56, 56), (64, 64, 3, 3)]
+    output_shape = (2, 64, 56, 56)
+    input_names = ["x", "y"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    y = relay.Var(input_names[1], tvm.relay.TensorType(input_shape[1]))
+    print("[Test]Begin building graph with op relay.nn.conv2d resnet2")
+    mod = relay.Function([x, y],
+                         relay.nn.conv2d(
+                             x,
+                             y,
+                             kernel_size=(3, 3),
+                             padding=(1, 1),
+                             strides=(1, 1)))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_conv2d_resnet3():
+    input_shape = [(2, 64, 56, 56), (64, 64, 1, 1)]
+    output_shape = (2, 64, 56, 56)
+    input_names = ["x", "y"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    y = relay.Var(input_names[1], tvm.relay.TensorType(input_shape[1]))
+    print("[Test]Begin building graph with op relay.nn.conv2d resnet2")
+    mod = relay.Function([x, y],
+                         relay.nn.conv2d(
+                             x,
+                             y,
+                             kernel_size=(1, 1),
+                             padding=(0, 0),
+                             strides=(1, 1)))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_conv2d_resnet4():
+    input_shape = [(2, 64, 56, 56), (128, 64, 1, 1)]
+    output_shape = (2, 128, 28, 28)
+    input_names = ["x", "y"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    y = relay.Var(input_names[1], tvm.relay.TensorType(input_shape[1]))
+    print("[Test]Begin building graph with op relay.nn.conv2d resnet2")
+    mod = relay.Function([x, y],
+                         relay.nn.conv2d(
+                             x,
+                             y,
+                             kernel_size=(1, 1),
+                             padding=(0, 0),
+                             strides=(2, 2)))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_conv2d_resnet5():
+    input_shape = [(2, 128, 28, 28), (256, 128, 3, 3)]
+    output_shape = (2, 256, 14, 14)
+    input_names = ["x", "y"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    y = relay.Var(input_names[1], tvm.relay.TensorType(input_shape[1]))
+    print("[Test]Begin building graph with op relay.nn.conv2d resnet2")
+    mod = relay.Function([x, y],
+                         relay.nn.conv2d(
+                             x,
+                             y,
+                             kernel_size=(3, 3),
+                             padding=(1, 1),
+                             strides=(2, 2)))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_relu():
+    input_shape = [(2, 512, 112, 112)]
+    output_shape = (2, 512, 112, 112)
+    input_names = ["x"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    print("[Test]Begin building graph with op relay.nn.relu")
+    mod = relay.Function([x], relay.nn.relu(x))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_elementwise():
+    input_shape = [(64, 64), (64, 64)]
+    output_shape = (64, 64)
+    input_names = ["x", "y"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    y = relay.Var(input_names[1], tvm.relay.TensorType(input_shape[1]))
+    print("[Test]Begin building graph with op relay.multiply")
+    mod = relay.Function([x, y], relay.multiply(x, y))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_matmul():
+    input_shape = [(32, 32), (32, 32)]
+    output_shape = (32, 32)
+    input_names = ["x", "y"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    y = relay.Var(input_names[1], tvm.relay.TensorType(input_shape[1]))
+    print("[Test]Begin building graph with op relay.nn.dense (matmul)")
+    mod = relay.Function([x, y], relay.nn.dense(x, y))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_softmax():
+    input_shape = [(1024, 2048)]
+    output_shape = (1024, 2048)
+    input_names = ["x"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    print("[Test]Begin building graph with op relay.nn.softmax")
+    mod = relay.Function([x], relay.nn.softmax(x))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_pool2d():
+    input_shape = [(2, 64, 112, 112)]
+    output_shape = (2, 64, 56, 56)
+    input_names = ["x"]
+    x = relay.Var(input_names[0], tvm.relay.TensorType(input_shape[0]))
+    print("[Test]Begin building graph with op relay.nn.max_pool2d")
+    mod = relay.Function([x],
+                         relay.nn.max_pool2d(
+                             x,
+                             pool_size=(3, 3),
+                             strides=(2, 2),
+                             padding=(1, 1)))
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+def get_network_batchnorm():
+    data0 = relay.var("data0", relay.TensorType((2, 512, 32, 32), "float32"))
+    bn_gamma = relay.var("bn_gamma1", relay.TensorType((512, ), "float32"))
+    bn_beta = relay.var("bn_beta1", relay.TensorType((512, ), "float32"))
+    bn_mmean = relay.var("bn_mean1", relay.TensorType((512, ), "float32"))
+    bn_mvar = relay.var("bn_var1", relay.TensorType((512, ), "float32"))
+    bn = relay.nn.batch_norm(data0, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
+    input_shape = [(2, 512, 32, 32), (512), (512), (512), (512)]
+    output_shape = (2, 512, 32, 32)
+    input_names = ["data0", "bn_gamma1", "bn_beta1", "bn_mean1", "bn_var1"]
+    print("[Test]Begin building graph with op relay.nn.batch_norm")
+    mod = relay.Function([data0, bn_gamma, bn_beta, bn_mmean, bn_mvar], bn)
+    params = []
+    return mod, params, input_shape, output_shape, input_names
+
+
+##################################################################
+# For CUDA backends, use
+# :code:`target = "cuda"`
+# For X86 backends, use
+# :code:`target = "llvm"`
+target = "cuda"
+dtype = "float32"
+
+
+def tune_and_evaluate(func):
+    # extract workloads from relay program
+    mod, params, input_shape, out_shape, input_names = func()
+
+    runtime_mod = relay.build_module.build(mod, target=target)
+    print("-----GPU code-----")
+    print(runtime_mod.get_lib().imported_modules[0].get_source())
+    # load parameters
+    ctx = tvm.context(str(target), 0)
+    module = runtime.GraphModule(runtime_mod["default"](ctx))
+    for index in range(len(input_shape)):
+        data_temp = tvm.nd.array(
+            (np.random.uniform(size=input_shape[index])).astype(dtype))
+        module.set_input(input_names[index], data_temp)
+    # evaluate
+    evaluator_preheat = module.module.time_evaluator(
+        "run", ctx, number=10, repeat=10)
+    evaluator = module.module.time_evaluator("run", ctx, number=100, repeat=10)
+
+    prof_res1 = np.array(
+        evaluator_preheat().results) * 1000  # convert to millisecond
+    print("[PreHeat]Mean inference time (std dev): %.4f ms (%.4f ms)" %
+          (np.mean(prof_res1), np.std(prof_res1)))
+
+    prof_res2 = np.array(evaluator().results) * 1000  # convert to millisecond
+    print("[Benchmark]Mean inference time (std dev): %.4f ms (%.4f ms)" %
+          (np.mean(prof_res2), np.std(prof_res2)))
+
+
+#tune_and_evaluate(get_network_pool2d)
+#tune_and_evaluate(get_network_softmax)
+#tune_and_evaluate(get_network_matmul)
+#tune_and_evaluate(get_network_batchnorm)
+tune_and_evaluate(get_network_relu)
+#tune_and_evaluate(get_network_elementwise)
+#tune_and_evaluate(get_network_conv2d_resnet1)
+#tune_and_evaluate(get_network_conv2d_resnet2)
+#tune_and_evaluate(get_network_conv2d_resnet3)
+#tune_and_evaluate(get_network_conv2d_resnet4)
+#tune_and_evaluate(get_network_conv2d_resnet5)
+#tune_and_evaluate(get_network_conv2d)

From ce171af3ce7dd7e60d003f5462e20059264ce5e9 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Mon, 19 Jun 2023 17:00:14 +0800
Subject: [PATCH 09/14] feat(cmake): copy code of cinn

---
 paddle/cinn/auto_schedule/CMakeLists.txt      |  4 +-
 .../auto_schedule/analysis/CMakeLists.txt     |  2 +-
 .../cinn/auto_schedule/analysis/analyze_ir.cc | 26 +++++-----
 .../cinn/auto_schedule/analysis/analyze_ir.h  |  8 +--
 .../auto_schedule/analysis/analyze_ir_test.cc | 22 ++++----
 paddle/cinn/auto_schedule/auto_schedule.proto |  2 +-
 paddle/cinn/auto_schedule/auto_tuner.cc       | 28 +++++------
 paddle/cinn/auto_schedule/auto_tuner.h        | 18 +++----
 paddle/cinn/auto_schedule/auto_tuner_test.cc  | 22 ++++----
 .../auto_schedule/cost_model/CMakeLists.txt   |  6 +--
 .../cost_model/expr_cost_model.cc             | 12 ++---
 .../cost_model/expr_cost_model.h              |  4 +-
 .../cinn/auto_schedule/cost_model/feature.cc  |  4 +-
 .../cinn/auto_schedule/cost_model/feature.h   |  4 +-
 .../cost_model/feature_extractor.cc           | 18 +++----
 .../cost_model/feature_extractor.h            | 12 ++---
 .../cost_model/feature_extractor_test.cc      | 20 ++++----
 .../auto_schedule/cost_model/feature_test.cc  |  2 +-
 .../cost_model/xgb_cost_model.cc              |  4 +-
 .../auto_schedule/cost_model/xgb_cost_model.h |  2 +-
 .../cost_model/xgb_cost_model_test.cc         |  2 +-
 .../auto_schedule/database/CMakeLists.txt     |  4 +-
 .../cinn/auto_schedule/database/database.cc   | 10 ++--
 paddle/cinn/auto_schedule/database/database.h |  6 +--
 .../auto_schedule/database/database_test.cc   |  8 +--
 .../database/jsonfile_database.cc             |  8 +--
 .../database/jsonfile_database.h              |  2 +-
 .../database/jsonfile_database_test.cc        | 14 +++---
 .../cinn/auto_schedule/measure/CMakeLists.txt |  4 +-
 paddle/cinn/auto_schedule/measure/measure.h   |  8 +--
 .../auto_schedule/measure/measurer_test.cc    | 20 ++++----
 .../measure/schedule_measurer.cc              |  4 +-
 .../auto_schedule/measure/schedule_measurer.h |  2 +-
 .../auto_schedule/measure/simple_builder.cc   |  2 +-
 .../auto_schedule/measure/simple_builder.h    |  4 +-
 .../auto_schedule/measure/simple_runner.cc    | 10 ++--
 .../auto_schedule/measure/simple_runner.h     |  4 +-
 .../measure/simple_runner_test.cc             | 12 ++---
 .../post_schedule_rule/CMakeLists.txt         |  2 +-
 .../post_schedule_rule/cooperative_process.cc | 11 ++--
 .../post_schedule_rule/cooperative_process.h  |  2 +-
 .../cooperative_process_test.cc               |  8 +--
 .../post_schedule_rule/post_schedule_rule.h   |  2 +-
 .../auto_schedule/search_space/CMakeLists.txt |  8 +--
 .../search_space/auto_gen_rule/CMakeLists.txt | 14 +++---
 .../search_space/auto_gen_rule/auto_bind.cc   | 10 ++--
 .../search_space/auto_gen_rule/auto_bind.h    |  6 +--
 .../auto_gen_rule/auto_bind_test.cc           |  8 +--
 .../auto_gen_rule/auto_gen_rule.cc            |  6 +--
 .../auto_gen_rule/auto_gen_rule.h             |  6 +--
 .../search_space/auto_gen_rule/auto_inline.cc | 20 ++++----
 .../search_space/auto_gen_rule/auto_inline.h  |  6 +--
 .../auto_gen_rule/auto_inline_test.cc         | 38 +++++++-------
 .../search_space/auto_gen_rule/auto_unroll.cc | 10 ++--
 .../search_space/auto_gen_rule/auto_unroll.h  |  6 +--
 .../auto_gen_rule/auto_unroll_test.cc         |  6 +--
 .../auto_gen_rule/mix_rules_test.cc           | 12 ++---
 .../auto_gen_rule/multi_level_tiling.cc       | 24 ++++-----
 .../auto_gen_rule/multi_level_tiling.h        | 10 ++--
 .../auto_gen_rule/multi_level_tiling_test.cc  | 30 +++++------
 .../search_space/auto_gen_rule/skip_rule.cc   | 10 ++--
 .../search_space/auto_gen_rule/skip_rule.h    |  6 +--
 .../auto_gen_rule/skip_rule_test.cc           | 20 ++++----
 .../search_space/auto_gen_rule/test_helper.cc | 22 ++++----
 .../search_space/auto_gen_rule/test_helper.h  | 14 +++---
 .../search_space/block_sampler.cc             |  4 +-
 .../search_space/block_sampler.h              |  4 +-
 .../search_space/block_sampler_test.cc        |  4 +-
 .../search_space/rule_sampler.cc              |  2 +-
 .../auto_schedule/search_space/rule_sampler.h |  4 +-
 .../search_space/rule_sampler_test.cc         |  6 +--
 .../search_space/search_space.cc              | 28 +++++------
 .../auto_schedule/search_space/search_space.h | 14 +++---
 .../search_space/search_space_test.cc         |  2 +-
 .../search_space/search_state.cc              | 14 +++---
 .../auto_schedule/search_space/search_state.h | 10 ++--
 .../search_space/search_state_test.cc         |  6 +--
 .../search_strategy/CMakeLists.txt            |  2 +-
 .../search_strategy/evolutionary_search.cc    | 26 +++++-----
 .../search_strategy/evolutionary_search.h     | 18 +++----
 .../evolutionary_search_test.cc               | 24 ++++-----
 .../mutate_rule/CMakeLists.txt                |  2 +-
 .../mutate_rule/mutate_rule.cc                |  4 +-
 .../search_strategy/mutate_rule/mutate_rule.h |  4 +-
 .../mutate_rule/mutate_tile_size.cc           |  2 +-
 .../mutate_rule/mutate_tile_size.h            |  2 +-
 .../mutate_rule/mutate_tile_size_test.cc      |  6 +--
 paddle/cinn/auto_schedule/task/CMakeLists.txt |  6 +--
 .../cinn/auto_schedule/task/task_creator.cc   |  8 +--
 paddle/cinn/auto_schedule/task/task_creator.h |  6 +--
 .../auto_schedule/task/task_creator_test.cc   | 14 +++---
 .../cinn/auto_schedule/task/task_optimizer.cc | 34 ++++++-------
 .../cinn/auto_schedule/task/task_optimizer.h  | 16 +++---
 .../cinn/auto_schedule/task/task_registry.h   |  6 +--
 .../auto_schedule/task/task_registry_test.cc  | 18 +++----
 paddle/cinn/auto_schedule/task/tune_task.cc   | 16 +++---
 paddle/cinn/auto_schedule/task/tune_task.h    | 18 +++----
 .../cinn/auto_schedule/task/tune_task_test.cc | 30 +++++------
 .../task_scheduler/CMakeLists.txt             |  2 +-
 .../task_scheduler/efficiency_priority.cc     |  2 +-
 .../task_scheduler/efficiency_priority.h      |  2 +-
 .../task_scheduler/round_robin.cc             |  2 +-
 .../task_scheduler/round_robin.h              |  2 +-
 .../task_scheduler/task_scheduler.cc          |  8 +--
 .../task_scheduler/task_scheduler.h           |  6 +--
 .../task_scheduler/task_scheduler_test.cc     |  6 +--
 .../cinn/auto_schedule/tests/CMakeLists.txt   |  2 +-
 .../tests/performance_comparison_test.cc      | 26 +++++-----
 paddle/cinn/auto_schedule/tuning.h            |  6 +--
 paddle/cinn/backends/CMakeLists.txt           | 26 +++++-----
 paddle/cinn/backends/_x86_builtin_source.cc   |  2 +-
 paddle/cinn/backends/codegen_c.cc             | 22 ++++----
 paddle/cinn/backends/codegen_c.h              | 16 +++---
 paddle/cinn/backends/codegen_c_test.cc        | 20 ++++----
 paddle/cinn/backends/codegen_c_x86.cc         |  2 +-
 paddle/cinn/backends/codegen_c_x86.h          |  4 +-
 paddle/cinn/backends/codegen_c_x86_test.cc    | 20 ++++----
 paddle/cinn/backends/codegen_cuda_dev.cc      | 12 ++---
 paddle/cinn/backends/codegen_cuda_dev.h       | 16 +++---
 .../backends/codegen_cuda_generate_test.cc    | 32 ++++++------
 paddle/cinn/backends/codegen_cuda_host.cc     | 12 ++---
 paddle/cinn/backends/codegen_cuda_host.h      |  2 +-
 paddle/cinn/backends/codegen_cuda_util.cc     |  6 +--
 paddle/cinn/backends/codegen_cuda_util.h      |  8 +--
 paddle/cinn/backends/codegen_debug_test.cc    |  8 +--
 paddle/cinn/backends/compiler.cc              | 20 ++++----
 paddle/cinn/backends/compiler.h               | 10 ++--
 paddle/cinn/backends/compiler_test.cc         | 14 +++---
 paddle/cinn/backends/cuda_util.cc             |  6 +--
 paddle/cinn/backends/cuda_util.h              |  2 +-
 paddle/cinn/backends/extern_func_emitter.cc   | 12 ++---
 paddle/cinn/backends/extern_func_emitter.h    |  4 +-
 .../backends/extern_func_emitter_builtin.cc   |  6 +--
 .../backends/extern_func_emitter_builtin.h    | 10 ++--
 .../cinn/backends/extern_func_jit_register.cc |  2 +-
 .../cinn/backends/extern_func_jit_register.h  | 18 +++----
 paddle/cinn/backends/extern_func_protos.cc    |  2 +-
 paddle/cinn/backends/extern_func_protos.h     |  2 +-
 paddle/cinn/backends/function_prototype.cc    |  6 +--
 paddle/cinn/backends/function_prototype.h     |  4 +-
 paddle/cinn/backends/generated1.cu            |  2 +-
 paddle/cinn/backends/generated_module1.cc     |  2 +-
 paddle/cinn/backends/ir_schedule_test.cc      | 22 ++++----
 paddle/cinn/backends/llvm/CMakeLists.txt      | 20 ++++----
 paddle/cinn/backends/llvm/codegen_llvm.cc     | 31 ++++++------
 paddle/cinn/backends/llvm/codegen_llvm.h      | 12 ++---
 .../cinn/backends/llvm/codegen_llvm_test.cc   | 16 +++---
 paddle/cinn/backends/llvm/codegen_x86.cc      | 14 +++---
 paddle/cinn/backends/llvm/codegen_x86.h       |  2 +-
 paddle/cinn/backends/llvm/codegen_x86_test.cc | 10 ++--
 paddle/cinn/backends/llvm/execution_engine.cc | 22 ++++----
 paddle/cinn/backends/llvm/execution_engine.h  |  8 +--
 .../backends/llvm/execution_engine_test.cc    | 28 +++++------
 .../backends/llvm/generate_runtime_llvm_ir.py |  2 +-
 paddle/cinn/backends/llvm/llvm_intrin_rule.h  |  8 +--
 paddle/cinn/backends/llvm/llvm_optimizer.cc   |  2 +-
 paddle/cinn/backends/llvm/llvm_util.cc        |  2 +-
 paddle/cinn/backends/llvm/llvm_util.h         |  2 +-
 .../backends/llvm/runtime_symbol_registry.cc  |  4 +-
 .../backends/llvm/runtime_symbol_registry.h   |  2 +-
 paddle/cinn/backends/llvm/simple_jit.cc       | 23 ++++++---
 paddle/cinn/backends/llvm/simple_jit.h        | 10 ++--
 paddle/cinn/backends/modular.cc               |  4 +-
 paddle/cinn/backends/modular.h                |  2 +-
 paddle/cinn/backends/nvrtc/CMakeLists.txt     |  2 +-
 .../cinn/backends/nvrtc/header_generator.cc   |  2 +-
 paddle/cinn/backends/nvrtc/nvrtc_util.cc      | 12 ++---
 paddle/cinn/backends/nvrtc/nvrtc_util_test.cc |  2 +-
 paddle/cinn/backends/outputs.cc               |  2 +-
 paddle/cinn/backends/raw_cuda_code_test.cu    |  4 +-
 paddle/cinn/cinn.h                            | 18 +++----
 paddle/cinn/common/CMakeLists.txt             | 18 +++----
 paddle/cinn/common/arithmatic.cc              | 12 ++---
 paddle/cinn/common/arithmatic.h               |  2 +-
 paddle/cinn/common/arithmatic_test.cc         | 12 ++---
 paddle/cinn/common/axis.cc                    | 12 ++---
 paddle/cinn/common/axis_test.cc               |  4 +-
 paddle/cinn/common/cas.cc                     | 22 ++++----
 paddle/cinn/common/cas.h                      |  6 +--
 paddle/cinn/common/cas_test.cc                | 14 +++---
 paddle/cinn/common/cinn_value.cc              | 10 ++--
 paddle/cinn/common/cinn_value.h               | 10 ++--
 paddle/cinn/common/cinn_value_test.cc         | 10 ++--
 paddle/cinn/common/common.h                   | 16 +++---
 paddle/cinn/common/context.cc                 |  6 +--
 paddle/cinn/common/context.h                  |  6 +--
 paddle/cinn/common/cuda_test_helper.cc        | 16 +++---
 paddle/cinn/common/cuda_test_helper.h         |  6 +--
 paddle/cinn/common/debug_manager.cc           |  2 +-
 .../cinn/common/float16_bfloat16_cuda_test.cu |  4 +-
 .../cinn/common/float16_bfloat16_host_test.cc |  4 +-
 paddle/cinn/common/float16_bfloat16_utils.h   |  4 +-
 paddle/cinn/common/graph_utils.cc             |  6 +--
 paddle/cinn/common/graph_utils.h              |  6 +--
 paddle/cinn/common/graph_utils_test.cc        |  2 +-
 paddle/cinn/common/info_registry.cc           |  2 +-
 paddle/cinn/common/ir_util.cc                 | 12 ++---
 paddle/cinn/common/ir_util.h                  |  6 +--
 paddle/cinn/common/object.cc                  |  2 +-
 paddle/cinn/common/object.h                   |  2 +-
 .../cinn/common/python_interpreter_guard.cc   |  2 +-
 paddle/cinn/common/shared.cc                  |  2 +-
 paddle/cinn/common/shared_test.cc             |  4 +-
 paddle/cinn/common/target.cc                  |  4 +-
 paddle/cinn/common/test_helper.cc             |  2 +-
 paddle/cinn/common/test_helper.h              |  6 +--
 paddle/cinn/common/type.cc                    |  2 +-
 paddle/cinn/common/type.h                     | 10 ++--
 paddle/cinn/common/type_test.cc               |  2 +-
 paddle/cinn/common/union_find.cc              |  2 +-
 paddle/cinn/common/union_find.h               |  4 +-
 paddle/cinn/frontend/CMakeLists.txt           | 18 +++----
 paddle/cinn/frontend/computation.cc           | 16 +++---
 paddle/cinn/frontend/computation.h            |  8 +--
 paddle/cinn/frontend/computation_test.cc      | 14 +++---
 .../cinn/frontend/decomposer/CMakeLists.txt   | 12 ++---
 paddle/cinn/frontend/decomposer/activation.cc |  4 +-
 .../frontend/decomposer/activation_test.cc    |  2 +-
 paddle/cinn/frontend/decomposer/batch_norm.cc |  4 +-
 .../frontend/decomposer/batch_norm_test.cc    |  2 +-
 paddle/cinn/frontend/decomposer/broadcast.cc  |  4 +-
 .../frontend/decomposer/broadcast_test.cc     |  2 +-
 .../cinn/frontend/decomposer/elementwise.cc   |  4 +-
 .../frontend/decomposer/elementwise_test.cc   |  2 +-
 .../cinn/frontend/decomposer/test_helper.cc   |  2 +-
 paddle/cinn/frontend/decomposer/test_helper.h | 24 ++++-----
 paddle/cinn/frontend/decomposer/top_k.cc      |  4 +-
 paddle/cinn/frontend/decomposer/top_k_test.cc |  2 +-
 .../cinn/frontend/decomposer/use_decomposer.h |  2 +-
 paddle/cinn/frontend/decomposer_registry.h    |  6 +--
 .../cinn/frontend/decomposer_registry_test.cc |  4 +-
 paddle/cinn/frontend/interpreter.cc           | 22 ++++----
 paddle/cinn/frontend/interpreter.h            |  6 +--
 paddle/cinn/frontend/interpreter_test.cc      |  4 +-
 paddle/cinn/frontend/net_builder.cc           | 12 ++---
 paddle/cinn/frontend/net_builder.h            | 12 ++---
 paddle/cinn/frontend/net_builder_test.cc      | 18 +++----
 paddle/cinn/frontend/op_mapper_registry.cc    |  4 +-
 paddle/cinn/frontend/op_mapper_registry.h     | 18 +++----
 .../cinn/frontend/op_mapper_registry_test.cc  |  6 +--
 .../cinn/frontend/op_mappers/common_utils.h   |  8 +--
 .../frontend/op_mappers/paddle/arg_min_max.cc |  6 +--
 .../frontend/op_mappers/paddle/argsort.cc     |  6 +--
 .../cinn/frontend/op_mappers/paddle/atan.cc   |  6 +--
 .../frontend/op_mappers/paddle/batchnorm.cc   |  4 +-
 .../cinn/frontend/op_mappers/paddle/binary.cc |  4 +-
 .../frontend/op_mappers/paddle/cholesky.cc    |  4 +-
 .../cinn/frontend/op_mappers/paddle/clip.cc   |  4 +-
 .../frontend/op_mappers/paddle/compare.cc     |  4 +-
 .../cinn/frontend/op_mappers/paddle/concat.cc |  4 +-
 .../frontend/op_mappers/paddle/constant.cc    |  6 +--
 .../cinn/frontend/op_mappers/paddle/conv2d.cc |  6 +--
 .../cinn/frontend/op_mappers/paddle/cumsum.cc |  6 +--
 .../frontend/op_mappers/paddle/dropout.cc     |  4 +-
 .../frontend/op_mappers/paddle/elementwise.cc | 10 ++--
 .../cinn/frontend/op_mappers/paddle/expand.cc |  4 +-
 .../frontend/op_mappers/paddle/fetch_feed.cc  |  6 +--
 .../cinn/frontend/op_mappers/paddle/flip.cc   |  4 +-
 .../cinn/frontend/op_mappers/paddle/gather.cc |  4 +-
 .../frontend/op_mappers/paddle/gather_nd.cc   |  4 +-
 .../op_mappers/paddle/gaussian_random.cc      |  6 +--
 .../frontend/op_mappers/paddle/layer_norm.cc  |  8 +--
 paddle/cinn/frontend/op_mappers/paddle/log.cc |  4 +-
 .../op_mappers/paddle/lookup_table.cc         |  4 +-
 .../cinn/frontend/op_mappers/paddle/matmul.cc |  4 +-
 paddle/cinn/frontend/op_mappers/paddle/mul.cc |  4 +-
 .../cinn/frontend/op_mappers/paddle/norm.cc   |  4 +-
 .../frontend/op_mappers/paddle/one_hot.cc     |  6 +--
 .../cinn/frontend/op_mappers/paddle/pool2d.cc |  4 +-
 .../frontend/op_mappers/paddle/randint.cc     |  6 +--
 .../cinn/frontend/op_mappers/paddle/reduce.cc |  6 +--
 .../cinn/frontend/op_mappers/paddle/relu.cc   |  4 +-
 .../frontend/op_mappers/paddle/reshape.cc     |  6 +--
 .../frontend/op_mappers/paddle/reverse.cc     |  4 +-
 .../cinn/frontend/op_mappers/paddle/roll.cc   |  6 +--
 .../cinn/frontend/op_mappers/paddle/scale.cc  |  6 +--
 .../frontend/op_mappers/paddle/scatter.cc     |  6 +--
 .../cinn/frontend/op_mappers/paddle/slice.cc  |  4 +-
 .../frontend/op_mappers/paddle/softmax.cc     |  4 +-
 .../frontend/op_mappers/paddle/squeeze.cc     |  4 +-
 .../op_mappers/paddle/strided_slice.cc        |  6 +--
 .../op_mappers/paddle/take_along_axis.cc      |  6 +--
 .../cinn/frontend/op_mappers/paddle/tile.cc   |  6 +--
 .../cinn/frontend/op_mappers/paddle/top_k.cc  |  4 +-
 .../frontend/op_mappers/paddle/transpose.cc   |  6 +--
 .../op_mappers/paddle/triangular_solve.cc     |  4 +-
 .../cinn/frontend/op_mappers/paddle/unary.cc  |  4 +-
 .../op_mappers/paddle/uniform_random.cc       |  6 +--
 .../frontend/op_mappers/paddle/unsqueeze.cc   |  6 +--
 .../cinn/frontend/op_mappers/paddle/where.cc  |  4 +-
 .../frontend/op_mappers/science/broadcast.cc  |  6 +--
 .../frontend/op_mappers/science/compare.cc    |  4 +-
 .../cinn/frontend/op_mappers/science/math.cc  |  4 +-
 .../frontend/op_mappers/science/transform.cc  |  6 +--
 .../cinn/frontend/op_mappers/use_op_mappers.h |  2 +-
 paddle/cinn/frontend/optimize.cc              | 22 ++++----
 paddle/cinn/frontend/optimize.h               |  6 +--
 paddle/cinn/frontend/paddle/CMakeLists.txt    |  8 +--
 paddle/cinn/frontend/paddle/compatible_pb.cc  | 10 ++--
 paddle/cinn/frontend/paddle/compatible_pb.h   | 10 ++--
 .../cinn/frontend/paddle/cpp/CMakeLists.txt   |  2 +-
 paddle/cinn/frontend/paddle/cpp/block_desc.cc |  2 +-
 paddle/cinn/frontend/paddle/cpp/block_desc.h  |  6 +--
 paddle/cinn/frontend/paddle/cpp/op_desc.cc    |  2 +-
 paddle/cinn/frontend/paddle/cpp/op_desc.h     |  4 +-
 .../cinn/frontend/paddle/cpp/program_desc.cc  |  2 +-
 .../cinn/frontend/paddle/cpp/program_desc.h   |  4 +-
 paddle/cinn/frontend/paddle/cpp/var_desc.cc   |  2 +-
 paddle/cinn/frontend/paddle/cpp/var_desc.h    |  2 +-
 paddle/cinn/frontend/paddle/model_parser.cc   | 14 +++---
 paddle/cinn/frontend/paddle/model_parser.h    | 14 +++---
 .../cinn/frontend/paddle/model_parser_test.cc |  2 +-
 paddle/cinn/frontend/paddle/pb/CMakeLists.txt |  2 +-
 paddle/cinn/frontend/paddle/pb/block_desc.cc  |  2 +-
 paddle/cinn/frontend/paddle/pb/block_desc.h   |  4 +-
 paddle/cinn/frontend/paddle/pb/op_desc.cc     |  2 +-
 paddle/cinn/frontend/paddle/pb/op_desc.h      |  4 +-
 .../cinn/frontend/paddle/pb/program_desc.cc   |  2 +-
 paddle/cinn/frontend/paddle/pb/program_desc.h |  4 +-
 paddle/cinn/frontend/paddle/pb/var_desc.cc    |  6 +--
 paddle/cinn/frontend/paddle/pb/var_desc.h     |  4 +-
 .../cinn/frontend/paddle_model_convertor.cc   | 14 +++---
 paddle/cinn/frontend/paddle_model_convertor.h | 14 +++---
 .../frontend/paddle_model_convertor_test.cc   |  6 +--
 .../cinn/frontend/paddle_model_to_program.cc  | 10 ++--
 .../cinn/frontend/paddle_model_to_program.h   | 20 ++++----
 paddle/cinn/frontend/pass/CMakeLists.txt      | 28 +++++------
 paddle/cinn/frontend/pass/auto_broadcast.cc   | 10 ++--
 paddle/cinn/frontend/pass/auto_cast.cc        |  4 +-
 paddle/cinn/frontend/pass/auto_cast_test.cc   | 24 ++++-----
 paddle/cinn/frontend/pass/cast_collapsing.cc  |  8 +--
 .../frontend/pass/cast_collapsing_test.cc     | 26 +++++-----
 .../cinn/frontend/pass/dead_code_eliminate.cc |  4 +-
 .../frontend/pass/dead_code_eliminate_test.cc | 12 ++---
 paddle/cinn/frontend/pass/decomposer.cc       |  4 +-
 paddle/cinn/frontend/pass/decomposer_test.cc  | 26 +++++-----
 .../frontend/pass/expand_zero_dim_pass.cc     |  4 +-
 .../pass/expand_zero_dim_pass_test.cc         | 26 +++++-----
 .../frontend/pass/fill_constant_folding.cc    | 12 ++---
 .../pass/fill_constant_folding_test.cc        | 24 ++++-----
 .../frontend/pass/fill_constant_rewriter.cc   |  4 +-
 .../pass/fill_constant_rewriter_test.cc       |  8 +--
 paddle/cinn/frontend/pass/gemm_rewriter.cc    |  4 +-
 .../cinn/frontend/pass/gemm_rewriter_test.cc  | 10 ++--
 paddle/cinn/frontend/pass/pass_test_helper.h  | 24 ++++-----
 .../frontend/pass/program_topoerror_test.cc   | 28 +++++------
 paddle/cinn/frontend/pass/remove_identity.cc  |  4 +-
 .../frontend/pass/remove_identity_test.cc     |  8 +--
 paddle/cinn/frontend/pass/test_helper.h       | 14 +++---
 .../frontend/pass/transpose_collapsing.cc     |  8 +--
 .../pass/transpose_collapsing_test.cc         | 22 ++++----
 .../frontend/pass/transpose_folding_base.h    |  8 +--
 .../frontend/pass/transpose_folding_input.cc  |  6 +--
 .../pass/transpose_folding_input_test.cc      | 26 +++++-----
 .../frontend/pass/transpose_folding_output.cc |  6 +--
 .../pass/transpose_folding_output_test.cc     |  8 +--
 .../pass/transpose_scale_folding_test.cc      |  8 +--
 paddle/cinn/frontend/pass/use_program_pass.h  |  2 +-
 paddle/cinn/frontend/program_pass.cc          |  4 +-
 paddle/cinn/frontend/program_pass.h           |  4 +-
 paddle/cinn/frontend/syntax.cc                | 12 ++---
 paddle/cinn/frontend/syntax.h                 | 12 ++---
 paddle/cinn/frontend/syntax_test.cc           | 22 ++++----
 paddle/cinn/frontend/var_type_utils.h         | 10 ++--
 paddle/cinn/hlir/framework/CMakeLists.txt     | 24 ++++-----
 .../cinn/hlir/framework/accuracy_checker.cc   |  2 +-
 paddle/cinn/hlir/framework/accuracy_checker.h |  4 +-
 .../hlir/framework/accuracy_checker_test.cc   |  8 +--
 paddle/cinn/hlir/framework/buffer.cc          |  2 +-
 paddle/cinn/hlir/framework/buffer.h           |  8 +--
 paddle/cinn/hlir/framework/buffer_test.cc     |  4 +-
 paddle/cinn/hlir/framework/graph.cc           |  8 +--
 paddle/cinn/hlir/framework/graph.h            |  6 +--
 paddle/cinn/hlir/framework/graph_compiler.cc  | 22 ++++----
 paddle/cinn/hlir/framework/graph_compiler.h   | 24 ++++-----
 .../hlir/framework/graph_compiler_test.cc     | 18 +++----
 paddle/cinn/hlir/framework/graph_test.cc      |  8 +--
 paddle/cinn/hlir/framework/instruction.cc     | 10 ++--
 paddle/cinn/hlir/framework/instruction.h      | 10 ++--
 .../cinn/hlir/framework/instruction_test.cc   | 16 +++---
 paddle/cinn/hlir/framework/memory.cc          |  4 +-
 paddle/cinn/hlir/framework/memory.h           |  4 +-
 paddle/cinn/hlir/framework/node.cc            |  4 +-
 paddle/cinn/hlir/framework/node.h             |  8 +--
 paddle/cinn/hlir/framework/op.h               |  6 +--
 paddle/cinn/hlir/framework/op_lowering.cc     | 10 ++--
 paddle/cinn/hlir/framework/op_lowering.h      | 16 +++---
 .../cinn/hlir/framework/op_lowering_test.cc   | 18 +++----
 .../cinn/hlir/framework/op_lowering_util.cc   | 10 ++--
 paddle/cinn/hlir/framework/op_lowering_util.h |  2 +-
 paddle/cinn/hlir/framework/op_strategy.cc     |  2 +-
 paddle/cinn/hlir/framework/op_strategy.h      |  6 +--
 paddle/cinn/hlir/framework/op_test.cc         | 16 +++---
 .../cinn/hlir/framework/parallel_compiler.cc  | 24 ++++-----
 .../cinn/hlir/framework/parallel_compiler.h   | 14 +++---
 .../hlir/framework/parallel_compiler_test.cc  | 10 ++--
 paddle/cinn/hlir/framework/pass.cc            |  6 +--
 paddle/cinn/hlir/framework/pass.h             |  4 +-
 .../hlir/framework/print_graph_pass_test.cc   | 14 +++---
 paddle/cinn/hlir/framework/schedule.h         |  4 +-
 paddle/cinn/hlir/framework/scope.cc           |  4 +-
 paddle/cinn/hlir/framework/scope.h            |  4 +-
 paddle/cinn/hlir/framework/scope_test.cc      |  2 +-
 paddle/cinn/hlir/framework/tensor.cc          |  4 +-
 paddle/cinn/hlir/framework/tensor.h           |  8 +--
 paddle/cinn/hlir/framework/tensor_test.cc     |  2 +-
 paddle/cinn/hlir/framework/variable.cc        |  2 +-
 .../cinn/hlir/framework/visualize_helper.cc   | 10 ++--
 paddle/cinn/hlir/framework/visualize_helper.h |  6 +--
 paddle/cinn/hlir/op/CMakeLists.txt            | 10 ++--
 paddle/cinn/hlir/op/broadcast.cc              | 20 ++++----
 paddle/cinn/hlir/op/contrib/CMakeLists.txt    | 16 +++---
 paddle/cinn/hlir/op/contrib/argmax.cc         | 36 ++++++-------
 paddle/cinn/hlir/op/contrib/argmax.h          |  2 +-
 paddle/cinn/hlir/op/contrib/argmax_test.cc    | 18 +++----
 paddle/cinn/hlir/op/contrib/argmin.cc         | 36 ++++++-------
 paddle/cinn/hlir/op/contrib/argmin.h          |  2 +-
 paddle/cinn/hlir/op/contrib/argmin_test.cc    | 16 +++---
 paddle/cinn/hlir/op/contrib/assert_true.cc    | 26 +++++-----
 .../cinn/hlir/op/contrib/bitcast_convert.cc   | 34 ++++++-------
 paddle/cinn/hlir/op/contrib/cholesky.cc       | 46 ++++++++---------
 paddle/cinn/hlir/op/contrib/gather_nd.cc      | 36 ++++++-------
 paddle/cinn/hlir/op/contrib/gather_nd.h       |  6 +--
 paddle/cinn/hlir/op/contrib/gather_nd_test.cc | 16 +++---
 .../cinn/hlir/op/contrib/gaussian_random.cc   | 46 ++++++++---------
 .../hlir/op/contrib/logical_right_shift.cc    | 36 ++++++-------
 .../hlir/op/contrib/logical_right_shift.h     |  6 +--
 .../op/contrib/logical_right_shift_test.cc    | 16 +++---
 paddle/cinn/hlir/op/contrib/lookup_table.cc   | 38 +++++++-------
 paddle/cinn/hlir/op/contrib/lookup_table.h    | 10 ++--
 .../cinn/hlir/op/contrib/lookup_table_test.cc | 18 +++----
 paddle/cinn/hlir/op/contrib/one_hot.cc        | 36 ++++++-------
 paddle/cinn/hlir/op/contrib/one_hot.h         |  6 +--
 paddle/cinn/hlir/op/contrib/one_hot_test.cc   | 16 +++---
 paddle/cinn/hlir/op/contrib/randint.cc        | 46 ++++++++---------
 paddle/cinn/hlir/op/contrib/reciprocal.cc     | 36 ++++++-------
 paddle/cinn/hlir/op/contrib/reciprocal.h      |  6 +--
 .../cinn/hlir/op/contrib/reciprocal_test.cc   | 16 +++---
 paddle/cinn/hlir/op/contrib/repeat.cc         | 34 ++++++-------
 paddle/cinn/hlir/op/contrib/repeat.h          |  6 +--
 paddle/cinn/hlir/op/contrib/repeat_test.cc    | 16 +++---
 paddle/cinn/hlir/op/contrib/resize.cc         | 32 ++++++------
 paddle/cinn/hlir/op/contrib/resize.h          |  6 +--
 paddle/cinn/hlir/op/contrib/sort.cc           | 34 ++++++-------
 paddle/cinn/hlir/op/contrib/sort.h            |  6 +--
 paddle/cinn/hlir/op/contrib/sort_test.cc      | 16 +++---
 .../cinn/hlir/op/contrib/triangular_solve.cc  | 26 +++++-----
 paddle/cinn/hlir/op/contrib/uniform_random.cc | 46 ++++++++---------
 paddle/cinn/hlir/op/custom_call.cc            | 26 +++++-----
 paddle/cinn/hlir/op/elementwise.cc            | 20 ++++----
 paddle/cinn/hlir/op/external_api_registry.cc  |  2 +-
 paddle/cinn/hlir/op/external_api_registry.h   |  6 +--
 .../hlir/op/external_api_registry_test.cc     |  6 +--
 paddle/cinn/hlir/op/nn.cc                     | 24 ++++-----
 paddle/cinn/hlir/op/op_broadcast_test.cc      | 22 ++++----
 paddle/cinn/hlir/op/op_nn_test.cc             | 22 ++++----
 paddle/cinn/hlir/op/op_util.cc                | 10 ++--
 paddle/cinn/hlir/op/op_util.h                 | 10 ++--
 paddle/cinn/hlir/op/reduction.cc              | 22 ++++----
 paddle/cinn/hlir/op/reduction_test.cc         | 38 +++++++-------
 paddle/cinn/hlir/op/transform.cc              | 26 +++++-----
 paddle/cinn/hlir/op/transform_test.cc         | 40 +++++++--------
 paddle/cinn/hlir/op/use_ops.h                 |  2 +-
 paddle/cinn/hlir/pass/CMakeLists.txt          | 28 +++++------
 paddle/cinn/hlir/pass/alterlayout.cc          | 16 +++---
 paddle/cinn/hlir/pass/alterlayout_test.cc     | 16 +++---
 .../hlir/pass/check_fusion_accuracy_pass.cc   | 14 +++---
 .../pass/check_fusion_accuracy_pass_test.cc   |  4 +-
 .../pass/common_subexpression_elimination.cc  | 14 +++---
 .../common_subexpression_elimination_test.cc  | 12 ++---
 paddle/cinn/hlir/pass/const_propagate.cc      | 14 +++---
 paddle/cinn/hlir/pass/const_propagate_test.cc | 16 +++---
 .../cinn/hlir/pass/constant_folding_pass.cc   |  4 +-
 .../hlir/pass/constant_folding_pass_test.cc   |  2 +-
 .../hlir/pass/constant_folding_pass_util.cc   |  8 +--
 .../hlir/pass/constant_folding_pass_util.h    |  4 +-
 paddle/cinn/hlir/pass/custom_call_pass.cc     |  8 +--
 paddle/cinn/hlir/pass/dce_pass.cc             |  4 +-
 paddle/cinn/hlir/pass/dce_pass_test.cc        |  2 +-
 paddle/cinn/hlir/pass/dense_merge_pass.cc     |  6 +--
 .../cinn/hlir/pass/dense_merge_pass_test.cc   |  2 +-
 paddle/cinn/hlir/pass/dot_merger.cc           |  8 +--
 paddle/cinn/hlir/pass/dot_merger_test.cc      |  2 +-
 paddle/cinn/hlir/pass/fusion_helper_base.h    | 14 +++---
 paddle/cinn/hlir/pass/fusion_merge_pass.cc    |  2 +-
 .../cinn/hlir/pass/fusion_merge_pass_test.cc  |  2 +-
 .../cinn/hlir/pass/fusion_merge_pass_util.h   |  2 +-
 paddle/cinn/hlir/pass/infershape.cc           | 12 ++---
 paddle/cinn/hlir/pass/infershape.h            |  6 +--
 paddle/cinn/hlir/pass/op_fusion_pass.cc       |  4 +-
 paddle/cinn/hlir/pass/op_fusion_pass_test.cc  |  2 +-
 paddle/cinn/hlir/pass/op_fusion_pass_util.h   |  2 +-
 paddle/cinn/hlir/pass/opfusion.cc             | 12 ++---
 paddle/cinn/hlir/pass/opfusion_test.cc        | 16 +++---
 paddle/cinn/hlir/pass/reduce_split_pass.cc    | 12 ++---
 .../cinn/hlir/pass/reduce_split_pass_test.cc  |  2 +-
 .../hlir/pass/single_group_optimize_pass.cc   | 10 ++--
 paddle/cinn/hlir/pass/test_dot_merger.cc      |  6 +--
 paddle/cinn/hlir/pass/test_primitive_ops.cc   | 18 +++----
 paddle/cinn/hlir/pass/use_pass.h              |  2 +-
 paddle/cinn/hlir/pe/CMakeLists.txt            | 10 ++--
 paddle/cinn/hlir/pe/broadcast.cc              | 16 +++---
 paddle/cinn/hlir/pe/broadcast.h               |  2 +-
 paddle/cinn/hlir/pe/elementwise.cc            | 10 ++--
 paddle/cinn/hlir/pe/elementwise.h             |  6 +--
 paddle/cinn/hlir/pe/ir_schedule_pe.cc         | 22 ++++----
 paddle/cinn/hlir/pe/ir_schedule_pe.h          | 14 +++---
 paddle/cinn/hlir/pe/load_params_test.cc       |  2 +-
 paddle/cinn/hlir/pe/load_x86_params.cc        |  2 +-
 paddle/cinn/hlir/pe/nn.cc                     | 24 ++++-----
 paddle/cinn/hlir/pe/nn.h                      |  8 +--
 paddle/cinn/hlir/pe/nn_util.cc                |  6 +--
 paddle/cinn/hlir/pe/nn_util.h                 | 14 +++---
 paddle/cinn/hlir/pe/pe_broadcast_test.cc      | 12 ++---
 paddle/cinn/hlir/pe/pe_elementwise_test.cc    | 24 ++++-----
 paddle/cinn/hlir/pe/pe_transform_test.cc      | 30 +++++------
 paddle/cinn/hlir/pe/reduction.cc              | 24 ++++-----
 paddle/cinn/hlir/pe/reduction.h               |  2 +-
 paddle/cinn/hlir/pe/schedule.cc               | 12 ++---
 paddle/cinn/hlir/pe/schedule.h                | 12 ++---
 paddle/cinn/hlir/pe/transform.cc              | 22 ++++----
 paddle/cinn/hlir/pe/transform.h               |  8 +--
 paddle/cinn/hlir/pe/vision.cc                 |  2 +-
 paddle/cinn/ir/CMakeLists.txt                 | 24 ++++-----
 paddle/cinn/ir/buffer.cc                      | 16 +++---
 paddle/cinn/ir/buffer.h                       |  4 +-
 paddle/cinn/ir/buffer_test.cc                 | 20 ++++----
 paddle/cinn/ir/collect_ir_nodes.cc            |  6 +--
 paddle/cinn/ir/collect_ir_nodes.h             |  2 +-
 paddle/cinn/ir/collect_ir_nodes_test.cc       |  4 +-
 paddle/cinn/ir/function_base.cc               |  2 +-
 paddle/cinn/ir/function_base.h                |  2 +-
 paddle/cinn/ir/function_definition.cc         |  2 +-
 paddle/cinn/ir/function_definition.h          |  2 +-
 paddle/cinn/ir/intrinsic_ops.cc               |  2 +-
 paddle/cinn/ir/intrinsic_ops.h                |  4 +-
 paddle/cinn/ir/intrinsic_ops_test.cc          |  2 +-
 paddle/cinn/ir/ir.cc                          | 16 +++---
 paddle/cinn/ir/ir.h                           | 10 ++--
 paddle/cinn/ir/ir_base.cc                     | 20 ++++----
 paddle/cinn/ir/ir_base.h                      |  8 +--
 paddle/cinn/ir/ir_compare.cc                  |  6 +--
 paddle/cinn/ir/ir_compare.h                   |  4 +-
 paddle/cinn/ir/ir_compare_test.cc             |  8 +--
 paddle/cinn/ir/ir_mutator.cc                  |  6 +--
 paddle/cinn/ir/ir_mutator.h                   |  6 +--
 paddle/cinn/ir/ir_operators.cc                | 12 ++---
 paddle/cinn/ir/ir_operators.h                 |  4 +-
 paddle/cinn/ir/ir_operators_test.cc           |  2 +-
 paddle/cinn/ir/ir_printer.cc                  | 14 +++---
 paddle/cinn/ir/ir_printer.h                   |  6 +--
 paddle/cinn/ir/ir_printer_test.cc             |  2 +-
 paddle/cinn/ir/ir_schedule.cc                 | 32 ++++++------
 paddle/cinn/ir/ir_schedule.h                  | 12 ++---
 paddle/cinn/ir/ir_schedule_util.cc            | 24 ++++-----
 paddle/cinn/ir/ir_schedule_util.h             | 12 ++---
 paddle/cinn/ir/ir_test.cc                     |  4 +-
 paddle/cinn/ir/ir_verify.cc                   |  6 +--
 paddle/cinn/ir/ir_verify.h                    |  2 +-
 paddle/cinn/ir/ir_verify_test.cc              |  4 +-
 paddle/cinn/ir/ir_visitor.cc                  |  8 +--
 paddle/cinn/ir/ir_visitor.h                   | 12 ++---
 paddle/cinn/ir/layout.cc                      |  2 +-
 paddle/cinn/ir/layout.h                       |  4 +-
 paddle/cinn/ir/lowered_func.cc                | 18 +++----
 paddle/cinn/ir/lowered_func.h                 |  4 +-
 paddle/cinn/ir/module.cc                      |  6 +--
 paddle/cinn/ir/module.h                       |  8 +--
 paddle/cinn/ir/operation.cc                   |  4 +-
 paddle/cinn/ir/operation.h                    |  6 +--
 paddle/cinn/ir/registry.cc                    |  2 +-
 paddle/cinn/ir/registry.h                     |  2 +-
 paddle/cinn/ir/schedule_desc.cc               |  8 +--
 paddle/cinn/ir/schedule_desc.h                |  8 +--
 paddle/cinn/ir/schedule_desc_test.cc          | 18 +++----
 paddle/cinn/ir/tensor.cc                      | 30 +++++------
 paddle/cinn/ir/tensor.h                       | 10 ++--
 paddle/cinn/ir/tensor_test.cc                 | 24 ++++-----
 paddle/cinn/lang/CMakeLists.txt               | 10 ++--
 paddle/cinn/lang/buffer.cc                    |  4 +-
 paddle/cinn/lang/buffer.h                     |  2 +-
 paddle/cinn/lang/builtin.cc                   | 10 ++--
 paddle/cinn/lang/builtin.h                    |  6 +--
 paddle/cinn/lang/compute.cc                   | 20 ++++----
 paddle/cinn/lang/compute.h                    |  8 +--
 paddle/cinn/lang/compute_test.cc              | 12 ++---
 paddle/cinn/lang/lower.cc                     | 12 ++---
 paddle/cinn/lang/lower.h                      | 12 ++---
 paddle/cinn/lang/lower_impl.cc                | 22 ++++----
 paddle/cinn/lang/lower_impl.h                 | 26 +++++-----
 paddle/cinn/lang/lower_impl_test.cc           |  6 +--
 paddle/cinn/lang/lower_test.cc                | 12 ++---
 paddle/cinn/lang/packed_func.cc               |  2 +-
 paddle/cinn/lang/packed_func.h                |  4 +-
 paddle/cinn/lang/packed_func_test.cc          |  8 +--
 paddle/cinn/lang/placeholder.cc               |  4 +-
 paddle/cinn/lang/placeholder.h                | 14 +++---
 paddle/cinn/lang/placeholder_test.cc          |  4 +-
 paddle/cinn/optim/CMakeLists.txt              | 24 ++++-----
 paddle/cinn/optim/buffer_assign.cc            | 12 ++---
 paddle/cinn/optim/buffer_assign.h             |  4 +-
 .../optim/cache_read_write_replace_test.cc    |  4 +-
 .../cinn/optim/call_arg_list_to_pod_value.cc  |  8 +--
 .../cinn/optim/call_arg_list_to_pod_value.h   |  2 +-
 paddle/cinn/optim/cast_bool_to_int8.cc        |  4 +-
 paddle/cinn/optim/cast_bool_to_int8.h         |  2 +-
 paddle/cinn/optim/cast_simplify.cc            |  4 +-
 paddle/cinn/optim/cast_simplify.h             |  2 +-
 paddle/cinn/optim/cast_simplify_test.cc       |  6 +--
 paddle/cinn/optim/collect_undefined_vars.cc   |  4 +-
 paddle/cinn/optim/collect_undefined_vars.h    |  2 +-
 paddle/cinn/optim/compute_inline_expand.cc    | 10 ++--
 paddle/cinn/optim/compute_inline_expand.h     |  2 +-
 .../optim/eliminate_broadcast_in_forloop.cc   | 10 ++--
 .../optim/eliminate_broadcast_in_forloop.h    |  2 +-
 paddle/cinn/optim/extern_call_process.cc      |  4 +-
 paddle/cinn/optim/extern_call_process.h       |  2 +-
 paddle/cinn/optim/fold_cinn_call_arguments.cc |  8 +--
 paddle/cinn/optim/fold_cinn_call_arguments.h  |  2 +-
 paddle/cinn/optim/if_simplify.cc              |  4 +-
 paddle/cinn/optim/if_simplify.h               |  2 +-
 paddle/cinn/optim/if_simplify_test.cc         |  4 +-
 paddle/cinn/optim/insert_debug_log_callee.cc  | 12 ++---
 paddle/cinn/optim/insert_debug_log_callee.h   |  2 +-
 paddle/cinn/optim/ir_copy.cc                  | 14 +++---
 paddle/cinn/optim/ir_copy.h                   |  4 +-
 paddle/cinn/optim/ir_copy_test.cc             |  4 +-
 paddle/cinn/optim/ir_replace.cc               | 10 ++--
 paddle/cinn/optim/ir_replace.h                |  2 +-
 paddle/cinn/optim/ir_simplify.cc              | 22 ++++----
 paddle/cinn/optim/ir_simplify.h               |  2 +-
 paddle/cinn/optim/ir_simplify_test.cc         |  6 +--
 .../optim/lower_function_call_bind_vars.cc    |  4 +-
 .../optim/lower_function_call_bind_vars.h     |  4 +-
 paddle/cinn/optim/lower_intrin.cc             | 12 ++---
 paddle/cinn/optim/lower_intrin.h              |  2 +-
 paddle/cinn/optim/map_extern_call.cc          | 10 ++--
 paddle/cinn/optim/map_extern_call.h           |  2 +-
 paddle/cinn/optim/optimize.cc                 | 46 ++++++++---------
 paddle/cinn/optim/optimize.h                  |  4 +-
 paddle/cinn/optim/optimize_test.cc            |  8 +--
 paddle/cinn/optim/remove_nested_block.cc      |  6 +--
 paddle/cinn/optim/remove_nested_block.h       |  4 +-
 paddle/cinn/optim/remove_nested_block_test.cc |  6 +--
 paddle/cinn/optim/remove_schedule_block.cc    |  8 +--
 paddle/cinn/optim/remove_schedule_block.h     |  4 +-
 .../cinn/optim/remove_schedule_block_test.cc  | 12 ++---
 paddle/cinn/optim/replace_call_with_expr.cc   | 10 ++--
 paddle/cinn/optim/replace_call_with_expr.h    |  2 +-
 .../cinn/optim/replace_call_with_expr_test.cc | 12 ++---
 .../optim/replace_const_param_to_integer.cc   |  8 +--
 .../optim/replace_const_param_to_integer.h    |  2 +-
 paddle/cinn/optim/replace_var_with_expr.cc    | 22 ++++----
 paddle/cinn/optim/replace_var_with_expr.h     |  2 +-
 paddle/cinn/optim/tensor_write_tell.cc        |  2 +-
 paddle/cinn/optim/tensor_write_tell.h         |  4 +-
 paddle/cinn/optim/transform_gpu_forloop.cc    | 28 +++++------
 paddle/cinn/optim/transform_gpu_forloop.h     |  8 +--
 paddle/cinn/optim/transform_polyfor_to_for.cc | 22 ++++----
 paddle/cinn/optim/transform_polyfor_to_for.h  |  2 +-
 .../optim/transform_polyfor_to_for_test.cc    |  4 +-
 paddle/cinn/optim/unroll_loops.cc             | 12 ++---
 paddle/cinn/optim/unroll_loops.h              |  2 +-
 paddle/cinn/optim/unroll_loops_test.cc        |  8 +--
 paddle/cinn/optim/var_mod_simplify.cc         |  8 +--
 paddle/cinn/optim/var_mod_simplify.h          |  2 +-
 paddle/cinn/optim/vectorize_loops.cc          | 24 ++++-----
 paddle/cinn/optim/vectorize_loops.h           |  2 +-
 paddle/cinn/optim/vectorize_loops_test.cc     | 18 +++----
 paddle/cinn/poly/CMakeLists.txt               | 14 +++---
 paddle/cinn/poly/ast_gen.cc                   | 12 ++---
 paddle/cinn/poly/ast_gen.h                    | 12 ++---
 paddle/cinn/poly/ast_gen_test.cc              | 16 +++---
 paddle/cinn/poly/compute_at_transform.cc      |  2 +-
 paddle/cinn/poly/compute_at_transform.h       |  8 +--
 paddle/cinn/poly/compute_at_transform_test.cc |  2 +-
 paddle/cinn/poly/dim.cc                       |  8 +--
 paddle/cinn/poly/dim.h                        |  2 +-
 paddle/cinn/poly/domain.cc                    |  8 +--
 paddle/cinn/poly/domain.h                     |  2 +-
 .../cinn/poly/domain_add_unit_loop_mutator.cc |  8 +--
 .../cinn/poly/domain_add_unit_loop_mutator.h  |  4 +-
 paddle/cinn/poly/graph.cc                     |  2 +-
 paddle/cinn/poly/graph.h                      |  4 +-
 paddle/cinn/poly/graph_test.cc                |  6 +--
 paddle/cinn/poly/isl_utils.cc                 |  6 +--
 paddle/cinn/poly/isl_utils_test.cc            |  2 +-
 paddle/cinn/poly/map.cc                       |  6 +--
 paddle/cinn/poly/map.h                        |  6 +--
 paddle/cinn/poly/naive_scheduler.cc           |  2 +-
 paddle/cinn/poly/naive_scheduler.h            |  2 +-
 paddle/cinn/poly/poly_scheduler.cc            |  4 +-
 paddle/cinn/poly/poly_scheduler.h             | 14 +++---
 paddle/cinn/poly/poly_scheduler_test.cc       |  2 +-
 paddle/cinn/poly/schedule.cc                  | 16 +++---
 paddle/cinn/poly/schedule.h                   | 12 ++---
 paddle/cinn/poly/schedule_test.cc             | 12 ++---
 paddle/cinn/poly/stage.cc                     | 34 ++++++-------
 paddle/cinn/poly/stage.h                      | 10 ++--
 paddle/cinn/poly/stage_test.cc                | 18 +++----
 paddle/cinn/pybind/CMakeLists.txt             |  4 +-
 paddle/cinn/pybind/backends.cc                |  6 +--
 paddle/cinn/pybind/bind.cc                    |  6 +--
 paddle/cinn/pybind/bind_utils.h               | 16 +++---
 paddle/cinn/pybind/common.cc                  | 20 ++++----
 paddle/cinn/pybind/framework.cc               | 20 ++++----
 paddle/cinn/pybind/frontend.cc                | 42 ++++++++--------
 paddle/cinn/pybind/ir.cc                      | 26 +++++-----
 paddle/cinn/pybind/lang.cc                    | 22 ++++----
 paddle/cinn/pybind/optim.cc                   | 22 ++++----
 paddle/cinn/pybind/pe.cc                      | 16 +++---
 paddle/cinn/pybind/poly.cc                    |  6 +--
 paddle/cinn/pybind/runtime.cc                 |  6 +--
 paddle/cinn/pybind/utils.cc                   |  4 +-
 paddle/cinn/runtime/CMakeLists.txt            |  8 +--
 paddle/cinn/runtime/buffer.cc                 |  2 +-
 paddle/cinn/runtime/cinn_runtime.cc           |  4 +-
 paddle/cinn/runtime/cinn_runtime.h            |  4 +-
 paddle/cinn/runtime/cinn_runtime_test.cc      |  2 +-
 paddle/cinn/runtime/cinn_x86_device_impl.cc   |  2 +-
 paddle/cinn/runtime/cpu/CMakeLists.txt        |  6 +--
 paddle/cinn/runtime/cpu/cblas.cc              |  6 +--
 paddle/cinn/runtime/cpu/cblas.h               |  2 +-
 paddle/cinn/runtime/cpu/host_intrinsics.cc    | 12 ++---
 paddle/cinn/runtime/cpu/host_intrinsics.h     |  2 +-
 .../cinn/runtime/cpu/host_intrinsics_test.cc  | 18 +++----
 paddle/cinn/runtime/cpu/mkl_math.cc           |  8 +--
 paddle/cinn/runtime/cpu/mkl_math.h            |  2 +-
 paddle/cinn/runtime/cpu/mkl_math_test.cc      | 20 ++++----
 paddle/cinn/runtime/cpu/mkldnn_math.cc        | 40 +++++++--------
 paddle/cinn/runtime/cpu/mkldnn_math.h         |  4 +-
 paddle/cinn/runtime/cpu/mkldnn_math_test.cc   | 20 ++++----
 paddle/cinn/runtime/cpu/thread_backend.cc     | 10 ++--
 paddle/cinn/runtime/cpu/thread_backend.h      |  2 +-
 paddle/cinn/runtime/cpu/use_extern_funcs.h    |  2 +-
 paddle/cinn/runtime/cuda/CMakeLists.txt       |  4 +-
 paddle/cinn/runtime/cuda/cublas_util.h        |  2 +-
 .../runtime/cuda/cuda_instrinsics_bfloat16.cc | 14 +++---
 .../runtime/cuda/cuda_instrinsics_float16.cc  | 14 +++---
 paddle/cinn/runtime/cuda/cuda_intrinsics.cc   | 12 ++---
 .../runtime/cuda/cuda_intrinsics_reduce.cc    | 16 +++---
 paddle/cinn/runtime/cuda/cuda_module.cc       | 10 ++--
 paddle/cinn/runtime/cuda/cuda_module.h        |  4 +-
 paddle/cinn/runtime/cuda/cuda_module_test.cc  | 12 ++---
 paddle/cinn/runtime/cuda/cuda_util.cc         | 18 +++----
 paddle/cinn/runtime/cuda/cuda_util.h          |  4 +-
 paddle/cinn/runtime/cuda/test_util.h          |  2 +-
 paddle/cinn/runtime/cuda/use_extern_funcs.h   |  2 +-
 paddle/cinn/runtime/custom_function.cc        |  6 +--
 paddle/cinn/runtime/custom_function.h         |  8 +--
 paddle/cinn/runtime/custom_function_test.cc   |  8 +--
 paddle/cinn/runtime/flags.cc                  |  4 +-
 paddle/cinn/runtime/flags.h                   |  2 +-
 paddle/cinn/runtime/intrinsic.cc              |  6 +--
 paddle/cinn/runtime/intrinsic.h               |  6 +--
 paddle/cinn/runtime/intrinsic_types.cc        |  2 +-
 paddle/cinn/runtime/intrinsic_types.h         |  2 +-
 paddle/cinn/runtime/use_extern_funcs.h        |  4 +-
 paddle/cinn/utils/CMakeLists.txt              | 10 ++--
 paddle/cinn/utils/data_util.cc                |  2 +-
 paddle/cinn/utils/data_util.h                 |  4 +-
 paddle/cinn/utils/dot_lang.cc                 |  2 +-
 paddle/cinn/utils/error.cc                    |  2 +-
 paddle/cinn/utils/event.cc                    |  2 +-
 paddle/cinn/utils/functional.cc               |  2 +-
 paddle/cinn/utils/functional_test.cc          |  4 +-
 paddle/cinn/utils/multi_threading.cc          |  4 +-
 paddle/cinn/utils/multi_threading_test.cc     |  2 +-
 paddle/cinn/utils/profiler.cc                 |  4 +-
 paddle/cinn/utils/profiler.h                  |  2 +-
 paddle/cinn/utils/profiler_test.cc            |  2 +-
 paddle/cinn/utils/random_engine.cc            |  2 +-
 paddle/cinn/utils/sized_multi_set.cc          |  2 +-
 paddle/cinn/utils/sized_multi_set_test.cc     |  2 +-
 paddle/cinn/utils/small_vector.cc             |  2 +-
 paddle/cinn/utils/string.cc                   |  2 +-
 paddle/cinn/utils/string.h                    |  2 +-
 paddle/cinn/utils/string_test.cc              |  2 +-
 paddle/cinn/utils/timer.cc                    |  2 +-
 test/cinn/fusion/fusion_test.py               |  2 +-
 test/cinn/op_mappers/op_mapper_test.py        |  2 +-
 test/cinn/passes/pass_test.py                 |  2 +-
 test/cpp/cinn/CMakeLists.txt                  | 16 +++---
 test/cpp/cinn/benchmark/CMakeLists.txt        |  8 +--
 .../cinn/benchmark/test_all_ops_default.cc    |  8 +--
 test/cpp/cinn/benchmark/test_elementwise.cc   |  6 +--
 test/cpp/cinn/benchmark/test_elementwise.h    |  2 +-
 test/cpp/cinn/benchmark/test_matmul.cc        |  2 +-
 test/cpp/cinn/benchmark/test_matmul.h         |  4 +-
 test/cpp/cinn/benchmark/test_utils.cc         | 22 ++++----
 test/cpp/cinn/benchmark/test_utils.h          | 10 ++--
 test/cpp/cinn/concrete_program_builder.h      |  2 +-
 test/cpp/cinn/program_builder.cc              |  2 +-
 test/cpp/cinn/program_builder.h               |  8 +--
 test/cpp/cinn/test01_elementwise_add_case.cc  | 20 ++++----
 test/cpp/cinn/test01_elementwise_add_main.cc  |  8 +--
 test/cpp/cinn/test02_helper.h                 |  8 +--
 test/cpp/cinn/test02_matmul_case.cc           | 50 +++++++++----------
 test/cpp/cinn/test02_matmul_main.cc           |  6 +--
 test/cpp/cinn/test03_convolution_case.cc      |  8 +--
 test/cpp/cinn/test03_convolution_main.cc      |  4 +-
 801 files changed, 3809 insertions(+), 3798 deletions(-)

diff --git a/paddle/cinn/auto_schedule/CMakeLists.txt b/paddle/cinn/auto_schedule/CMakeLists.txt
index 7a2d725d33ee8..88569498cadfd 100644
--- a/paddle/cinn/auto_schedule/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/CMakeLists.txt
@@ -9,13 +9,13 @@ add_subdirectory(task)
 add_subdirectory(task_scheduler)
 add_subdirectory(tests)
 
-proto_library(auto_schedule_proto SRCS auto_schedule.proto DEPS schedule_desc_proto)
+cinn_proto_library(auto_schedule_proto SRCS auto_schedule.proto DEPS schedule_desc_proto)
 
 core_gather_headers()
 
 gather_srcs(cinnapi_src SRCS auto_tuner.cc)
 
-#cc_test(test_auto_tuner SRCS auto_tuner_test.cc DEPS cinncore)
+#cinn_cc_test(test_auto_tuner SRCS auto_tuner_test.cc DEPS cinncore)
 
 foreach(header ${auto_schedule_proto_HDRS})
   set(core_proto_includes "${core_proto_includes};${header}" CACHE INTERNAL "")
diff --git a/paddle/cinn/auto_schedule/analysis/CMakeLists.txt b/paddle/cinn/auto_schedule/analysis/CMakeLists.txt
index 46eda4a587bb8..92a5847b86371 100644
--- a/paddle/cinn/auto_schedule/analysis/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/analysis/CMakeLists.txt
@@ -2,4 +2,4 @@ core_gather_headers()
 
 gather_srcs(cinnapi_src SRCS analyze_ir.cc)
 
-cc_test(test_analyze_ir SRCS analyze_ir_test.cc DEPS cinncore)
+cinn_cc_test(test_analyze_ir SRCS analyze_ir_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index 21ff620118d59..b64449d703aab 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/analysis/analyze_ir.h"
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
 
 #include <glog/logging.h>
 
@@ -20,18 +20,18 @@
 #include <string>
 #include <unordered_set>
 
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/lower.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/optimize.h"
-#include "cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/optimize.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.h b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
index f2d214db89e43..fdd8d9604ac29 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.h
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
@@ -17,10 +17,10 @@
 #include <string>
 #include <unordered_set>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/lowered_func.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
index e51bd0e94cf26..e0e426575bedd 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/analysis/analyze_ir.h"
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,16 +20,16 @@
 #include <sstream>
 #include <vector>
 
-#include "cinn/common/context.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/auto_schedule.proto b/paddle/cinn/auto_schedule/auto_schedule.proto
index d5d8eff373fa3..6d239cf522279 100644
--- a/paddle/cinn/auto_schedule/auto_schedule.proto
+++ b/paddle/cinn/auto_schedule/auto_schedule.proto
@@ -16,7 +16,7 @@ syntax ="proto3";
 
 package cinn.auto_schedule.proto;
 
-import "cinn/ir/schedule_desc.proto";
+import "paddle/cinn/ir/schedule_desc.proto";
 
 message TuningRecord {
   string task_key = 1;
diff --git a/paddle/cinn/auto_schedule/auto_tuner.cc b/paddle/cinn/auto_schedule/auto_tuner.cc
index 86baae7007a56..f9c8a1da4fe72 100644
--- a/paddle/cinn/auto_schedule/auto_tuner.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/auto_tuner.h"
+#include "paddle/cinn/auto_schedule/auto_tuner.h"
 
 #include <glog/logging.h>
 #include <pybind11/embed.h>
@@ -21,19 +21,19 @@
 #include <memory>
 #include <utility>
 
-#include "cinn/auto_schedule/database/jsonfile_database.h"
-#include "cinn/auto_schedule/measure/schedule_measurer.h"
-#include "cinn/auto_schedule/measure/simple_builder.h"
-#include "cinn/auto_schedule/measure/simple_runner.h"
-#include "cinn/auto_schedule/task/task_creator.h"
-#include "cinn/auto_schedule/task/task_registry.h"
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
-#include "cinn/common/context.h"
-#include "cinn/common/type.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/visualize_helper.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/auto_schedule/database/jsonfile_database.h"
+#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
+#include "paddle/cinn/auto_schedule/measure/simple_builder.h"
+#include "paddle/cinn/auto_schedule/measure/simple_runner.h"
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/auto_tuner.h b/paddle/cinn/auto_schedule/auto_tuner.h
index 6a356bd3dd7b1..70dd824391aeb 100644
--- a/paddle/cinn/auto_schedule/auto_tuner.h
+++ b/paddle/cinn/auto_schedule/auto_tuner.h
@@ -18,15 +18,15 @@
 #include <string>
 #include <vector>
 
-#include "cinn/auto_schedule/measure/schedule_measurer.h"
-#include "cinn/auto_schedule/task/task_optimizer.h"
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
-#include "cinn/auto_schedule/tuning.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
+#include "paddle/cinn/auto_schedule/task/task_optimizer.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/auto_tuner_test.cc b/paddle/cinn/auto_schedule/auto_tuner_test.cc
index 362a279e852d1..10a417720cffe 100644
--- a/paddle/cinn/auto_schedule/auto_tuner_test.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/auto_tuner.h"
+#include "paddle/cinn/auto_schedule/auto_tuner.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,16 +20,16 @@
 #include <cstdlib>
 #include <iostream>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(auto_schedule_use_cost_model);
 DECLARE_bool(cinn_ir_schedule);
diff --git a/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt b/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
index 6e52f7a3dad14..b8a5d31019535 100644
--- a/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
@@ -2,6 +2,6 @@ core_gather_headers()
 
 gather_srcs(cinnapi_src SRCS xgb_cost_model.cc expr_cost_model.cc feature.cc feature_extractor.cc)
 
-cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
-cc_test(test_feature_extractor SRCS feature_extractor_test.cc DEPS cinncore)
-cc_test(test_feature SRCS feature_test.cc DEPS cinncore)
+cinn_cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
+cinn_cc_test(test_feature_extractor SRCS feature_extractor_test.cc DEPS cinncore)
+cinn_cc_test(test_feature SRCS feature_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
index e41a71a409109..fadee09ed1a1b 100644
--- a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
+++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
 
 #include <glog/logging.h>
 
 #include <atomic>
 #include <vector>
 
-#include "cinn/auto_schedule/cost_model/feature.h"
-#include "cinn/auto_schedule/cost_model/feature_extractor.h"
-#include "cinn/auto_schedule/search_space/search_state.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/cost_model/feature.h"
+#include "paddle/cinn/auto_schedule/cost_model/feature_extractor.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
index 176424c785cb0..8aadec6f7ca3f 100644
--- a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
+++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
@@ -17,8 +17,8 @@
 #include <atomic>
 #include <vector>
 
-#include "cinn/auto_schedule/cost_model/xgb_cost_model.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/cost_model/feature.cc b/paddle/cinn/auto_schedule/cost_model/feature.cc
index 1c7f8158eb409..d2c7a89a8f3bd 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature.cc
@@ -25,13 +25,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/cost_model/feature.h"
+#include "paddle/cinn/auto_schedule/cost_model/feature.h"
 
 #include <glog/logging.h>
 
 #include <vector>
 
-#include "cinn/common/target.h"
+#include "paddle/cinn/common/target.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/cost_model/feature.h b/paddle/cinn/auto_schedule/cost_model/feature.h
index 019bd25382432..0c8aea6b9e9e3 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature.h
+++ b/paddle/cinn/auto_schedule/cost_model/feature.h
@@ -17,8 +17,8 @@
 #include <cmath>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
index 5f44b2e3f0a8d..01e29c37c06f4 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
@@ -25,18 +25,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/cost_model/feature_extractor.h"
+#include "paddle/cinn/auto_schedule/cost_model/feature_extractor.h"
 
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/common/type.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor.h b/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
index 073eee27cac77..9f3d3762eb6d4 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
@@ -27,12 +27,12 @@
 
 #pragma once
 
-#include "cinn/auto_schedule/cost_model/feature.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/ir_visitor.h"
+#include "paddle/cinn/auto_schedule/cost_model/feature.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/ir_visitor.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
index ed0cd984c93de..0cfee9415a611 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/cost_model/feature_extractor.h"
+#include "paddle/cinn/auto_schedule/cost_model/feature_extractor.h"
 
 #include <gtest/gtest.h>
 #include <pybind11/embed.h>
@@ -21,15 +21,15 @@
 #include <unordered_set>
 #include <vector>
 
-#include "cinn/common/context.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_test.cc b/paddle/cinn/auto_schedule/cost_model/feature_test.cc
index 908672d41b404..997766a13dc30 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_test.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/cost_model/feature.h"
+#include "paddle/cinn/auto_schedule/cost_model/feature.h"
 
 #include <gtest/gtest.h>
 #include <pybind11/embed.h>
diff --git a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
index 8549442688033..5db35b19732fb 100644
--- a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
+++ b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/cost_model/xgb_cost_model.h"
+#include "paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h"
 
 #include <dirent.h>
 #include <glog/logging.h>
@@ -32,7 +32,7 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/python_interpreter_guard.h"
+#include "paddle/cinn/common/python_interpreter_guard.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h
index 69dbb8a7f3904..05c2ecc1f2df4 100644
--- a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h
+++ b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/cost_model.h"
+#include "paddle/cinn/common/cost_model.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model_test.cc b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model_test.cc
index f237699a94406..c75210903e16b 100644
--- a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model_test.cc
+++ b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/cost_model/xgb_cost_model.h"
+#include "paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
diff --git a/paddle/cinn/auto_schedule/database/CMakeLists.txt b/paddle/cinn/auto_schedule/database/CMakeLists.txt
index 1c3ca9330ba8c..610b1889f0080 100644
--- a/paddle/cinn/auto_schedule/database/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/database/CMakeLists.txt
@@ -2,5 +2,5 @@ core_gather_headers()
 
 gather_srcs(cinnapi_src SRCS database.cc jsonfile_database.cc)
 
-cc_test(test_database SRCS database_test.cc DEPS cinncore)
-cc_test(test_jsonfile_database SRCS jsonfile_database_test.cc DEPS cinncore)
+cinn_cc_test(test_database SRCS database_test.cc DEPS cinncore)
+cinn_cc_test(test_jsonfile_database SRCS jsonfile_database_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/database/database.cc b/paddle/cinn/auto_schedule/database/database.cc
index 87cfd63007db4..4a1a075ae20ea 100644
--- a/paddle/cinn/auto_schedule/database/database.cc
+++ b/paddle/cinn/auto_schedule/database/database.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/database/database.h"
+#include "paddle/cinn/auto_schedule/database/database.h"
 
 #include <google/protobuf/message.h>
 #include <google/protobuf/text_format.h>
 #include <google/protobuf/util/json_util.h>
 
-#include "cinn/auto_schedule/database/jsonfile_database.h"
-#include "cinn/auto_schedule/task/task_registry.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/schedule_desc.h"
+#include "paddle/cinn/auto_schedule/database/jsonfile_database.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/schedule_desc.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/database/database.h b/paddle/cinn/auto_schedule/database/database.h
index 4487272b23875..3d9a237ecf626 100644
--- a/paddle/cinn/auto_schedule/database/database.h
+++ b/paddle/cinn/auto_schedule/database/database.h
@@ -15,9 +15,9 @@
 #pragma once
 #include <unordered_map>
 
-#include "cinn/auto_schedule/auto_schedule.pb.h"
-#include "cinn/auto_schedule/search_space/search_state.h"
-#include "cinn/ir/schedule_desc.pb.h"
+#include "paddle/cinn/auto_schedule/auto_schedule.pb.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/ir/schedule_desc.pb.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/database/database_test.cc b/paddle/cinn/auto_schedule/database/database_test.cc
index 2e06f4a56be0b..1b6f28e4d0a21 100644
--- a/paddle/cinn/auto_schedule/database/database_test.cc
+++ b/paddle/cinn/auto_schedule/database/database_test.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/database/database.h"
+#include "paddle/cinn/auto_schedule/database/database.h"
 
 #include <gtest/gtest.h>
 
 #include <vector>
 
-#include "cinn/auto_schedule/auto_schedule.pb.h"
-#include "cinn/auto_schedule/search_space/search_state.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/auto_schedule.pb.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database.cc b/paddle/cinn/auto_schedule/database/jsonfile_database.cc
index 3a7eb677183f3..023b2585caed6 100644
--- a/paddle/cinn/auto_schedule/database/jsonfile_database.cc
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/database/jsonfile_database.h"
+#include "paddle/cinn/auto_schedule/database/jsonfile_database.h"
 
 #include <google/protobuf/message.h>
 #include <google/protobuf/text_format.h>
@@ -20,9 +20,9 @@
 
 #include <fstream>
 
-#include "cinn/auto_schedule/auto_schedule.pb.h"
-#include "cinn/auto_schedule/task/task_registry.h"
-#include "cinn/utils/multi_threading.h"
+#include "paddle/cinn/auto_schedule/auto_schedule.pb.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/utils/multi_threading.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database.h b/paddle/cinn/auto_schedule/database/jsonfile_database.h
index 540013c224d5f..2a9752224217b 100644
--- a/paddle/cinn/auto_schedule/database/jsonfile_database.h
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/auto_schedule/database/database.h"
+#include "paddle/cinn/auto_schedule/database/database.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
index 6ace45ea19478..d60ce20e162a8 100644
--- a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/database/jsonfile_database.h"
+#include "paddle/cinn/auto_schedule/database/jsonfile_database.h"
 
 #include <google/protobuf/util/message_differencer.h>
 #include <gtest/gtest.h>
@@ -20,12 +20,12 @@
 #include <fstream>
 #include <vector>
 
-#include "cinn/auto_schedule/search_space/search_state.h"
-#include "cinn/auto_schedule/task/task_registry.h"
-#include "cinn/cinn.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/measure/CMakeLists.txt b/paddle/cinn/auto_schedule/measure/CMakeLists.txt
index ea2e822368df2..658ca4278f388 100644
--- a/paddle/cinn/auto_schedule/measure/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/measure/CMakeLists.txt
@@ -2,5 +2,5 @@ core_gather_headers()
 
 gather_srcs(cinnapi_src SRCS schedule_measurer.cc simple_builder.cc simple_runner.cc)
 
-cc_test(test_simple_runner SRCS simple_runner_test.cc DEPS cinncore)
-cc_test(test_measurer SRCS measurer_test.cc DEPS cinncore)
+cinn_cc_test(test_simple_runner SRCS simple_runner_test.cc DEPS cinncore)
+cinn_cc_test(test_measurer SRCS measurer_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/measure/measure.h b/paddle/cinn/auto_schedule/measure/measure.h
index 124aa474d9948..f03ec13b9fef9 100644
--- a/paddle/cinn/auto_schedule/measure/measure.h
+++ b/paddle/cinn/auto_schedule/measure/measure.h
@@ -19,10 +19,10 @@
 #include <string>
 #include <vector>
 
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/instruction.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/measure/measurer_test.cc b/paddle/cinn/auto_schedule/measure/measurer_test.cc
index 5297cabad5296..949875d14e510 100644
--- a/paddle/cinn/auto_schedule/measure/measurer_test.cc
+++ b/paddle/cinn/auto_schedule/measure/measurer_test.cc
@@ -16,16 +16,16 @@
 
 #include <memory>
 
-#include "cinn/auto_schedule/measure/schedule_measurer.h"
-#include "cinn/auto_schedule/measure/simple_builder.h"
-#include "cinn/auto_schedule/measure/simple_runner.h"
-#include "cinn/auto_schedule/task/task_creator.h"
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
+#include "paddle/cinn/auto_schedule/measure/simple_builder.h"
+#include "paddle/cinn/auto_schedule/measure/simple_runner.h"
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/auto_schedule/measure/schedule_measurer.cc b/paddle/cinn/auto_schedule/measure/schedule_measurer.cc
index 3662d831d3eb2..03b95ad26f184 100644
--- a/paddle/cinn/auto_schedule/measure/schedule_measurer.cc
+++ b/paddle/cinn/auto_schedule/measure/schedule_measurer.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/measure/schedule_measurer.h"
+#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
 
 #include <exception>
 
-#include "cinn/utils/multi_threading.h"
+#include "paddle/cinn/utils/multi_threading.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/measure/schedule_measurer.h b/paddle/cinn/auto_schedule/measure/schedule_measurer.h
index bf093b2c199a5..b95efc53ebe53 100644
--- a/paddle/cinn/auto_schedule/measure/schedule_measurer.h
+++ b/paddle/cinn/auto_schedule/measure/schedule_measurer.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "cinn/auto_schedule/measure/measure.h"
+#include "paddle/cinn/auto_schedule/measure/measure.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/measure/simple_builder.cc b/paddle/cinn/auto_schedule/measure/simple_builder.cc
index 5921d1b63b026..842acd47216e8 100644
--- a/paddle/cinn/auto_schedule/measure/simple_builder.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_builder.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/measure/simple_builder.h"
+#include "paddle/cinn/auto_schedule/measure/simple_builder.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/measure/simple_builder.h b/paddle/cinn/auto_schedule/measure/simple_builder.h
index 8757a3e322207..ca098d0ef6ffc 100644
--- a/paddle/cinn/auto_schedule/measure/simple_builder.h
+++ b/paddle/cinn/auto_schedule/measure/simple_builder.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "cinn/auto_schedule/measure/measure.h"
-#include "cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/auto_schedule/measure/measure.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/measure/simple_runner.cc b/paddle/cinn/auto_schedule/measure/simple_runner.cc
index 54660ccc93c56..5d5621cc43b60 100644
--- a/paddle/cinn/auto_schedule/measure/simple_runner.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_runner.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/measure/simple_runner.h"
+#include "paddle/cinn/auto_schedule/measure/simple_runner.h"
 
 #include <algorithm>
 #include <chrono>
@@ -21,10 +21,10 @@
 #include <memory>
 #include <random>
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/buffer.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/buffer.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/measure/simple_runner.h b/paddle/cinn/auto_schedule/measure/simple_runner.h
index 48b316a0d7c06..de5ef2b152c62 100644
--- a/paddle/cinn/auto_schedule/measure/simple_runner.h
+++ b/paddle/cinn/auto_schedule/measure/simple_runner.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "cinn/auto_schedule/measure/measure.h"
-#include "cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/auto_schedule/measure/measure.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/measure/simple_runner_test.cc b/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
index b20faa6734a52..bfe93bfa0eb74 100644
--- a/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
@@ -13,18 +13,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/measure/simple_runner.h"
+#include "paddle/cinn/auto_schedule/measure/simple_runner.h"
 
 #include <gtest/gtest.h>
 
 #include <chrono>
 #include <thread>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/CMakeLists.txt b/paddle/cinn/auto_schedule/post_schedule_rule/CMakeLists.txt
index eda51bbb7e568..15c1555af7c2a 100644
--- a/paddle/cinn/auto_schedule/post_schedule_rule/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/CMakeLists.txt
@@ -5,5 +5,5 @@ gather_srcs(cinnapi_src SRCS
 	)
 
 if (WITH_CUDA)
-  nv_test(test_cooperative_process SRCS cooperative_process_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
+  cinn_nv_test(test_cooperative_process SRCS cooperative_process_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
 endif()
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
index 2b8c05e105f1d..ede3cce78edc5 100644
--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
+#include "paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/schedule_desc.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/schedule_desc.h"
 
 namespace cinn {
 namespace auto_schedule {
@@ -64,6 +64,7 @@ bool CooperativeProcess::Apply(ir::IRSchedule* schedule) {
     auto block = schedule->GetBlock(candidate);
     schedule->Unannotate(block, ir::attr::cooperative_process);
   }
+  return true;
 }
 
 }  // namespace auto_schedule
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h
index 9f106dfda0eb3..545d7078d39ed 100644
--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h"
+#include "paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
index c10005a910969..e4cf2ab43aa64 100644
--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
+#include "paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
-#include "cinn/ir/ir_printer.h"
-#include "tests/program_builder.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "test/cpp/cinn/program_builder.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h b/paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h
index 136d4fc18f297..f9de633cbc99b 100644
--- a/paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/CMakeLists.txt b/paddle/cinn/auto_schedule/search_space/CMakeLists.txt
index 44d73649efaec..2a06875cc9b2e 100644
--- a/paddle/cinn/auto_schedule/search_space/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_space/CMakeLists.txt
@@ -9,7 +9,7 @@ gather_srcs(cinnapi_src SRCS
     rule_sampler.cc
     )
 
-cc_test(test_search_space SRCS search_space_test.cc DEPS cinncore)
-cc_test(test_search_state SRCS search_state_test.cc DEPS cinncore)
-cc_test(test_block_sampler SRCS block_sampler_test.cc DEPS cinncore)
-cc_test(test_rule_sampler SRCS rule_sampler_test.cc DEPS cinncore)
+cinn_cc_test(test_search_space SRCS search_space_test.cc DEPS cinncore)
+cinn_cc_test(test_search_state SRCS search_state_test.cc DEPS cinncore)
+cinn_cc_test(test_block_sampler SRCS block_sampler_test.cc DEPS cinncore)
+cinn_cc_test(test_rule_sampler SRCS rule_sampler_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
index dcb81c71baefd..e31f0c8e93c92 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
@@ -10,15 +10,15 @@ gather_srcs(cinnapi_src SRCS
 )
 
 if (WITH_TESTING)
-  cc_library(auto_gen_rule_test_helper SRCS test_helper.cc DEPS glog gtest cinncore)
+  cinn_cc_library(auto_gen_rule_test_helper SRCS test_helper.cc DEPS glog gtest cinncore)
 endif()
 
 if (WITH_CUDA)
-    nv_test(test_mix_rules SRCS mix_rules_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
-    nv_test(test_auto_bind SRCS auto_bind_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
-    nv_test(test_multi_level_tiling SRCS multi_level_tiling_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
+    cinn_nv_test(test_mix_rules SRCS mix_rules_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
+    cinn_nv_test(test_auto_bind SRCS auto_bind_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
+    cinn_nv_test(test_multi_level_tiling SRCS multi_level_tiling_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
 endif()
 
-#cc_test(test_auto_inline SRCS auto_inline_test.cc DEPS cinncore auto_gen_rule_test_helper)
-cc_test(test_skip_rule SRCS skip_rule_test.cc DEPS cinncore)
-cc_test(test_auto_unroll SRCS auto_unroll_test.cc DEPS cinncore)
+#cinn_cc_test(test_auto_inline SRCS auto_inline_test.cc DEPS cinncore auto_gen_rule_test_helper)
+cinn_cc_test(test_skip_rule SRCS skip_rule_test.cc DEPS cinncore)
+cinn_cc_test(test_auto_unroll SRCS auto_unroll_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
index 0a49d8c269645..fddc04b4f37f0 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h"
 
 #include <glog/logging.h>
 
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
index b93f633b230e3..8b05ec75b3e9b 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
index 9ffbe0a3f4a3a..b15a2267add47 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -21,9 +21,9 @@
 #include <functional>
 #include <numeric>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
-#include "cinn/ir/ir_printer.h"
-#include "tests/program_builder.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "test/cpp/cinn/program_builder.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
index fb6eaa797b4c1..af0d6a9e99638 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
 
 #include <glog/logging.h>
 
 #include <cstdlib>
 
-#include "cinn/common/target.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
index 2a4ed201ad709..4bca9f34483bc 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
@@ -16,9 +16,9 @@
 
 #include <string>
 
-#include "cinn/auto_schedule/search_space/search_state.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
index 5b53ee148173c..872e66e8928f5 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h"
 
 #include <memory>
 #include <set>
@@ -21,15 +21,15 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/auto_schedule/analysis/analyze_ir.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
index 982092e717c33..db89070b4529d 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
@@ -20,9 +20,9 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
index a8d8ee9f9d0c0..dd54ed59e1f34 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -21,24 +21,24 @@
 #include <iostream>
 #include <vector>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
-#include "cinn/cinn.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/hlir/framework/op_lowering.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/ir/function_base.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/poly/stage.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/string.h"
-#include "tests/concrete_program_builder.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/ir/function_base.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/string.h"
+#include "test/cpp/cinn/concrete_program_builder.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
index a4bc75ef1af83..3483992421d64 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
 
 #include <glog/logging.h>
 
 #include <cstdlib>
 
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
index f1b67d173cf3f..b42c3eed78683 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
index 99688a2da6738..4307ac7837376 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/lang/lower.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/lang/lower.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
index 21ed0e94f9ddf..e1779ad426b4b 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
@@ -17,12 +17,12 @@
 
 #include <vector>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "tests/program_builder.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "test/cpp/cinn/program_builder.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
index 3dee778f8f886..9929e90393c8c 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
 
 #include <glog/logging.h>
 
@@ -23,17 +23,17 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/auto_schedule/analysis/analyze_ir.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
index 0756071657dbd..7c54047d8a81b 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
@@ -22,11 +22,11 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
index 91ddf361da4d3..ab67854e4cc08 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -21,20 +21,20 @@
 #include <iostream>
 #include <vector>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
-#include "cinn/cinn.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/poly/stage.h"
-#include "cinn/utils/string.h"
-#include "tests/program_builder.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/utils/string.h"
+#include "test/cpp/cinn/program_builder.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
index 795a1bdc488fb..ca46f4b54940a 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
 
 #include <string>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
index 0b7f26f2fdd8b..837eadd6aafe2 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
@@ -16,9 +16,9 @@
 
 #include <string>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
index 9031605a7508c..37e8f3eaa7a81 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -21,15 +21,15 @@
 #include <iostream>
 #include <vector>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/cinn.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
index 9ad001a23bdcc..17601fc695340 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
@@ -12,23 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <memory.h>
 #include <stdlib.h>
 
-#include "cinn/auto_schedule/analysis/analyze_ir.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/cinn.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/hlir/framework/instruction.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_lowering.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
 #ifdef CINN_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
index d8f8feb46babb..4b5833aca1e13 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
@@ -19,13 +19,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/compiler.h"
-#include "cinn/common/target.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/utils/random_engine.h"
+#include "paddle/cinn/backends/compiler.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/utils/random_engine.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/block_sampler.cc b/paddle/cinn/auto_schedule/search_space/block_sampler.cc
index 66cfb8d7bfba1..85bad94baead7 100644
--- a/paddle/cinn/auto_schedule/search_space/block_sampler.cc
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/block_sampler.h"
+#include "paddle/cinn/auto_schedule/search_space/block_sampler.h"
 
 #include <algorithm>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/block_sampler.h b/paddle/cinn/auto_schedule/search_space/block_sampler.h
index 7135afffb0280..148ae7c25691a 100644
--- a/paddle/cinn/auto_schedule/search_space/block_sampler.h
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler.h
@@ -18,8 +18,8 @@
 #include <random>
 #include <vector>
 
-#include "cinn/ir/ir_base.h"
-#include "cinn/utils/random_engine.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/utils/random_engine.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/block_sampler_test.cc b/paddle/cinn/auto_schedule/search_space/block_sampler_test.cc
index ef07d964dd153..f9430c66ac64f 100644
--- a/paddle/cinn/auto_schedule/search_space/block_sampler_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/block_sampler.h"
+#include "paddle/cinn/auto_schedule/search_space/block_sampler.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/rule_sampler.cc b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
index 3951af427081f..f9d2f47c47524 100644
--- a/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/rule_sampler.h"
+#include "paddle/cinn/auto_schedule/search_space/rule_sampler.h"
 
 #include <algorithm>
 #include <random>
diff --git a/paddle/cinn/auto_schedule/search_space/rule_sampler.h b/paddle/cinn/auto_schedule/search_space/rule_sampler.h
index 828e4a775eeb1..e92adcfb866b5 100644
--- a/paddle/cinn/auto_schedule/search_space/rule_sampler.h
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler.h
@@ -18,8 +18,8 @@
 #include <random>
 #include <vector>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/utils/random_engine.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/utils/random_engine.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc b/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
index 91ca4fd5926b0..2d9ef7be94add 100644
--- a/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/rule_sampler.h"
+#include "paddle/cinn/auto_schedule/search_space/rule_sampler.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/search_space.cc b/paddle/cinn/auto_schedule/search_space/search_space.cc
index af10da2215100..f2fd7b8618f9d 100644
--- a/paddle/cinn/auto_schedule/search_space/search_space.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_space.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/search_space.h"
+#include "paddle/cinn/auto_schedule/search_space/search_space.h"
 
 #include <glog/logging.h>
 
@@ -20,19 +20,19 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
-#include "cinn/auto_schedule/search_space/block_sampler.h"
-#include "cinn/auto_schedule/search_space/rule_sampler.h"
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/block_sampler.h"
+#include "paddle/cinn/auto_schedule/search_space/rule_sampler.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(auto_schedule_use_cost_model);
 
diff --git a/paddle/cinn/auto_schedule/search_space/search_space.h b/paddle/cinn/auto_schedule/search_space/search_space.h
index afa87174ca2c9..4463fa82cfc0a 100644
--- a/paddle/cinn/auto_schedule/search_space/search_space.h
+++ b/paddle/cinn/auto_schedule/search_space/search_space.h
@@ -18,13 +18,13 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
-#include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
-#include "cinn/auto_schedule/search_space/rule_sampler.h"
-#include "cinn/auto_schedule/search_space/search_state.h"
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/rule_sampler.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/search_space_test.cc b/paddle/cinn/auto_schedule/search_space/search_space_test.cc
index 2e1064ba7f929..d113e4edeaf39 100644
--- a/paddle/cinn/auto_schedule/search_space/search_space_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_space_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/search_space.h"
+#include "paddle/cinn/auto_schedule/search_space/search_space.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/auto_schedule/search_space/search_state.cc b/paddle/cinn/auto_schedule/search_space/search_state.cc
index 48f9e8532085f..a50d006f61a2c 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_state.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
 
 #include <memory>
 #include <sstream>
 #include <utility>
 #include <vector>
 
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/utils/functional.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/utils/functional.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/search_state.h b/paddle/cinn/auto_schedule/search_space/search_state.h
index db2bfa3f7e276..f180b2d508452 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state.h
+++ b/paddle/cinn/auto_schedule/search_space/search_state.h
@@ -18,11 +18,11 @@
 #include <limits>
 #include <vector>
 
-#include "cinn/common/object.h"
-#include "cinn/common/shared.h"
-#include "cinn/ir/ir_compare.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/ir_visitor.h"
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/ir/ir_compare.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/ir_visitor.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/search_state_test.cc b/paddle/cinn/auto_schedule/search_space/search_state_test.cc
index 598fc95317589..f0e09ebb8de32 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_state_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/common/context.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/context.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt b/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
index a31e01c801a57..8f51423561e55 100644
--- a/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
@@ -4,4 +4,4 @@ core_gather_headers()
 
 gather_srcs(cinnapi_src SRCS evolutionary_search.cc)
 
-cc_test(test_evolutionary_search SRCS evolutionary_search_test.cc DEPS cinncore test_program_builder)
+cinn_cc_test(test_evolutionary_search SRCS evolutionary_search_test.cc DEPS cinncore test_program_builder)
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
index c938718ad06af..fb75161ff136a 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_strategy/evolutionary_search.h"
+#include "paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h"
 
 #include <glog/logging.h>
 
@@ -23,18 +23,18 @@
 #include <memory>
 #include <utility>
 
-#include "cinn/auto_schedule/database/database.h"
-#include "cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
-#include "cinn/auto_schedule/search_space/search_space.h"
-#include "cinn/auto_schedule/search_space/search_state.h"
-#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
-#include "cinn/auto_schedule/task/task_registry.h"
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/auto_schedule/tuning.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/utils/multi_threading.h"
-#include "cinn/utils/sized_multi_set.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/auto_schedule/database/database.h"
+#include "paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
+#include "paddle/cinn/auto_schedule/search_space/search_space.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/utils/multi_threading.h"
+#include "paddle/cinn/utils/sized_multi_set.h"
+#include "paddle/cinn/utils/string.h"
 
 DECLARE_bool(auto_schedule_use_cost_model);
 
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h
index 40e5bb9f7e889..21005f53988a1 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h
@@ -17,15 +17,15 @@
 #include <memory>
 #include <vector>
 
-#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
-#include "cinn/auto_schedule/database/database.h"
-#include "cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h"
-#include "cinn/auto_schedule/search_space/search_space.h"
-#include "cinn/auto_schedule/search_space/search_state.h"
-#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h"
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/auto_schedule/tuning.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "paddle/cinn/auto_schedule/database/database.h"
+#include "paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h"
+#include "paddle/cinn/auto_schedule/search_space/search_space.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
index 4f6764b41f65a..cab2249cc5f2f 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
@@ -12,24 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_strategy/evolutionary_search.h"
+#include "paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h"
 
 #include <gtest/gtest.h>
 
 #include <memory>
 #include <utility>
 
-#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
-#include "cinn/auto_schedule/database/database.h"
-#include "cinn/auto_schedule/search_space/search_space.h"
-#include "cinn/auto_schedule/search_space/search_state.h"
-#include "cinn/auto_schedule/task/task_creator.h"
-#include "cinn/auto_schedule/task/task_registry.h"
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/auto_schedule/tuning.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "tests/program_builder.h"
+#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "paddle/cinn/auto_schedule/database/database.h"
+#include "paddle/cinn/auto_schedule/search_space/search_space.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "test/cpp/cinn/program_builder.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/CMakeLists.txt b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/CMakeLists.txt
index 308f9a91feea5..fdb8f4a2dc215 100644
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/CMakeLists.txt
@@ -5,4 +5,4 @@ gather_srcs(cinnapi_src SRCS
   mutate_tile_size.cc
 	)
 
-cc_test(test_mutate_tile_size SRCS mutate_tile_size_test.cc DEPS cinncore)
+cinn_cc_test(test_mutate_tile_size SRCS mutate_tile_size_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.cc b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.cc
index 8e07e0d572788..9d41301df614c 100644
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h"
+#include "paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h"
 
-#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
+#include "paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h
index b650a9c746763..02fcfa088e6fd 100644
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "cinn/ir/schedule_desc.h"
-#include "cinn/utils/random_engine.h"
+#include "paddle/cinn/ir/schedule_desc.h"
+#include "paddle/cinn/utils/random_engine.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.cc b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.cc
index bc59bf668198d..4026d295dbea8 100644
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
+#include "paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h
index 2313a38577c38..7f860204f1a39 100644
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h"
+#include "paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
index c8b4ce0a27ae6..c334761174fff 100644
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
+#include "paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task/CMakeLists.txt b/paddle/cinn/auto_schedule/task/CMakeLists.txt
index f3dc34dad4c86..934c63c6f7556 100644
--- a/paddle/cinn/auto_schedule/task/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/task/CMakeLists.txt
@@ -7,6 +7,6 @@ gather_srcs(cinnapi_src SRCS
     )
 gather_srcs(cinnapi_src SRCS task_creator.cc task_optimizer.cc)
 
-cc_test(test_task_creator SRCS task_creator_test.cc DEPS cinncore)
-cc_test(test_tune_task SRCS tune_task_test.cc DEPS cinncore)
-cc_test(test_task_registry SRCS task_registry_test.cc DEPS cinncore)
+cinn_cc_test(test_task_creator SRCS task_creator_test.cc DEPS cinncore)
+cinn_cc_test(test_tune_task SRCS tune_task_test.cc DEPS cinncore)
+cinn_cc_test(test_task_registry SRCS task_registry_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/task/task_creator.cc b/paddle/cinn/auto_schedule/task/task_creator.cc
index 6d62ec2a7278d..c22ce44587633 100644
--- a/paddle/cinn/auto_schedule/task/task_creator.cc
+++ b/paddle/cinn/auto_schedule/task/task_creator.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
 
 #include <glog/logging.h>
 
@@ -20,9 +20,9 @@
 #include <tuple>
 #include <vector>
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/pass.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task/task_creator.h b/paddle/cinn/auto_schedule/task/task_creator.h
index 6dd600f54e340..3f60571e82239 100644
--- a/paddle/cinn/auto_schedule/task/task_creator.h
+++ b/paddle/cinn/auto_schedule/task/task_creator.h
@@ -17,9 +17,9 @@
 #include <memory>
 #include <vector>
 
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/graph.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/graph.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task/task_creator_test.cc b/paddle/cinn/auto_schedule/task/task_creator_test.cc
index fe5638108e884..cc7d7e0b3dd82 100644
--- a/paddle/cinn/auto_schedule/task/task_creator_test.cc
+++ b/paddle/cinn/auto_schedule/task/task_creator_test.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
 
 #include <gtest/gtest.h>
 
 #include <memory>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/node.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc
index b4afd2fa0bd4b..a3a50c98dee6f 100644
--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
@@ -12,32 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/task/task_optimizer.h"
+#include "paddle/cinn/auto_schedule/task/task_optimizer.h"
 
 #include <glog/logging.h>
 
 #include <functional>
 #include <limits>
 
-#include "cinn/auto_schedule/analysis/analyze_ir.h"
-#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
-#include "cinn/auto_schedule/measure/measure.h"
-#include "cinn/auto_schedule/search_strategy/evolutionary_search.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/op_lowering.h"
-#include "cinn/hlir/op/external_api_registry.h"
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/transform_gpu_forloop.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
+#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "paddle/cinn/auto_schedule/measure/measure.h"
+#include "paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/hlir/op/external_api_registry.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/string.h"
 #ifdef CINN_WITH_CUDA
 #include <cuda_runtime_api.h>
 
-#include "cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
 #endif
 
 DECLARE_bool(auto_schedule_use_cost_model);
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.h b/paddle/cinn/auto_schedule/task/task_optimizer.h
index 68fb9f8457324..5d70e540d55ed 100644
--- a/paddle/cinn/auto_schedule/task/task_optimizer.h
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.h
@@ -16,14 +16,14 @@
 
 #include <memory>
 
-#include "cinn/auto_schedule/cost_model/expr_cost_model.h"
-#include "cinn/auto_schedule/database/database.h"
-#include "cinn/auto_schedule/measure/schedule_measurer.h"
-#include "cinn/auto_schedule/search_strategy/evolutionary_search.h"
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/auto_schedule/tuning.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/utils/random_engine.h"
+#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "paddle/cinn/auto_schedule/database/database.h"
+#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
+#include "paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/utils/random_engine.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task/task_registry.h b/paddle/cinn/auto_schedule/task/task_registry.h
index ad069ecac8343..2e57fc7151ebd 100644
--- a/paddle/cinn/auto_schedule/task/task_registry.h
+++ b/paddle/cinn/auto_schedule/task/task_registry.h
@@ -19,9 +19,9 @@
 #include <mutex>
 #include <string>
 
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/utils/registry.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/utils/registry.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/auto_schedule/task/task_registry_test.cc b/paddle/cinn/auto_schedule/task/task_registry_test.cc
index c94f0df743e9b..bf68df11481fc 100644
--- a/paddle/cinn/auto_schedule/task/task_registry_test.cc
+++ b/paddle/cinn/auto_schedule/task/task_registry_test.cc
@@ -12,21 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include <cstdlib>
 
-#include "cinn/auto_schedule/task/task_creator.h"
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/op_lowering.h"
-#include "cinn/utils/string.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/utils/string.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 DECLARE_bool(auto_schedule_use_cost_model);
 DECLARE_bool(cinn_ir_schedule);
diff --git a/paddle/cinn/auto_schedule/task/tune_task.cc b/paddle/cinn/auto_schedule/task/tune_task.cc
index 80998c3825a47..9e79ec6ac7b4b 100644
--- a/paddle/cinn/auto_schedule/task/tune_task.cc
+++ b/paddle/cinn/auto_schedule/task/tune_task.cc
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
 
 #include <glog/logging.h>
 
 #include <iostream>
 #include <vector>
 
-#include "cinn/auto_schedule/analysis/analyze_ir.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op_lowering.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task/tune_task.h b/paddle/cinn/auto_schedule/task/tune_task.h
index 4963a36fc4133..c253878b94fa6 100644
--- a/paddle/cinn/auto_schedule/task/tune_task.h
+++ b/paddle/cinn/auto_schedule/task/tune_task.h
@@ -20,15 +20,15 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/common/type.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op_lowering.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/lowered_func.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/lowered_func.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task/tune_task_test.cc b/paddle/cinn/auto_schedule/task/tune_task_test.cc
index 9ff7ea26392cd..f434af1187368 100755
--- a/paddle/cinn/auto_schedule/task/tune_task_test.cc
+++ b/paddle/cinn/auto_schedule/task/tune_task_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
 
 #include <gtest/gtest.h>
 
@@ -20,20 +20,20 @@
 #include <memory>
 #include <vector>
 
-#include "cinn/auto_schedule/task/task_creator.h"
-#include "cinn/common/context.h"
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op_lowering.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/utils/string.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt b/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
index d938b027a7c5f..681c60b377c55 100644
--- a/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
@@ -2,4 +2,4 @@ core_gather_headers()
 
 gather_srcs(cinnapi_src SRCS task_scheduler.cc round_robin.cc efficiency_priority.cc)
 
-cc_test(test_task_scheduler SRCS task_scheduler_test.cc DEPS cinncore)
+cinn_cc_test(test_task_scheduler SRCS task_scheduler_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
index a83f8004965c2..8cfa6067fe95b 100644
--- a/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/task_scheduler/efficiency_priority.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
index af6e5272b09fe..a42ebc290a0f0 100644
--- a/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
+++ b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
@@ -16,7 +16,7 @@
 
 #include <string>
 
-#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc b/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
index 37af0cee556c0..a904ebef12ed7 100644
--- a/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/task_scheduler/round_robin.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task_scheduler/round_robin.h b/paddle/cinn/auto_schedule/task_scheduler/round_robin.h
index 55429fce92f1f..bbd862b70e721 100644
--- a/paddle/cinn/auto_schedule/task_scheduler/round_robin.h
+++ b/paddle/cinn/auto_schedule/task_scheduler/round_robin.h
@@ -16,7 +16,7 @@
 
 #include <string>
 
-#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
index 0c6f99ad73c6e..c9682ca9adc4a 100644
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
 
 #include <algorithm>
 
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/auto_schedule/task_scheduler/efficiency_priority.h"
-#include "cinn/auto_schedule/task_scheduler/round_robin.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
index cd8776bd97620..862ab7f2c3314 100644
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
@@ -19,9 +19,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/auto_schedule/task/task_optimizer.h"
-#include "cinn/auto_schedule/task/tune_task.h"
-#include "cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/auto_schedule/task/task_optimizer.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
index a05b8dab3fd28..e75778d27d5de 100644
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/auto_schedule/task_scheduler/task_scheduler.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
 
 #include <gtest/gtest.h>
 
 #include <type_traits>
 
-#include "cinn/auto_schedule/task_scheduler/efficiency_priority.h"
-#include "cinn/auto_schedule/task_scheduler/round_robin.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/tests/CMakeLists.txt b/paddle/cinn/auto_schedule/tests/CMakeLists.txt
index 407400b1f241b..c6b29e89d0024 100644
--- a/paddle/cinn/auto_schedule/tests/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (WITH_CUDA AND (NOT WITH_CUDNN))
-  cc_test(test_performance_comparison
+  cinn_cc_test(test_performance_comparison
           ARGS "--resnet50_model_dir=${THIRD_PARTY_PATH}/ResNet50"
           SRCS performance_comparison_test.cc DEPS cinncore test_program_builder)
 endif()
diff --git a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
index 35a1e58063605..694e75b5e2b38 100644
--- a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
+++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
@@ -18,19 +18,19 @@
 #include <bitset>
 #include <iostream>
 
-#include "cinn/auto_schedule/auto_tuner.h"
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/paddle_model_convertor.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/data_util.h"
-#include "tests/program_builder.h"
+#include "paddle/cinn/auto_schedule/auto_tuner.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/paddle_model_convertor.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/data_util.h"
+#include "test/cpp/cinn/program_builder.h"
 
 /* This test is used as a tool to evaluate or compare performance of 3 schedules(no schedule, manual schedule,
  * auto-schedule). One can specify which schedules to be evaluated through `FLAGS_evaluate_knobs` and specify which
diff --git a/paddle/cinn/auto_schedule/tuning.h b/paddle/cinn/auto_schedule/tuning.h
index 0b2bfe66d1273..ecfbee79213ba 100644
--- a/paddle/cinn/auto_schedule/tuning.h
+++ b/paddle/cinn/auto_schedule/tuning.h
@@ -17,9 +17,9 @@
 #include <memory>
 #include <vector>
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/ir/lowered_func.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/ir/lowered_func.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/backends/CMakeLists.txt b/paddle/cinn/backends/CMakeLists.txt
index 3949bc4e7313d..349244c4fb69c 100755
--- a/paddle/cinn/backends/CMakeLists.txt
+++ b/paddle/cinn/backends/CMakeLists.txt
@@ -20,43 +20,43 @@ if (WITH_CUDA)
 endif()
 
 if (WITH_OPENMP)
-cc_library(__x86_source_fake_lib SRCS _x86_builtin_source.cc)
+cinn_cc_library(__x86_source_fake_lib SRCS _x86_builtin_source.cc)
 endif()
 add_subdirectory(llvm)
 
 
 if (WITH_CUDA)
-    nv_test(test_raw_cuda_code SRCS raw_cuda_code_test.cu DEPS cinncore)
+    cinn_nv_test(test_raw_cuda_code SRCS raw_cuda_code_test.cu DEPS cinncore)
 endif()
 
-cc_test(test_codegen_c SRCS codegen_c_test.cc DEPS cinncore ARGS ${global_test_args})
-cc_test(test_codegen_c_x86 SRCS codegen_c_x86_test.cc DEPS cinncore ARGS ${global_test_args})
-cc_test(test_generated1 SRCS generated_module1.cc DEPS cinn_runtime)
+cinn_cc_test(test_codegen_c SRCS codegen_c_test.cc DEPS cinncore ARGS ${global_test_args})
+cinn_cc_test(test_codegen_c_x86 SRCS codegen_c_x86_test.cc DEPS cinncore ARGS ${global_test_args})
+cinn_cc_test(test_generated1 SRCS generated_module1.cc DEPS cinn_runtime)
 add_run_test_dependency(test_generated1 test_codegen_c)
-cc_test(test_ir_schedule SRCS ir_schedule_test.cc DEPS cinncore)
-include_directories(${CMAKE_SOURCE_DIR}/cinn/runtime)
+cinn_cc_test(test_ir_schedule SRCS ir_schedule_test.cc DEPS cinncore)
+include_directories(${CMAKE_SOURCE_DIR}/paddle/cinn/runtime)
 if (TARGET test_generated1)
   add_dependencies(test_generated1 test_codegen_c)
 endif()
 
 if (WITH_CUDA)
-  nv_test(test_codegen_cuda_generate SRCS codegen_cuda_generate_test.cc DEPS cinncore)
-  nv_test(test_codegen_debug SRCS codegen_debug_test.cc DEPS cinncore)
+  cinn_nv_test(test_codegen_cuda_generate SRCS codegen_cuda_generate_test.cc DEPS cinncore)
+  cinn_nv_test(test_codegen_debug SRCS codegen_debug_test.cc DEPS cinncore)
 
   if (WITH_TESTING)
-    nv_test(generated1_cuda SRCS generated1.cu DEPS cinncore)
+    cinn_nv_test(generated1_cuda SRCS generated1.cu DEPS cinncore)
     add_run_test_dependency(generated1_cuda test_codegen_cuda_generate)
   endif()
 
-  nv_test(test_compiler SRCS compiler_test.cc DEPS cinncore)
+  cinn_nv_test(test_compiler SRCS compiler_test.cc DEPS cinncore)
 else()
-  cc_test(test_compiler SRCS compiler_test.cc DEPS cinncore)
+  cinn_cc_test(test_compiler SRCS compiler_test.cc DEPS cinncore)
 endif()
 
 
 foreach(cpp ${srcs})
   set(cinnapi_src
-          "${cinnapi_src};cinn/backends/${cpp}"
+          "${cinnapi_src};paddle/cinn/backends/${cpp}"
           CACHE INTERNAL "")
 endforeach()
 
diff --git a/paddle/cinn/backends/_x86_builtin_source.cc b/paddle/cinn/backends/_x86_builtin_source.cc
index f29b3cc79ca81..1fc10c1d3ce59 100644
--- a/paddle/cinn/backends/_x86_builtin_source.cc
+++ b/paddle/cinn/backends/_x86_builtin_source.cc
@@ -23,7 +23,7 @@
 
 #include <vector>
 
-#include "cinn/runtime/cpu/thread_backend.h"
+#include "paddle/cinn/runtime/cpu/thread_backend.h"
 
 #ifndef _CINN_X86_BUILTIN_SOURCE_
 #define _CINN_X86_BUILTIN_SOURCE_
diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc
index a5a26ecea027c..453376d774d0e 100644
--- a/paddle/cinn/backends/codegen_c.cc
+++ b/paddle/cinn/backends/codegen_c.cc
@@ -12,21 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c.h"
 
 #include <fstream>
 #include <string>
 
-#include "cinn/backends/extern_func_emitter.h"
-#include "cinn/backends/extern_func_emitter_builtin.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_verify.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/remove_nested_block.h"
-#include "cinn/runtime/cpu/thread_backend.h"
-#include "cinn/runtime/intrinsic.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/backends/extern_func_emitter.h"
+#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_verify.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/remove_nested_block.h"
+#include "paddle/cinn/runtime/cpu/thread_backend.h"
+#include "paddle/cinn/runtime/intrinsic.h"
+#include "paddle/cinn/utils/string.h"
 
 //! Root of the builtin code.
 DECLARE_string(cinn_x86_builtin_code_root);
diff --git a/paddle/cinn/backends/codegen_c.h b/paddle/cinn/backends/codegen_c.h
index 42458d549bed3..479300a1d6f38 100755
--- a/paddle/cinn/backends/codegen_c.h
+++ b/paddle/cinn/backends/codegen_c.h
@@ -19,14 +19,14 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/intrinsic_ops.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/ir/module.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/backends/codegen_c_test.cc b/paddle/cinn/backends/codegen_c_test.cc
index 3a95774c2f53f..a72be4f5d7468 100755
--- a/paddle/cinn/backends/codegen_c_test.cc
+++ b/paddle/cinn/backends/codegen_c_test.cc
@@ -12,22 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c.h"
 
 #include <gtest/gtest.h>
 
 #include <sstream>
 #include <tuple>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/module.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/runtime/cpu/use_extern_funcs.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/codegen_c_x86.cc b/paddle/cinn/backends/codegen_c_x86.cc
index 737566dc2c651..994ef1191e675 100644
--- a/paddle/cinn/backends/codegen_c_x86.cc
+++ b/paddle/cinn/backends/codegen_c_x86.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/codegen_c_x86.h b/paddle/cinn/backends/codegen_c_x86.h
index 29555df3c5e9a..75a13ad22e873 100644
--- a/paddle/cinn/backends/codegen_c_x86.h
+++ b/paddle/cinn/backends/codegen_c_x86.h
@@ -16,8 +16,8 @@
 
 #include <string>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/codegen_c_x86_test.cc b/paddle/cinn/backends/codegen_c_x86_test.cc
index b4cb6bf376a51..4f2dddb319d63 100644
--- a/paddle/cinn/backends/codegen_c_x86_test.cc
+++ b/paddle/cinn/backends/codegen_c_x86_test.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/module.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/transform_polyfor_to_for.h"
-#include "cinn/optim/vectorize_loops.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/optim/vectorize_loops.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index 21fc8961faeea..6eb1232a069ed 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
 
-#include <cinn/utils/string.h>
+#include <paddle/cinn/utils/string.h>
 #include <glog/logging.h>
 
 #include <fstream>
 #include <set>
 #include <unordered_set>
 
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_verify.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/remove_nested_block.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_verify.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/remove_nested_block.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/codegen_cuda_dev.h b/paddle/cinn/backends/codegen_cuda_dev.h
index ad7e03024553f..44607d76a283b 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.h
+++ b/paddle/cinn/backends/codegen_cuda_dev.h
@@ -17,14 +17,14 @@
 #include <unordered_set>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/common/common.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/ir/module.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace cinn::ir {
 class Module;
diff --git a/paddle/cinn/backends/codegen_cuda_generate_test.cc b/paddle/cinn/backends/codegen_cuda_generate_test.cc
index 5d4fc35afe663..c120732528a8e 100644
--- a/paddle/cinn/backends/codegen_cuda_generate_test.cc
+++ b/paddle/cinn/backends/codegen_cuda_generate_test.cc
@@ -19,22 +19,22 @@
 #include <tuple>
 #include <vector>
 
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/codegen_cuda_host.h"
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/cinn.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/lang/lower.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/utils/timer.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/utils/timer.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/codegen_cuda_host.cc b/paddle/cinn/backends/codegen_cuda_host.cc
index 38774b181dbcc..f1ee430b68997 100644
--- a/paddle/cinn/backends/codegen_cuda_host.cc
+++ b/paddle/cinn/backends/codegen_cuda_host.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
 
 #include <algorithm>
 #include <string>
 #include <unordered_map>
 
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/extern_func_emitter_builtin.h"
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/llvm/llvm_util.h"
-#include "cinn/runtime/intrinsic.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/runtime/intrinsic.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/codegen_cuda_host.h b/paddle/cinn/backends/codegen_cuda_host.h
index 4f0b858db4144..c91ab76a91a8e 100644
--- a/paddle/cinn/backends/codegen_cuda_host.h
+++ b/paddle/cinn/backends/codegen_cuda_host.h
@@ -21,7 +21,7 @@
 #include <tuple>
 #include <vector>
 
-#include "cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/codegen_cuda_util.cc b/paddle/cinn/backends/codegen_cuda_util.cc
index ee7174be9f407..f74fadacc803d 100644
--- a/paddle/cinn/backends/codegen_cuda_util.cc
+++ b/paddle/cinn/backends/codegen_cuda_util.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/codegen_cuda_util.h b/paddle/cinn/backends/codegen_cuda_util.h
index 598feede403ae..51677bf3c0530 100755
--- a/paddle/cinn/backends/codegen_cuda_util.h
+++ b/paddle/cinn/backends/codegen_cuda_util.h
@@ -20,10 +20,10 @@
 #include <tuple>
 #include <vector>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/codegen_debug_test.cc b/paddle/cinn/backends/codegen_debug_test.cc
index 306238f58fe52..317e9b9957440 100644
--- a/paddle/cinn/backends/codegen_debug_test.cc
+++ b/paddle/cinn/backends/codegen_debug_test.cc
@@ -18,10 +18,10 @@
 #include <iostream>
 #include <vector>
 
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/nvrtc/nvrtc_util.h"
-#include "cinn/common/context.h"
-#include "cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
index 798b0a96a216d..5289e6a95196c 100644
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/compiler.h"
+#include "paddle/cinn/backends/compiler.h"
 
 #include <fstream>
 
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/common/context.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/common/context.h"
 #ifdef CINN_WITH_CUDA
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/codegen_cuda_host.h"
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/nvrtc/nvrtc_util.h"
-#include "cinn/runtime/cuda/cuda_module.h"
-#include "cinn/runtime/cuda/cuda_util.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/flags.h"
 #endif
 
 DECLARE_string(cinn_source_code_save_path);
diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h
index bba22e60303a6..9fbed5c518977 100644
--- a/paddle/cinn/backends/compiler.h
+++ b/paddle/cinn/backends/compiler.h
@@ -21,12 +21,12 @@
 #include <mutex>
 #include <string>
 
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/lang/packed_func.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/lang/packed_func.h"
 #ifdef CINN_WITH_CUDA
-#include "cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
 #endif
 
 namespace cinn {
diff --git a/paddle/cinn/backends/compiler_test.cc b/paddle/cinn/backends/compiler_test.cc
index 0393c97eb4d5a..e415eaa14e157 100644
--- a/paddle/cinn/backends/compiler_test.cc
+++ b/paddle/cinn/backends/compiler_test.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/compiler.h"
+#include "paddle/cinn/backends/compiler.h"
 
 #include <gtest/gtest.h>
 
 #include <vector>
 
-#include "cinn/cinn.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/runtime/use_extern_funcs.h"
-#include "cinn/utils/timer.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/runtime/use_extern_funcs.h"
+#include "paddle/cinn/utils/timer.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/cuda_util.cc b/paddle/cinn/backends/cuda_util.cc
index fa6f5b25f78df..c3bc824401cc7 100644
--- a/paddle/cinn/backends/cuda_util.cc
+++ b/paddle/cinn/backends/cuda_util.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
 
 #include <glog/logging.h>
 
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/common/target.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/common/target.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/cuda_util.h b/paddle/cinn/backends/cuda_util.h
index f86dc177febc8..5175ba8e819c6 100644
--- a/paddle/cinn/backends/cuda_util.h
+++ b/paddle/cinn/backends/cuda_util.h
@@ -25,7 +25,7 @@
 #include <tuple>
 #include <vector>
 
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 #define CUDA_DRIVER_CALL(func)                                                 \
   {                                                                            \
diff --git a/paddle/cinn/backends/extern_func_emitter.cc b/paddle/cinn/backends/extern_func_emitter.cc
index bede4f99ff198..83d18060ec122 100644
--- a/paddle/cinn/backends/extern_func_emitter.cc
+++ b/paddle/cinn/backends/extern_func_emitter.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/extern_func_emitter.h"
+#include "paddle/cinn/backends/extern_func_emitter.h"
 
 #include <absl/hash/hash.h>
 #include <glog/raw_logging.h>
@@ -21,11 +21,11 @@
 #include <iostream>
 #include <string>
 
-#include "cinn/backends/extern_func_emitter_builtin.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/runtime/cpu/host_intrinsics.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/string.h"
 
 DECLARE_bool(verbose_function_register);
 
diff --git a/paddle/cinn/backends/extern_func_emitter.h b/paddle/cinn/backends/extern_func_emitter.h
index b2b8870d51124..98631055be904 100644
--- a/paddle/cinn/backends/extern_func_emitter.h
+++ b/paddle/cinn/backends/extern_func_emitter.h
@@ -24,8 +24,8 @@
 #include <string>
 #include <utility>
 
-#include "cinn/backends/extern_func_protos.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/backends/extern_func_protos.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/extern_func_emitter_builtin.cc b/paddle/cinn/backends/extern_func_emitter_builtin.cc
index 087ddc6b81d33..f3ce3a5521b85 100644
--- a/paddle/cinn/backends/extern_func_emitter_builtin.cc
+++ b/paddle/cinn/backends/extern_func_emitter_builtin.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/extern_func_emitter_builtin.h"
+#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
 
 #include <glog/logging.h>
 
-#include "cinn/backends/llvm/ir_builder_mixin.h"
-#include "cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/backends/llvm/ir_builder_mixin.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/extern_func_emitter_builtin.h b/paddle/cinn/backends/extern_func_emitter_builtin.h
index 59d508e0e8906..80301d7de62a7 100644
--- a/paddle/cinn/backends/extern_func_emitter_builtin.h
+++ b/paddle/cinn/backends/extern_func_emitter_builtin.h
@@ -17,11 +17,11 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/extern_func_emitter.h"
-#include "cinn/backends/extern_func_protos.h"
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/extern_func_emitter.h"
+#include "paddle/cinn/backends/extern_func_protos.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/extern_func_jit_register.cc b/paddle/cinn/backends/extern_func_jit_register.cc
index 1c9113c9f5da3..f56a266d8e2b8 100644
--- a/paddle/cinn/backends/extern_func_jit_register.cc
+++ b/paddle/cinn/backends/extern_func_jit_register.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
 
 #include <string>
 
diff --git a/paddle/cinn/backends/extern_func_jit_register.h b/paddle/cinn/backends/extern_func_jit_register.h
index ad738ec288667..b9ca806d4f8d0 100644
--- a/paddle/cinn/backends/extern_func_jit_register.h
+++ b/paddle/cinn/backends/extern_func_jit_register.h
@@ -22,15 +22,15 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/backends/extern_func_emitter.h"
-#include "cinn/backends/extern_func_emitter_builtin.h"
-#include "cinn/backends/extern_func_protos.h"
-#include "cinn/backends/function_prototype.h"
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/ir_builder_mixin.h"
-#include "cinn/backends/llvm/llvm_util.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/common/macros.h"
+#include "paddle/cinn/backends/extern_func_emitter.h"
+#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
+#include "paddle/cinn/backends/extern_func_protos.h"
+#include "paddle/cinn/backends/function_prototype.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/ir_builder_mixin.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/common/macros.h"
 
 /**
  * Helper to register an external function into CINN, including the prototype, the function address.
diff --git a/paddle/cinn/backends/extern_func_protos.cc b/paddle/cinn/backends/extern_func_protos.cc
index 58472677b3ea9..e9c737e48eb12 100644
--- a/paddle/cinn/backends/extern_func_protos.cc
+++ b/paddle/cinn/backends/extern_func_protos.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/extern_func_protos.h"
+#include "paddle/cinn/backends/extern_func_protos.h"
 
 #include <string>
 #include <vector>
diff --git a/paddle/cinn/backends/extern_func_protos.h b/paddle/cinn/backends/extern_func_protos.h
index 8b9dbd230dfd5..d2be4a3a7c37f 100644
--- a/paddle/cinn/backends/extern_func_protos.h
+++ b/paddle/cinn/backends/extern_func_protos.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/backends/function_prototype.h"
+#include "paddle/cinn/backends/function_prototype.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/function_prototype.cc b/paddle/cinn/backends/function_prototype.cc
index 87fb0ec2a40b2..66e80a525f274 100644
--- a/paddle/cinn/backends/function_prototype.cc
+++ b/paddle/cinn/backends/function_prototype.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/function_prototype.h"
+#include "paddle/cinn/backends/function_prototype.h"
 
 #include <glog/raw_logging.h>
 
 #include <iostream>
 
-#include "cinn/ir/tensor.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(verbose_function_register);
 
diff --git a/paddle/cinn/backends/function_prototype.h b/paddle/cinn/backends/function_prototype.h
index 2ec058fa7edb2..9950e4f4dad03 100644
--- a/paddle/cinn/backends/function_prototype.h
+++ b/paddle/cinn/backends/function_prototype.h
@@ -21,8 +21,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/generated1.cu b/paddle/cinn/backends/generated1.cu
index 88459ce83f588..7fb2ccfc71997 100644
--- a/paddle/cinn/backends/generated1.cu
+++ b/paddle/cinn/backends/generated1.cu
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/_generated1.cu"
+#include "paddle/cinn/backends/_generated1.cu"
diff --git a/paddle/cinn/backends/generated_module1.cc b/paddle/cinn/backends/generated_module1.cc
index 4c74a485bec27..cfbd94bddbed0 100644
--- a/paddle/cinn/backends/generated_module1.cc
+++ b/paddle/cinn/backends/generated_module1.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/_generated_module1.cc"
+#include "paddle/cinn/backends/_generated_module1.cc"
diff --git a/paddle/cinn/backends/ir_schedule_test.cc b/paddle/cinn/backends/ir_schedule_test.cc
index 0d11d4230d911..5fd5453bffb82 100644
--- a/paddle/cinn/backends/ir_schedule_test.cc
+++ b/paddle/cinn/backends/ir_schedule_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 #include <gtest/gtest.h>
 #include <stdlib.h>
@@ -20,16 +20,16 @@
 #include <tuple>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/cinn.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/lang/lower.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/remove_schedule_block.h"
-#include "cinn/optim/unroll_loops.h"
-#include "cinn/optim/vectorize_loops.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/remove_schedule_block.h"
+#include "paddle/cinn/optim/unroll_loops.h"
+#include "paddle/cinn/optim/vectorize_loops.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/llvm/CMakeLists.txt b/paddle/cinn/backends/llvm/CMakeLists.txt
index f405b6b8801b6..dbecc06bbd1b1 100755
--- a/paddle/cinn/backends/llvm/CMakeLists.txt
+++ b/paddle/cinn/backends/llvm/CMakeLists.txt
@@ -3,14 +3,14 @@ add_definitions(${LLVM_DEFINITIONS})
 # generate cinn_runtime.ll file
 
 add_custom_command(
-  OUTPUT ${CMAKE_BINARY_DIR}/cinn/backends/llvm/cinn_runtime_llvm_ir.h
-  COMMAND ${LLVM_PATH}/bin/clang++ -mavx2 -std=c++11 -masm=intel -S -emit-llvm -O3 ${PROJECT_SOURCE_DIR}/cinn/runtime/cinn_runtime.cc -I${PROJECT_SOURCE_DIR} -o ${CMAKE_BINARY_DIR}/cinn/runtime/cinn_runtime.ll
-  COMMAND ${PYTHON_EXECUTABLE} generate_runtime_llvm_ir.py ${CMAKE_BINARY_DIR}/cinn/runtime/cinn_runtime.ll ${CMAKE_BINARY_DIR}/cinn/backends/llvm/cinn_runtime_llvm_ir.h ${LLVM_PATH}/bin/llvm-config
-  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/cinn/backends/llvm
-  DEPENDS ${PROJECT_SOURCE_DIR}/cinn/runtime/cinn_runtime.cc ${PROJECT_SOURCE_DIR}/cinn/runtime/cinn_runtime.h
+  OUTPUT ${CMAKE_BINARY_DIR}/paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h
+  COMMAND ${LLVM_PATH}/bin/clang++ -mavx2 -std=c++11 -masm=intel -S -emit-llvm -O3 ${PROJECT_SOURCE_DIR}/paddle/cinn/runtime/cinn_runtime.cc -I${PROJECT_SOURCE_DIR} -o ${CMAKE_BINARY_DIR}/paddle/cinn/runtime/cinn_runtime.ll
+  COMMAND ${PYTHON_EXECUTABLE} generate_runtime_llvm_ir.py ${CMAKE_BINARY_DIR}/paddle/cinn/runtime/cinn_runtime.ll ${CMAKE_BINARY_DIR}/paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h ${LLVM_PATH}/bin/llvm-config
+  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/paddle/cinn/backends/llvm
+  DEPENDS ${PROJECT_SOURCE_DIR}/paddle/cinn/runtime/cinn_runtime.cc ${PROJECT_SOURCE_DIR}/paddle/cinn/runtime/cinn_runtime.h
   )
 add_custom_target(GEN_LLVM_RUNTIME_IR_HEADER ALL
-  DEPENDS ${CMAKE_BINARY_DIR}/cinn/backends/llvm/cinn_runtime_llvm_ir.h
+  DEPENDS ${CMAKE_BINARY_DIR}/paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h
   )
 
 set(srcs
@@ -24,13 +24,13 @@ set(srcs
 )
 
 
-cc_test(test_codegen_llvm SRCS codegen_llvm_test.cc DEPS cinncore)
-#cc_test(test_execution_engine SRCS execution_engine_test.cc DEPS cinncore)
-cc_test(test_codegen_x86 SRCS codegen_x86_test.cc DEPS cinncore)
+cinn_cc_test(test_codegen_llvm SRCS codegen_llvm_test.cc DEPS cinncore)
+#cinn_cc_test(test_execution_engine SRCS execution_engine_test.cc DEPS cinncore)
+cinn_cc_test(test_codegen_x86 SRCS codegen_x86_test.cc DEPS cinncore)
 
 foreach(cpp ${srcs})
   set(cinnapi_src
-    "${cinnapi_src};cinn/backends/llvm/${cpp}"
+    "${cinnapi_src};paddle/cinn/backends/llvm/${cpp}"
     CACHE INTERNAL "")
 endforeach()
 
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc
index 169fe3cfd40e3..757ce2a41b923 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
 
 #include <glog/logging.h>
 #include <glog/stl_logging.h>
@@ -33,18 +33,18 @@
 #include <string>
 #include <type_traits>
 
-#include "cinn/backends/extern_func_emitter.h"
-#include "cinn/backends/extern_func_emitter_builtin.h"
-#include "cinn/backends/llvm/llvm_util.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/type.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_verify.h"
-#include "cinn/optim/var_mod_simplify.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/runtime/intrinsic.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/backends/extern_func_emitter.h"
+#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_verify.h"
+#include "paddle/cinn/optim/var_mod_simplify.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/intrinsic.h"
+#include "paddle/cinn/utils/string.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -722,6 +722,7 @@ llvm::Value *CodeGenLLVM::Visit(const ir::_Module_ *op) {
 
     VLOG(5) << "fn llvm:\n" << DumpToString(*fnll);
   }
+  return nullptr;
 }
 
 llvm::Value *CodeGenLLVM::Visit(const ir::_Var_ *op) {
@@ -1085,7 +1086,9 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Sum *op) {
 
 #undef __IR_EMITTER_CINN_NOT_IMPLEMENTED
 
-void CodeGenLLVM::Compile(const ir::Module &module) { Visit(module.self()); }
+void CodeGenLLVM::Compile(const ir::Module &module) {
+  Visit(module.self());
+}
 
 llvm::Value *CodeGenLLVM::EmitCall_buffer_malloc(const ir::Call *op) { return nullptr; }
 
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.h b/paddle/cinn/backends/llvm/codegen_llvm.h
index f472e2239e15d..aba39d22e5073 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.h
+++ b/paddle/cinn/backends/llvm/codegen_llvm.h
@@ -29,12 +29,12 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/backends/llvm/ir_builder_mixin.h"
-#include "cinn/backends/llvm/llvm_util.h"
-#include "cinn/ir/intrinsic_ops.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/ir/module.h"
+#include "paddle/cinn/backends/llvm/ir_builder_mixin.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/module.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/llvm/codegen_llvm_test.cc b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
index ebeaf20f01577..b0d9370f43555 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm_test.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -28,13 +28,13 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/backends/llvm/cinn_runtime_llvm_ir.h"
-#include "cinn/cinn.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/module.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
+#include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/llvm/codegen_x86.cc b/paddle/cinn/backends/llvm/codegen_x86.cc
index c76b04b16c372..8fd38489d345f 100644
--- a/paddle/cinn/backends/llvm/codegen_x86.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/llvm/codegen_x86.h"
+#include "paddle/cinn/backends/llvm/codegen_x86.h"
 
 #include <absl/container/flat_hash_map.h>
 #include <llvm/IR/LLVMContext.h>
@@ -20,12 +20,12 @@
 #include <algorithm>
 #include <utility>
 
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/optim/collect_undefined_vars.h"
-#include "cinn/runtime/intrinsic.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/optim/collect_undefined_vars.h"
+#include "paddle/cinn/runtime/intrinsic.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Intrinsics.h"
diff --git a/paddle/cinn/backends/llvm/codegen_x86.h b/paddle/cinn/backends/llvm/codegen_x86.h
index baf480f51a3d5..72ba4bc88c1e5 100644
--- a/paddle/cinn/backends/llvm/codegen_x86.h
+++ b/paddle/cinn/backends/llvm/codegen_x86.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
 
 namespace cinn::backends {
 
diff --git a/paddle/cinn/backends/llvm/codegen_x86_test.cc b/paddle/cinn/backends/llvm/codegen_x86_test.cc
index 95ded4776ce56..1287ffcd5e6be 100644
--- a/paddle/cinn/backends/llvm/codegen_x86_test.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/llvm/codegen_x86.h"
+#include "paddle/cinn/backends/llvm/codegen_x86.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/cinn.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/llvm/execution_engine.cc b/paddle/cinn/backends/llvm/execution_engine.cc
index 175e58dbdd59b..549675269bf36 100644
--- a/paddle/cinn/backends/llvm/execution_engine.cc
+++ b/paddle/cinn/backends/llvm/execution_engine.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
 
 #include <absl/strings/string_view.h>
 #include <llvm/ADT/Triple.h>
@@ -54,16 +54,16 @@
 #include <string>
 #include <utility>
 
-#include "cinn/backends/codegen_cuda_host.h"
-#include "cinn/backends/llvm/cinn_runtime_llvm_ir.h"
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/codegen_x86.h"
-#include "cinn/backends/llvm/llvm_optimizer.h"
-#include "cinn/backends/llvm/llvm_util.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/runtime/intrinsic.h"
-#include "cinn/utils/profiler.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/codegen_x86.h"
+#include "paddle/cinn/backends/llvm/llvm_optimizer.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/runtime/intrinsic.h"
+#include "paddle/cinn/utils/profiler.h"
 
 namespace cinn::backends {
 namespace {
diff --git a/paddle/cinn/backends/llvm/execution_engine.h b/paddle/cinn/backends/llvm/execution_engine.h
index 15a7e8793a139..31f6e144fbcc9 100644
--- a/paddle/cinn/backends/llvm/execution_engine.h
+++ b/paddle/cinn/backends/llvm/execution_engine.h
@@ -43,10 +43,10 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/llvm/codegen_x86.h"
-#include "cinn/backends/llvm/llvm_util.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/ir/module.h"
+#include "paddle/cinn/backends/llvm/codegen_x86.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/ir/module.h"
 
 namespace cinn::backends {
 
diff --git a/paddle/cinn/backends/llvm/execution_engine_test.cc b/paddle/cinn/backends/llvm/execution_engine_test.cc
index 5818f33a645a8..55a2bbedb1133 100644
--- a/paddle/cinn/backends/llvm/execution_engine_test.cc
+++ b/paddle/cinn/backends/llvm/execution_engine_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
 
 #include <glog/logging.h>
 #include <glog/raw_logging.h>
@@ -36,19 +36,19 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/backends/llvm/cinn_runtime_llvm_ir.h"
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/cinn.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/module.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/optim/optimize.h"
-#include "cinn/runtime/cpu/host_intrinsics.h"
-#include "cinn/runtime/cpu/use_extern_funcs.h"
+#include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/optim/optimize.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/llvm/generate_runtime_llvm_ir.py b/paddle/cinn/backends/llvm/generate_runtime_llvm_ir.py
index 2d8d93aa5d334..1b99d885df567 100644
--- a/paddle/cinn/backends/llvm/generate_runtime_llvm_ir.py
+++ b/paddle/cinn/backends/llvm/generate_runtime_llvm_ir.py
@@ -25,7 +25,7 @@ def main():
 
     srcs = []
     srcs.append('#include <absl/strings/string_view.h>')
-    #srcs.append('#include "cinn/backends/llvm/cinn_runtime_llvm_ir.h"\n')
+    #srcs.append('#include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h"\n')
     srcs.append('namespace cinn::backends {')
     srcs.append("static const absl::string_view kRuntimeLlvmIr(")
     srcs.append('R"ROC(')
diff --git a/paddle/cinn/backends/llvm/llvm_intrin_rule.h b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
index 822349f8a8ae9..7e912fcdcc8b6 100644
--- a/paddle/cinn/backends/llvm/llvm_intrin_rule.h
+++ b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
@@ -22,10 +22,10 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/intrinsic_ops.h"
-#include "cinn/ir/registry.h"
-#include "cinn/lang/packed_func.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/registry.h"
+#include "paddle/cinn/lang/packed_func.h"
 
 namespace cinn {
 namespace codegen {
diff --git a/paddle/cinn/backends/llvm/llvm_optimizer.cc b/paddle/cinn/backends/llvm/llvm_optimizer.cc
index ff5c60d74fd7a..3fd11ea8c1731 100644
--- a/paddle/cinn/backends/llvm/llvm_optimizer.cc
+++ b/paddle/cinn/backends/llvm/llvm_optimizer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/llvm/llvm_optimizer.h"
+#include "paddle/cinn/backends/llvm/llvm_optimizer.h"
 
 #include <glog/logging.h>
 #include <llvm/ADT/Triple.h>
diff --git a/paddle/cinn/backends/llvm/llvm_util.cc b/paddle/cinn/backends/llvm/llvm_util.cc
index e03325faf4d21..1fe056e94d406 100644
--- a/paddle/cinn/backends/llvm/llvm_util.cc
+++ b/paddle/cinn/backends/llvm/llvm_util.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
 
 #include <glog/logging.h>
 #include <llvm/Support/Alignment.h>
diff --git a/paddle/cinn/backends/llvm/llvm_util.h b/paddle/cinn/backends/llvm/llvm_util.h
index b53b46af245d8..41e3523f3a17c 100644
--- a/paddle/cinn/backends/llvm/llvm_util.h
+++ b/paddle/cinn/backends/llvm/llvm_util.h
@@ -29,7 +29,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "cinn/common/type.h"
+#include "paddle/cinn/common/type.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/llvm/runtime_symbol_registry.cc b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
index 796a7f9b69216..08d2c5e8b50bc 100644
--- a/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
+++ b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
 
 #include <absl/strings/string_view.h>
 #include <glog/raw_logging.h>
 
 #include <iostream>
 
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/runtime/flags.h"
 #include "gflags/gflags_declare.h"
 
 DECLARE_bool(verbose_function_register);
diff --git a/paddle/cinn/backends/llvm/runtime_symbol_registry.h b/paddle/cinn/backends/llvm/runtime_symbol_registry.h
index 91e82cb1ffad9..dd416bdbe76d4 100644
--- a/paddle/cinn/backends/llvm/runtime_symbol_registry.h
+++ b/paddle/cinn/backends/llvm/runtime_symbol_registry.h
@@ -24,7 +24,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/macros.h"
+#include "paddle/cinn/common/macros.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/llvm/simple_jit.cc b/paddle/cinn/backends/llvm/simple_jit.cc
index 77f55e18644cd..c121f3aad159d 100755
--- a/paddle/cinn/backends/llvm/simple_jit.cc
+++ b/paddle/cinn/backends/llvm/simple_jit.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
 
 #include <llvm/AsmParser/Parser.h>
 #include <llvm/ExecutionEngine/JITSymbol.h>
@@ -32,13 +32,13 @@
 #include <string>
 #include <utility>
 
-#include "cinn/backends/codegen_cuda_host.h"
-#include "cinn/backends/llvm/cinn_runtime_llvm_ir.h"
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/llvm_util.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/runtime/intrinsic.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/runtime/intrinsic.h"
 
 namespace cinn {
 namespace backends {
@@ -111,6 +111,7 @@ SimpleJIT::SimpleJIT() : context_(std::make_unique<llvm::LLVMContext>()) {
 
 template <typename CodeGenT>
 void SimpleJIT::Link(ir::Module module, bool optimize) {
+  VLOG(-1) << "dddddd";
   std::string runtime_ir(backends::kRuntimeLlvmIr);
   llvm::SMDiagnostic error;
   auto m = llvm::parseAssemblyString(runtime_ir, error, context());
@@ -118,11 +119,17 @@ void SimpleJIT::Link(ir::Module module, bool optimize) {
   auto b = std::make_unique<llvm::IRBuilder<>>(context());
 
   auto ir_emitter = std::make_unique<CodeGenT>(m.get(), b.get());
+  VLOG(-1) << "dddddd";
   ir_emitter->Compile(module);
+  VLOG(-1) << "dddddd";
 
+  VLOG(-1) << "dddddd";
   CHECK(!llvm::verifyModule(*m, &llvm::errs())) << "Invalid module found";
+  VLOG(-1) << "dddddd";
 
+  VLOG(-1) << "dddddd";
   AddModule(std::move(m), optimize);
+  VLOG(-1) << "dddddd";
 }
 
 template void SimpleJIT::Link<CodeGenLLVM>(ir::Module module, bool optimize);
diff --git a/paddle/cinn/backends/llvm/simple_jit.h b/paddle/cinn/backends/llvm/simple_jit.h
index ebbae127c3d8e..039bfe17417d8 100755
--- a/paddle/cinn/backends/llvm/simple_jit.h
+++ b/paddle/cinn/backends/llvm/simple_jit.h
@@ -39,11 +39,11 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/llvm_util.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/ir/module.h"
-#include "cinn/runtime/intrinsic.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/runtime/intrinsic.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/modular.cc b/paddle/cinn/backends/modular.cc
index e09c06b0d43ef..428da27776f56 100644
--- a/paddle/cinn/backends/modular.cc
+++ b/paddle/cinn/backends/modular.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/modular.h"
+#include "paddle/cinn/backends/modular.h"
 
-#include "cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/ir_visitor.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/modular.h b/paddle/cinn/backends/modular.h
index a72bc9f922b18..c3a657b6360ba 100644
--- a/paddle/cinn/backends/modular.h
+++ b/paddle/cinn/backends/modular.h
@@ -16,7 +16,7 @@
 
 #include <map>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/backends/nvrtc/CMakeLists.txt b/paddle/cinn/backends/nvrtc/CMakeLists.txt
index a344b65ca93e4..16748d4b8f88e 100644
--- a/paddle/cinn/backends/nvrtc/CMakeLists.txt
+++ b/paddle/cinn/backends/nvrtc/CMakeLists.txt
@@ -5,4 +5,4 @@ gather_srcs(cinnapi_src SRCS
   nvrtc_util.cc
 )
 
-nv_test(test_nvrtc_util SRCS nvrtc_util_test.cc DEPS cinncore)
+cinn_nv_test(test_nvrtc_util SRCS nvrtc_util_test.cc DEPS cinncore)
diff --git a/paddle/cinn/backends/nvrtc/header_generator.cc b/paddle/cinn/backends/nvrtc/header_generator.cc
index 85972814bcbc0..9eba1cf2f0a09 100644
--- a/paddle/cinn/backends/nvrtc/header_generator.cc
+++ b/paddle/cinn/backends/nvrtc/header_generator.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/nvrtc/header_generator.h"
+#include "paddle/cinn/backends/nvrtc/header_generator.h"
 
 #include "glog/logging.h"
 #include "jitify.hpp"
diff --git a/paddle/cinn/backends/nvrtc/nvrtc_util.cc b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
index 4598054701129..3ba93ece28477 100644
--- a/paddle/cinn/backends/nvrtc/nvrtc_util.cc
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -24,11 +24,11 @@
 #include <fstream>
 #include <iostream>
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/backends/nvrtc/header_generator.h"
-#include "cinn/common/common.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/nvrtc/header_generator.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/string.h"
 
 DECLARE_string(cinn_nvcc_cmd_path);
 DECLARE_bool(nvrtc_compile_to_cubin);
diff --git a/paddle/cinn/backends/nvrtc/nvrtc_util_test.cc b/paddle/cinn/backends/nvrtc/nvrtc_util_test.cc
index 9a21934130086..5c111687ea9ab 100644
--- a/paddle/cinn/backends/nvrtc/nvrtc_util_test.cc
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/backends/outputs.cc b/paddle/cinn/backends/outputs.cc
index 65d4cc76899fe..1be075cadbb57 100644
--- a/paddle/cinn/backends/outputs.cc
+++ b/paddle/cinn/backends/outputs.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/outputs.h"
+#include "paddle/cinn/backends/outputs.h"
 
 namespace cinn {
 namespace lang {}  // namespace lang
diff --git a/paddle/cinn/backends/raw_cuda_code_test.cu b/paddle/cinn/backends/raw_cuda_code_test.cu
index 765ef5bd986bb..829b9f16f117c 100644
--- a/paddle/cinn/backends/raw_cuda_code_test.cu
+++ b/paddle/cinn/backends/raw_cuda_code_test.cu
@@ -15,8 +15,8 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/utils/timer.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/utils/timer.h"
 
 __global__ void elementwise_add_kernel(const float* __restrict__ A,
                                        const float* __restrict__ B,
diff --git a/paddle/cinn/cinn.h b/paddle/cinn/cinn.h
index 41ce22a7b54ba..6fc210fe1c58c 100644
--- a/paddle/cinn/cinn.h
+++ b/paddle/cinn/cinn.h
@@ -16,15 +16,15 @@
  * This file exposes some internal APIs to global cinn namespace to make usage more friendly.
  */
 #pragma once
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/common/common.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/optim/optimize.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/optim/optimize.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/common/CMakeLists.txt b/paddle/cinn/common/CMakeLists.txt
index f45e2812960a0..e5e4b19190ec4 100644
--- a/paddle/cinn/common/CMakeLists.txt
+++ b/paddle/cinn/common/CMakeLists.txt
@@ -22,15 +22,15 @@ gather_srcs(cinnapi_src SRCS
 
  message(STATUS "srcs: ${cinnapi_src}")
 
-cc_test(test_cinn_value SRCS cinn_value_test.cc DEPS cinncore)
-cc_test(test_shared SRCS shared_test.cc DEPS cinncore)
-cc_test(test_graph_utils SRCS graph_utils_test.cc DEPS cinncore)
-cc_test(test_arithmatic SRCS arithmatic_test.cc DEPS cinncore)
-cc_test(test_cas SRCS cas_test.cc DEPS cinncore)
-cc_test(test_type SRCS type_test.cc DEPS cinncore)
-cc_test(test_axis SRCS axis_test.cc DEPS cinncore)
+cinn_cc_test(test_cinn_value SRCS cinn_value_test.cc DEPS cinncore)
+cinn_cc_test(test_shared SRCS shared_test.cc DEPS cinncore)
+cinn_cc_test(test_graph_utils SRCS graph_utils_test.cc DEPS cinncore)
+cinn_cc_test(test_arithmatic SRCS arithmatic_test.cc DEPS cinncore)
+cinn_cc_test(test_cas SRCS cas_test.cc DEPS cinncore)
+cinn_cc_test(test_type SRCS type_test.cc DEPS cinncore)
+cinn_cc_test(test_axis SRCS axis_test.cc DEPS cinncore)
 
-cc_test(test_fp16_bf16_host SRCS float16_bfloat16_host_test.cc DEPS gtest glog)
+cinn_cc_test(test_fp16_bf16_host SRCS float16_bfloat16_host_test.cc DEPS gtest glog)
 if (WITH_CUDA)
-nv_test(test_fp16_bf16_cuda SRCS float16_bfloat16_cuda_test.cu DEPS gtest glog)
+cinn_nv_test(test_fp16_bf16_cuda SRCS float16_bfloat16_cuda_test.cu DEPS gtest glog)
 endif()
diff --git a/paddle/cinn/common/arithmatic.cc b/paddle/cinn/common/arithmatic.cc
index 8fd8bb6f6ec50..596d8a4466530 100644
--- a/paddle/cinn/common/arithmatic.cc
+++ b/paddle/cinn/common/arithmatic.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/arithmatic.h"
+#include "paddle/cinn/common/arithmatic.h"
 
 #include <map>
 #include <mutex>
@@ -20,11 +20,11 @@
 #include <set>
 #include <string>
 
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/arithmatic.h b/paddle/cinn/common/arithmatic.h
index 4c443dd0adda6..5d4e7ea872baa 100644
--- a/paddle/cinn/common/arithmatic.h
+++ b/paddle/cinn/common/arithmatic.h
@@ -18,7 +18,7 @@
 #pragma once
 
 // clang-format off
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 #include <ginac/ginac.h>
 // clang-format on
diff --git a/paddle/cinn/common/arithmatic_test.cc b/paddle/cinn/common/arithmatic_test.cc
index 707eda8620aa8..8122196aa6ed6 100644
--- a/paddle/cinn/common/arithmatic_test.cc
+++ b/paddle/cinn/common/arithmatic_test.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/arithmatic.h"
+#include "paddle/cinn/common/arithmatic.h"
 
 #include <ginac/ginac.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/axis.cc b/paddle/cinn/common/axis.cc
index f74de36aa3063..ed18d03934d81 100644
--- a/paddle/cinn/common/axis.cc
+++ b/paddle/cinn/common/axis.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/axis.h"
+#include "paddle/cinn/common/axis.h"
 
-#include "cinn/common/common.h"
-#include "cinn/lang/compute.h"
-#include "cinn/poly/dim.h"
-#include "cinn/poly/domain.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/poly/dim.h"
+#include "paddle/cinn/poly/domain.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/axis_test.cc b/paddle/cinn/common/axis_test.cc
index 1d2a07e87c6d3..4bb236cac8266 100644
--- a/paddle/cinn/common/axis_test.cc
+++ b/paddle/cinn/common/axis_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/axis.h"
+#include "paddle/cinn/common/axis.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include <string>
 
-#include "cinn/utils/string.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc
index 7049770611717..2111c244f33fa 100644
--- a/paddle/cinn/common/cas.cc
+++ b/paddle/cinn/common/cas.cc
@@ -12,23 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/cas.h"
+#include "paddle/cinn/common/cas.h"
 
 #include <algorithm>
 #include <cmath>
 #include <string>
 #include <utility>
 
-#include "cinn/common/arithmatic.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/optim/cast_simplify.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/optim/cast_simplify.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/cas.h b/paddle/cinn/common/cas.h
index 03fa5181f52d9..eca803d3b6aa6 100755
--- a/paddle/cinn/common/cas.h
+++ b/paddle/cinn/common/cas.h
@@ -19,9 +19,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/optim/ir_simplify.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/optim/ir_simplify.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/cas_test.cc b/paddle/cinn/common/cas_test.cc
index bf260e4e3a0dd..224ffd36be38c 100644
--- a/paddle/cinn/common/cas_test.cc
+++ b/paddle/cinn/common/cas_test.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/cas.h"
+#include "paddle/cinn/common/cas.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/common/common.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/cinn_value.cc b/paddle/cinn/common/cinn_value.cc
index 06834725b082c..e705cfdac24fd 100644
--- a/paddle/cinn/common/cinn_value.cc
+++ b/paddle/cinn/common/cinn_value.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/cinn_value.h"
+#include "paddle/cinn/common/cinn_value.h"
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/poly/stage.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/common/cinn_value.h b/paddle/cinn/common/cinn_value.h
index 59598cf955123..5db64e41bfc90 100755
--- a/paddle/cinn/common/cinn_value.h
+++ b/paddle/cinn/common/cinn_value.h
@@ -18,11 +18,11 @@
 
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/object.h"
-#include "cinn/common/type.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 struct cinn_buffer_t;
 
diff --git a/paddle/cinn/common/cinn_value_test.cc b/paddle/cinn/common/cinn_value_test.cc
index 9d27d2cd5f68a..3419ba7849c09 100644
--- a/paddle/cinn/common/cinn_value_test.cc
+++ b/paddle/cinn/common/cinn_value_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/cinn_value.h"
+#include "paddle/cinn/common/cinn_value.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/common/common.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/common.h b/paddle/cinn/common/common.h
index 5e42dc43dce09..25e371caf7824 100644
--- a/paddle/cinn/common/common.h
+++ b/paddle/cinn/common/common.h
@@ -16,14 +16,14 @@
 
 #include <absl/strings/string_view.h>
 
-#include "cinn/common/axis.h"
-#include "cinn/common/cinn_value.h"
-#include "cinn/common/context.h"
-#include "cinn/common/graph_utils.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/shared.h"
-#include "cinn/common/target.h"
-#include "cinn/common/type.h"
+#include "paddle/cinn/common/axis.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/common/context.cc b/paddle/cinn/common/context.cc
index 6f9dfebfc971d..2f985f26f05e5 100644
--- a/paddle/cinn/common/context.cc
+++ b/paddle/cinn/common/context.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/context.h"
+#include "paddle/cinn/common/context.h"
 
 #include <glog/logging.h>
 #include <isl/cpp.h>
 
 #include <mutex>
 
-#include "cinn/ir/ir.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/context.h b/paddle/cinn/common/context.h
index 6962ce680830d..4a5d774fae3ee 100644
--- a/paddle/cinn/common/context.h
+++ b/paddle/cinn/common/context.h
@@ -22,9 +22,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/debug_manager.h"
-#include "cinn/common/info_registry.h"
-#include "cinn/common/target.h"
+#include "paddle/cinn/common/debug_manager.h"
+#include "paddle/cinn/common/info_registry.h"
+#include "paddle/cinn/common/target.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/common/cuda_test_helper.cc b/paddle/cinn/common/cuda_test_helper.cc
index 23666e60dabcd..8c9b67985a10a 100644
--- a/paddle/cinn/common/cuda_test_helper.cc
+++ b/paddle/cinn/common/cuda_test_helper.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/cuda_test_helper.h"
-
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/codegen_cuda_host.h"
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/nvrtc/nvrtc_util.h"
-#include "cinn/runtime/cuda/cuda_module.h"
-#include "cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/common/cuda_test_helper.h"
+
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/cuda_test_helper.h b/paddle/cinn/common/cuda_test_helper.h
index 44bb30a8025a5..20f225cde78e7 100644
--- a/paddle/cinn/common/cuda_test_helper.h
+++ b/paddle/cinn/common/cuda_test_helper.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/cinn.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/cinn.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/debug_manager.cc b/paddle/cinn/common/debug_manager.cc
index f60cc7dc4c76f..9a1642e5c82c6 100644
--- a/paddle/cinn/common/debug_manager.cc
+++ b/paddle/cinn/common/debug_manager.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/debug_manager.h"
+#include "paddle/cinn/common/debug_manager.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/float16_bfloat16_cuda_test.cu b/paddle/cinn/common/float16_bfloat16_cuda_test.cu
index 141144563d3ea..a7b2e82939850 100644
--- a/paddle/cinn/common/float16_bfloat16_cuda_test.cu
+++ b/paddle/cinn/common/float16_bfloat16_cuda_test.cu
@@ -18,8 +18,8 @@
 #include <random>
 #include <vector>
 
-#include "cinn/common/bfloat16.h"
-#include "cinn/common/float16.h"
+#include "paddle/cinn/common/bfloat16.h"
+#include "paddle/cinn/common/float16.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/float16_bfloat16_host_test.cc b/paddle/cinn/common/float16_bfloat16_host_test.cc
index 0846056a34288..5072501c095c9 100644
--- a/paddle/cinn/common/float16_bfloat16_host_test.cc
+++ b/paddle/cinn/common/float16_bfloat16_host_test.cc
@@ -18,8 +18,8 @@
 #include <random>
 #include <vector>
 
-#include "cinn/common/bfloat16.h"
-#include "cinn/common/float16.h"
+#include "paddle/cinn/common/bfloat16.h"
+#include "paddle/cinn/common/float16.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/float16_bfloat16_utils.h b/paddle/cinn/common/float16_bfloat16_utils.h
index 312c0d99b0912..b80c6a518e6ef 100644
--- a/paddle/cinn/common/float16_bfloat16_utils.h
+++ b/paddle/cinn/common/float16_bfloat16_utils.h
@@ -17,8 +17,8 @@
 #include <iostream>
 #include <limits>
 
-#include "cinn/common/bfloat16.h"
-#include "cinn/common/float16.h"
+#include "paddle/cinn/common/bfloat16.h"
+#include "paddle/cinn/common/float16.h"
 
 namespace std {
 // Override the std::is_pod::value for float16 and bfloat16
diff --git a/paddle/cinn/common/graph_utils.cc b/paddle/cinn/common/graph_utils.cc
index cf7b3446e9b01..1ec34f5262311 100755
--- a/paddle/cinn/common/graph_utils.cc
+++ b/paddle/cinn/common/graph_utils.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/graph_utils.h"
+#include "paddle/cinn/common/graph_utils.h"
 
 #include <glog/logging.h>
 
@@ -21,8 +21,8 @@
 #include <set>
 #include <stack>
 
-#include "cinn/common/common.h"
-#include "cinn/utils/dot_lang.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/utils/dot_lang.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/graph_utils.h b/paddle/cinn/common/graph_utils.h
index 2075257ea8b99..cd01be7d44bb5 100644
--- a/paddle/cinn/common/graph_utils.h
+++ b/paddle/cinn/common/graph_utils.h
@@ -28,9 +28,9 @@
 #include <tuple>
 #include <vector>
 
-#include "cinn/common/object.h"
-#include "cinn/common/shared.h"
-#include "cinn/common/type.h"
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/common/type.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/graph_utils_test.cc b/paddle/cinn/common/graph_utils_test.cc
index 8734797ee1607..228abaa7f7894 100644
--- a/paddle/cinn/common/graph_utils_test.cc
+++ b/paddle/cinn/common/graph_utils_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/common/common.h"
+#include "paddle/cinn/common/common.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/info_registry.cc b/paddle/cinn/common/info_registry.cc
index fc0bd31c31f6b..64360d9e5bab8 100644
--- a/paddle/cinn/common/info_registry.cc
+++ b/paddle/cinn/common/info_registry.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/info_registry.h"
+#include "paddle/cinn/common/info_registry.h"
 
 namespace cinn {
 namespace common {}  // namespace common
diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc
index f48991516e37d..a42186c023717 100755
--- a/paddle/cinn/common/ir_util.cc
+++ b/paddle/cinn/common/ir_util.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/ir_util.h"
+#include "paddle/cinn/common/ir_util.h"
 
 #include <algorithm>
 #include <unordered_set>
 
-#include "cinn/common/cas.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/optim/cast_simplify.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/optim/cast_simplify.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/ir_util.h b/paddle/cinn/common/ir_util.h
index 8cf8a578b3938..8194c27c5e226 100644
--- a/paddle/cinn/common/ir_util.h
+++ b/paddle/cinn/common/ir_util.h
@@ -20,9 +20,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/bfloat16.h"
-#include "cinn/common/float16.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/common/bfloat16.h"
+#include "paddle/cinn/common/float16.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/object.cc b/paddle/cinn/common/object.cc
index f04cdceff5c6f..22cb63df0dc68 100644
--- a/paddle/cinn/common/object.cc
+++ b/paddle/cinn/common/object.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/object.h"
+#include "paddle/cinn/common/object.h"
 
 namespace cinn {
 namespace common {}  // namespace common
diff --git a/paddle/cinn/common/object.h b/paddle/cinn/common/object.h
index e73234911a0ff..625e7d43c6534 100644
--- a/paddle/cinn/common/object.h
+++ b/paddle/cinn/common/object.h
@@ -15,7 +15,7 @@
 #pragma once
 #include <cstring>
 
-#include "cinn/common/shared.h"
+#include "paddle/cinn/common/shared.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/python_interpreter_guard.cc b/paddle/cinn/common/python_interpreter_guard.cc
index bafe376d972c2..a465adda2c558 100644
--- a/paddle/cinn/common/python_interpreter_guard.cc
+++ b/paddle/cinn/common/python_interpreter_guard.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/python_interpreter_guard.h"
+#include "paddle/cinn/common/python_interpreter_guard.h"
 
 #include <pybind11/embed.h>
 
diff --git a/paddle/cinn/common/shared.cc b/paddle/cinn/common/shared.cc
index 0052054cf469b..da4e6980731a6 100644
--- a/paddle/cinn/common/shared.cc
+++ b/paddle/cinn/common/shared.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/shared.h"
+#include "paddle/cinn/common/shared.h"
diff --git a/paddle/cinn/common/shared_test.cc b/paddle/cinn/common/shared_test.cc
index ce089fea5f7ac..e28e7c6e936a3 100644
--- a/paddle/cinn/common/shared_test.cc
+++ b/paddle/cinn/common/shared_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/shared.h"
+#include "paddle/cinn/common/shared.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/common/object.h"
+#include "paddle/cinn/common/object.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/target.cc b/paddle/cinn/common/target.cc
index 1e58408b4c097..c60ff05a06736 100644
--- a/paddle/cinn/common/target.cc
+++ b/paddle/cinn/common/target.cc
@@ -20,8 +20,8 @@
 
 #include <sstream>
 
-#include "cinn/common/target.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 #ifdef CINN_WITH_CUDA
 #include <cuda_runtime_api.h>
diff --git a/paddle/cinn/common/test_helper.cc b/paddle/cinn/common/test_helper.cc
index 5e0b88ab0ac8e..257b92983eb42 100644
--- a/paddle/cinn/common/test_helper.cc
+++ b/paddle/cinn/common/test_helper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/test_helper.h"
+#include "paddle/cinn/common/test_helper.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/test_helper.h b/paddle/cinn/common/test_helper.h
index ec03ec558c4b4..2a3c4625c3e0c 100644
--- a/paddle/cinn/common/test_helper.h
+++ b/paddle/cinn/common/test_helper.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/cinn.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/cinn.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/type.cc b/paddle/cinn/common/type.cc
index 7240017e97533..1195f560bf4cf 100644
--- a/paddle/cinn/common/type.cc
+++ b/paddle/cinn/common/type.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/type.h"
+#include "paddle/cinn/common/type.h"
 
 #include <functional>
 #include <string>
diff --git a/paddle/cinn/common/type.h b/paddle/cinn/common/type.h
index 490b80c64f6f6..6a92d2f15c044 100644
--- a/paddle/cinn/common/type.h
+++ b/paddle/cinn/common/type.h
@@ -18,11 +18,11 @@
 #include <memory>
 #include <string>
 
-#include "cinn/common/bfloat16.h"
-#include "cinn/common/float16.h"
-#include "cinn/common/float16_bfloat16_utils.h"
-#include "cinn/common/macros.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/common/bfloat16.h"
+#include "paddle/cinn/common/float16.h"
+#include "paddle/cinn/common/float16_bfloat16_utils.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 //! Much of the concepts are borrowed from Halide project.
 
diff --git a/paddle/cinn/common/type_test.cc b/paddle/cinn/common/type_test.cc
index 7de6c00105048..7e4b6e9505707 100644
--- a/paddle/cinn/common/type_test.cc
+++ b/paddle/cinn/common/type_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/type.h"
+#include "paddle/cinn/common/type.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/common/union_find.cc b/paddle/cinn/common/union_find.cc
index 9301517e0c544..8a1217cea6138 100644
--- a/paddle/cinn/common/union_find.cc
+++ b/paddle/cinn/common/union_find.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/union_find.h"
+#include "paddle/cinn/common/union_find.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/union_find.h b/paddle/cinn/common/union_find.h
index 573e87b08c721..b586ee8442488 100644
--- a/paddle/cinn/common/union_find.h
+++ b/paddle/cinn/common/union_find.h
@@ -22,8 +22,8 @@
 #include <tuple>
 #include <vector>
 
-#include "cinn/common/object.h"
-#include "cinn/common/shared.h"
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/shared.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/frontend/CMakeLists.txt b/paddle/cinn/frontend/CMakeLists.txt
index d1cf70fd87486..0f5f8ec08b0c6 100755
--- a/paddle/cinn/frontend/CMakeLists.txt
+++ b/paddle/cinn/frontend/CMakeLists.txt
@@ -11,34 +11,34 @@ gather_srcs(cinnapi_src SRCS
   optimize.cc)
 
 if(NOT WITH_CUDA)
-  cc_test(test_frontend_syntax
+  cinn_cc_test(test_frontend_syntax
           ARGS "--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
           SRCS syntax_test.cc DEPS cinncore)
 
-#  cc_test(test_frontend_interpreter
+#  cinn_cc_test(test_frontend_interpreter
 #          ARGS --model_dir=${THIRD_PARTY_PATH}/naive_mul_model
 #          SRCS interpreter_test.cc DEPS cinncore)
 
 else()
-  nv_test(test_frontend_syntax
+  cinn_nv_test(test_frontend_syntax
           ARGS "--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
           SRCS syntax_test.cc DEPS cinncore)
 
-  nv_test(test_frontend_interpreter
+  cinn_nv_test(test_frontend_interpreter
           ARGS --model_dir=${THIRD_PARTY_PATH}/naive_mul_model
           SRCS interpreter_test.cc DEPS cinncore)
 endif()
 
-#cc_test(test_paddle_model_convertor
+#cinn_cc_test(test_paddle_model_convertor
 #        ARGS --model_dir=${THIRD_PARTY_PATH}/resnet_model
 #        SRCS paddle_model_convertor_test.cc DEPS cinncore decomposer_test_helper)
 
-#cc_test(test_computation
+#cinn_cc_test(test_computation
 #  ARGS "--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
 #  SRCS computation_test.cc DEPS cinncore)
 
-cc_test(test_net_builder SRCS net_builder_test.cc DEPS cinncore)
-cc_test(test_decomposer_registry
+cinn_cc_test(test_net_builder SRCS net_builder_test.cc DEPS cinncore)
+cinn_cc_test(test_decomposer_registry
         SRCS decomposer_registry_test.cc DEPS cinncore)
 
 add_subdirectory(paddle)
@@ -46,4 +46,4 @@ add_subdirectory(decomposer)
 add_subdirectory(op_mappers)
 add_subdirectory(pass)
 
-cc_test(test_op_mapper_registry SRCS op_mapper_registry_test.cc DEPS cinncore)
+cinn_cc_test(test_op_mapper_registry SRCS op_mapper_registry_test.cc DEPS cinncore)
diff --git a/paddle/cinn/frontend/computation.cc b/paddle/cinn/frontend/computation.cc
index 447f3605d4a4f..a8a8b335582e6 100644
--- a/paddle/cinn/frontend/computation.cc
+++ b/paddle/cinn/frontend/computation.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/computation.h"
-
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/scope.h"
+#include "paddle/cinn/frontend/computation.h"
+
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/scope.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/computation.h b/paddle/cinn/frontend/computation.h
index 689dc731dd2ca..9dd1ec8e62270 100644
--- a/paddle/cinn/frontend/computation.h
+++ b/paddle/cinn/frontend/computation.h
@@ -14,10 +14,10 @@
 
 #include <iostream>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/computation_test.cc b/paddle/cinn/frontend/computation_test.cc
index 9fd159e936cf3..cdc8db6388fa3 100644
--- a/paddle/cinn/frontend/computation_test.cc
+++ b/paddle/cinn/frontend/computation_test.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/computation.h"
+#include "paddle/cinn/frontend/computation.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/decomposer/use_decomposer.h"
-#include "cinn/frontend/decomposer_registry.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/decomposer/use_decomposer.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
 
 DEFINE_string(model_dir, "", "");
 
diff --git a/paddle/cinn/frontend/decomposer/CMakeLists.txt b/paddle/cinn/frontend/decomposer/CMakeLists.txt
index db216057594b7..d45c8c8a1cf65 100755
--- a/paddle/cinn/frontend/decomposer/CMakeLists.txt
+++ b/paddle/cinn/frontend/decomposer/CMakeLists.txt
@@ -8,12 +8,12 @@ gather_srcs(cinnapi_src SRCS
     top_k.cc
     )
 
-cc_library(decomposer_test_helper SRCS test_helper.cc DEPS cinncore)
+cinn_cc_library(decomposer_test_helper SRCS test_helper.cc DEPS cinncore)
 
 if (WITH_CUDA)
-cc_test(test_activation_decomposer SRCS activation_test.cc DEPS cinncore decomposer_test_helper)
-cc_test(test_elementwise_decomposer SRCS elementwise_test.cc DEPS cinncore decomposer_test_helper)
-cc_test(test_broadcast_decomposer SRCS broadcast_test.cc DEPS cinncore decomposer_test_helper)
-cc_test(test_batch_norm_decomposer SRCS batch_norm_test.cc DEPS cinncore decomposer_test_helper)
-cc_test(test_top_k_decomposer SRCS top_k_test.cc DEPS cinncore decomposer_test_helper)
+cinn_cc_test(test_activation_decomposer SRCS activation_test.cc DEPS cinncore decomposer_test_helper)
+cinn_cc_test(test_elementwise_decomposer SRCS elementwise_test.cc DEPS cinncore decomposer_test_helper)
+cinn_cc_test(test_broadcast_decomposer SRCS broadcast_test.cc DEPS cinncore decomposer_test_helper)
+cinn_cc_test(test_batch_norm_decomposer SRCS batch_norm_test.cc DEPS cinncore decomposer_test_helper)
+cinn_cc_test(test_top_k_decomposer SRCS top_k_test.cc DEPS cinncore decomposer_test_helper)
 endif()
diff --git a/paddle/cinn/frontend/decomposer/activation.cc b/paddle/cinn/frontend/decomposer/activation.cc
index 947e9c32b4920..0244243be8822 100644
--- a/paddle/cinn/frontend/decomposer/activation.cc
+++ b/paddle/cinn/frontend/decomposer/activation.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/decomposer_registry.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/decomposer/activation_test.cc b/paddle/cinn/frontend/decomposer/activation_test.cc
index 2174321520d4f..5e2b5b18ea2b1 100644
--- a/paddle/cinn/frontend/decomposer/activation_test.cc
+++ b/paddle/cinn/frontend/decomposer/activation_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/decomposer/batch_norm.cc b/paddle/cinn/frontend/decomposer/batch_norm.cc
index 44cd8711c85d5..ed421fdebd470 100644
--- a/paddle/cinn/frontend/decomposer/batch_norm.cc
+++ b/paddle/cinn/frontend/decomposer/batch_norm.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/decomposer_registry.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/decomposer/batch_norm_test.cc b/paddle/cinn/frontend/decomposer/batch_norm_test.cc
index c2f6b8e1dc085..8c5607dfcf378 100755
--- a/paddle/cinn/frontend/decomposer/batch_norm_test.cc
+++ b/paddle/cinn/frontend/decomposer/batch_norm_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/decomposer/broadcast.cc b/paddle/cinn/frontend/decomposer/broadcast.cc
index f9f06d679a4c5..7f57de1f835c8 100644
--- a/paddle/cinn/frontend/decomposer/broadcast.cc
+++ b/paddle/cinn/frontend/decomposer/broadcast.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/decomposer_registry.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/decomposer/broadcast_test.cc b/paddle/cinn/frontend/decomposer/broadcast_test.cc
index 7e11f5d8f0e9a..39a564649c99f 100644
--- a/paddle/cinn/frontend/decomposer/broadcast_test.cc
+++ b/paddle/cinn/frontend/decomposer/broadcast_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/decomposer/elementwise.cc b/paddle/cinn/frontend/decomposer/elementwise.cc
index e613ad765a76b..9fddfde5de4a8 100644
--- a/paddle/cinn/frontend/decomposer/elementwise.cc
+++ b/paddle/cinn/frontend/decomposer/elementwise.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/decomposer_registry.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/decomposer/elementwise_test.cc b/paddle/cinn/frontend/decomposer/elementwise_test.cc
index 4aaca9ed72b51..9a8b02bbe1d7e 100644
--- a/paddle/cinn/frontend/decomposer/elementwise_test.cc
+++ b/paddle/cinn/frontend/decomposer/elementwise_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/decomposer/test_helper.cc b/paddle/cinn/frontend/decomposer/test_helper.cc
index 3ca29f5f07c31..a44111f46b3c1 100644
--- a/paddle/cinn/frontend/decomposer/test_helper.cc
+++ b/paddle/cinn/frontend/decomposer/test_helper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/decomposer/test_helper.h b/paddle/cinn/frontend/decomposer/test_helper.h
index a00e94b82f07e..995b65c607c0e 100644
--- a/paddle/cinn/frontend/decomposer/test_helper.h
+++ b/paddle/cinn/frontend/decomposer/test_helper.h
@@ -19,18 +19,18 @@
 #include <iomanip>
 #include <random>
 
-#include "cinn/frontend/decomposer/use_decomposer.h"
-#include "cinn/frontend/decomposer_registry.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/frontend/decomposer/use_decomposer.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/decomposer/top_k.cc b/paddle/cinn/frontend/decomposer/top_k.cc
index 14472e0c0c0ba..5f8e9401a94d0 100644
--- a/paddle/cinn/frontend/decomposer/top_k.cc
+++ b/paddle/cinn/frontend/decomposer/top_k.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/decomposer_registry.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/decomposer/top_k_test.cc b/paddle/cinn/frontend/decomposer/top_k_test.cc
index 2107bbb68def2..495ddfa713d1a 100644
--- a/paddle/cinn/frontend/decomposer/top_k_test.cc
+++ b/paddle/cinn/frontend/decomposer/top_k_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/decomposer/use_decomposer.h b/paddle/cinn/frontend/decomposer/use_decomposer.h
index 0afb73e2d1815..8726204f0d48b 100644
--- a/paddle/cinn/frontend/decomposer/use_decomposer.h
+++ b/paddle/cinn/frontend/decomposer/use_decomposer.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/common/macros.h"
+#include "paddle/cinn/common/macros.h"
 
 CINN_USE_REGISTER(relu_decomposers)
 CINN_USE_REGISTER(relu_grad_decomposers)
diff --git a/paddle/cinn/frontend/decomposer_registry.h b/paddle/cinn/frontend/decomposer_registry.h
index 6eeb2be3b8292..bcd0277316d35 100644
--- a/paddle/cinn/frontend/decomposer_registry.h
+++ b/paddle/cinn/frontend/decomposer_registry.h
@@ -18,9 +18,9 @@
 #include <string>
 #include <unordered_map>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/decomposer_registry_test.cc b/paddle/cinn/frontend/decomposer_registry_test.cc
index 6fa3382939f95..ad3828706b1a6 100644
--- a/paddle/cinn/frontend/decomposer_registry_test.cc
+++ b/paddle/cinn/frontend/decomposer_registry_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/use_decomposer.h"
+#include "paddle/cinn/frontend/decomposer/use_decomposer.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/interpreter.cc b/paddle/cinn/frontend/interpreter.cc
index 5b151ed0981ce..c72bffc2ffc6c 100755
--- a/paddle/cinn/frontend/interpreter.cc
+++ b/paddle/cinn/frontend/interpreter.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/interpreter.h"
-
-#include "cinn/auto_schedule/auto_tuner.h"
-#include "cinn/auto_schedule/tuning.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/frontend/interpreter.h"
+
+#include "paddle/cinn/auto_schedule/auto_tuner.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(enable_auto_tuner);
 
diff --git a/paddle/cinn/frontend/interpreter.h b/paddle/cinn/frontend/interpreter.h
index a6e5dc019bb19..36ef8aa3c15b6 100755
--- a/paddle/cinn/frontend/interpreter.h
+++ b/paddle/cinn/frontend/interpreter.h
@@ -20,9 +20,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/scope.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/scope.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/interpreter_test.cc b/paddle/cinn/frontend/interpreter_test.cc
index 40549f88ee6b1..6eb373accc73d 100755
--- a/paddle/cinn/frontend/interpreter_test.cc
+++ b/paddle/cinn/frontend/interpreter_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/interpreter.h"
+#include "paddle/cinn/frontend/interpreter.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/runtime/use_extern_funcs.h"
+#include "paddle/cinn/runtime/use_extern_funcs.h"
 
 DEFINE_string(model_dir, "", "");
 
diff --git a/paddle/cinn/frontend/net_builder.cc b/paddle/cinn/frontend/net_builder.cc
index 0d04897d1dd94..1a495b430aee6 100644
--- a/paddle/cinn/frontend/net_builder.cc
+++ b/paddle/cinn/frontend/net_builder.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/net_builder.h"
 
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/functional.h"
-#include "cinn/utils/profiler.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/functional.h"
+#include "paddle/cinn/utils/profiler.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/frontend/net_builder.h b/paddle/cinn/frontend/net_builder.h
index b16b9a91a4e61..cc9510e877580 100644
--- a/paddle/cinn/frontend/net_builder.h
+++ b/paddle/cinn/frontend/net_builder.h
@@ -21,12 +21,12 @@
 #include <type_traits>
 #include <vector>
 
-#include "cinn/common/macros.h"
-#include "cinn/common/type.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/utils/functional.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/utils/functional.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/net_builder_test.cc b/paddle/cinn/frontend/net_builder_test.cc
index e57ec7a24198a..14450ae8bad34 100644
--- a/paddle/cinn/frontend/net_builder_test.cc
+++ b/paddle/cinn/frontend/net_builder_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/net_builder.h"
 
 #include <gtest/gtest.h>
 
@@ -21,14 +21,14 @@
 #include <random>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/utils/data_util.h"
 #ifdef CINN_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
diff --git a/paddle/cinn/frontend/op_mapper_registry.cc b/paddle/cinn/frontend/op_mapper_registry.cc
index 423271f33b973..a38b3209b4e8c 100644
--- a/paddle/cinn/frontend/op_mapper_registry.cc
+++ b/paddle/cinn/frontend/op_mapper_registry.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
 
-#include "cinn/frontend/paddle/cpp/var_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/var_desc.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mapper_registry.h b/paddle/cinn/frontend/op_mapper_registry.h
index e4529e4b660a5..51864d6bf2eec 100644
--- a/paddle/cinn/frontend/op_mapper_registry.h
+++ b/paddle/cinn/frontend/op_mapper_registry.h
@@ -22,15 +22,15 @@
 #include <variant>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/target.h"
-#include "cinn/common/type.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/paddle/cpp/op_desc.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/utils/registry.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/paddle/cpp/op_desc.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/utils/registry.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mapper_registry_test.cc b/paddle/cinn/frontend/op_mapper_registry_test.cc
index c5765c6ac8a9d..be9699b7053ad 100644
--- a/paddle/cinn/frontend/op_mapper_registry_test.cc
+++ b/paddle/cinn/frontend/op_mapper_registry_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
 
 #include <gtest/gtest.h>
 
 #include <typeinfo>
 
-#include "cinn/frontend/op_mappers/use_op_mappers.h"
-#include "cinn/utils/registry.h"
+#include "paddle/cinn/frontend/op_mappers/use_op_mappers.h"
+#include "paddle/cinn/utils/registry.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/common_utils.h b/paddle/cinn/frontend/op_mappers/common_utils.h
index d5a93b1f2076e..2ef4293192023 100644
--- a/paddle/cinn/frontend/op_mappers/common_utils.h
+++ b/paddle/cinn/frontend/op_mappers/common_utils.h
@@ -20,10 +20,10 @@
 #include <string>
 #include <vector>
 
-#include "cinn/frontend/paddle/cpp/op_desc.h"
-#include "cinn/frontend/var_type_utils.h"
-#include "cinn/utils/functional.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/frontend/paddle/cpp/op_desc.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/utils/functional.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/arg_min_max.cc b/paddle/cinn/frontend/op_mappers/paddle/arg_min_max.cc
index e67aed1a400e5..a7845b6557388 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/arg_min_max.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/arg_min_max.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/argsort.cc b/paddle/cinn/frontend/op_mappers/paddle/argsort.cc
index 7faef70972bca..fe290a3476062 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/argsort.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/argsort.cc
@@ -14,9 +14,9 @@
 
 #include <variant>
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/atan.cc b/paddle/cinn/frontend/op_mappers/paddle/atan.cc
index fde80539965ca..dbae16696ab30 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/atan.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/atan.cc
@@ -14,9 +14,9 @@
 
 #include <variant>
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/batchnorm.cc b/paddle/cinn/frontend/op_mappers/paddle/batchnorm.cc
index 133ce244f6d9a..4d379b2c4208f 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/batchnorm.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/batchnorm.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/binary.cc b/paddle/cinn/frontend/op_mappers/paddle/binary.cc
index 8dcede4dcec7d..98b550f89484c 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/binary.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/binary.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/cholesky.cc b/paddle/cinn/frontend/op_mappers/paddle/cholesky.cc
index a6512dbc43063..1364ed57962a5 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/cholesky.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/cholesky.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/clip.cc b/paddle/cinn/frontend/op_mappers/paddle/clip.cc
index 5c49822ba6520..bcd65a15ae700 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/clip.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/clip.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/compare.cc b/paddle/cinn/frontend/op_mappers/paddle/compare.cc
index 30ffbe24a34b2..158477a7eb0c8 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/compare.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/compare.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/concat.cc b/paddle/cinn/frontend/op_mappers/paddle/concat.cc
index aee846fb2ebab..61a84a8aa4789 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/concat.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/concat.cc
@@ -14,8 +14,8 @@
 
 #include <algorithm>
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/constant.cc b/paddle/cinn/frontend/op_mappers/paddle/constant.cc
index 228bc8b0fc765..e4c2b305a0806 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/constant.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/constant.cc
@@ -20,9 +20,9 @@
 #include <numeric>
 #include <utility>
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc b/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
index 49cbb0fc0c857..df524f5599fe7 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc b/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
index c83edaad0f20f..3ffaafd958a22 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/dropout.cc b/paddle/cinn/frontend/op_mappers/paddle/dropout.cc
index de18d82d0ce5f..868bca420c2c2 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/dropout.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/dropout.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc b/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
index c9a0496c08be0..c2cb5dede47d9 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/type.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/paddle/cpp/desc_api.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/expand.cc b/paddle/cinn/frontend/op_mappers/paddle/expand.cc
index e0dbb78c8c17c..9fb4222d53a09 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/expand.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/expand.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/fetch_feed.cc b/paddle/cinn/frontend/op_mappers/paddle/fetch_feed.cc
index d5786eebd1c3f..befa88869afcc 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/fetch_feed.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/fetch_feed.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/macros.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/flip.cc b/paddle/cinn/frontend/op_mappers/paddle/flip.cc
index 105a89519d8a7..321429f6ef261 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/flip.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/flip.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/gather.cc b/paddle/cinn/frontend/op_mappers/paddle/gather.cc
index 8bd273682d18e..0337bc9ab5c1d 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/gather.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/gather.cc
@@ -14,8 +14,8 @@
 
 #include <numeric>
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/gather_nd.cc b/paddle/cinn/frontend/op_mappers/paddle/gather_nd.cc
index f97f5835b3c12..29d54622fcd40 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/gather_nd.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/gather_nd.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/gaussian_random.cc b/paddle/cinn/frontend/op_mappers/paddle/gaussian_random.cc
index 1f10b9bfd859f..50790ee2016c3 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/gaussian_random.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/gaussian_random.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc b/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
index a91317a43e7e0..d53832ba9bd28 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
@@ -16,10 +16,10 @@
 
 #include <string>
 
-#include "cinn/common/context.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/log.cc b/paddle/cinn/frontend/op_mappers/paddle/log.cc
index 44b7df9b3040b..9a1197a869213 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/log.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/log.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/lookup_table.cc b/paddle/cinn/frontend/op_mappers/paddle/lookup_table.cc
index 82e3903ea804e..9eefbae5bc9c0 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/lookup_table.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/lookup_table.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/matmul.cc b/paddle/cinn/frontend/op_mappers/paddle/matmul.cc
index 7db1c86fea325..0a35cc56e708d 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/matmul.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/matmul.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/mul.cc b/paddle/cinn/frontend/op_mappers/paddle/mul.cc
index a510654038f25..a1f2985b912c9 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/mul.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/mul.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "absl/types/optional.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/norm.cc b/paddle/cinn/frontend/op_mappers/paddle/norm.cc
index cc3c6556d79cf..6bc7273a4bbd1 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/norm.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/norm.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/one_hot.cc b/paddle/cinn/frontend/op_mappers/paddle/one_hot.cc
index 29efa3beb114c..3cf731110f00f 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/one_hot.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/one_hot.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/pool2d.cc b/paddle/cinn/frontend/op_mappers/paddle/pool2d.cc
index adc5a38c0fa51..22da2aebd7433 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/pool2d.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/pool2d.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/randint.cc b/paddle/cinn/frontend/op_mappers/paddle/randint.cc
index 8e32a7b4e3037..3fde481723ec1 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/randint.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/randint.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/reduce.cc b/paddle/cinn/frontend/op_mappers/paddle/reduce.cc
index 74d7bfaa7e32f..24fbd99b2a0eb 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/reduce.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/reduce.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/relu.cc b/paddle/cinn/frontend/op_mappers/paddle/relu.cc
index d8997ff7e63ee..828e6c98f60a9 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/relu.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/relu.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/reshape.cc b/paddle/cinn/frontend/op_mappers/paddle/reshape.cc
index e767fbee1ec86..440870bd7fb20 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/reshape.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/reshape.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/reverse.cc b/paddle/cinn/frontend/op_mappers/paddle/reverse.cc
index e515f970bbefe..9c257e7767208 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/reverse.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/reverse.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/roll.cc b/paddle/cinn/frontend/op_mappers/paddle/roll.cc
index 85cb63504f2a5..c2b6ee0136e38 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/roll.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/roll.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/scale.cc b/paddle/cinn/frontend/op_mappers/paddle/scale.cc
index 8b4b6e2a4fc69..be1a9984e738b 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/scale.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/scale.cc
@@ -14,9 +14,9 @@
 
 #include <variant>
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/scatter.cc b/paddle/cinn/frontend/op_mappers/paddle/scatter.cc
index 507f8a86cf7c2..72697c4a84b1c 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/scatter.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/scatter.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/type.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/slice.cc b/paddle/cinn/frontend/op_mappers/paddle/slice.cc
index 8ebeb53144dfc..cadd9fd3fe79d 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/slice.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/slice.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/softmax.cc b/paddle/cinn/frontend/op_mappers/paddle/softmax.cc
index 732b101927999..12a1c86bef442 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/softmax.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/softmax.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/squeeze.cc b/paddle/cinn/frontend/op_mappers/paddle/squeeze.cc
index 7c64abc709ee3..430be863661dd 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/squeeze.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/squeeze.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/strided_slice.cc b/paddle/cinn/frontend/op_mappers/paddle/strided_slice.cc
index 9d637d4e37f8c..318dc7e14a6f3 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/strided_slice.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/strided_slice.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/take_along_axis.cc b/paddle/cinn/frontend/op_mappers/paddle/take_along_axis.cc
index b771c2e96f04b..5c74cf55c10ca 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/take_along_axis.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/take_along_axis.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/tile.cc b/paddle/cinn/frontend/op_mappers/paddle/tile.cc
index 03038a0186dd9..01f83bdccfbb2 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/tile.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/tile.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/top_k.cc b/paddle/cinn/frontend/op_mappers/paddle/top_k.cc
index 786ec5612b000..10c610c9b5eb1 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/top_k.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/top_k.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/transpose.cc b/paddle/cinn/frontend/op_mappers/paddle/transpose.cc
index 0daddd0ec62a4..bc79123d042c7 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/transpose.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/transpose.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/triangular_solve.cc b/paddle/cinn/frontend/op_mappers/paddle/triangular_solve.cc
index efbff5985f88a..e2d9f55d650d3 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/triangular_solve.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/triangular_solve.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/unary.cc b/paddle/cinn/frontend/op_mappers/paddle/unary.cc
index d442950fd468f..c7ce01f0a169a 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/unary.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/unary.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/uniform_random.cc b/paddle/cinn/frontend/op_mappers/paddle/uniform_random.cc
index 989504c981249..bc842c5bd3ef9 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/uniform_random.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/uniform_random.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/unsqueeze.cc b/paddle/cinn/frontend/op_mappers/paddle/unsqueeze.cc
index cb3ab0aebec60..90b480edecf0b 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/unsqueeze.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/unsqueeze.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/paddle/where.cc b/paddle/cinn/frontend/op_mappers/paddle/where.cc
index 57783f09b631e..c2e7386d5c94b 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/where.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/where.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/science/broadcast.cc b/paddle/cinn/frontend/op_mappers/science/broadcast.cc
index 79ee6061f7ef7..b19f9830becc6 100644
--- a/paddle/cinn/frontend/op_mappers/science/broadcast.cc
+++ b/paddle/cinn/frontend/op_mappers/science/broadcast.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/science/compare.cc b/paddle/cinn/frontend/op_mappers/science/compare.cc
index eda23588d5607..54c39029dbd11 100644
--- a/paddle/cinn/frontend/op_mappers/science/compare.cc
+++ b/paddle/cinn/frontend/op_mappers/science/compare.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/science/math.cc b/paddle/cinn/frontend/op_mappers/science/math.cc
index 2a59776c66b34..843d6095b3b0e 100644
--- a/paddle/cinn/frontend/op_mappers/science/math.cc
+++ b/paddle/cinn/frontend/op_mappers/science/math.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/science/transform.cc b/paddle/cinn/frontend/op_mappers/science/transform.cc
index 3064e53aab84c..2048a37c01dbd 100644
--- a/paddle/cinn/frontend/op_mappers/science/transform.cc
+++ b/paddle/cinn/frontend/op_mappers/science/transform.cc
@@ -17,9 +17,9 @@
 #include <functional>
 #include <numeric>
 
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/op_mappers/common_utils.h"
-#include "cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/op_mappers/common_utils.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/op_mappers/use_op_mappers.h b/paddle/cinn/frontend/op_mappers/use_op_mappers.h
index dd545889c02af..967cc2b902b4d 100644
--- a/paddle/cinn/frontend/op_mappers/use_op_mappers.h
+++ b/paddle/cinn/frontend/op_mappers/use_op_mappers.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/common/macros.h"
+#include "paddle/cinn/common/macros.h"
 
 CINN_USE_REGISTER(paddle_argsort)
 CINN_USE_REGISTER(paddle_fetch_feed)
diff --git a/paddle/cinn/frontend/optimize.cc b/paddle/cinn/frontend/optimize.cc
index b93326b806c43..393f5b35ea0b4 100644
--- a/paddle/cinn/frontend/optimize.cc
+++ b/paddle/cinn/frontend/optimize.cc
@@ -12,22 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/optimize.h"
 
 #include <memory>
 #include <string>
 #include <unordered_set>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/decomposer/use_decomposer.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/visualize_helper.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/decomposer/use_decomposer.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(cinn_use_fill_constant_folding);
 DECLARE_bool(cinn_use_op_fusion);
diff --git a/paddle/cinn/frontend/optimize.h b/paddle/cinn/frontend/optimize.h
index 9f41479055a28..2646a52359c91 100755
--- a/paddle/cinn/frontend/optimize.h
+++ b/paddle/cinn/frontend/optimize.h
@@ -19,9 +19,9 @@
 #include <unordered_set>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/paddle/CMakeLists.txt b/paddle/cinn/frontend/paddle/CMakeLists.txt
index 782295f74de21..d5418af18fb45 100644
--- a/paddle/cinn/frontend/paddle/CMakeLists.txt
+++ b/paddle/cinn/frontend/paddle/CMakeLists.txt
@@ -1,4 +1,4 @@
-proto_library(framework_proto SRCS framework.proto)
+cinn_proto_library(cinn_framework_proto SRCS framework.proto)
 
 add_subdirectory(cpp)
 add_subdirectory(pb)
@@ -8,11 +8,11 @@ set(srcs
   compatible_pb.cc
   )
 
-cc_test(test_model_parser SRCS model_parser_test.cc DEPS cinncore
+cinn_cc_test(test_model_parser SRCS model_parser_test.cc DEPS cinncore
   ARGS --model_dir=${THIRD_PARTY_PATH}/model/lite_naive_model)
 
 foreach(cpp ${srcs})
-  set(cinnapi_src "${cinnapi_src};cinn/frontend/paddle/${cpp}" CACHE INTERNAL "")
+  set(cinnapi_src "${cinnapi_src};paddle/cinn/frontend/paddle/${cpp}" CACHE INTERNAL "")
 endforeach()
 
 file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
@@ -21,6 +21,6 @@ foreach(header ${includes})
   set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
 endforeach()
 
-foreach(header ${framework_proto_HDRS})
+foreach(header ${cinn_framework_proto_HDRS})
   set(core_proto_includes "${core_proto_includes};${header}" CACHE INTERNAL "")
 endforeach()
diff --git a/paddle/cinn/frontend/paddle/compatible_pb.cc b/paddle/cinn/frontend/paddle/compatible_pb.cc
index 0a4aaad8f8177..93f392e86bd97 100644
--- a/paddle/cinn/frontend/paddle/compatible_pb.cc
+++ b/paddle/cinn/frontend/paddle/compatible_pb.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/compatible_pb.h"
+#include "paddle/cinn/frontend/paddle/compatible_pb.h"
 
 #include <string>
 #include <vector>
 
-#include "cinn/frontend/paddle/pb/block_desc.h"
-#include "cinn/frontend/paddle/pb/op_desc.h"
-#include "cinn/frontend/paddle/pb/program_desc.h"
-#include "cinn/frontend/paddle/pb/var_desc.h"
+#include "paddle/cinn/frontend/paddle/pb/block_desc.h"
+#include "paddle/cinn/frontend/paddle/pb/op_desc.h"
+#include "paddle/cinn/frontend/paddle/pb/program_desc.h"
+#include "paddle/cinn/frontend/paddle/pb/var_desc.h"
 
 namespace cinn::frontend::paddle {
 namespace framework_proto = ::cinn::frontend::paddle::proto;
diff --git a/paddle/cinn/frontend/paddle/compatible_pb.h b/paddle/cinn/frontend/paddle/compatible_pb.h
index 77088c5d0b7fd..e8478d555c392 100644
--- a/paddle/cinn/frontend/paddle/compatible_pb.h
+++ b/paddle/cinn/frontend/paddle/compatible_pb.h
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/frontend/paddle/cpp/block_desc.h"
-#include "cinn/frontend/paddle/cpp/desc_api.h"
-#include "cinn/frontend/paddle/cpp/op_desc.h"
-#include "cinn/frontend/paddle/cpp/program_desc.h"
-#include "cinn/frontend/paddle/cpp/var_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/block_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/cpp/op_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/program_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/var_desc.h"
 
 namespace cinn::frontend::paddle {
 
diff --git a/paddle/cinn/frontend/paddle/cpp/CMakeLists.txt b/paddle/cinn/frontend/paddle/cpp/CMakeLists.txt
index bc13453f806aa..4c990c5d0b693 100644
--- a/paddle/cinn/frontend/paddle/cpp/CMakeLists.txt
+++ b/paddle/cinn/frontend/paddle/cpp/CMakeLists.txt
@@ -7,7 +7,7 @@ set(srcs
 
 foreach(cpp ${srcs})
   set(cinnapi_src
-    "${cinnapi_src};cinn/frontend/paddle/cpp/${cpp}"
+    "${cinnapi_src};paddle/cinn/frontend/paddle/cpp/${cpp}"
     CACHE INTERNAL "")
 endforeach()
 
diff --git a/paddle/cinn/frontend/paddle/cpp/block_desc.cc b/paddle/cinn/frontend/paddle/cpp/block_desc.cc
index 6e5ade3e153ec..f9bafc75c4bcf 100644
--- a/paddle/cinn/frontend/paddle/cpp/block_desc.cc
+++ b/paddle/cinn/frontend/paddle/cpp/block_desc.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/cpp/block_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/block_desc.h"
 
 namespace cinn::frontend::paddle::cpp {
 
diff --git a/paddle/cinn/frontend/paddle/cpp/block_desc.h b/paddle/cinn/frontend/paddle/cpp/block_desc.h
index 59b5c443ed41d..209e8b91218c7 100644
--- a/paddle/cinn/frontend/paddle/cpp/block_desc.h
+++ b/paddle/cinn/frontend/paddle/cpp/block_desc.h
@@ -16,9 +16,9 @@
 
 #include <vector>
 
-#include "cinn/frontend/paddle/cpp/desc_api.h"
-#include "cinn/frontend/paddle/cpp/op_desc.h"
-#include "cinn/frontend/paddle/cpp/var_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/cpp/op_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/var_desc.h"
 
 namespace cinn::frontend::paddle::cpp {
 
diff --git a/paddle/cinn/frontend/paddle/cpp/op_desc.cc b/paddle/cinn/frontend/paddle/cpp/op_desc.cc
index 787b2f5f6f065..91abefd737837 100644
--- a/paddle/cinn/frontend/paddle/cpp/op_desc.cc
+++ b/paddle/cinn/frontend/paddle/cpp/op_desc.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/cpp/op_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/op_desc.h"
 
 #include <cstdint>
 #include <utility>
diff --git a/paddle/cinn/frontend/paddle/cpp/op_desc.h b/paddle/cinn/frontend/paddle/cpp/op_desc.h
index 320392b9c0d3d..bf8110f2f1b9d 100644
--- a/paddle/cinn/frontend/paddle/cpp/op_desc.h
+++ b/paddle/cinn/frontend/paddle/cpp/op_desc.h
@@ -21,8 +21,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/frontend/paddle/cpp/desc_api.h"
-#include "cinn/frontend/paddle/framework.pb.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/framework.pb.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/paddle/cpp/program_desc.cc b/paddle/cinn/frontend/paddle/cpp/program_desc.cc
index 2b78f07a63161..c2a11f6be6924 100644
--- a/paddle/cinn/frontend/paddle/cpp/program_desc.cc
+++ b/paddle/cinn/frontend/paddle/cpp/program_desc.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/cpp/program_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/program_desc.h"
 
 namespace cinn::frontend::paddle::cpp {
 
diff --git a/paddle/cinn/frontend/paddle/cpp/program_desc.h b/paddle/cinn/frontend/paddle/cpp/program_desc.h
index 34a028a3028bf..cb88a94ac4800 100644
--- a/paddle/cinn/frontend/paddle/cpp/program_desc.h
+++ b/paddle/cinn/frontend/paddle/cpp/program_desc.h
@@ -17,8 +17,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/frontend/paddle/cpp/block_desc.h"
-#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/cpp/block_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
 
 namespace cinn::frontend::paddle::cpp {
 
diff --git a/paddle/cinn/frontend/paddle/cpp/var_desc.cc b/paddle/cinn/frontend/paddle/cpp/var_desc.cc
index b9f3c73535dd1..8a17b0c130194 100644
--- a/paddle/cinn/frontend/paddle/cpp/var_desc.cc
+++ b/paddle/cinn/frontend/paddle/cpp/var_desc.cc
@@ -12,6 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/cpp/var_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/var_desc.h"
 
 namespace cinn::frontend::paddle::cpp {}  // namespace cinn::frontend::paddle::cpp
diff --git a/paddle/cinn/frontend/paddle/cpp/var_desc.h b/paddle/cinn/frontend/paddle/cpp/var_desc.h
index 35026f9ddd480..c42410d87d04d 100644
--- a/paddle/cinn/frontend/paddle/cpp/var_desc.h
+++ b/paddle/cinn/frontend/paddle/cpp/var_desc.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
 
 namespace cinn::frontend::paddle::cpp {
 
diff --git a/paddle/cinn/frontend/paddle/model_parser.cc b/paddle/cinn/frontend/paddle/model_parser.cc
index e337ff2475052..a21f29d91cdda 100755
--- a/paddle/cinn/frontend/paddle/model_parser.cc
+++ b/paddle/cinn/frontend/paddle/model_parser.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/model_parser.h"
+#include "paddle/cinn/frontend/paddle/model_parser.h"
 
 #include <fstream>
 #include <vector>
 
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/codegen_cuda_host.h"
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/cuda_util.h"
-#include "cinn/common/common.h"
-#include "cinn/frontend/paddle/compatible_pb.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/frontend/paddle/compatible_pb.h"
 
 namespace cinn::frontend::paddle {
 
diff --git a/paddle/cinn/frontend/paddle/model_parser.h b/paddle/cinn/frontend/paddle/model_parser.h
index 1644940523638..bda1285666dc0 100644
--- a/paddle/cinn/frontend/paddle/model_parser.h
+++ b/paddle/cinn/frontend/paddle/model_parser.h
@@ -18,13 +18,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/frontend/paddle/cpp/program_desc.h"
-#include "cinn/frontend/paddle/framework.pb.h"
-#include "cinn/frontend/paddle/pb/block_desc.h"
-#include "cinn/frontend/paddle/pb/op_desc.h"
-#include "cinn/frontend/paddle/pb/program_desc.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/frontend/paddle/cpp/program_desc.h"
+#include "paddle/cinn/frontend/paddle/framework.pb.h"
+#include "paddle/cinn/frontend/paddle/pb/block_desc.h"
+#include "paddle/cinn/frontend/paddle/pb/op_desc.h"
+#include "paddle/cinn/frontend/paddle/pb/program_desc.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
 
 namespace cinn::frontend::paddle {
 namespace framework_proto = ::cinn::frontend::paddle::proto;
diff --git a/paddle/cinn/frontend/paddle/model_parser_test.cc b/paddle/cinn/frontend/paddle/model_parser_test.cc
index 5dd01ee08bcdf..58d08266de8c6 100644
--- a/paddle/cinn/frontend/paddle/model_parser_test.cc
+++ b/paddle/cinn/frontend/paddle/model_parser_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/model_parser.h"
+#include "paddle/cinn/frontend/paddle/model_parser.h"
 
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
diff --git a/paddle/cinn/frontend/paddle/pb/CMakeLists.txt b/paddle/cinn/frontend/paddle/pb/CMakeLists.txt
index 2dce2900e7129..096084db4821f 100644
--- a/paddle/cinn/frontend/paddle/pb/CMakeLists.txt
+++ b/paddle/cinn/frontend/paddle/pb/CMakeLists.txt
@@ -6,7 +6,7 @@ set(srcs
   )
 
 foreach(cpp ${srcs})
-  set(cinnapi_src "${cinnapi_src};cinn/frontend/paddle/pb/${cpp}" CACHE INTERNAL "")
+  set(cinnapi_src "${cinnapi_src};paddle/cinn/frontend/paddle/pb/${cpp}" CACHE INTERNAL "")
 endforeach()
 
 file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
diff --git a/paddle/cinn/frontend/paddle/pb/block_desc.cc b/paddle/cinn/frontend/paddle/pb/block_desc.cc
index 93193cb67a53b..9cea5f25b1cc8 100644
--- a/paddle/cinn/frontend/paddle/pb/block_desc.cc
+++ b/paddle/cinn/frontend/paddle/pb/block_desc.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/pb/block_desc.h"
+#include "paddle/cinn/frontend/paddle/pb/block_desc.h"
 
 namespace cinn::frontend::paddle::pb {
 
diff --git a/paddle/cinn/frontend/paddle/pb/block_desc.h b/paddle/cinn/frontend/paddle/pb/block_desc.h
index ffdc2c5bdf81f..e5229bf3c4aba 100644
--- a/paddle/cinn/frontend/paddle/pb/block_desc.h
+++ b/paddle/cinn/frontend/paddle/pb/block_desc.h
@@ -15,8 +15,8 @@
 #pragma once
 #include <glog/logging.h>
 
-#include "cinn/frontend/paddle/cpp/desc_api.h"
-#include "cinn/frontend/paddle/framework.pb.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/framework.pb.h"
 
 namespace cinn::frontend::paddle::pb {
 
diff --git a/paddle/cinn/frontend/paddle/pb/op_desc.cc b/paddle/cinn/frontend/paddle/pb/op_desc.cc
index 0b67628487701..679f50b93c8ed 100644
--- a/paddle/cinn/frontend/paddle/pb/op_desc.cc
+++ b/paddle/cinn/frontend/paddle/pb/op_desc.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/pb/op_desc.h"
+#include "paddle/cinn/frontend/paddle/pb/op_desc.h"
 
 namespace cinn::frontend::paddle::pb {
 
diff --git a/paddle/cinn/frontend/paddle/pb/op_desc.h b/paddle/cinn/frontend/paddle/pb/op_desc.h
index ab11611b7e686..51d2187e4f6e1 100644
--- a/paddle/cinn/frontend/paddle/pb/op_desc.h
+++ b/paddle/cinn/frontend/paddle/pb/op_desc.h
@@ -15,8 +15,8 @@
 #pragma once
 #include <absl/types/variant.h>
 
-#include "cinn/frontend/paddle/cpp/op_desc.h"
-#include "cinn/frontend/paddle/framework.pb.h"
+#include "paddle/cinn/frontend/paddle/cpp/op_desc.h"
+#include "paddle/cinn/frontend/paddle/framework.pb.h"
 
 namespace cinn::frontend::paddle::pb {
 
diff --git a/paddle/cinn/frontend/paddle/pb/program_desc.cc b/paddle/cinn/frontend/paddle/pb/program_desc.cc
index eb9a885f81f1c..dc4eef70f8235 100644
--- a/paddle/cinn/frontend/paddle/pb/program_desc.cc
+++ b/paddle/cinn/frontend/paddle/pb/program_desc.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/pb/program_desc.h"
+#include "paddle/cinn/frontend/paddle/pb/program_desc.h"
 
 #include <algorithm>
 #include <limits>
diff --git a/paddle/cinn/frontend/paddle/pb/program_desc.h b/paddle/cinn/frontend/paddle/pb/program_desc.h
index bdb55d2f372d5..f89b1739e6a93 100644
--- a/paddle/cinn/frontend/paddle/pb/program_desc.h
+++ b/paddle/cinn/frontend/paddle/pb/program_desc.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/frontend/paddle/cpp/desc_api.h"
-#include "cinn/frontend/paddle/framework.pb.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/framework.pb.h"
 
 namespace cinn::frontend::paddle::pb {
 namespace framework_proto = ::cinn::frontend::paddle::proto;
diff --git a/paddle/cinn/frontend/paddle/pb/var_desc.cc b/paddle/cinn/frontend/paddle/pb/var_desc.cc
index 1e85d9a2b0770..2ecd927e995e2 100644
--- a/paddle/cinn/frontend/paddle/pb/var_desc.cc
+++ b/paddle/cinn/frontend/paddle/pb/var_desc.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle/pb/var_desc.h"
+#include "paddle/cinn/frontend/paddle/pb/var_desc.h"
 
 #include <google/protobuf/map.h>
 
-#include "cinn/frontend/paddle/cpp/desc_api.h"
-#include "cinn/frontend/paddle/framework.pb.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/framework.pb.h"
 
 namespace cinn::frontend::paddle::pb {
 
diff --git a/paddle/cinn/frontend/paddle/pb/var_desc.h b/paddle/cinn/frontend/paddle/pb/var_desc.h
index 5f680648b352e..ccb4aa9f534f6 100644
--- a/paddle/cinn/frontend/paddle/pb/var_desc.h
+++ b/paddle/cinn/frontend/paddle/pb/var_desc.h
@@ -20,8 +20,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/frontend/paddle/cpp/desc_api.h"
-#include "cinn/frontend/paddle/framework.pb.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/framework.pb.h"
 
 namespace cinn::frontend::paddle::pb {
 namespace framework_proto = ::cinn::frontend::paddle::proto;
diff --git a/paddle/cinn/frontend/paddle_model_convertor.cc b/paddle/cinn/frontend/paddle_model_convertor.cc
index 6f71001b8163f..c9f978f99c4a9 100644
--- a/paddle/cinn/frontend/paddle_model_convertor.cc
+++ b/paddle/cinn/frontend/paddle_model_convertor.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle_model_convertor.h"
+#include "paddle/cinn/frontend/paddle_model_convertor.h"
 
 #include <glog/logging.h>
 
@@ -20,12 +20,12 @@
 #include <unordered_set>
 #include <utility>
 
-#include "cinn/frontend/op_mappers/use_op_mappers.h"
-#include "cinn/frontend/paddle/cpp/op_desc.h"
-#include "cinn/frontend/paddle/cpp/program_desc.h"
-#include "cinn/frontend/paddle/model_parser.h"
-#include "cinn/frontend/var_type_utils.h"
-#include "cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/frontend/op_mappers/use_op_mappers.h"
+#include "paddle/cinn/frontend/paddle/cpp/op_desc.h"
+#include "paddle/cinn/frontend/paddle/cpp/program_desc.h"
+#include "paddle/cinn/frontend/paddle/model_parser.h"
+#include "paddle/cinn/frontend/var_type_utils.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/paddle_model_convertor.h b/paddle/cinn/frontend/paddle_model_convertor.h
index 5c82cea643169..8d7eebe289084 100644
--- a/paddle/cinn/frontend/paddle_model_convertor.h
+++ b/paddle/cinn/frontend/paddle_model_convertor.h
@@ -19,13 +19,13 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/paddle/cpp/block_desc.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/paddle/cpp/block_desc.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/paddle_model_convertor_test.cc b/paddle/cinn/frontend/paddle_model_convertor_test.cc
index c9240b461f4de..7e985b1c570d7 100644
--- a/paddle/cinn/frontend/paddle_model_convertor_test.cc
+++ b/paddle/cinn/frontend/paddle_model_convertor_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle_model_convertor.h"
+#include "paddle/cinn/frontend/paddle_model_convertor.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
-#include "cinn/runtime/use_extern_funcs.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/runtime/use_extern_funcs.h"
 
 DEFINE_string(model_dir, "", "");
 
diff --git a/paddle/cinn/frontend/paddle_model_to_program.cc b/paddle/cinn/frontend/paddle_model_to_program.cc
index 490c9097f85f4..5d3a129ad92e6 100644
--- a/paddle/cinn/frontend/paddle_model_to_program.cc
+++ b/paddle/cinn/frontend/paddle_model_to_program.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/paddle_model_to_program.h"
+#include "paddle/cinn/frontend/paddle_model_to_program.h"
 
 #include <algorithm>
 
-#include "cinn/frontend/paddle/framework.pb.h"
-#include "cinn/frontend/paddle/model_parser.h"
-#include "cinn/frontend/paddle/pb/program_desc.h"
-#include "cinn/hlir/framework/node.h"
+#include "paddle/cinn/frontend/paddle/framework.pb.h"
+#include "paddle/cinn/frontend/paddle/model_parser.h"
+#include "paddle/cinn/frontend/paddle/pb/program_desc.h"
+#include "paddle/cinn/hlir/framework/node.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/paddle_model_to_program.h b/paddle/cinn/frontend/paddle_model_to_program.h
index 1df5d0fb661e6..18a9874760446 100644
--- a/paddle/cinn/frontend/paddle_model_to_program.h
+++ b/paddle/cinn/frontend/paddle_model_to_program.h
@@ -24,16 +24,16 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/object.h"
-#include "cinn/common/type.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/paddle/cpp/program_desc.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/scope.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/paddle/cpp/program_desc.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/scope.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/pass/CMakeLists.txt b/paddle/cinn/frontend/pass/CMakeLists.txt
index 67cd9b5786510..ffca90c22824e 100755
--- a/paddle/cinn/frontend/pass/CMakeLists.txt
+++ b/paddle/cinn/frontend/pass/CMakeLists.txt
@@ -17,20 +17,20 @@ gather_srcs(cinnapi_src SRCS
     )
 
 if (WITH_CUDA)
-cc_test(test_decomposer_pass SRCS decomposer_test.cc DEPS cinncore)
-cc_test(test_dead_code_eliminate_pass SRCS dead_code_eliminate_test.cc DEPS cinncore)
-cc_test(test_remove_identity_pass SRCS remove_identity_test.cc DEPS cinncore)
-cc_test(test_fill_constant_rewriter_pass SRCS fill_constant_rewriter_test.cc DEPS cinncore)
-cc_test(test_fill_constant_folding_pass SRCS fill_constant_folding_test.cc DEPS cinncore)
-cc_test(test_program_topoerror SRCS program_topoerror_test.cc DEPS cinncore)
+cinn_cc_test(test_decomposer_pass SRCS decomposer_test.cc DEPS cinncore)
+cinn_cc_test(test_dead_code_eliminate_pass SRCS dead_code_eliminate_test.cc DEPS cinncore)
+cinn_cc_test(test_remove_identity_pass SRCS remove_identity_test.cc DEPS cinncore)
+cinn_cc_test(test_fill_constant_rewriter_pass SRCS fill_constant_rewriter_test.cc DEPS cinncore)
+cinn_cc_test(test_fill_constant_folding_pass SRCS fill_constant_folding_test.cc DEPS cinncore)
+cinn_cc_test(test_program_topoerror SRCS program_topoerror_test.cc DEPS cinncore)
 endif()
 if (WITH_CUDNN)
-cc_test(test_gemm_rewriter_pass SRCS gemm_rewriter_test.cc DEPS cinncore)
-cc_test(test_transpose_folding_input_pass SRCS transpose_folding_input_test.cc DEPS cinncore)
-cc_test(test_transpose_folding_output_pass SRCS transpose_folding_output_test.cc DEPS cinncore)
-cc_test(test_transpose_scale_folding SRCS transpose_scale_folding_test.cc DEPS cinncore)
+cinn_cc_test(test_gemm_rewriter_pass SRCS gemm_rewriter_test.cc DEPS cinncore)
+cinn_cc_test(test_transpose_folding_input_pass SRCS transpose_folding_input_test.cc DEPS cinncore)
+cinn_cc_test(test_transpose_folding_output_pass SRCS transpose_folding_output_test.cc DEPS cinncore)
+cinn_cc_test(test_transpose_scale_folding SRCS transpose_scale_folding_test.cc DEPS cinncore)
 endif()
-cc_test(test_transpose_collapsing SRCS transpose_collapsing_test.cc DEPS cinncore)
-cc_test(test_cast_collapsing SRCS cast_collapsing_test.cc DEPS cinncore)
-cc_test(test_auto_cast SRCS auto_cast_test.cc DEPS cinncore)
-cc_test(test_expand_zero_dim_pass SRCS expand_zero_dim_pass_test.cc DEPS cinncore)
+cinn_cc_test(test_transpose_collapsing SRCS transpose_collapsing_test.cc DEPS cinncore)
+cinn_cc_test(test_cast_collapsing SRCS cast_collapsing_test.cc DEPS cinncore)
+cinn_cc_test(test_auto_cast SRCS auto_cast_test.cc DEPS cinncore)
+cinn_cc_test(test_expand_zero_dim_pass SRCS expand_zero_dim_pass_test.cc DEPS cinncore)
diff --git a/paddle/cinn/frontend/pass/auto_broadcast.cc b/paddle/cinn/frontend/pass/auto_broadcast.cc
index 8be83f2fb568f..e68e260d533d3 100644
--- a/paddle/cinn/frontend/pass/auto_broadcast.cc
+++ b/paddle/cinn/frontend/pass/auto_broadcast.cc
@@ -18,11 +18,11 @@
 #include <unordered_set>
 #include <vector>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/utils/string.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/utils/string.h"
+#include "paddle/cinn/utils/type_defs.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/frontend/pass/auto_cast.cc b/paddle/cinn/frontend/pass/auto_cast.cc
index aa5440bd67595..f1bf636d0dab4 100644
--- a/paddle/cinn/frontend/pass/auto_cast.cc
+++ b/paddle/cinn/frontend/pass/auto_cast.cc
@@ -17,8 +17,8 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/frontend/pass/auto_cast_test.cc b/paddle/cinn/frontend/pass/auto_cast_test.cc
index ab9b07fb113dc..5cf5ae333133e 100644
--- a/paddle/cinn/frontend/pass/auto_cast_test.cc
+++ b/paddle/cinn/frontend/pass/auto_cast_test.cc
@@ -16,18 +16,18 @@
 
 #include <cfloat>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/pass/pass_test_helper.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/pass/pass_test_helper.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/cast_collapsing.cc b/paddle/cinn/frontend/pass/cast_collapsing.cc
index 50e7001e09b32..373d462a58875 100644
--- a/paddle/cinn/frontend/pass/cast_collapsing.cc
+++ b/paddle/cinn/frontend/pass/cast_collapsing.cc
@@ -18,10 +18,10 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn::frontend::pass {
 
diff --git a/paddle/cinn/frontend/pass/cast_collapsing_test.cc b/paddle/cinn/frontend/pass/cast_collapsing_test.cc
index 1dca1c6ed0875..8effc53f330b1 100644
--- a/paddle/cinn/frontend/pass/cast_collapsing_test.cc
+++ b/paddle/cinn/frontend/pass/cast_collapsing_test.cc
@@ -16,19 +16,19 @@
 
 #include <cfloat>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/pass/pass_test_helper.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/pass/pass_test_helper.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/data_util.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/dead_code_eliminate.cc b/paddle/cinn/frontend/pass/dead_code_eliminate.cc
index 1094710ba137d..1df236d1e94bd 100644
--- a/paddle/cinn/frontend/pass/dead_code_eliminate.cc
+++ b/paddle/cinn/frontend/pass/dead_code_eliminate.cc
@@ -15,8 +15,8 @@
 #include <string>
 #include <unordered_set>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc b/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
index da731a720292d..5651381d690b2 100644
--- a/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
+++ b/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
@@ -14,12 +14,12 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/pass/pass_test_helper.h"
-#include "cinn/frontend/pass/test_helper.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/frontend/pass/pass_test_helper.h"
+#include "paddle/cinn/frontend/pass/test_helper.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/decomposer.cc b/paddle/cinn/frontend/pass/decomposer.cc
index ace930b7ec18c..e3d816ac87622 100755
--- a/paddle/cinn/frontend/pass/decomposer.cc
+++ b/paddle/cinn/frontend/pass/decomposer.cc
@@ -14,8 +14,8 @@
 
 #include <unordered_set>
 
-#include "cinn/frontend/decomposer_registry.h"
-#include "cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/program_pass.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/pass/decomposer_test.cc b/paddle/cinn/frontend/pass/decomposer_test.cc
index 0712baa81bb01..4c5f2dc15a526 100644
--- a/paddle/cinn/frontend/pass/decomposer_test.cc
+++ b/paddle/cinn/frontend/pass/decomposer_test.cc
@@ -16,19 +16,19 @@
 
 #include <random>
 
-#include "cinn/frontend/decomposer/use_decomposer.h"
-#include "cinn/frontend/decomposer_registry.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/frontend/decomposer/use_decomposer.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc b/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
index 176b873359c58..293c92389b639 100644
--- a/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
+++ b/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
@@ -17,8 +17,8 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc b/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
index ea18d07826f09..47854e9c7b608 100644
--- a/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
+++ b/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
@@ -16,19 +16,19 @@
 
 #include <cfloat>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/decomposer/test_helper.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/pass/fill_constant_folding.cc b/paddle/cinn/frontend/pass/fill_constant_folding.cc
index d7d1d250af1d3..c0a5dd51c4734 100644
--- a/paddle/cinn/frontend/pass/fill_constant_folding.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_folding.cc
@@ -18,12 +18,12 @@
 #include <string>
 #include <unordered_set>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/utils/string.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/utils/string.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn::frontend::pass {
 
diff --git a/paddle/cinn/frontend/pass/fill_constant_folding_test.cc b/paddle/cinn/frontend/pass/fill_constant_folding_test.cc
index 3e97b5225db1a..62bf732e10558 100644
--- a/paddle/cinn/frontend/pass/fill_constant_folding_test.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_folding_test.cc
@@ -16,18 +16,18 @@
 
 #include <cfloat>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/fill_constant_rewriter.cc b/paddle/cinn/frontend/pass/fill_constant_rewriter.cc
index c70284b0c453f..058fd628efa0c 100644
--- a/paddle/cinn/frontend/pass/fill_constant_rewriter.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_rewriter.cc
@@ -19,8 +19,8 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/frontend/pass/fill_constant_rewriter_test.cc b/paddle/cinn/frontend/pass/fill_constant_rewriter_test.cc
index cd662d3cbb715..3c215d66f2b96 100644
--- a/paddle/cinn/frontend/pass/fill_constant_rewriter_test.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_rewriter_test.cc
@@ -14,10 +14,10 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/pass/test_helper.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/frontend/pass/test_helper.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/gemm_rewriter.cc b/paddle/cinn/frontend/pass/gemm_rewriter.cc
index 201a751c01eee..1306eaba19718 100644
--- a/paddle/cinn/frontend/pass/gemm_rewriter.cc
+++ b/paddle/cinn/frontend/pass/gemm_rewriter.cc
@@ -17,8 +17,8 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/frontend/pass/gemm_rewriter_test.cc b/paddle/cinn/frontend/pass/gemm_rewriter_test.cc
index d04844cbb3230..f84aebd6b2c39 100755
--- a/paddle/cinn/frontend/pass/gemm_rewriter_test.cc
+++ b/paddle/cinn/frontend/pass/gemm_rewriter_test.cc
@@ -22,11 +22,11 @@
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/pass/pass_test_helper.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/pass/pass_test_helper.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/pass_test_helper.h b/paddle/cinn/frontend/pass/pass_test_helper.h
index 7e88f67c8982d..1c8fac1c3365a 100644
--- a/paddle/cinn/frontend/pass/pass_test_helper.h
+++ b/paddle/cinn/frontend/pass/pass_test_helper.h
@@ -27,18 +27,18 @@
 #include <cuda_runtime.h>
 #endif
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 DECLARE_bool(cinn_use_op_fusion);
 
diff --git a/paddle/cinn/frontend/pass/program_topoerror_test.cc b/paddle/cinn/frontend/pass/program_topoerror_test.cc
index 012dc50e4ef91..141d3dceb1ff6 100644
--- a/paddle/cinn/frontend/pass/program_topoerror_test.cc
+++ b/paddle/cinn/frontend/pass/program_topoerror_test.cc
@@ -16,20 +16,20 @@
 
 #include <cfloat>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/pass/pass_test_helper.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/pass/pass_test_helper.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/data_util.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/remove_identity.cc b/paddle/cinn/frontend/pass/remove_identity.cc
index c580b94e2e289..c5f8d3853cefe 100644
--- a/paddle/cinn/frontend/pass/remove_identity.cc
+++ b/paddle/cinn/frontend/pass/remove_identity.cc
@@ -16,8 +16,8 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/frontend/pass/remove_identity_test.cc b/paddle/cinn/frontend/pass/remove_identity_test.cc
index 87833f405257e..227f076e66cbb 100644
--- a/paddle/cinn/frontend/pass/remove_identity_test.cc
+++ b/paddle/cinn/frontend/pass/remove_identity_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/pass/test_helper.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/frontend/pass/test_helper.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/test_helper.h b/paddle/cinn/frontend/pass/test_helper.h
index d68d876dfee1e..00f46012f3b00 100644
--- a/paddle/cinn/frontend/pass/test_helper.h
+++ b/paddle/cinn/frontend/pass/test_helper.h
@@ -18,13 +18,13 @@
 
 #include <random>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/transpose_collapsing.cc b/paddle/cinn/frontend/pass/transpose_collapsing.cc
index 241007c172397..37055a48a13b5 100644
--- a/paddle/cinn/frontend/pass/transpose_collapsing.cc
+++ b/paddle/cinn/frontend/pass/transpose_collapsing.cc
@@ -18,10 +18,10 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn::frontend::pass {
 
diff --git a/paddle/cinn/frontend/pass/transpose_collapsing_test.cc b/paddle/cinn/frontend/pass/transpose_collapsing_test.cc
index 0bbf4ea456d74..0809661544ea1 100644
--- a/paddle/cinn/frontend/pass/transpose_collapsing_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_collapsing_test.cc
@@ -16,17 +16,17 @@
 
 #include <cfloat>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/transpose_folding_base.h b/paddle/cinn/frontend/pass/transpose_folding_base.h
index 2236f0128f75b..489b520a1b494 100644
--- a/paddle/cinn/frontend/pass/transpose_folding_base.h
+++ b/paddle/cinn/frontend/pass/transpose_folding_base.h
@@ -18,10 +18,10 @@
 #include <string>
 #include <unordered_set>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn::frontend::pass {
 
diff --git a/paddle/cinn/frontend/pass/transpose_folding_input.cc b/paddle/cinn/frontend/pass/transpose_folding_input.cc
index ac568340f40f3..da1c22d806795 100644
--- a/paddle/cinn/frontend/pass/transpose_folding_input.cc
+++ b/paddle/cinn/frontend/pass/transpose_folding_input.cc
@@ -18,9 +18,9 @@
 #include <string>
 #include <unordered_set>
 
-#include "cinn/frontend/pass/transpose_folding_base.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/pass/transpose_folding_base.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn::frontend::pass {
 
diff --git a/paddle/cinn/frontend/pass/transpose_folding_input_test.cc b/paddle/cinn/frontend/pass/transpose_folding_input_test.cc
index eeed2426c36df..b9c0d188ca7be 100644
--- a/paddle/cinn/frontend/pass/transpose_folding_input_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_folding_input_test.cc
@@ -16,19 +16,19 @@
 
 #include <cfloat>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/pass/pass_test_helper.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/pass/pass_test_helper.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/data_util.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/transpose_folding_output.cc b/paddle/cinn/frontend/pass/transpose_folding_output.cc
index e763e2fa32417..b65fb0e7ce385 100644
--- a/paddle/cinn/frontend/pass/transpose_folding_output.cc
+++ b/paddle/cinn/frontend/pass/transpose_folding_output.cc
@@ -18,9 +18,9 @@
 #include <string>
 #include <unordered_set>
 
-#include "cinn/frontend/pass/transpose_folding_base.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/pass/transpose_folding_base.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 namespace cinn::frontend::pass {
 
diff --git a/paddle/cinn/frontend/pass/transpose_folding_output_test.cc b/paddle/cinn/frontend/pass/transpose_folding_output_test.cc
index e41667518909f..f9d0083343a30 100755
--- a/paddle/cinn/frontend/pass/transpose_folding_output_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_folding_output_test.cc
@@ -22,10 +22,10 @@
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/pass/pass_test_helper.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/pass/pass_test_helper.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc b/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
index d54a93228eeec..e39382ea6f4ca 100644
--- a/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
@@ -22,10 +22,10 @@
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/pass/pass_test_helper.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/pass/pass_test_helper.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/frontend/pass/use_program_pass.h b/paddle/cinn/frontend/pass/use_program_pass.h
index 29fd12ac1cea1..64855d6629929 100644
--- a/paddle/cinn/frontend/pass/use_program_pass.h
+++ b/paddle/cinn/frontend/pass/use_program_pass.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/common/macros.h"
+#include "paddle/cinn/common/macros.h"
 
 CINN_USE_REGISTER(ExpandZeroDim)
 CINN_USE_REGISTER(AutoCast)
diff --git a/paddle/cinn/frontend/program_pass.cc b/paddle/cinn/frontend/program_pass.cc
index 902c5e8cf6c14..ec2116fa92dd0 100644
--- a/paddle/cinn/frontend/program_pass.cc
+++ b/paddle/cinn/frontend/program_pass.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
 
 #include <unordered_set>
 
-#include "cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/program_pass.h b/paddle/cinn/frontend/program_pass.h
index 7a7cf6bbff0f2..94c86e30b75ce 100755
--- a/paddle/cinn/frontend/program_pass.h
+++ b/paddle/cinn/frontend/program_pass.h
@@ -20,8 +20,8 @@
 #include <unordered_set>
 #include <vector>
 
-#include "cinn/frontend/syntax.h"
-#include "cinn/utils/registry.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/utils/registry.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/syntax.cc b/paddle/cinn/frontend/syntax.cc
index 07f473c0dfd84..92f39973142fa 100644
--- a/paddle/cinn/frontend/syntax.cc
+++ b/paddle/cinn/frontend/syntax.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 #include <absl/types/variant.h>
 
@@ -23,11 +23,11 @@
 #include <type_traits>
 #include <utility>
 
-#include "cinn/frontend/paddle/model_parser.h"
-#include "cinn/frontend/paddle_model_to_program.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/frontend/paddle/model_parser.h"
+#include "paddle/cinn/frontend/paddle_model_to_program.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/syntax.h b/paddle/cinn/frontend/syntax.h
index 3e86bc230827b..198377ecb286f 100644
--- a/paddle/cinn/frontend/syntax.h
+++ b/paddle/cinn/frontend/syntax.h
@@ -28,12 +28,12 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/object.h"
-#include "cinn/common/type.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/scope.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/scope.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/frontend/syntax_test.cc b/paddle/cinn/frontend/syntax_test.cc
index 6ccbe51897057..59689cc0c27d4 100644
--- a/paddle/cinn/frontend/syntax_test.cc
+++ b/paddle/cinn/frontend/syntax_test.cc
@@ -12,22 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/syntax.h"
+#include "paddle/cinn/frontend/syntax.h"
 
 #include <gtest/gtest.h>
 
 #include <memory>
 //
-#include "cinn/cinn.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 DEFINE_string(model_dir, "", "");
 
diff --git a/paddle/cinn/frontend/var_type_utils.h b/paddle/cinn/frontend/var_type_utils.h
index ee911ef8b93b4..a46af22620f72 100644
--- a/paddle/cinn/frontend/var_type_utils.h
+++ b/paddle/cinn/frontend/var_type_utils.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "cinn/common/macros.h"
-#include "cinn/common/type.h"
-#include "cinn/frontend/op_mapper_registry.h"
-#include "cinn/frontend/paddle/cpp/desc_api.h"
-#include "cinn/frontend/paddle/cpp/var_desc.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/frontend/op_mapper_registry.h"
+#include "paddle/cinn/frontend/paddle/cpp/desc_api.h"
+#include "paddle/cinn/frontend/paddle/cpp/var_desc.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/hlir/framework/CMakeLists.txt b/paddle/cinn/hlir/framework/CMakeLists.txt
index 4961373283c23..5485f6c3cd5ad 100755
--- a/paddle/cinn/hlir/framework/CMakeLists.txt
+++ b/paddle/cinn/hlir/framework/CMakeLists.txt
@@ -20,22 +20,22 @@ gather_srcs(cinnapi_src SRCS
 )
 
 if(WITH_CUDA)
-  nv_test(test_hlir_framework_buffer SRCS buffer_test.cc DEPS cinncore)
-  cc_test(test_hlir_framework_accuracy_checker SRCS accuracy_checker_test.cc DEPS cinncore)
-  cc_test(test_hlir_framework_parallel_compiler SRCS parallel_compiler_test.cc DEPS cinncore)
+  cinn_nv_test(test_hlir_framework_buffer SRCS buffer_test.cc DEPS cinncore)
+  cinn_cc_test(test_hlir_framework_accuracy_checker SRCS accuracy_checker_test.cc DEPS cinncore)
+  cinn_cc_test(test_hlir_framework_parallel_compiler SRCS parallel_compiler_test.cc DEPS cinncore)
 else()
-  cc_test(test_hlir_framework_buffer SRCS buffer_test.cc DEPS cinncore)
+  cinn_cc_test(test_hlir_framework_buffer SRCS buffer_test.cc DEPS cinncore)
 endif()
 
 
 if (WITH_CUDA)
-cc_test(test_hlir_framework_op_lowering SRCS op_lowering_test.cc DEPS cinncore decomposer_test_helper)
+cinn_cc_test(test_hlir_framework_op_lowering SRCS op_lowering_test.cc DEPS cinncore decomposer_test_helper)
 endif()
-cc_test(test_hlir_framework_tensor SRCS tensor_test.cc DEPS cinncore)
-cc_test(test_hlir_framework_scope SRCS scope_test.cc DEPS cinncore)
-cc_test(test_hlir_framework_instruction SRCS instruction_test.cc DEPS cinncore)
-cc_test(test_hlir_framework_op SRCS op_test.cc DEPS cinncore)
-cc_test(test_hlir_framework_print_graph_pass SRCS print_graph_pass_test.cc DEPS cinncore)
-cc_test(test_hlir_framework_graph SRCS graph_test.cc DEPS cinncore)
+cinn_cc_test(test_hlir_framework_tensor SRCS tensor_test.cc DEPS cinncore)
+cinn_cc_test(test_hlir_framework_scope SRCS scope_test.cc DEPS cinncore)
+cinn_cc_test(test_hlir_framework_instruction SRCS instruction_test.cc DEPS cinncore)
+cinn_cc_test(test_hlir_framework_op SRCS op_test.cc DEPS cinncore)
+cinn_cc_test(test_hlir_framework_print_graph_pass SRCS print_graph_pass_test.cc DEPS cinncore)
+cinn_cc_test(test_hlir_framework_graph SRCS graph_test.cc DEPS cinncore)
 
-#cc_test(test_hlir_framework_graph_compiler SRCS graph_compiler_test.cc DEPS cinncore)
+#cinn_cc_test(test_hlir_framework_graph_compiler SRCS graph_compiler_test.cc DEPS cinncore)
diff --git a/paddle/cinn/hlir/framework/accuracy_checker.cc b/paddle/cinn/hlir/framework/accuracy_checker.cc
index 5f756bfcf176f..23cf7e6d78593 100644
--- a/paddle/cinn/hlir/framework/accuracy_checker.cc
+++ b/paddle/cinn/hlir/framework/accuracy_checker.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/accuracy_checker.h"
+#include "paddle/cinn/hlir/framework/accuracy_checker.h"
 
 #include <iomanip>
 #include <limits>
diff --git a/paddle/cinn/hlir/framework/accuracy_checker.h b/paddle/cinn/hlir/framework/accuracy_checker.h
index 635c05705b39b..db24d4d84b9e2 100644
--- a/paddle/cinn/hlir/framework/accuracy_checker.h
+++ b/paddle/cinn/hlir/framework/accuracy_checker.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/accuracy_checker_test.cc b/paddle/cinn/hlir/framework/accuracy_checker_test.cc
index 624d9d5cfece7..39e22c7692eff 100644
--- a/paddle/cinn/hlir/framework/accuracy_checker_test.cc
+++ b/paddle/cinn/hlir/framework/accuracy_checker_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/accuracy_checker.h"
+#include "paddle/cinn/hlir/framework/accuracy_checker.h"
 
 #include <gtest/gtest.h>
 
@@ -21,9 +21,9 @@
 #include <cuda_runtime.h>
 #endif
 
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/hlir/framework/instruction.h"
-#include "cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
 
 DECLARE_string(cinn_self_check_accuracy);
 
diff --git a/paddle/cinn/hlir/framework/buffer.cc b/paddle/cinn/hlir/framework/buffer.cc
index 2828527af5b40..8051236acd4db 100755
--- a/paddle/cinn/hlir/framework/buffer.cc
+++ b/paddle/cinn/hlir/framework/buffer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/buffer.h"
+#include "paddle/cinn/hlir/framework/buffer.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/buffer.h b/paddle/cinn/hlir/framework/buffer.h
index 0538eaee678d1..f41a717a80c44 100644
--- a/paddle/cinn/hlir/framework/buffer.h
+++ b/paddle/cinn/hlir/framework/buffer.h
@@ -19,10 +19,10 @@
 
 #include <memory>
 
-#include "cinn/common/macros.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/memory.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/memory.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/buffer_test.cc b/paddle/cinn/hlir/framework/buffer_test.cc
index 5c7f00276c7d0..6687077b26309 100755
--- a/paddle/cinn/hlir/framework/buffer_test.cc
+++ b/paddle/cinn/hlir/framework/buffer_test.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/buffer.h"
+#include "paddle/cinn/hlir/framework/buffer.h"
 #ifdef CINN_WITH_CUDA
-#include "cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
 #endif
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/hlir/framework/graph.cc b/paddle/cinn/hlir/framework/graph.cc
index 2d79f107814f1..17666ea77591d 100644
--- a/paddle/cinn/hlir/framework/graph.cc
+++ b/paddle/cinn/hlir/framework/graph.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph.h"
 
 #include <atomic>
 #include <sstream>
 
-#include "cinn/hlir/framework/visualize_helper.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/string.h"
 
 DECLARE_string(cinn_fusion_groups_graphviz_dir);
 
diff --git a/paddle/cinn/hlir/framework/graph.h b/paddle/cinn/hlir/framework/graph.h
index 484d551508c0b..47002c8a34708 100644
--- a/paddle/cinn/hlir/framework/graph.h
+++ b/paddle/cinn/hlir/framework/graph.h
@@ -21,9 +21,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/node.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/node.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/graph_compiler.cc b/paddle/cinn/hlir/framework/graph_compiler.cc
index 3850c50cbf0bc..6cbfa5d677624 100644
--- a/paddle/cinn/hlir/framework/graph_compiler.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
@@ -12,23 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
 
 #include <absl/container/flat_hash_map.h>
 
 #include <memory>
 #include <unordered_set>
 
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/hlir/framework/instruction.h"
-#include "cinn/hlir/framework/op_lowering_util.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/lang/lower.h"
-#include "cinn/optim/transform_gpu_forloop.h"
-#include "cinn/poly/stage.h"
-#include "cinn/utils/profiler.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/op_lowering_util.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/utils/profiler.h"
 
 DECLARE_bool(cinn_ir_schedule);
 DECLARE_int32(cinn_parallel_compile_size);
diff --git a/paddle/cinn/hlir/framework/graph_compiler.h b/paddle/cinn/hlir/framework/graph_compiler.h
index 9a7567591d20e..5cb6ed7e8267e 100644
--- a/paddle/cinn/hlir/framework/graph_compiler.h
+++ b/paddle/cinn/hlir/framework/graph_compiler.h
@@ -23,18 +23,18 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/auto_schedule/tuning.h"
-#include "cinn/backends/compiler.h"
-#include "cinn/backends/cuda_util.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/instruction.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/framework/parallel_compiler.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/utils/timer.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/backends/compiler.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/framework/parallel_compiler.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/utils/timer.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/graph_compiler_test.cc b/paddle/cinn/hlir/framework/graph_compiler_test.cc
index 7805c9e5bd131..78e57d81668c4 100644
--- a/paddle/cinn/hlir/framework/graph_compiler_test.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler_test.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/graph_test.cc b/paddle/cinn/hlir/framework/graph_test.cc
index a4fec36f7ccf0..c5928869fd0d5 100644
--- a/paddle/cinn/hlir/framework/graph_test.cc
+++ b/paddle/cinn/hlir/framework/graph_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
 
 DECLARE_string(cinn_fusion_groups_graphviz_dir);
 
diff --git a/paddle/cinn/hlir/framework/instruction.cc b/paddle/cinn/hlir/framework/instruction.cc
index 51c8f98de3804..7e822a69af42c 100644
--- a/paddle/cinn/hlir/framework/instruction.cc
+++ b/paddle/cinn/hlir/framework/instruction.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
 
 #include <fstream>
 #include <sstream>
 
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/framework/accuracy_checker.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/profiler.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/framework/accuracy_checker.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/profiler.h"
 
 DECLARE_bool(cinn_sync_run);
 DECLARE_string(cinn_self_check_accuracy);
diff --git a/paddle/cinn/hlir/framework/instruction.h b/paddle/cinn/hlir/framework/instruction.h
index 7d48a22d6585a..61184b370ebd0 100644
--- a/paddle/cinn/hlir/framework/instruction.h
+++ b/paddle/cinn/hlir/framework/instruction.h
@@ -19,13 +19,13 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/hlir/framework/scope.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/hlir/framework/scope.h"
 #ifdef CINN_WITH_CUDA
-#include "cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
 #endif
-#include "cinn/utils/string.h"
-#include "cinn/utils/timer.h"
+#include "paddle/cinn/utils/string.h"
+#include "paddle/cinn/utils/timer.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/instruction_test.cc b/paddle/cinn/hlir/framework/instruction_test.cc
index 087baf02984ce..5c5bf493b9120 100644
--- a/paddle/cinn/hlir/framework/instruction_test.cc
+++ b/paddle/cinn/hlir/framework/instruction_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
 
 #include <gtest/gtest.h>
 
@@ -21,13 +21,13 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/memory.cc b/paddle/cinn/hlir/framework/memory.cc
index 69e29ef56caab..d2739544b715f 100755
--- a/paddle/cinn/hlir/framework/memory.cc
+++ b/paddle/cinn/hlir/framework/memory.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/memory.h"
+#include "paddle/cinn/hlir/framework/memory.h"
 
 #ifdef CINN_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#include "cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
 #endif
 
 namespace cinn {
diff --git a/paddle/cinn/hlir/framework/memory.h b/paddle/cinn/hlir/framework/memory.h
index 8f23f352af92c..a17f3608cf1d4 100755
--- a/paddle/cinn/hlir/framework/memory.h
+++ b/paddle/cinn/hlir/framework/memory.h
@@ -19,8 +19,8 @@
 
 #include <memory>
 
-#include "cinn/common/macros.h"
-#include "cinn/common/target.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/target.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/node.cc b/paddle/cinn/hlir/framework/node.cc
index 176d67697b9ca..e2e8ae865f99c 100644
--- a/paddle/cinn/hlir/framework/node.cc
+++ b/paddle/cinn/hlir/framework/node.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/node.h"
 
 #include <algorithm>
 
-#include "cinn/common/context.h"
+#include "paddle/cinn/common/context.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/node.h b/paddle/cinn/hlir/framework/node.h
index c22b817355427..6c48efca99746 100644
--- a/paddle/cinn/hlir/framework/node.h
+++ b/paddle/cinn/hlir/framework/node.h
@@ -21,10 +21,10 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/common/shared.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/op.h b/paddle/cinn/hlir/framework/op.h
index 2a158c720e1d5..cc21ed086d2c7 100755
--- a/paddle/cinn/hlir/framework/op.h
+++ b/paddle/cinn/hlir/framework/op.h
@@ -25,9 +25,9 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/macros.h"
-#include "cinn/utils/registry.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/utils/registry.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 template <typename R, typename... Args>
 inline auto MakeOpFunction(R (*func)(Args...)) {
diff --git a/paddle/cinn/hlir/framework/op_lowering.cc b/paddle/cinn/hlir/framework/op_lowering.cc
index 9a213242a17e4..46b0c48678012 100644
--- a/paddle/cinn/hlir/framework/op_lowering.cc
+++ b/paddle/cinn/hlir/framework/op_lowering.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
 
-#include "cinn/hlir/framework/op_lowering_util.h"
-#include "cinn/hlir/op/external_api_registry.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/hlir/framework/op_lowering_util.h"
+#include "paddle/cinn/hlir/op/external_api_registry.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
 
 DECLARE_bool(cinn_ir_schedule);
 DECLARE_bool(cinn_use_cuda_vectorize);
diff --git a/paddle/cinn/hlir/framework/op_lowering.h b/paddle/cinn/hlir/framework/op_lowering.h
index 6e291afeb6e08..cb95ee0a04afe 100755
--- a/paddle/cinn/hlir/framework/op_lowering.h
+++ b/paddle/cinn/hlir/framework/op_lowering.h
@@ -17,14 +17,14 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/instruction.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/ir_schedule_util.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/lang/packed_func.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/ir_schedule_util.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/lang/packed_func.h"
 
 // Fusion Op lowering, there are four kinds of lowering function:
 // Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
diff --git a/paddle/cinn/hlir/framework/op_lowering_test.cc b/paddle/cinn/hlir/framework/op_lowering_test.cc
index 55a5a2fd445c7..34ecabee866a5 100644
--- a/paddle/cinn/hlir/framework/op_lowering_test.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_test.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/cuda_util.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/backends/nvrtc/nvrtc_util.h"
-#include "cinn/common/target.h"
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
index 8220c586008db..4eb45b1d2884b 100644
--- a/paddle/cinn/hlir/framework/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/op_lowering_util.h"
+#include "paddle/cinn/hlir/framework/op_lowering_util.h"
 
-#include "cinn/hlir/pe/nn_util.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/hlir/pe/nn_util.h"
+#include "paddle/cinn/utils/string.h"
 #ifdef CINN_WITH_CUDA
-#include "cinn/common/bfloat16.h"
-#include "cinn/common/float16.h"
+#include "paddle/cinn/common/bfloat16.h"
+#include "paddle/cinn/common/float16.h"
 #endif
 #include <queue>
 
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.h b/paddle/cinn/hlir/framework/op_lowering_util.h
index f081411ec055a..fd1c6addb0aae 100644
--- a/paddle/cinn/hlir/framework/op_lowering_util.h
+++ b/paddle/cinn/hlir/framework/op_lowering_util.h
@@ -16,7 +16,7 @@
 
 #include <queue>
 
-#include "cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/op_strategy.cc b/paddle/cinn/hlir/framework/op_strategy.cc
index 6ab11854c24f8..825a519987e9f 100644
--- a/paddle/cinn/hlir/framework/op_strategy.cc
+++ b/paddle/cinn/hlir/framework/op_strategy.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
 namespace cinn {
 namespace hlir {
 namespace framework {
diff --git a/paddle/cinn/hlir/framework/op_strategy.h b/paddle/cinn/hlir/framework/op_strategy.h
index b5e86fab11e2d..96d053ca5c980 100644
--- a/paddle/cinn/hlir/framework/op_strategy.h
+++ b/paddle/cinn/hlir/framework/op_strategy.h
@@ -18,9 +18,9 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/schedule.h"
-#include "cinn/lang/packed_func.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/schedule.h"
+#include "paddle/cinn/lang/packed_func.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/op_test.cc b/paddle/cinn/hlir/framework/op_test.cc
index 406e1ad628ffd..f883c41550b6c 100644
--- a/paddle/cinn/hlir/framework/op_test.cc
+++ b/paddle/cinn/hlir/framework/op_test.cc
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op.h"
 
 #include <gtest/gtest.h>
 
 #include <functional>
 #include <string>
 
-#include "cinn/cinn.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.cc b/paddle/cinn/hlir/framework/parallel_compiler.cc
index ede13cab04f13..a5c233e0d14a5 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler.cc
@@ -12,23 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/parallel_compiler.h"
+#include "paddle/cinn/hlir/framework/parallel_compiler.h"
 
 #include <algorithm>
 #include <fstream>
 #include <thread>
 
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/codegen_cuda_host.h"
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/compiler.h"
-#include "cinn/backends/llvm/codegen_x86.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/backends/nvrtc/nvrtc_util.h"
-#include "cinn/common/context.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/ir/module.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/compiler.h"
+#include "paddle/cinn/backends/llvm/codegen_x86.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_int32(cinn_parallel_compile_size);
 DECLARE_int32(cinn_parallel_compile_thread);
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.h b/paddle/cinn/hlir/framework/parallel_compiler.h
index 251d82ad285ed..8638d07230386 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler.h
+++ b/paddle/cinn/hlir/framework/parallel_compiler.h
@@ -16,14 +16,14 @@
 #include <mutex>
 #include <vector>
 
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/instruction.h"
-#include "cinn/hlir/framework/op_lowering.h"
-#include "cinn/ir/lowered_func.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/ir/lowered_func.h"
 #ifdef CINN_WITH_CUDA
-#include "cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
 #endif
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/parallel_compiler_test.cc b/paddle/cinn/hlir/framework/parallel_compiler_test.cc
index 4e9ff5ef62f4a..e4f295a8fee84 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler_test.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/parallel_compiler.h"
+#include "paddle/cinn/hlir/framework/parallel_compiler.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/pass.cc b/paddle/cinn/hlir/framework/pass.cc
index ad4d6152e5f98..978c52d4609bb 100644
--- a/paddle/cinn/hlir/framework/pass.cc
+++ b/paddle/cinn/hlir/framework/pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/pass.h"
 
-#include "cinn/hlir/framework/visualize_helper.h"
-#include "cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/pass.h b/paddle/cinn/hlir/framework/pass.h
index 8a51c98eaa352..bfd64a0e27ef3 100644
--- a/paddle/cinn/hlir/framework/pass.h
+++ b/paddle/cinn/hlir/framework/pass.h
@@ -17,8 +17,8 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/utils/registry.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/utils/registry.h"
 
 #define CINN_REGISTER_PASS(name) \
   CINN_REGISTRY_REGISTER(::cinn::hlir::framework::PassFunctionRegister, PassFunctionRegister, name)
diff --git a/paddle/cinn/hlir/framework/print_graph_pass_test.cc b/paddle/cinn/hlir/framework/print_graph_pass_test.cc
index 6e63cc4a77618..d74afa6ed86f9 100644
--- a/paddle/cinn/hlir/framework/print_graph_pass_test.cc
+++ b/paddle/cinn/hlir/framework/print_graph_pass_test.cc
@@ -17,13 +17,13 @@
 
 #include <string>
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/schedule.h b/paddle/cinn/hlir/framework/schedule.h
index afd7816ca642d..36dc5186a9c53 100644
--- a/paddle/cinn/hlir/framework/schedule.h
+++ b/paddle/cinn/hlir/framework/schedule.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/scope.cc b/paddle/cinn/hlir/framework/scope.cc
index 581d18a6a3405..531b0b6365d83 100755
--- a/paddle/cinn/hlir/framework/scope.cc
+++ b/paddle/cinn/hlir/framework/scope.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/scope.h"
+#include "paddle/cinn/hlir/framework/scope.h"
 
-#include "cinn/common/common.h"
+#include "paddle/cinn/common/common.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/scope.h b/paddle/cinn/hlir/framework/scope.h
index f28d540a7d54e..44088f38b3e31 100755
--- a/paddle/cinn/hlir/framework/scope.h
+++ b/paddle/cinn/hlir/framework/scope.h
@@ -22,8 +22,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/scope_test.cc b/paddle/cinn/hlir/framework/scope_test.cc
index b94dbf6851361..99cc296c887b2 100644
--- a/paddle/cinn/hlir/framework/scope_test.cc
+++ b/paddle/cinn/hlir/framework/scope_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/scope.h"
+#include "paddle/cinn/hlir/framework/scope.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/hlir/framework/tensor.cc b/paddle/cinn/hlir/framework/tensor.cc
index ba84747970d5a..df9eb871c448a 100644
--- a/paddle/cinn/hlir/framework/tensor.cc
+++ b/paddle/cinn/hlir/framework/tensor.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
 
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/tensor.h b/paddle/cinn/hlir/framework/tensor.h
index cb46fa4697fea..6f35148c407fa 100644
--- a/paddle/cinn/hlir/framework/tensor.h
+++ b/paddle/cinn/hlir/framework/tensor.h
@@ -21,10 +21,10 @@
 #include <numeric>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/buffer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/buffer.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/tensor_test.cc b/paddle/cinn/hlir/framework/tensor_test.cc
index 2a589f5e34920..30ce7c158696d 100644
--- a/paddle/cinn/hlir/framework/tensor_test.cc
+++ b/paddle/cinn/hlir/framework/tensor_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/hlir/framework/variable.cc b/paddle/cinn/hlir/framework/variable.cc
index ad9f115f4d838..9ab0713cead63 100644
--- a/paddle/cinn/hlir/framework/variable.cc
+++ b/paddle/cinn/hlir/framework/variable.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/variable.h"
+#include "paddle/cinn/hlir/framework/variable.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/visualize_helper.cc b/paddle/cinn/hlir/framework/visualize_helper.cc
index 81056a51eb880..214637655fde7 100644
--- a/paddle/cinn/hlir/framework/visualize_helper.cc
+++ b/paddle/cinn/hlir/framework/visualize_helper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
 
 #include <errno.h>
 #include <sys/stat.h>
@@ -25,10 +25,10 @@
 #include <unordered_map>
 #include <vector>
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/dot_lang.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/dot_lang.h"
+#include "paddle/cinn/utils/string.h"
 
 DECLARE_string(cinn_pass_visualize_dir);
 DECLARE_string(cinn_check_fusion_accuracy_pass);
diff --git a/paddle/cinn/hlir/framework/visualize_helper.h b/paddle/cinn/hlir/framework/visualize_helper.h
index 4f9a54e4a0ddd..c961a0137340c 100644
--- a/paddle/cinn/hlir/framework/visualize_helper.h
+++ b/paddle/cinn/hlir/framework/visualize_helper.h
@@ -25,9 +25,9 @@
 #include <unordered_map>
 #include <vector>
 
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/utils/dot_lang.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/utils/dot_lang.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/CMakeLists.txt b/paddle/cinn/hlir/op/CMakeLists.txt
index 306d1ce2aea55..23f4c836a14f3 100644
--- a/paddle/cinn/hlir/op/CMakeLists.txt
+++ b/paddle/cinn/hlir/op/CMakeLists.txt
@@ -13,11 +13,11 @@ gather_srcs(cinnapi_src SRCS
     external_api_registry.cc
     )
 
-cc_test(test_cinn_op_broadcast SRCS op_broadcast_test.cc DEPS cinncore)
-cc_test(test_cinn_op_nn SRCS op_nn_test.cc DEPS cinncore)
-cc_test(test_cinn_op_transform SRCS transform_test.cc DEPS cinncore)
-cc_test(test_external_api_registry SRCS external_api_registry_test.cc DEPS cinncore)
+cinn_cc_test(test_cinn_op_broadcast SRCS op_broadcast_test.cc DEPS cinncore)
+cinn_cc_test(test_cinn_op_nn SRCS op_nn_test.cc DEPS cinncore)
+cinn_cc_test(test_cinn_op_transform SRCS transform_test.cc DEPS cinncore)
+cinn_cc_test(test_external_api_registry SRCS external_api_registry_test.cc DEPS cinncore)
 
 if (WITH_CUDA)
-cc_test(test_cinn_op_reduction SRCS reduction_test.cc DEPS cinncore)
+cinn_cc_test(test_cinn_op_reduction SRCS reduction_test.cc DEPS cinncore)
 endif()
diff --git a/paddle/cinn/hlir/op/broadcast.cc b/paddle/cinn/hlir/op/broadcast.cc
index 427a23398f78b..5d876bb615a84 100644
--- a/paddle/cinn/hlir/op/broadcast.cc
+++ b/paddle/cinn/hlir/op/broadcast.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
 
 #include <iostream>
 
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/layout.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/layout.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/contrib/CMakeLists.txt b/paddle/cinn/hlir/op/contrib/CMakeLists.txt
index d8237fb503134..53381180cc34e 100644
--- a/paddle/cinn/hlir/op/contrib/CMakeLists.txt
+++ b/paddle/cinn/hlir/op/contrib/CMakeLists.txt
@@ -19,11 +19,11 @@ gather_srcs(cinnapi_src SRCS
         assert_true.cc
         )
 
-cc_test(test_gather_nd SRCS gather_nd_test.cc DEPS cinncore)
-cc_test(test_sort SRCS sort_test.cc DEPS cinncore)
-cc_test(test_argmin SRCS argmin_test.cc DEPS cinncore)
-cc_test(test_argmax SRCS argmax_test.cc DEPS cinncore)
-cc_test(test_repeat SRCS repeat_test.cc DEPS cinncore)
-cc_test(test_one_hot SRCS one_hot_test.cc DEPS cinncore)
-cc_test(test_lookup_table SRCS lookup_table_test.cc DEPS cinncore)
-cc_test(test_reciprocal SRCS reciprocal_test.cc DEPS cinncore)
+cinn_cc_test(test_gather_nd SRCS gather_nd_test.cc DEPS cinncore)
+cinn_cc_test(test_sort SRCS sort_test.cc DEPS cinncore)
+cinn_cc_test(test_argmin SRCS argmin_test.cc DEPS cinncore)
+cinn_cc_test(test_argmax SRCS argmax_test.cc DEPS cinncore)
+cinn_cc_test(test_repeat SRCS repeat_test.cc DEPS cinncore)
+cinn_cc_test(test_one_hot SRCS one_hot_test.cc DEPS cinncore)
+cinn_cc_test(test_lookup_table SRCS lookup_table_test.cc DEPS cinncore)
+cinn_cc_test(test_reciprocal SRCS reciprocal_test.cc DEPS cinncore)
diff --git a/paddle/cinn/hlir/op/contrib/argmax.cc b/paddle/cinn/hlir/op/contrib/argmax.cc
index 985b600e99bc3..36745b1fbc8f5 100644
--- a/paddle/cinn/hlir/op/contrib/argmax.cc
+++ b/paddle/cinn/hlir/op/contrib/argmax.cc
@@ -12,28 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/argmax.h"
+#include "paddle/cinn/hlir/op/contrib/argmax.h"
 
 #include <iostream>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/cinn_value.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/contrib/sort.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/contrib/sort.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/contrib/argmax.h b/paddle/cinn/hlir/op/contrib/argmax.h
index ab253be44178d..43b306202924b 100644
--- a/paddle/cinn/hlir/op/contrib/argmax.h
+++ b/paddle/cinn/hlir/op/contrib/argmax.h
@@ -16,7 +16,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/argmax_test.cc b/paddle/cinn/hlir/op/contrib/argmax_test.cc
index 3b42e73f1fdd5..49b3cfb38c91f 100644
--- a/paddle/cinn/hlir/op/contrib/argmax_test.cc
+++ b/paddle/cinn/hlir/op/contrib/argmax_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/argmax.h"
+#include "paddle/cinn/hlir/op/contrib/argmax.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,14 +20,14 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/argmin.cc b/paddle/cinn/hlir/op/contrib/argmin.cc
index 51214f30eb641..52fc9ccd5d0e4 100644
--- a/paddle/cinn/hlir/op/contrib/argmin.cc
+++ b/paddle/cinn/hlir/op/contrib/argmin.cc
@@ -12,28 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/argmin.h"
+#include "paddle/cinn/hlir/op/contrib/argmin.h"
 
 #include <iostream>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/cinn_value.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/contrib/sort.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/contrib/sort.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/contrib/argmin.h b/paddle/cinn/hlir/op/contrib/argmin.h
index 9de4860b889a2..839e5ec6eee79 100644
--- a/paddle/cinn/hlir/op/contrib/argmin.h
+++ b/paddle/cinn/hlir/op/contrib/argmin.h
@@ -16,7 +16,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/argmin_test.cc b/paddle/cinn/hlir/op/contrib/argmin_test.cc
index bfe053f101383..d4625e14df04c 100644
--- a/paddle/cinn/hlir/op/contrib/argmin_test.cc
+++ b/paddle/cinn/hlir/op/contrib/argmin_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/argmin.h"
+#include "paddle/cinn/hlir/op/contrib/argmin.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,13 +20,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/assert_true.cc b/paddle/cinn/hlir/op/contrib/assert_true.cc
index 3f0e43cc2b8fb..0fb0328ed87d5 100644
--- a/paddle/cinn/hlir/op/contrib/assert_true.cc
+++ b/paddle/cinn/hlir/op/contrib/assert_true.cc
@@ -15,19 +15,19 @@
 #include <memory>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/bitcast_convert.cc b/paddle/cinn/hlir/op/contrib/bitcast_convert.cc
index beb7e9fe9be6b..4105ec32d4d93 100644
--- a/paddle/cinn/hlir/op/contrib/bitcast_convert.cc
+++ b/paddle/cinn/hlir/op/contrib/bitcast_convert.cc
@@ -19,23 +19,23 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/cholesky.cc b/paddle/cinn/hlir/op/contrib/cholesky.cc
index d1c3c4f5dab31..d64cbcab4acd7 100644
--- a/paddle/cinn/hlir/op/contrib/cholesky.cc
+++ b/paddle/cinn/hlir/op/contrib/cholesky.cc
@@ -20,29 +20,29 @@
 #include <vector>
 
 #include "absl/types/variant.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/cinn_value.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/poly/stage.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/hlir/op/contrib/gather_nd.cc b/paddle/cinn/hlir/op/contrib/gather_nd.cc
index b80810a039271..3dd8625172930 100644
--- a/paddle/cinn/hlir/op/contrib/gather_nd.cc
+++ b/paddle/cinn/hlir/op/contrib/gather_nd.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/gather_nd.h"
+#include "paddle/cinn/hlir/op/contrib/gather_nd.h"
 
 #include <gflags/gflags.h>
 
@@ -21,23 +21,23 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/type.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 DECLARE_bool(cinn_ir_schedule);
 
 namespace cinn {
diff --git a/paddle/cinn/hlir/op/contrib/gather_nd.h b/paddle/cinn/hlir/op/contrib/gather_nd.h
index 09561cff7a636..45bc23d215657 100644
--- a/paddle/cinn/hlir/op/contrib/gather_nd.h
+++ b/paddle/cinn/hlir/op/contrib/gather_nd.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/gather_nd_test.cc b/paddle/cinn/hlir/op/contrib/gather_nd_test.cc
index b1b23bbdc14bf..275a0f6de5d95 100644
--- a/paddle/cinn/hlir/op/contrib/gather_nd_test.cc
+++ b/paddle/cinn/hlir/op/contrib/gather_nd_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/gather_nd.h"
+#include "paddle/cinn/hlir/op/contrib/gather_nd.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,13 +20,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/gaussian_random.cc b/paddle/cinn/hlir/op/contrib/gaussian_random.cc
index 0072c0a73067b..55478d51be4dc 100644
--- a/paddle/cinn/hlir/op/contrib/gaussian_random.cc
+++ b/paddle/cinn/hlir/op/contrib/gaussian_random.cc
@@ -20,29 +20,29 @@
 #include <vector>
 
 #include "absl/types/variant.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/cinn_value.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/poly/stage.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/hlir/op/contrib/logical_right_shift.cc b/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
index 6dcfad2b89e8b..4ac382fc756f8 100644
--- a/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
@@ -17,24 +17,24 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 #include "gflags/gflags.h"
 
 DECLARE_bool(cinn_ir_schedule);
diff --git a/paddle/cinn/hlir/op/contrib/logical_right_shift.h b/paddle/cinn/hlir/op/contrib/logical_right_shift.h
index cf1d0ec5220ef..1b18fa639464d 100644
--- a/paddle/cinn/hlir/op/contrib/logical_right_shift.h
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc b/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
index b93f364e896e5..108fa018b3523 100644
--- a/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/logical_right_shift.h"
+#include "paddle/cinn/hlir/op/contrib/logical_right_shift.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,13 +20,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/lookup_table.cc b/paddle/cinn/hlir/op/contrib/lookup_table.cc
index 39a7a4862255f..dcd11361644cb 100644
--- a/paddle/cinn/hlir/op/contrib/lookup_table.cc
+++ b/paddle/cinn/hlir/op/contrib/lookup_table.cc
@@ -12,31 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/lookup_table.h"
+#include "paddle/cinn/hlir/op/contrib/lookup_table.h"
 
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/type.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 #include "gflags/gflags.h"
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/contrib/lookup_table.h b/paddle/cinn/hlir/op/contrib/lookup_table.h
index 202f640938ca4..164d7e2e6e42c 100644
--- a/paddle/cinn/hlir/op/contrib/lookup_table.h
+++ b/paddle/cinn/hlir/op/contrib/lookup_table.h
@@ -17,11 +17,11 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/compute.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/lookup_table_test.cc b/paddle/cinn/hlir/op/contrib/lookup_table_test.cc
index 5c235c6a0736a..984e6024b6572 100644
--- a/paddle/cinn/hlir/op/contrib/lookup_table_test.cc
+++ b/paddle/cinn/hlir/op/contrib/lookup_table_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/lookup_table.h"
+#include "paddle/cinn/hlir/op/contrib/lookup_table.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,14 +20,14 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/one_hot.cc b/paddle/cinn/hlir/op/contrib/one_hot.cc
index 5f6f74c6f8bd3..a782481069c50 100755
--- a/paddle/cinn/hlir/op/contrib/one_hot.cc
+++ b/paddle/cinn/hlir/op/contrib/one_hot.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/one_hot.h"
+#include "paddle/cinn/hlir/op/contrib/one_hot.h"
 
 #include <gflags/gflags.h>
 
@@ -21,23 +21,23 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/contrib/one_hot.h b/paddle/cinn/hlir/op/contrib/one_hot.h
index 90e21d8c01c4e..44b04d7f41c69 100644
--- a/paddle/cinn/hlir/op/contrib/one_hot.h
+++ b/paddle/cinn/hlir/op/contrib/one_hot.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/one_hot_test.cc b/paddle/cinn/hlir/op/contrib/one_hot_test.cc
index 2c51165228cdd..fc96d75179206 100644
--- a/paddle/cinn/hlir/op/contrib/one_hot_test.cc
+++ b/paddle/cinn/hlir/op/contrib/one_hot_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/one_hot.h"
+#include "paddle/cinn/hlir/op/contrib/one_hot.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,13 +20,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/randint.cc b/paddle/cinn/hlir/op/contrib/randint.cc
index f69fb4d44bb5b..a39c89458fda5 100644
--- a/paddle/cinn/hlir/op/contrib/randint.cc
+++ b/paddle/cinn/hlir/op/contrib/randint.cc
@@ -20,29 +20,29 @@
 #include <vector>
 
 #include "absl/types/variant.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/cinn_value.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/poly/stage.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/hlir/op/contrib/reciprocal.cc b/paddle/cinn/hlir/op/contrib/reciprocal.cc
index b642a24b0f7e8..262aadc8b3d74 100644
--- a/paddle/cinn/hlir/op/contrib/reciprocal.cc
+++ b/paddle/cinn/hlir/op/contrib/reciprocal.cc
@@ -17,24 +17,24 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 #include "gflags/gflags.h"
 
 DECLARE_bool(cinn_ir_schedule);
diff --git a/paddle/cinn/hlir/op/contrib/reciprocal.h b/paddle/cinn/hlir/op/contrib/reciprocal.h
index 8b95d111ee638..0011b6840f3a4 100644
--- a/paddle/cinn/hlir/op/contrib/reciprocal.h
+++ b/paddle/cinn/hlir/op/contrib/reciprocal.h
@@ -16,9 +16,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/reciprocal_test.cc b/paddle/cinn/hlir/op/contrib/reciprocal_test.cc
index ba059a2148456..e5a4cc20ad75d 100644
--- a/paddle/cinn/hlir/op/contrib/reciprocal_test.cc
+++ b/paddle/cinn/hlir/op/contrib/reciprocal_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/reciprocal.h"
+#include "paddle/cinn/hlir/op/contrib/reciprocal.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,13 +20,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/repeat.cc b/paddle/cinn/hlir/op/contrib/repeat.cc
index 141e302b7271e..2c770af2164e4 100755
--- a/paddle/cinn/hlir/op/contrib/repeat.cc
+++ b/paddle/cinn/hlir/op/contrib/repeat.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/repeat.h"
+#include "paddle/cinn/hlir/op/contrib/repeat.h"
 
 #include <gflags/gflags.h>
 
@@ -21,22 +21,22 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/contrib/repeat.h b/paddle/cinn/hlir/op/contrib/repeat.h
index 7b6fcfd174839..2a23d03ba2c29 100644
--- a/paddle/cinn/hlir/op/contrib/repeat.h
+++ b/paddle/cinn/hlir/op/contrib/repeat.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/repeat_test.cc b/paddle/cinn/hlir/op/contrib/repeat_test.cc
index 02977ea19fb7a..f21dbd031614d 100755
--- a/paddle/cinn/hlir/op/contrib/repeat_test.cc
+++ b/paddle/cinn/hlir/op/contrib/repeat_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/repeat.h"
+#include "paddle/cinn/hlir/op/contrib/repeat.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,13 +20,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/resize.cc b/paddle/cinn/hlir/op/contrib/resize.cc
index 26ad4af1411cc..b1f31a6f9bb31 100644
--- a/paddle/cinn/hlir/op/contrib/resize.cc
+++ b/paddle/cinn/hlir/op/contrib/resize.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/resize.h"
+#include "paddle/cinn/hlir/op/contrib/resize.h"
 
 #include <gflags/gflags.h>
 
@@ -21,21 +21,21 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/contrib/resize.h b/paddle/cinn/hlir/op/contrib/resize.h
index 694ec71f832d4..77bf94878870d 100644
--- a/paddle/cinn/hlir/op/contrib/resize.h
+++ b/paddle/cinn/hlir/op/contrib/resize.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/sort.cc b/paddle/cinn/hlir/op/contrib/sort.cc
index 0f324a73f2196..73648ff22bc0e 100644
--- a/paddle/cinn/hlir/op/contrib/sort.cc
+++ b/paddle/cinn/hlir/op/contrib/sort.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/sort.h"
+#include "paddle/cinn/hlir/op/contrib/sort.h"
 
 #include <gflags/gflags.h>
 
@@ -21,22 +21,22 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/contrib/sort.h b/paddle/cinn/hlir/op/contrib/sort.h
index 4290f9e1df181..bb07855666f21 100644
--- a/paddle/cinn/hlir/op/contrib/sort.h
+++ b/paddle/cinn/hlir/op/contrib/sort.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/sort_test.cc b/paddle/cinn/hlir/op/contrib/sort_test.cc
index e5f990ba11b1c..ed39e957914de 100644
--- a/paddle/cinn/hlir/op/contrib/sort_test.cc
+++ b/paddle/cinn/hlir/op/contrib/sort_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/contrib/sort.h"
+#include "paddle/cinn/hlir/op/contrib/sort.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -20,13 +20,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/codegen_c_x86.h"
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/common/context.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/codegen_c_x86.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/triangular_solve.cc b/paddle/cinn/hlir/op/contrib/triangular_solve.cc
index b7b1cfd79459a..0b9b120b71083 100644
--- a/paddle/cinn/hlir/op/contrib/triangular_solve.cc
+++ b/paddle/cinn/hlir/op/contrib/triangular_solve.cc
@@ -15,19 +15,19 @@
 #include <memory>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/common/macros.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/contrib/uniform_random.cc b/paddle/cinn/hlir/op/contrib/uniform_random.cc
index d4b2f9c8a953d..dfa3fb17743ab 100644
--- a/paddle/cinn/hlir/op/contrib/uniform_random.cc
+++ b/paddle/cinn/hlir/op/contrib/uniform_random.cc
@@ -20,29 +20,29 @@
 #include <vector>
 
 #include "absl/types/variant.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/cinn_value.h"
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/macros.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/poly/stage.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/hlir/op/custom_call.cc b/paddle/cinn/hlir/op/custom_call.cc
index 05a63eab713a0..548141837fdab 100644
--- a/paddle/cinn/hlir/op/custom_call.cc
+++ b/paddle/cinn/hlir/op/custom_call.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/common/cas.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
 
 #ifdef CINN_WITH_CUDNN
 #include "cudnn.h"
diff --git a/paddle/cinn/hlir/op/elementwise.cc b/paddle/cinn/hlir/op/elementwise.cc
index 598311837a336..60b9599480126 100644
--- a/paddle/cinn/hlir/op/elementwise.cc
+++ b/paddle/cinn/hlir/op/elementwise.cc
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
 
 #include <iostream>
 
 #include "absl/types/optional.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/utils/functional.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/utils/functional.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/external_api_registry.cc b/paddle/cinn/hlir/op/external_api_registry.cc
index 8928078be7a12..e2b9c3a1a3f40 100644
--- a/paddle/cinn/hlir/op/external_api_registry.cc
+++ b/paddle/cinn/hlir/op/external_api_registry.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/external_api_registry.h"
+#include "paddle/cinn/hlir/op/external_api_registry.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/external_api_registry.h b/paddle/cinn/hlir/op/external_api_registry.h
index bca2b1223229e..189a51d276974 100644
--- a/paddle/cinn/hlir/op/external_api_registry.h
+++ b/paddle/cinn/hlir/op/external_api_registry.h
@@ -15,9 +15,9 @@
 #pragma once
 #include <sstream>
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/utils/registry.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/utils/registry.h"
 
 #define CINN_OP_REGISTER_EXTERNAL_API(Name, Target)                                                               \
   static ::cinn::hlir::op::ExternalApiInfo& CINN_STR_CONCAT(__make_##ExternalApiInfo##_##Name##__, __COUNTER__) = \
diff --git a/paddle/cinn/hlir/op/external_api_registry_test.cc b/paddle/cinn/hlir/op/external_api_registry_test.cc
index f2ea23237d5ba..86fa723faece1 100644
--- a/paddle/cinn/hlir/op/external_api_registry_test.cc
+++ b/paddle/cinn/hlir/op/external_api_registry_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/external_api_registry.h"
+#include "paddle/cinn/hlir/op/external_api_registry.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/nn.cc b/paddle/cinn/hlir/op/nn.cc
index f6c08202ee703..1658897f30e9e 100644
--- a/paddle/cinn/hlir/op/nn.cc
+++ b/paddle/cinn/hlir/op/nn.cc
@@ -12,21 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/nn.h"
 
 #include <functional>
 
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/layout.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/layout.h"
+#include "paddle/cinn/poly/stage.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/op_broadcast_test.cc b/paddle/cinn/hlir/op/op_broadcast_test.cc
index 67bc46e428ac5..f8af967ff62ec 100755
--- a/paddle/cinn/hlir/op/op_broadcast_test.cc
+++ b/paddle/cinn/hlir/op/op_broadcast_test.cc
@@ -17,17 +17,17 @@
 #include <functional>
 #include <string>
 
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/cinn.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/op_nn_test.cc b/paddle/cinn/hlir/op/op_nn_test.cc
index 55f595227eedd..6f6b26407761d 100644
--- a/paddle/cinn/hlir/op/op_nn_test.cc
+++ b/paddle/cinn/hlir/op/op_nn_test.cc
@@ -18,17 +18,17 @@
 #include <iostream>
 #include <string>
 
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/cinn.h"
-#include "cinn/common/target.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/op_util.cc b/paddle/cinn/hlir/op/op_util.cc
index 7a7160e3d6b3b..4cca01a4fbd5d 100644
--- a/paddle/cinn/hlir/op/op_util.cc
+++ b/paddle/cinn/hlir/op/op_util.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/op/op_util.h"
 
 #include <string>
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/op_util.h b/paddle/cinn/hlir/op/op_util.h
index 2eac5e0a65de9..1a48117ae8025 100644
--- a/paddle/cinn/hlir/op/op_util.h
+++ b/paddle/cinn/hlir/op/op_util.h
@@ -18,11 +18,11 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/ir/ir.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/reduction.cc b/paddle/cinn/hlir/op/reduction.cc
index 7e36a572d8743..6e40d3da2d00f 100644
--- a/paddle/cinn/hlir/op/reduction.cc
+++ b/paddle/cinn/hlir/op/reduction.cc
@@ -12,22 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/reduction.h"
+#include "paddle/cinn/hlir/pe/reduction.h"
 
 #include <algorithm>
 #include <iostream>
 #include <vector>
 
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/optim/ir_simplify.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_simplify.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/reduction_test.cc b/paddle/cinn/hlir/op/reduction_test.cc
index 9b214f467f3e8..f6f30c75b79d4 100644
--- a/paddle/cinn/hlir/op/reduction_test.cc
+++ b/paddle/cinn/hlir/op/reduction_test.cc
@@ -20,25 +20,25 @@
 #include <iostream>
 #include <string>
 
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/codegen_cuda_host.h"
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/cuda_util.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/backends/nvrtc/nvrtc_util.h"
-#include "cinn/cinn.h"
-#include "cinn/common/target.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
 DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/op/transform.cc b/paddle/cinn/hlir/op/transform.cc
index 24951be324eee..0b370d6dc027a 100644
--- a/paddle/cinn/hlir/op/transform.cc
+++ b/paddle/cinn/hlir/op/transform.cc
@@ -12,22 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/transform.h"
+#include "paddle/cinn/hlir/pe/transform.h"
 
 #include <algorithm>
 
-#include "cinn/common/cas.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/ir_schedule_pe.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/transform_test.cc b/paddle/cinn/hlir/op/transform_test.cc
index 3578d986fccf8..ea06fb03f4093 100644
--- a/paddle/cinn/hlir/op/transform_test.cc
+++ b/paddle/cinn/hlir/op/transform_test.cc
@@ -19,26 +19,26 @@
 #include <iostream>
 #include <string>
 
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/codegen_cuda_host.h"
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/cuda_util.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/backends/nvrtc/nvrtc_util.h"
-#include "cinn/cinn.h"
-#include "cinn/common/target.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/runtime/cuda/cuda_module.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/hlir/op/use_ops.h b/paddle/cinn/hlir/op/use_ops.h
index 9589bb96b0ed5..3f32debf9ef98 100644
--- a/paddle/cinn/hlir/op/use_ops.h
+++ b/paddle/cinn/hlir/op/use_ops.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/common/macros.h"
+#include "paddle/cinn/common/macros.h"
 
 CINN_USE_REGISTER(nn_ops)
 CINN_USE_REGISTER(nn_grad_ops)
diff --git a/paddle/cinn/hlir/pass/CMakeLists.txt b/paddle/cinn/hlir/pass/CMakeLists.txt
index ac48b9a153cf6..fc494b7fd97bf 100644
--- a/paddle/cinn/hlir/pass/CMakeLists.txt
+++ b/paddle/cinn/hlir/pass/CMakeLists.txt
@@ -20,23 +20,23 @@ gather_srcs(cinnapi_src SRCS
     constant_folding_pass_util.cc
     )
 
-#cc_test(test_opfusion SRCS opfusion_test.cc DEPS cinncore)
+#cinn_cc_test(test_opfusion SRCS opfusion_test.cc DEPS cinncore)
 if (WITH_CUDA)
-cc_test(test_primitive_ops SRCS test_primitive_ops.cc DEPS cinncore)
-cc_test(test_const_propagate SRCS const_propagate_test.cc DEPS cinncore)
-cc_test(test_dot_merger_pass SRCS dot_merger_test.cc DEPS cinncore)
+cinn_cc_test(test_primitive_ops SRCS test_primitive_ops.cc DEPS cinncore)
+cinn_cc_test(test_const_propagate SRCS const_propagate_test.cc DEPS cinncore)
+cinn_cc_test(test_dot_merger_pass SRCS dot_merger_test.cc DEPS cinncore)
 
 # TODO(thisjiang): move when test bug in x86 is fixed
-cc_test(test_check_fusion_accuracy_pass SRCS check_fusion_accuracy_pass_test.cc DEPS cinncore decomposer_test_helper)
-cc_test(test_dense_merge_pass SRCS dense_merge_pass_test.cc DEPS cinncore)
-cc_test(test_reduce_split_pass SRCS reduce_split_pass_test.cc DEPS cinncore)
+cinn_cc_test(test_check_fusion_accuracy_pass SRCS check_fusion_accuracy_pass_test.cc DEPS cinncore decomposer_test_helper)
+cinn_cc_test(test_dense_merge_pass SRCS dense_merge_pass_test.cc DEPS cinncore)
+cinn_cc_test(test_reduce_split_pass SRCS reduce_split_pass_test.cc DEPS cinncore)
 endif()
-cc_test(test_op_fusion_pass SRCS op_fusion_pass_test.cc DEPS cinncore decomposer_test_helper)
-cc_test(test_fusion_merge_pass SRCS fusion_merge_pass_test.cc DEPS cinncore decomposer_test_helper)
+cinn_cc_test(test_op_fusion_pass SRCS op_fusion_pass_test.cc DEPS cinncore decomposer_test_helper)
+cinn_cc_test(test_fusion_merge_pass SRCS fusion_merge_pass_test.cc DEPS cinncore decomposer_test_helper)
 if (NOT WITH_CUDA)
-#cc_test(test_alterlayout SRCS alterlayout_test.cc DEPS cinncore)
+#cinn_cc_test(test_alterlayout SRCS alterlayout_test.cc DEPS cinncore)
 endif()
-cc_test(test_dot_merger SRCS test_dot_merger.cc DEPS cinncore)
-cc_test(test_dce_pass SRCS dce_pass_test.cc DEPS cinncore)
-cc_test(test_common_subexpression_elimination SRCS common_subexpression_elimination_test.cc DEPS cinncore)
-cc_test(test_constant_folding_pass SRCS constant_folding_pass_test.cc DEPS cinncore)
+cinn_cc_test(test_dot_merger SRCS test_dot_merger.cc DEPS cinncore)
+cinn_cc_test(test_dce_pass SRCS dce_pass_test.cc DEPS cinncore)
+cinn_cc_test(test_common_subexpression_elimination SRCS common_subexpression_elimination_test.cc DEPS cinncore)
+cinn_cc_test(test_constant_folding_pass SRCS constant_folding_pass_test.cc DEPS cinncore)
diff --git a/paddle/cinn/hlir/pass/alterlayout.cc b/paddle/cinn/hlir/pass/alterlayout.cc
index 0ebec27a56ac9..a89b8dd371e4c 100644
--- a/paddle/cinn/hlir/pass/alterlayout.cc
+++ b/paddle/cinn/hlir/pass/alterlayout.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/layout.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/layout.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/alterlayout_test.cc b/paddle/cinn/hlir/pass/alterlayout_test.cc
index 5f1a955d204b0..b979af059512d 100755
--- a/paddle/cinn/hlir/pass/alterlayout_test.cc
+++ b/paddle/cinn/hlir/pass/alterlayout_test.cc
@@ -16,14 +16,14 @@
 
 #include <memory>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 DEFINE_string(model_dir, "", "");
 
diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
index 38b10322cf3c7..3194a1b06b47d 100644
--- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
@@ -20,13 +20,13 @@
 #include <unordered_set>
 #include <vector>
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/visualize_helper.h"
-#include "cinn/hlir/pass/fusion_helper_base.h"
-#include "cinn/runtime/custom_function.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/hlir/pass/fusion_helper_base.h"
+#include "paddle/cinn/runtime/custom_function.h"
 
 namespace cinn::hlir::pass {
 
diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
index 3db0a1ff21432..d3c063f1a03dc 100644
--- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
@@ -17,8 +17,8 @@
 #include <string>
 #include <unordered_set>
 
-#include "cinn/common/target.h"
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn::frontend {
 
diff --git a/paddle/cinn/hlir/pass/common_subexpression_elimination.cc b/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
index 75aa2c92c3361..f5aa1e58e8d3a 100644
--- a/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
+++ b/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
@@ -15,13 +15,13 @@
 #include <string>
 #include <unordered_set>
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_lowering_util.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_lowering_util.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc b/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
index eef62e68fa5b7..5c01561d9b3f1 100644
--- a/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
+++ b/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
@@ -30,12 +30,12 @@
 
 #include <memory>
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 DEFINE_string(model_dir, "", "");
 
diff --git a/paddle/cinn/hlir/pass/const_propagate.cc b/paddle/cinn/hlir/pass/const_propagate.cc
index d603f99fff0d3..fec0602797001 100644
--- a/paddle/cinn/hlir/pass/const_propagate.cc
+++ b/paddle/cinn/hlir/pass/const_propagate.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/const_propagate_test.cc b/paddle/cinn/hlir/pass/const_propagate_test.cc
index 1c520528188a9..4ad813fc8461a 100644
--- a/paddle/cinn/hlir/pass/const_propagate_test.cc
+++ b/paddle/cinn/hlir/pass/const_propagate_test.cc
@@ -16,14 +16,14 @@
 
 #include <memory>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 DEFINE_string(model_dir, "", "");
 
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass.cc b/paddle/cinn/hlir/pass/constant_folding_pass.cc
index a4aa07c6030cd..483147a792dd7 100644
--- a/paddle/cinn/hlir/pass/constant_folding_pass.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/type.h"
-#include "cinn/hlir/pass/constant_folding_pass_util.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/pass/constant_folding_pass_util.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_test.cc b/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
index b761a890761f1..39293b7e8d407 100644
--- a/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
index 343a80bc89e59..ed6bf86ee5190 100644
--- a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
@@ -11,16 +11,16 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "cinn/hlir/pass/constant_folding_pass_util.h"
+#include "paddle/cinn/hlir/pass/constant_folding_pass_util.h"
 
 #include <algorithm>
 #include <cstddef>
 #include <queue>
 
 #include "absl/types/variant.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/utils/functional.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/utils/functional.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_util.h b/paddle/cinn/hlir/pass/constant_folding_pass_util.h
index 517ecee70cca9..b253af2b9ebaa 100644
--- a/paddle/cinn/hlir/pass/constant_folding_pass_util.h
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_util.h
@@ -15,8 +15,8 @@
 
 #include <queue>
 
-#include "cinn/hlir/pass/fusion_helper_base.h"
-#include "cinn/utils/functional.h"
+#include "paddle/cinn/hlir/pass/fusion_helper_base.h"
+#include "paddle/cinn/utils/functional.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/custom_call_pass.cc b/paddle/cinn/hlir/pass/custom_call_pass.cc
index 392ed57d53986..2af0d68c5a599 100644
--- a/paddle/cinn/hlir/pass/custom_call_pass.cc
+++ b/paddle/cinn/hlir/pass/custom_call_pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/type.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/external_api_registry.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/external_api_registry.h"
+#include "paddle/cinn/utils/string.h"
 
 DECLARE_string(cinn_custom_call_deny_ops);
 
diff --git a/paddle/cinn/hlir/pass/dce_pass.cc b/paddle/cinn/hlir/pass/dce_pass.cc
index c02ec31d8c34a..6865c2048d06d 100644
--- a/paddle/cinn/hlir/pass/dce_pass.cc
+++ b/paddle/cinn/hlir/pass/dce_pass.cc
@@ -14,8 +14,8 @@
 
 #include <queue>
 
-#include "cinn/common/type.h"
-#include "cinn/hlir/pass/op_fusion_pass_util.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/pass/op_fusion_pass_util.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/dce_pass_test.cc b/paddle/cinn/hlir/pass/dce_pass_test.cc
index 2fd08f9ca985b..1d81962c5680c 100644
--- a/paddle/cinn/hlir/pass/dce_pass_test.cc
+++ b/paddle/cinn/hlir/pass/dce_pass_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/hlir/pass/dense_merge_pass.cc b/paddle/cinn/hlir/pass/dense_merge_pass.cc
index b994788080c75..1c867a84a098c 100644
--- a/paddle/cinn/hlir/pass/dense_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/dense_merge_pass.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/common/type.h"
-#include "cinn/hlir/pass/fusion_helper_base.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/pass/fusion_helper_base.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/dense_merge_pass_test.cc b/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
index 2b3954958c23c..fa66ec2d1240f 100644
--- a/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
+++ b/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/hlir/pass/dot_merger.cc b/paddle/cinn/hlir/pass/dot_merger.cc
index 8739d9dd17e96..5ee1d79365c5c 100644
--- a/paddle/cinn/hlir/pass/dot_merger.cc
+++ b/paddle/cinn/hlir/pass/dot_merger.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/pass/infershape.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/pass/infershape.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/dot_merger_test.cc b/paddle/cinn/hlir/pass/dot_merger_test.cc
index c65cbad984aa9..1f5f0104dbd23 100644
--- a/paddle/cinn/hlir/pass/dot_merger_test.cc
+++ b/paddle/cinn/hlir/pass/dot_merger_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/hlir/pass/fusion_helper_base.h b/paddle/cinn/hlir/pass/fusion_helper_base.h
index 60ec6b99c9ff3..6f45fe5dc03d6 100644
--- a/paddle/cinn/hlir/pass/fusion_helper_base.h
+++ b/paddle/cinn/hlir/pass/fusion_helper_base.h
@@ -16,13 +16,13 @@
 #include <algorithm>
 #include <unordered_set>
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass.cc b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
index 7a63e12c4b357..50095cc762a05 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pass/fusion_merge_pass_util.h"
+#include "paddle/cinn/hlir/pass/fusion_merge_pass_util.h"
 
 DECLARE_bool(enhance_vertical_fusion_with_recompute);
 
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
index d8407e025196b..81658fcd6e019 100755
--- a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_util.h b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
index 3857db05643a5..c50379d8079ad 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
@@ -15,7 +15,7 @@
 
 #include <queue>
 
-#include "cinn/hlir/pass/fusion_helper_base.h"
+#include "paddle/cinn/hlir/pass/fusion_helper_base.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/infershape.cc b/paddle/cinn/hlir/pass/infershape.cc
index d563dbcf3d18d..890ed7329d191 100755
--- a/paddle/cinn/hlir/pass/infershape.cc
+++ b/paddle/cinn/hlir/pass/infershape.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pass/infershape.h"
+#include "paddle/cinn/hlir/pass/infershape.h"
 
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/infershape.h b/paddle/cinn/hlir/pass/infershape.h
index 0c653fa754dd2..8a3c77687cc97 100644
--- a/paddle/cinn/hlir/pass/infershape.h
+++ b/paddle/cinn/hlir/pass/infershape.h
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/pass.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/pass.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass.cc b/paddle/cinn/hlir/pass/op_fusion_pass.cc
index 23fa6bcb1491d..1337fabbde18e 100644
--- a/paddle/cinn/hlir/pass/op_fusion_pass.cc
+++ b/paddle/cinn/hlir/pass/op_fusion_pass.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/type.h"
-#include "cinn/hlir/pass/op_fusion_pass_util.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/pass/op_fusion_pass_util.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
index 4ba77dec9c84b..97eb048346b70 100755
--- a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_util.h b/paddle/cinn/hlir/pass/op_fusion_pass_util.h
index 580981b240c27..b04d7557d28bc 100644
--- a/paddle/cinn/hlir/pass/op_fusion_pass_util.h
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_util.h
@@ -15,7 +15,7 @@
 
 #include <queue>
 
-#include "cinn/hlir/pass/fusion_helper_base.h"
+#include "paddle/cinn/hlir/pass/fusion_helper_base.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/opfusion.cc b/paddle/cinn/hlir/pass/opfusion.cc
index ac6506d4d5487..fd8a693216e94 100644
--- a/paddle/cinn/hlir/pass/opfusion.cc
+++ b/paddle/cinn/hlir/pass/opfusion.cc
@@ -15,12 +15,12 @@
 #include <algorithm>
 #include <unordered_set>
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/opfusion_test.cc b/paddle/cinn/hlir/pass/opfusion_test.cc
index 56294e935828f..ae8546055a0b6 100755
--- a/paddle/cinn/hlir/pass/opfusion_test.cc
+++ b/paddle/cinn/hlir/pass/opfusion_test.cc
@@ -16,14 +16,14 @@
 
 #include <memory>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 DEFINE_string(model_dir, "", "");
 
diff --git a/paddle/cinn/hlir/pass/reduce_split_pass.cc b/paddle/cinn/hlir/pass/reduce_split_pass.cc
index 80cab5ff240a7..513930d87d33b 100644
--- a/paddle/cinn/hlir/pass/reduce_split_pass.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/pass/infershape.h"
-#include "cinn/hlir/pe/nn_util.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/pass/infershape.h"
+#include "paddle/cinn/hlir/pe/nn_util.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/reduce_split_pass_test.cc b/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
index e511e07ad954b..eec8e861e5a1a 100644
--- a/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/frontend/decomposer/test_helper.h"
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
 
 namespace cinn {
 namespace frontend {
diff --git a/paddle/cinn/hlir/pass/single_group_optimize_pass.cc b/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
index 0f1ff6be12b00..b1f932427436d 100644
--- a/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
+++ b/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
@@ -14,11 +14,11 @@
 
 #include <absl/container/flat_hash_map.h>
 
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
 
 namespace cinn::hlir::pass {
 
diff --git a/paddle/cinn/hlir/pass/test_dot_merger.cc b/paddle/cinn/hlir/pass/test_dot_merger.cc
index d086846ce4ae2..8d490192970b4 100644
--- a/paddle/cinn/hlir/pass/test_dot_merger.cc
+++ b/paddle/cinn/hlir/pass/test_dot_merger.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/pass/pass_test_helper.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/pass/pass_test_helper.h"
+#include "paddle/cinn/runtime/flags.h"
 #include "gtest/gtest.h"
 
 namespace cinn::frontend::pass {
diff --git a/paddle/cinn/hlir/pass/test_primitive_ops.cc b/paddle/cinn/hlir/pass/test_primitive_ops.cc
index af403e18be816..2a7276baa45be 100755
--- a/paddle/cinn/hlir/pass/test_primitive_ops.cc
+++ b/paddle/cinn/hlir/pass/test_primitive_ops.cc
@@ -16,15 +16,15 @@
 
 #include <memory>
 
-#include "cinn/cinn.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/hlir/pass/use_pass.h"
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/hlir/pass/use_pass.h"
+#include "paddle/cinn/utils/data_util.h"
 
 DEFINE_string(model_dir, "", "");
 
diff --git a/paddle/cinn/hlir/pass/use_pass.h b/paddle/cinn/hlir/pass/use_pass.h
index dc44fb0869597..a9397cc8d281f 100644
--- a/paddle/cinn/hlir/pass/use_pass.h
+++ b/paddle/cinn/hlir/pass/use_pass.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/common/macros.h"
+#include "paddle/cinn/common/macros.h"
 
 CINN_USE_REGISTER(InferShape)
 CINN_USE_REGISTER(OpFusion)
diff --git a/paddle/cinn/hlir/pe/CMakeLists.txt b/paddle/cinn/hlir/pe/CMakeLists.txt
index b362ed80e71ff..1719549c055eb 100755
--- a/paddle/cinn/hlir/pe/CMakeLists.txt
+++ b/paddle/cinn/hlir/pe/CMakeLists.txt
@@ -1,4 +1,4 @@
-proto_library(param_proto SRCS schedule_param.proto)
+cinn_proto_library(param_proto SRCS schedule_param.proto)
 
 core_gather_headers()
 
@@ -15,10 +15,10 @@ gather_srcs(cinnapi_src SRCS
     vision.cc
     )
 
-cc_test(test_cinn_pe_elementwise SRCS pe_elementwise_test.cc DEPS cinncore)
-cc_test(test_cinn_pe_broadcast SRCS pe_broadcast_test.cc DEPS cinncore)
-cc_test(test_cinn_pe_transform SRCS pe_transform_test.cc DEPS cinncore)
-cc_test(test_load_params SRCS load_params_test.cc DEPS cinncore)
+cinn_cc_test(test_cinn_pe_elementwise SRCS pe_elementwise_test.cc DEPS cinncore)
+cinn_cc_test(test_cinn_pe_broadcast SRCS pe_broadcast_test.cc DEPS cinncore)
+cinn_cc_test(test_cinn_pe_transform SRCS pe_transform_test.cc DEPS cinncore)
+cinn_cc_test(test_load_params SRCS load_params_test.cc DEPS cinncore)
 
 foreach(header ${param_proto_HDRS})
   set(core_proto_includes "${core_proto_includes};${header}" CACHE INTERNAL "")
diff --git a/paddle/cinn/hlir/pe/broadcast.cc b/paddle/cinn/hlir/pe/broadcast.cc
index d65b975152bc3..b7e0b1746b301 100644
--- a/paddle/cinn/hlir/pe/broadcast.cc
+++ b/paddle/cinn/hlir/pe/broadcast.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
 
 #include <algorithm>
 
-#include "cinn/common/ir_util.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/broadcast.h b/paddle/cinn/hlir/pe/broadcast.h
index fc287b09b0944..f3ce590ff1dc9 100644
--- a/paddle/cinn/hlir/pe/broadcast.h
+++ b/paddle/cinn/hlir/pe/broadcast.h
@@ -16,7 +16,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc
index f548d7664921d..fe11303b98e70 100644
--- a/paddle/cinn/hlir/pe/elementwise.cc
+++ b/paddle/cinn/hlir/pe/elementwise.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
 
 #include <algorithm>
 #include <string>
 
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/utils/functional.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/utils/functional.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/elementwise.h b/paddle/cinn/hlir/pe/elementwise.h
index 2899efb00416d..efe773cb7d01b 100644
--- a/paddle/cinn/hlir/pe/elementwise.h
+++ b/paddle/cinn/hlir/pe/elementwise.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
index 40fbb9d397009..4f8f99d8874d0 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
 
 #include <absl/container/flat_hash_map.h>
 #include <isl/cpp.h>
@@ -24,16 +24,16 @@
 #include <numeric>
 #include <utility>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/target.h"
-#include "cinn/hlir/pe/load_x86_params.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/poly/isl_utils.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/pe/load_x86_params.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.h b/paddle/cinn/hlir/pe/ir_schedule_pe.h
index fcea4956885c0..4c07c7cb1ad9f 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.h
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.h
@@ -19,13 +19,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/pe/schedule_param.pb.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/ir_schedule_util.h"
-#include "cinn/lang/compute.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/pe/schedule_param.pb.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/ir_schedule_util.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/load_params_test.cc b/paddle/cinn/hlir/pe/load_params_test.cc
index 6bf561701aecc..bc05920b54f24 100644
--- a/paddle/cinn/hlir/pe/load_params_test.cc
+++ b/paddle/cinn/hlir/pe/load_params_test.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/load_x86_params.cc b/paddle/cinn/hlir/pe/load_x86_params.cc
index a9184bb9de98f..6de15b72096af 100644
--- a/paddle/cinn/hlir/pe/load_x86_params.cc
+++ b/paddle/cinn/hlir/pe/load_x86_params.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/load_x86_params.h"
+#include "paddle/cinn/hlir/pe/load_x86_params.h"
 
 #include <glog/logging.h>
 
diff --git a/paddle/cinn/hlir/pe/nn.cc b/paddle/cinn/hlir/pe/nn.cc
index e2b3da2a64e1c..590f9e66b4dc7 100644
--- a/paddle/cinn/hlir/pe/nn.cc
+++ b/paddle/cinn/hlir/pe/nn.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/nn.h"
 
 #include <absl/container/flat_hash_map.h>
 
@@ -21,17 +21,17 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/context.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/nn_util.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/nn_util.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/nn.h b/paddle/cinn/hlir/pe/nn.h
index b0627c501ae60..eb98754db0a9b 100755
--- a/paddle/cinn/hlir/pe/nn.h
+++ b/paddle/cinn/hlir/pe/nn.h
@@ -17,10 +17,10 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/nn_util.cc b/paddle/cinn/hlir/pe/nn_util.cc
index cef6f243ef5e8..3a7d4cbce516f 100644
--- a/paddle/cinn/hlir/pe/nn_util.cc
+++ b/paddle/cinn/hlir/pe/nn_util.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/nn_util.h"
+#include "paddle/cinn/hlir/pe/nn_util.h"
 
-#include "cinn/common/ir_util.h"
-#include "cinn/common/target.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/target.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/nn_util.h b/paddle/cinn/hlir/pe/nn_util.h
index 77f84190ba0bd..fe27e6c8b9b1e 100644
--- a/paddle/cinn/hlir/pe/nn_util.h
+++ b/paddle/cinn/hlir/pe/nn_util.h
@@ -18,13 +18,13 @@
 #include <unordered_map>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/context.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/pe_broadcast_test.cc b/paddle/cinn/hlir/pe/pe_broadcast_test.cc
index fcf23647b9aaf..7248a55bd65aa 100644
--- a/paddle/cinn/hlir/pe/pe_broadcast_test.cc
+++ b/paddle/cinn/hlir/pe/pe_broadcast_test.cc
@@ -15,12 +15,12 @@
 #include <gtest/gtest.h>
 #include <math.h>
 
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/cinn.h"
-#include "cinn/common/target.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/pe_elementwise_test.cc b/paddle/cinn/hlir/pe/pe_elementwise_test.cc
index d22eca299c9c5..d31aaa95ba33c 100644
--- a/paddle/cinn/hlir/pe/pe_elementwise_test.cc
+++ b/paddle/cinn/hlir/pe/pe_elementwise_test.cc
@@ -16,18 +16,18 @@
 
 // #include <cmath>
 
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/codegen_x86.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/cinn.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/target.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/reduction.h"
-#include "cinn/runtime/cpu/host_intrinsics.h"
-#include "cinn/runtime/cpu/use_extern_funcs.h"
-#include "cinn/utils/timer.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/codegen_x86.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/reduction.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
+#include "paddle/cinn/utils/timer.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/pe_transform_test.cc b/paddle/cinn/hlir/pe/pe_transform_test.cc
index f5a76014e82cc..0fcd520623d5f 100644
--- a/paddle/cinn/hlir/pe/pe_transform_test.cc
+++ b/paddle/cinn/hlir/pe/pe_transform_test.cc
@@ -14,21 +14,21 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/backends/codegen_cuda_dev.h"
-#include "cinn/backends/codegen_cuda_util.h"
-#include "cinn/backends/cuda_util.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/backends/nvrtc/nvrtc_util.h"
-#include "cinn/cinn.h"
-#include "cinn/common/target.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/hlir/pe/reduction.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/runtime/cpu/host_intrinsics.h"
-#include "cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/hlir/pe/reduction.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc
index db2cd38f0f411..073e8a80549d7 100644
--- a/paddle/cinn/hlir/pe/reduction.cc
+++ b/paddle/cinn/hlir/pe/reduction.cc
@@ -12,22 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/reduction.h"
+#include "paddle/cinn/hlir/pe/reduction.h"
 
-#include <cinn/ir/ir_base.h>
+#include <paddle/cinn/ir/ir_base.h>
 
 #include <algorithm>
 
-#include "cinn/common/common.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/nn_util.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/nn_util.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/reduction.h b/paddle/cinn/hlir/pe/reduction.h
index b4b960194cc35..117eaf3aec73b 100644
--- a/paddle/cinn/hlir/pe/reduction.h
+++ b/paddle/cinn/hlir/pe/reduction.h
@@ -16,7 +16,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/schedule.cc b/paddle/cinn/hlir/pe/schedule.cc
index af5a10c328ee2..68dd70044aa9d 100644
--- a/paddle/cinn/hlir/pe/schedule.cc
+++ b/paddle/cinn/hlir/pe/schedule.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
 
 #include <absl/container/flat_hash_map.h>
 #include <isl/cpp.h>
@@ -25,11 +25,11 @@
 #include <numeric>
 #include <utility>
 
-#include "cinn/common/cas.h"
-#include "cinn/hlir/pe/load_x86_params.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/poly/isl_utils.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/hlir/pe/load_x86_params.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/utils/string.h"
 
 DECLARE_bool(cinn_use_cuda_vectorize);
 namespace cinn {
diff --git a/paddle/cinn/hlir/pe/schedule.h b/paddle/cinn/hlir/pe/schedule.h
index 99a9bcd037419..9190146d679c5 100644
--- a/paddle/cinn/hlir/pe/schedule.h
+++ b/paddle/cinn/hlir/pe/schedule.h
@@ -19,12 +19,12 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/pe/schedule_param.pb.h"
-#include "cinn/ir/ir.h"
-#include "cinn/lang/compute.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/pe/schedule_param.pb.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/transform.cc b/paddle/cinn/hlir/pe/transform.cc
index 0cda4ebe1df7c..2e39bb0cc936e 100644
--- a/paddle/cinn/hlir/pe/transform.cc
+++ b/paddle/cinn/hlir/pe/transform.cc
@@ -12,21 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/transform.h"
+#include "paddle/cinn/hlir/pe/transform.h"
 
 #include <algorithm>
 #include <utility>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/context.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/schedule.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/transform.h b/paddle/cinn/hlir/pe/transform.h
index a3fff23cb56f0..55534c1e029c2 100644
--- a/paddle/cinn/hlir/pe/transform.h
+++ b/paddle/cinn/hlir/pe/transform.h
@@ -18,10 +18,10 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/layout.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/layout.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/vision.cc b/paddle/cinn/hlir/pe/vision.cc
index 54c65f165458a..8a5da44441d44 100644
--- a/paddle/cinn/hlir/pe/vision.cc
+++ b/paddle/cinn/hlir/pe/vision.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/hlir/pe/vision.h"
+#include "paddle/cinn/hlir/pe/vision.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/ir/CMakeLists.txt b/paddle/cinn/ir/CMakeLists.txt
index b4618cbd0f545..04e416048a466 100755
--- a/paddle/cinn/ir/CMakeLists.txt
+++ b/paddle/cinn/ir/CMakeLists.txt
@@ -1,4 +1,4 @@
-proto_library(schedule_desc_proto SRCS schedule_desc.proto)
+cinn_proto_library(schedule_desc_proto SRCS schedule_desc.proto)
 
 core_gather_headers()
 
@@ -27,17 +27,17 @@ gather_srcs(cinnapi_src SRCS
     ir_compare.cc
     )
 
-# cc_test(test_ir SRCS ir_test.cc DEPS core)
-# cc_test(test_ir_printer SRCS ir_printer_test.cc DEPS core)
-# cc_test(test_ir_operators SRCS ir_operators_test.cc DEPS core)
-cc_test(test_collect_ir_nodes SRCS collect_ir_nodes_test.cc DEPS cinncore)
-# cc_test(test_tensor SRCS tensor_test.cc DEPS core)
-cc_test(test_buffer SRCS buffer_test.cc DEPS cinncore ARGS ${global_test_args})
-cc_test(test_tensor SRCS tensor_test.cc DEPS cinncore)
-cc_test(test_intrinsic_ops SRCS intrinsic_ops_test.cc DEPS cinncore)
-cc_test(test_ir_verify SRCS ir_verify_test.cc DEPS cinncore)
-cc_test(test_schedule_desc SRCS schedule_desc_test.cc DEPS cinncore)
-cc_test(test_ir_compare SRCS ir_compare_test.cc DEPS cinncore)
+# cinn_cc_test(test_ir SRCS ir_test.cc DEPS core)
+# cinn_cc_test(test_ir_printer SRCS ir_printer_test.cc DEPS core)
+# cinn_cc_test(test_ir_operators SRCS ir_operators_test.cc DEPS core)
+cinn_cc_test(test_collect_ir_nodes SRCS collect_ir_nodes_test.cc DEPS cinncore)
+# cinn_cc_test(test_tensor SRCS tensor_test.cc DEPS core)
+cinn_cc_test(test_buffer SRCS buffer_test.cc DEPS cinncore ARGS ${global_test_args})
+cinn_cc_test(test_tensor SRCS tensor_test.cc DEPS cinncore)
+cinn_cc_test(test_intrinsic_ops SRCS intrinsic_ops_test.cc DEPS cinncore)
+cinn_cc_test(test_ir_verify SRCS ir_verify_test.cc DEPS cinncore)
+cinn_cc_test(test_schedule_desc SRCS schedule_desc_test.cc DEPS cinncore)
+cinn_cc_test(test_ir_compare SRCS ir_compare_test.cc DEPS cinncore)
 
 foreach(header ${schedule_desc_proto_HDRS})
   set(core_proto_includes "${core_proto_includes};${header}" CACHE INTERNAL "")
diff --git a/paddle/cinn/ir/buffer.cc b/paddle/cinn/ir/buffer.cc
index 76ed3e37571e0..c72849e268446 100755
--- a/paddle/cinn/ir/buffer.cc
+++ b/paddle/cinn/ir/buffer.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/buffer.h"
-
-#include "cinn/common/common.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/runtime/intrinsic.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/buffer.h"
+
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/runtime/intrinsic.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/buffer.h b/paddle/cinn/ir/buffer.h
index 9fd9f96517a4d..308af03286b29 100755
--- a/paddle/cinn/ir/buffer.h
+++ b/paddle/cinn/ir/buffer.h
@@ -19,8 +19,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/buffer_test.cc b/paddle/cinn/ir/buffer_test.cc
index 13e84831c24e2..b04db4a891134 100644
--- a/paddle/cinn/ir/buffer_test.cc
+++ b/paddle/cinn/ir/buffer_test.cc
@@ -12,21 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/buffer.h"
+#include "paddle/cinn/ir/buffer.h"
 
 #include <gtest/gtest.h>
 
 #include <vector>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/cinn.h"
-#include "cinn/common/common.h"
-#include "cinn/ir/module.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/buffer.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/buffer.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/collect_ir_nodes.cc b/paddle/cinn/ir/collect_ir_nodes.cc
index 5bb20cd3363dd..34ecda9aa39b3 100644
--- a/paddle/cinn/ir/collect_ir_nodes.cc
+++ b/paddle/cinn/ir/collect_ir_nodes.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
 
 #include <glog/logging.h>
 
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/collect_ir_nodes.h b/paddle/cinn/ir/collect_ir_nodes.h
index b63ed193bee30..0f888aaed7468 100755
--- a/paddle/cinn/ir/collect_ir_nodes.h
+++ b/paddle/cinn/ir/collect_ir_nodes.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/collect_ir_nodes_test.cc b/paddle/cinn/ir/collect_ir_nodes_test.cc
index 2d988660340bf..ed0e818801afa 100644
--- a/paddle/cinn/ir/collect_ir_nodes_test.cc
+++ b/paddle/cinn/ir/collect_ir_nodes_test.cc
@@ -14,8 +14,8 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/function_base.cc b/paddle/cinn/ir/function_base.cc
index 924c011152637..4b1dae965579e 100644
--- a/paddle/cinn/ir/function_base.cc
+++ b/paddle/cinn/ir/function_base.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/function_base.h"
+#include "paddle/cinn/ir/function_base.h"
 
 namespace cinn {
 namespace ir {}  // namespace ir
diff --git a/paddle/cinn/ir/function_base.h b/paddle/cinn/ir/function_base.h
index 47683d66cd056..dd4be902da200 100644
--- a/paddle/cinn/ir/function_base.h
+++ b/paddle/cinn/ir/function_base.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_base.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/function_definition.cc b/paddle/cinn/ir/function_definition.cc
index b5d9368431e98..364690ea0ead3 100644
--- a/paddle/cinn/ir/function_definition.cc
+++ b/paddle/cinn/ir/function_definition.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/function_definition.h"
+#include "paddle/cinn/ir/function_definition.h"
 
 namespace cinn {
 namespace ir {}  // namespace ir
diff --git a/paddle/cinn/ir/function_definition.h b/paddle/cinn/ir/function_definition.h
index 848dd6a6ca2c2..b77d1cc303d20 100644
--- a/paddle/cinn/ir/function_definition.h
+++ b/paddle/cinn/ir/function_definition.h
@@ -16,7 +16,7 @@
 
 #include <memory>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/intrinsic_ops.cc b/paddle/cinn/ir/intrinsic_ops.cc
index 4e33f8666845f..bf3bd4302a965 100644
--- a/paddle/cinn/ir/intrinsic_ops.cc
+++ b/paddle/cinn/ir/intrinsic_ops.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
 
 namespace cinn::ir {
 
diff --git a/paddle/cinn/ir/intrinsic_ops.h b/paddle/cinn/ir/intrinsic_ops.h
index d5219e896d9a3..9f9ac71acbe11 100644
--- a/paddle/cinn/ir/intrinsic_ops.h
+++ b/paddle/cinn/ir/intrinsic_ops.h
@@ -21,8 +21,8 @@
 
 #include <string>
 
-#include "cinn/common/type.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/ir/ir.h"
 
 //! This file defines some intrinsic IR nodes, this is similar to the MLIR operations, we try to expose some underlying
 //! opaque operations to IR system to helpe more intuitive codegen.
diff --git a/paddle/cinn/ir/intrinsic_ops_test.cc b/paddle/cinn/ir/intrinsic_ops_test.cc
index e6af51abf223e..185dbcefc477e 100644
--- a/paddle/cinn/ir/intrinsic_ops_test.cc
+++ b/paddle/cinn/ir/intrinsic_ops_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc
index 6a2b58445bfad..112b19f8d7910 100755
--- a/paddle/cinn/ir/ir.cc
+++ b/paddle/cinn/ir/ir.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 #include <map>
 #include <string>
 #include <vector>
 
-#include "cinn/common/cinn_value.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/ir/module.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/optim/ir_simplify.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/optim/ir_simplify.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h
index 894ab40e4ce5b..515d20c5909a2 100644
--- a/paddle/cinn/ir/ir.h
+++ b/paddle/cinn/ir/ir.h
@@ -26,11 +26,11 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/shared.h"
-#include "cinn/common/type.h"
-#include "cinn/ir/function_base.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/utils/small_vector.h"
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/ir/function_base.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/utils/small_vector.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/ir/ir_base.cc b/paddle/cinn/ir/ir_base.cc
index 19c8004fd2bf4..4e3f780b81822 100644
--- a/paddle/cinn/ir/ir_base.cc
+++ b/paddle/cinn/ir/ir_base.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_base.h"
-
-#include "cinn/common/cinn_value.h"
-#include "cinn/common/common.h"
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/ir/module.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/ir_base.h"
+
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h
index b1baf1d59fdea..f5984448e89ed 100644
--- a/paddle/cinn/ir/ir_base.h
+++ b/paddle/cinn/ir/ir_base.h
@@ -21,10 +21,10 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/common/object.h"
-#include "cinn/common/shared.h"
-#include "cinn/common/type.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/common/type.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/ir/ir_compare.cc b/paddle/cinn/ir/ir_compare.cc
index 16a0672d51fea..00e79b569e61f 100644
--- a/paddle/cinn/ir/ir_compare.cc
+++ b/paddle/cinn/ir/ir_compare.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_compare.h"
+#include "paddle/cinn/ir/ir_compare.h"
 
 #include <regex>
 
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_compare.h b/paddle/cinn/ir/ir_compare.h
index 3bad12b7e7665..3b69b13d53235 100644
--- a/paddle/cinn/ir/ir_compare.h
+++ b/paddle/cinn/ir/ir_compare.h
@@ -15,8 +15,8 @@
 #pragma once
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_visitor.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_compare_test.cc b/paddle/cinn/ir/ir_compare_test.cc
index 88a288dd918df..bb3c54c8d8e57 100644
--- a/paddle/cinn/ir/ir_compare_test.cc
+++ b/paddle/cinn/ir/ir_compare_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_compare.h"
+#include "paddle/cinn/ir/ir_compare.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/common/context.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_mutator.cc b/paddle/cinn/ir/ir_mutator.cc
index eabeecfc1c67c..341a8be3edc3a 100644
--- a/paddle/cinn/ir/ir_mutator.cc
+++ b/paddle/cinn/ir/ir_mutator.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace ir {}  // namespace ir
diff --git a/paddle/cinn/ir/ir_mutator.h b/paddle/cinn/ir/ir_mutator.h
index 90098e3b35514..9cfaac27e47b1 100755
--- a/paddle/cinn/ir/ir_mutator.h
+++ b/paddle/cinn/ir/ir_mutator.h
@@ -17,9 +17,9 @@
  */
 #pragma once
 
-#include "cinn/ir/intrinsic_ops.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_visitor.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_operators.cc b/paddle/cinn/ir/ir_operators.cc
index cc586971c11b2..09a0274dfdf38 100644
--- a/paddle/cinn/ir/ir_operators.cc
+++ b/paddle/cinn/ir/ir_operators.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_operators.h"
 
 #include <limits>
 #include <string>
 
-#include "cinn/common/target.h"
-#include "cinn/common/type.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/lang/compute.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_operators.h b/paddle/cinn/ir/ir_operators.h
index a2a7b711573aa..b0a2eab109526 100644
--- a/paddle/cinn/ir/ir_operators.h
+++ b/paddle/cinn/ir/ir_operators.h
@@ -15,8 +15,8 @@
 #pragma once
 #include <vector>
 
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_operators_test.cc b/paddle/cinn/ir/ir_operators_test.cc
index b31614308e889..e3411a0e0d783 100644
--- a/paddle/cinn/ir/ir_operators_test.cc
+++ b/paddle/cinn/ir/ir_operators_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_operators.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/ir/ir_printer.cc b/paddle/cinn/ir/ir_printer.cc
index 66604da970182..7bcd404ec6201 100644
--- a/paddle/cinn/ir/ir_printer.cc
+++ b/paddle/cinn/ir/ir_printer.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 #include <algorithm>
 #include <iomanip>
 #include <limits>
 #include <vector>
 
-#include "cinn/ir/lowered_func.h"
-#include "cinn/ir/module.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/runtime/intrinsic.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/runtime/intrinsic.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_printer.h b/paddle/cinn/ir/ir_printer.h
index 7eafbcf97172e..cf6b41e75fbae 100644
--- a/paddle/cinn/ir/ir_printer.h
+++ b/paddle/cinn/ir/ir_printer.h
@@ -16,9 +16,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_visitor.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/ir/ir_printer_test.cc b/paddle/cinn/ir/ir_printer_test.cc
index 1f9edca6ded05..e1e55b0d61593 100644
--- a/paddle/cinn/ir/ir_printer_test.cc
+++ b/paddle/cinn/ir/ir_printer_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/ir/ir_schedule.cc b/paddle/cinn/ir/ir_schedule.cc
index eb2d934e0f646..06699e6442082 100644
--- a/paddle/cinn/ir/ir_schedule.cc
+++ b/paddle/cinn/ir/ir_schedule.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/ir_schedule.h"
 
 #include <math.h>
 
@@ -25,21 +25,21 @@
 #include <unordered_map>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule_util.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/lang/compute.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/replace_var_with_expr.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule_util.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_schedule.h b/paddle/cinn/ir/ir_schedule.h
index 6b7b252a57dec..2361e50378303 100644
--- a/paddle/cinn/ir/ir_schedule.h
+++ b/paddle/cinn/ir/ir_schedule.h
@@ -19,12 +19,12 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/schedule_desc.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/utils/random_engine.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/schedule_desc.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/utils/random_engine.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_schedule_util.cc b/paddle/cinn/ir/ir_schedule_util.cc
index 054e05dee06d3..26ea1a6736365 100644
--- a/paddle/cinn/ir/ir_schedule_util.cc
+++ b/paddle/cinn/ir/ir_schedule_util.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_schedule_util.h"
+#include "paddle/cinn/ir/ir_schedule_util.h"
 
 #include <math.h>
 
@@ -23,17 +23,17 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/lang/compute.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/replace_var_with_expr.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_schedule_util.h b/paddle/cinn/ir/ir_schedule_util.h
index 12a80f637969c..c3bc3cc4ae95b 100644
--- a/paddle/cinn/ir/ir_schedule_util.h
+++ b/paddle/cinn/ir/ir_schedule_util.h
@@ -20,12 +20,12 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/utils/random_engine.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/utils/random_engine.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_test.cc b/paddle/cinn/ir/ir_test.cc
index 39ec6b0073f58..fa7c8bd6afde0 100644
--- a/paddle/cinn/ir/ir_test.cc
+++ b/paddle/cinn/ir/ir_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/utils/string.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_verify.cc b/paddle/cinn/ir/ir_verify.cc
index b9f3fc7226e14..0a91ac17d18d0 100644
--- a/paddle/cinn/ir/ir_verify.cc
+++ b/paddle/cinn/ir/ir_verify.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_verify.h"
+#include "paddle/cinn/ir/ir_verify.h"
 
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn::ir {
 
diff --git a/paddle/cinn/ir/ir_verify.h b/paddle/cinn/ir/ir_verify.h
index fa2fe259ef127..deddb3178282d 100644
--- a/paddle/cinn/ir/ir_verify.h
+++ b/paddle/cinn/ir/ir_verify.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn::ir {
 
diff --git a/paddle/cinn/ir/ir_verify_test.cc b/paddle/cinn/ir/ir_verify_test.cc
index 5fcfe4cc8dcef..ed34a1d8be3a4 100644
--- a/paddle/cinn/ir/ir_verify_test.cc
+++ b/paddle/cinn/ir/ir_verify_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_verify.h"
+#include "paddle/cinn/ir/ir_verify.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_operators.h"
 
 namespace cinn::ir {
 
diff --git a/paddle/cinn/ir/ir_visitor.cc b/paddle/cinn/ir/ir_visitor.cc
index 0cdbc828a91a2..83090fc9e75d6 100644
--- a/paddle/cinn/ir/ir_visitor.cc
+++ b/paddle/cinn/ir/ir_visitor.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/ir_visitor.h"
 
 #include <unordered_set>
 
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/ir_visitor.h b/paddle/cinn/ir/ir_visitor.h
index 21d7bab369ae8..21ccbc23335c6 100644
--- a/paddle/cinn/ir/ir_visitor.h
+++ b/paddle/cinn/ir/ir_visitor.h
@@ -16,12 +16,12 @@
 #include <functional>
 #include <set>
 
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/intrinsic_ops.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/layout.cc b/paddle/cinn/ir/layout.cc
index 9b97c0e5ecab2..5113c065370be 100644
--- a/paddle/cinn/ir/layout.cc
+++ b/paddle/cinn/ir/layout.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/layout.h"
+#include "paddle/cinn/ir/layout.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/layout.h b/paddle/cinn/ir/layout.h
index 1af93114c93bd..f71c6823fd20f 100644
--- a/paddle/cinn/ir/layout.h
+++ b/paddle/cinn/ir/layout.h
@@ -17,8 +17,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc
index 36b0dcf6014c8..d31a959ffeecd 100644
--- a/paddle/cinn/ir/lowered_func.cc
+++ b/paddle/cinn/ir/lowered_func.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/lowered_func.h"
 
 #include <algorithm>
 #include <iostream>
@@ -22,14 +22,14 @@
 #include <unordered_set>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/optim/tensor_write_tell.h"
-#include "cinn/runtime/intrinsic.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/optim/tensor_write_tell.h"
+#include "paddle/cinn/runtime/intrinsic.h"
+#include "paddle/cinn/utils/string.h"
 DECLARE_bool(cinn_ir_schedule);
 
 namespace cinn {
diff --git a/paddle/cinn/ir/lowered_func.h b/paddle/cinn/ir/lowered_func.h
index f237232b1c7ab..3efc7cfc41254 100755
--- a/paddle/cinn/ir/lowered_func.h
+++ b/paddle/cinn/ir/lowered_func.h
@@ -17,8 +17,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir_base.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/module.cc b/paddle/cinn/ir/module.cc
index d0bd612bf0a7b..85d846e2c0fbe 100644
--- a/paddle/cinn/ir/module.cc
+++ b/paddle/cinn/ir/module.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/module.h"
+#include "paddle/cinn/ir/module.h"
 
 #include <memory>
 
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/optimize.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/optimize.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/module.h b/paddle/cinn/ir/module.h
index e92df6f219801..df47afab3f3fb 100644
--- a/paddle/cinn/ir/module.h
+++ b/paddle/cinn/ir/module.h
@@ -16,10 +16,10 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/outputs.h"
-#include "cinn/common/common.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/lang/buffer.h"
+#include "paddle/cinn/backends/outputs.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/lang/buffer.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/ir/operation.cc b/paddle/cinn/ir/operation.cc
index 217d0f853b762..967adf8a8d42f 100644
--- a/paddle/cinn/ir/operation.cc
+++ b/paddle/cinn/ir/operation.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/operation.h"
+#include "paddle/cinn/ir/operation.h"
 
 #include <memory>
 
-#include "cinn/common/common.h"
+#include "paddle/cinn/common/common.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/operation.h b/paddle/cinn/ir/operation.h
index be30969105356..c1aad25295e54 100644
--- a/paddle/cinn/ir/operation.h
+++ b/paddle/cinn/ir/operation.h
@@ -19,9 +19,9 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/registry.cc b/paddle/cinn/ir/registry.cc
index 2e8a7caf1efb1..03f1c50ed752f 100644
--- a/paddle/cinn/ir/registry.cc
+++ b/paddle/cinn/ir/registry.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/registry.h"
+#include "paddle/cinn/ir/registry.h"
 
 #include <map>
 #include <mutex>  // NOLINT
diff --git a/paddle/cinn/ir/registry.h b/paddle/cinn/ir/registry.h
index 612213a95d9cc..1413ae6dbf70a 100644
--- a/paddle/cinn/ir/registry.h
+++ b/paddle/cinn/ir/registry.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/lang/packed_func.h"
+#include "paddle/cinn/lang/packed_func.h"
 
 namespace cinn::ir {
 
diff --git a/paddle/cinn/ir/schedule_desc.cc b/paddle/cinn/ir/schedule_desc.cc
index cb50cc2ab9614..643423b741e05 100644
--- a/paddle/cinn/ir/schedule_desc.cc
+++ b/paddle/cinn/ir/schedule_desc.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/schedule_desc.h"
+#include "paddle/cinn/ir/schedule_desc.h"
 
 #include <glog/logging.h>
 
@@ -20,9 +20,9 @@
 #include <typeinfo>
 #include <utility>
 
-#include "cinn/common/macros.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/schedule_desc.h b/paddle/cinn/ir/schedule_desc.h
index 43a1820cfe9e0..57c85b5391bb2 100644
--- a/paddle/cinn/ir/schedule_desc.h
+++ b/paddle/cinn/ir/schedule_desc.h
@@ -19,10 +19,10 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/schedule_desc.pb.h"
-#include "cinn/utils/registry.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/schedule_desc.pb.h"
+#include "paddle/cinn/utils/registry.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/schedule_desc_test.cc b/paddle/cinn/ir/schedule_desc_test.cc
index 171f1fbedc3f8..af10af53406c6 100644
--- a/paddle/cinn/ir/schedule_desc_test.cc
+++ b/paddle/cinn/ir/schedule_desc_test.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/schedule_desc.h"
+#include "paddle/cinn/ir/schedule_desc.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/common/context.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/lang/lower.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/utils/string.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/utils/string.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index f0e53231fd33e..87a14399e49b9 100755
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -12,24 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/tensor.h"
 
 #include <cstring>
 
-#include "cinn/cinn.h"
-#include "cinn/common/arithmatic.h"
-#include "cinn/common/axis.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/common.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/ir/operation.h"
-#include "cinn/lang/compute.h"
-#include "cinn/poly/isl_utils.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/axis.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/operation.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/ir/tensor.h b/paddle/cinn/ir/tensor.h
index 437fe62e6d31c..f97409b25b04b 100644
--- a/paddle/cinn/ir/tensor.h
+++ b/paddle/cinn/ir/tensor.h
@@ -25,11 +25,11 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/function_base.h"
-#include "cinn/lang/buffer.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/function_base.h"
+#include "paddle/cinn/lang/buffer.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/ir/tensor_test.cc b/paddle/cinn/ir/tensor_test.cc
index 54c46bfa7028b..049b3c75ae1a0 100755
--- a/paddle/cinn/ir/tensor_test.cc
+++ b/paddle/cinn/ir/tensor_test.cc
@@ -12,21 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/tensor.h"
+#include "paddle/cinn/ir/tensor.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/cinn.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/lang/placeholder.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/lang/placeholder.h"
 
 namespace cinn {
 namespace ir {
diff --git a/paddle/cinn/lang/CMakeLists.txt b/paddle/cinn/lang/CMakeLists.txt
index 9a9c86a63e141..bd1e0376a3572 100644
--- a/paddle/cinn/lang/CMakeLists.txt
+++ b/paddle/cinn/lang/CMakeLists.txt
@@ -10,8 +10,8 @@ gather_srcs(cinnapi_src SRCS
     packed_func.cc
     )
 
-cc_test(test_compute SRCS compute_test.cc DEPS cinncore)
-cc_test(test_placeholder SRCS placeholder_test.cc DEPS cinncore)
-cc_test(test_lower SRCS lower_test.cc DEPS cinncore)
-cc_test(test_lower_impl SRCS lower_impl_test.cc DEPS cinncore)
-cc_test(test_cinn_packed_func SRCS packed_func_test.cc DEPS cinncore)
+cinn_cc_test(test_compute SRCS compute_test.cc DEPS cinncore)
+cinn_cc_test(test_placeholder SRCS placeholder_test.cc DEPS cinncore)
+cinn_cc_test(test_lower SRCS lower_test.cc DEPS cinncore)
+cinn_cc_test(test_lower_impl SRCS lower_impl_test.cc DEPS cinncore)
+cinn_cc_test(test_cinn_packed_func SRCS packed_func_test.cc DEPS cinncore)
diff --git a/paddle/cinn/lang/buffer.cc b/paddle/cinn/lang/buffer.cc
index 182d8c4b4c5a9..185633f66b0dc 100644
--- a/paddle/cinn/lang/buffer.cc
+++ b/paddle/cinn/lang/buffer.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/buffer.h"
+#include "paddle/cinn/lang/buffer.h"
 
-#include "cinn/ir/buffer.h"
+#include "paddle/cinn/ir/buffer.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/buffer.h b/paddle/cinn/lang/buffer.h
index bcb4f5a602e74..351291d096808 100644
--- a/paddle/cinn/lang/buffer.h
+++ b/paddle/cinn/lang/buffer.h
@@ -16,7 +16,7 @@
 
 #include <string>
 
-#include "cinn/ir/buffer.h"
+#include "paddle/cinn/ir/buffer.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/builtin.cc b/paddle/cinn/lang/builtin.cc
index 266f704a76576..34e150ba35472 100644
--- a/paddle/cinn/lang/builtin.cc
+++ b/paddle/cinn/lang/builtin.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/builtin.h"
+#include "paddle/cinn/lang/builtin.h"
 
 #include <cmath>
 #include <limits>
 #include <utility>
 
-#include "cinn/cinn.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir.h"
-#include "cinn/lang/buffer.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/lang/buffer.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/builtin.h b/paddle/cinn/lang/builtin.h
index 763461b697bc2..4ee302ee6eae3 100644
--- a/paddle/cinn/lang/builtin.h
+++ b/paddle/cinn/lang/builtin.h
@@ -16,9 +16,9 @@
 
 #include <vector>
 
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_operators.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_operators.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/compute.cc b/paddle/cinn/lang/compute.cc
index ac2e83ede44cf..629c915a4f628 100644
--- a/paddle/cinn/lang/compute.cc
+++ b/paddle/cinn/lang/compute.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/compute.h"
-
-#include "cinn/backends/extern_func_protos.h"
-#include "cinn/common/common.h"
-#include "cinn/ir/operation.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/poly/dim.h"
-#include "cinn/poly/domain.h"
-#include "cinn/poly/stage.h"
-#include "cinn/runtime/use_extern_funcs.h"
+#include "paddle/cinn/lang/compute.h"
+
+#include "paddle/cinn/backends/extern_func_protos.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/operation.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/poly/dim.h"
+#include "paddle/cinn/poly/domain.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/runtime/use_extern_funcs.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/compute.h b/paddle/cinn/lang/compute.h
index 230a2037c80a0..0970caa179603 100755
--- a/paddle/cinn/lang/compute.h
+++ b/paddle/cinn/lang/compute.h
@@ -21,10 +21,10 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/schedule.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/compute_test.cc b/paddle/cinn/lang/compute_test.cc
index cca239df92fbc..f5244016012e9 100644
--- a/paddle/cinn/lang/compute_test.cc
+++ b/paddle/cinn/lang/compute_test.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/compute.h"
+#include "paddle/cinn/lang/compute.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/buffer.h"
-#include "cinn/lang/placeholder.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/buffer.h"
+#include "paddle/cinn/lang/placeholder.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/lower.cc b/paddle/cinn/lang/lower.cc
index 5781c69e3b853..160ab5be22160 100755
--- a/paddle/cinn/lang/lower.cc
+++ b/paddle/cinn/lang/lower.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/lower.h"
+#include "paddle/cinn/lang/lower.h"
 
 #include <iostream>
 #include <map>
@@ -21,11 +21,11 @@
 #include <unordered_set>
 #include <utility>
 
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/lang/lower_impl.h"
-#include "cinn/optim/optimize.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/lang/lower_impl.h"
+#include "paddle/cinn/optim/optimize.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/lower.h b/paddle/cinn/lang/lower.h
index d20adad843174..92ffb101dedd6 100644
--- a/paddle/cinn/lang/lower.h
+++ b/paddle/cinn/lang/lower.h
@@ -20,12 +20,12 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/ir/module.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/poly/schedule.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/poly/schedule.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/lower_impl.cc b/paddle/cinn/lang/lower_impl.cc
index e839fc8ef0507..1bedacdf256ae 100644
--- a/paddle/cinn/lang/lower_impl.cc
+++ b/paddle/cinn/lang/lower_impl.cc
@@ -12,23 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/lower_impl.h"
+#include "paddle/cinn/lang/lower_impl.h"
 
 #include <algorithm>
 #include <queue>
 #include <string>
 #include <unordered_set>
 
-#include "cinn/common/common.h"
-#include "cinn/common/context.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/optim/remove_nested_block.h"
-#include "cinn/optim/replace_var_with_expr.h"
-#include "cinn/optim/transform_polyfor_to_for.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/optim/remove_nested_block.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/lower_impl.h b/paddle/cinn/lang/lower_impl.h
index 923e04c90d46f..b2acb773e6806 100644
--- a/paddle/cinn/lang/lower_impl.h
+++ b/paddle/cinn/lang/lower_impl.h
@@ -25,19 +25,19 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/optim/buffer_assign.h"
-#include "cinn/optim/compute_inline_expand.h"
-#include "cinn/optim/fold_cinn_call_arguments.h"
-#include "cinn/optim/optimize.h"
-#include "cinn/optim/remove_nested_block.h"
-#include "cinn/optim/replace_call_with_expr.h"
-#include "cinn/optim/tensor_write_tell.h"
-#include "cinn/optim/transform_gpu_forloop.h"
-#include "cinn/optim/transform_polyfor_to_for.h"
-#include "cinn/poly/ast_gen.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/optim/buffer_assign.h"
+#include "paddle/cinn/optim/compute_inline_expand.h"
+#include "paddle/cinn/optim/fold_cinn_call_arguments.h"
+#include "paddle/cinn/optim/optimize.h"
+#include "paddle/cinn/optim/remove_nested_block.h"
+#include "paddle/cinn/optim/replace_call_with_expr.h"
+#include "paddle/cinn/optim/tensor_write_tell.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/poly/ast_gen.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/lang/lower_impl_test.cc b/paddle/cinn/lang/lower_impl_test.cc
index 32b2c234093e0..3c9637128a59e 100644
--- a/paddle/cinn/lang/lower_impl_test.cc
+++ b/paddle/cinn/lang/lower_impl_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/lower_impl.h"
+#include "paddle/cinn/lang/lower_impl.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/lower_test.cc b/paddle/cinn/lang/lower_test.cc
index a7f9ebbebe9e7..a5b95bcaaf69a 100755
--- a/paddle/cinn/lang/lower_test.cc
+++ b/paddle/cinn/lang/lower_test.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/lower.h"
+#include "paddle/cinn/lang/lower.h"
 
 #include <gtest/gtest.h>
 
 #include <set>
 
-#include "cinn/cinn.h"
-#include "cinn/lang/buffer.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/lang/buffer.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/packed_func.cc b/paddle/cinn/lang/packed_func.cc
index 47f6e777c2c2d..579ae4e94d402 100644
--- a/paddle/cinn/lang/packed_func.cc
+++ b/paddle/cinn/lang/packed_func.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/packed_func.h"
+#include "paddle/cinn/lang/packed_func.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/packed_func.h b/paddle/cinn/lang/packed_func.h
index eca3fe84cd9f6..aba67cdbf524b 100644
--- a/paddle/cinn/lang/packed_func.h
+++ b/paddle/cinn/lang/packed_func.h
@@ -19,8 +19,8 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/cinn_value.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/packed_func_test.cc b/paddle/cinn/lang/packed_func_test.cc
index e374c4655e3c7..4a3eeb4b16e3d 100644
--- a/paddle/cinn/lang/packed_func_test.cc
+++ b/paddle/cinn/lang/packed_func_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/packed_func.h"
+#include "paddle/cinn/lang/packed_func.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/placeholder.cc b/paddle/cinn/lang/placeholder.cc
index c73476c2db021..f3dfd043178e3 100644
--- a/paddle/cinn/lang/placeholder.cc
+++ b/paddle/cinn/lang/placeholder.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/placeholder.h"
+#include "paddle/cinn/lang/placeholder.h"
 
-#include "cinn/runtime/intrinsic.h"
+#include "paddle/cinn/runtime/intrinsic.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/placeholder.h b/paddle/cinn/lang/placeholder.h
index dc945559cea23..588ab44d40de4 100644
--- a/paddle/cinn/lang/placeholder.h
+++ b/paddle/cinn/lang/placeholder.h
@@ -16,13 +16,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/operation.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/runtime/intrinsic.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/operation.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/runtime/intrinsic.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/lang/placeholder_test.cc b/paddle/cinn/lang/placeholder_test.cc
index 5043b5280dfd6..158d818f0e02e 100644
--- a/paddle/cinn/lang/placeholder_test.cc
+++ b/paddle/cinn/lang/placeholder_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/lang/placeholder.h"
+#include "paddle/cinn/lang/placeholder.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn {
 namespace lang {
diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt
index 54407db0af697..847a2867abf9d 100755
--- a/paddle/cinn/optim/CMakeLists.txt
+++ b/paddle/cinn/optim/CMakeLists.txt
@@ -36,15 +36,15 @@ if (WITH_CUDA)
 endif()
 
 
-cc_test(test_remove_nested_block SRCS remove_nested_block_test.cc DEPS cinncore)
-cc_test(test_ir_copy SRCS ir_copy_test.cc DEPS cinncore)
-cc_test(test_ir_simplify SRCS ir_simplify_test.cc DEPS cinncore)
-cc_test(test_replace_call_with_expr SRCS replace_call_with_expr_test.cc DEPS cinncore)
-cc_test(test_vectorize_loops SRCS vectorize_loops_test.cc DEPS cinncore ARGS ${global_test_args})
-cc_test(test_transform_polyfor_to_for SRCS transform_polyfor_to_for_test.cc DEPS cinncore ARGS ${global_test_args})
-cc_test(test_optimize SRCS optimize_test.cc DEPS cinncore)
-cc_test(test_cache_read_write_replace SRCS cache_read_write_replace_test.cc DEPS cinncore)
-cc_test(test_cast_simplify SRCS cast_simplify_test.cc DEPS cinncore)
-cc_test(test_if_simplify SRCS if_simplify_test.cc DEPS cinncore)
-cc_test(test_remove_schedule_block SRCS remove_schedule_block_test.cc DEPS cinncore)
-cc_test(test_unroll_loops SRCS unroll_loops_test.cc DEPS cinncore)
+cinn_cc_test(test_remove_nested_block SRCS remove_nested_block_test.cc DEPS cinncore)
+cinn_cc_test(test_ir_copy SRCS ir_copy_test.cc DEPS cinncore)
+cinn_cc_test(test_ir_simplify SRCS ir_simplify_test.cc DEPS cinncore)
+cinn_cc_test(test_replace_call_with_expr SRCS replace_call_with_expr_test.cc DEPS cinncore)
+cinn_cc_test(test_vectorize_loops SRCS vectorize_loops_test.cc DEPS cinncore ARGS ${global_test_args})
+cinn_cc_test(test_transform_polyfor_to_for SRCS transform_polyfor_to_for_test.cc DEPS cinncore ARGS ${global_test_args})
+cinn_cc_test(test_optimize SRCS optimize_test.cc DEPS cinncore)
+cinn_cc_test(test_cache_read_write_replace SRCS cache_read_write_replace_test.cc DEPS cinncore)
+cinn_cc_test(test_cast_simplify SRCS cast_simplify_test.cc DEPS cinncore)
+cinn_cc_test(test_if_simplify SRCS if_simplify_test.cc DEPS cinncore)
+cinn_cc_test(test_remove_schedule_block SRCS remove_schedule_block_test.cc DEPS cinncore)
+cinn_cc_test(test_unroll_loops SRCS unroll_loops_test.cc DEPS cinncore)
diff --git a/paddle/cinn/optim/buffer_assign.cc b/paddle/cinn/optim/buffer_assign.cc
index 0b59feb339237..74507ffe1807b 100644
--- a/paddle/cinn/optim/buffer_assign.cc
+++ b/paddle/cinn/optim/buffer_assign.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/buffer_assign.h"
+#include "paddle/cinn/optim/buffer_assign.h"
 
-#include "cinn/common/union_find.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/lang/lower_impl.h"
-#include "cinn/optim/ir_replace.h"
+#include "paddle/cinn/common/union_find.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/lang/lower_impl.h"
+#include "paddle/cinn/optim/ir_replace.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/buffer_assign.h b/paddle/cinn/optim/buffer_assign.h
index 69464607ad7de..bd5e0c1359413 100644
--- a/paddle/cinn/optim/buffer_assign.h
+++ b/paddle/cinn/optim/buffer_assign.h
@@ -18,8 +18,8 @@
 #include <set>
 #include <string>
 
-#include "cinn/ir/ir.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/cache_read_write_replace_test.cc b/paddle/cinn/optim/cache_read_write_replace_test.cc
index eda11ac0ccc3d..86206d8515287 100755
--- a/paddle/cinn/optim/cache_read_write_replace_test.cc
+++ b/paddle/cinn/optim/cache_read_write_replace_test.cc
@@ -14,8 +14,8 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/call_arg_list_to_pod_value.cc b/paddle/cinn/optim/call_arg_list_to_pod_value.cc
index afdddbb566a1b..c1f9389cdfa2f 100644
--- a/paddle/cinn/optim/call_arg_list_to_pod_value.cc
+++ b/paddle/cinn/optim/call_arg_list_to_pod_value.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/call_arg_list_to_pod_value.h"
+#include "paddle/cinn/optim/call_arg_list_to_pod_value.h"
 
 #include <string>
 #include <tuple>
 #include <vector>
 
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/runtime/intrinsic.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/runtime/intrinsic.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/call_arg_list_to_pod_value.h b/paddle/cinn/optim/call_arg_list_to_pod_value.h
index 2c568177ff75f..2b824f9665beb 100644
--- a/paddle/cinn/optim/call_arg_list_to_pod_value.h
+++ b/paddle/cinn/optim/call_arg_list_to_pod_value.h
@@ -17,7 +17,7 @@
  * \file Transform the CINN Call node's args to cinn_pod_value_t array.
  */
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/cast_bool_to_int8.cc b/paddle/cinn/optim/cast_bool_to_int8.cc
index 86584aba5072c..64385623bcd21 100644
--- a/paddle/cinn/optim/cast_bool_to_int8.cc
+++ b/paddle/cinn/optim/cast_bool_to_int8.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/cast_bool_to_int8.h"
+#include "paddle/cinn/optim/cast_bool_to_int8.h"
 
 #include <glog/logging.h>
 
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/cast_bool_to_int8.h b/paddle/cinn/optim/cast_bool_to_int8.h
index c7770840167e5..398163a81068a 100644
--- a/paddle/cinn/optim/cast_bool_to_int8.h
+++ b/paddle/cinn/optim/cast_bool_to_int8.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/cast_simplify.cc b/paddle/cinn/optim/cast_simplify.cc
index eb88dbc3d29a4..9a6f80b7e302a 100644
--- a/paddle/cinn/optim/cast_simplify.cc
+++ b/paddle/cinn/optim/cast_simplify.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/cast_simplify.h"
+#include "paddle/cinn/optim/cast_simplify.h"
 
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/cast_simplify.h b/paddle/cinn/optim/cast_simplify.h
index 7a3e1abf1ff70..595f85e2786da 100644
--- a/paddle/cinn/optim/cast_simplify.h
+++ b/paddle/cinn/optim/cast_simplify.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/cast_simplify_test.cc b/paddle/cinn/optim/cast_simplify_test.cc
index 2aad9b6789556..eebd935efd9ff 100644
--- a/paddle/cinn/optim/cast_simplify_test.cc
+++ b/paddle/cinn/optim/cast_simplify_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/cast_simplify.h"
+#include "paddle/cinn/optim/cast_simplify.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/collect_undefined_vars.cc b/paddle/cinn/optim/collect_undefined_vars.cc
index 244342bad2cb4..31b91c1f26944 100644
--- a/paddle/cinn/optim/collect_undefined_vars.cc
+++ b/paddle/cinn/optim/collect_undefined_vars.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/collect_undefined_vars.h"
+#include "paddle/cinn/optim/collect_undefined_vars.h"
 
 #include <set>
 
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/collect_undefined_vars.h b/paddle/cinn/optim/collect_undefined_vars.h
index 25b4de3a2d4d5..b83620fcc1cb0 100644
--- a/paddle/cinn/optim/collect_undefined_vars.h
+++ b/paddle/cinn/optim/collect_undefined_vars.h
@@ -16,7 +16,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 namespace cinn::optim {
 
 /**
diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc
index 9e110706aab57..7dae4cfeae5eb 100644
--- a/paddle/cinn/optim/compute_inline_expand.cc
+++ b/paddle/cinn/optim/compute_inline_expand.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/compute_inline_expand.h"
+#include "paddle/cinn/optim/compute_inline_expand.h"
 
 #include <map>
 #include <string>
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/replace_var_with_expr.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/compute_inline_expand.h b/paddle/cinn/optim/compute_inline_expand.h
index 9fa5baf682eb8..eb17641dc50b1 100644
--- a/paddle/cinn/optim/compute_inline_expand.h
+++ b/paddle/cinn/optim/compute_inline_expand.h
@@ -16,7 +16,7 @@
 
 #include <set>
 
-#include "cinn/cinn.h"
+#include "paddle/cinn/cinn.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
index 64ee0ba7a5664..46a17b4954abb 100644
--- a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
+++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/eliminate_broadcast_in_forloop.h"
+#include "paddle/cinn/optim/eliminate_broadcast_in_forloop.h"
 
 #include <tuple>
 #include <vector>
 
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/optim/ir_replace.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/optim/ir_replace.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/eliminate_broadcast_in_forloop.h b/paddle/cinn/optim/eliminate_broadcast_in_forloop.h
index 95f1a9a4063a6..db246a24a0dba 100644
--- a/paddle/cinn/optim/eliminate_broadcast_in_forloop.h
+++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/extern_call_process.cc b/paddle/cinn/optim/extern_call_process.cc
index 0f3f62c243b68..6c9532a02fa99 100644
--- a/paddle/cinn/optim/extern_call_process.cc
+++ b/paddle/cinn/optim/extern_call_process.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/extern_call_process.h"
+#include "paddle/cinn/optim/extern_call_process.h"
 
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/extern_call_process.h b/paddle/cinn/optim/extern_call_process.h
index 6f371a1134d7f..03855c7acb63f 100644
--- a/paddle/cinn/optim/extern_call_process.h
+++ b/paddle/cinn/optim/extern_call_process.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/fold_cinn_call_arguments.cc b/paddle/cinn/optim/fold_cinn_call_arguments.cc
index e09e7ede205fb..3ce60fd569e54 100644
--- a/paddle/cinn/optim/fold_cinn_call_arguments.cc
+++ b/paddle/cinn/optim/fold_cinn_call_arguments.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/fold_cinn_call_arguments.h"
+#include "paddle/cinn/optim/fold_cinn_call_arguments.h"
 
 #include <unordered_set>
 #include <vector>
 
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/fold_cinn_call_arguments.h b/paddle/cinn/optim/fold_cinn_call_arguments.h
index 8c15438792077..78cf2bd91a5cd 100644
--- a/paddle/cinn/optim/fold_cinn_call_arguments.h
+++ b/paddle/cinn/optim/fold_cinn_call_arguments.h
@@ -16,7 +16,7 @@
 
 #include <string>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/if_simplify.cc b/paddle/cinn/optim/if_simplify.cc
index 0d999dac84795..5226ae64c81e2 100644
--- a/paddle/cinn/optim/if_simplify.cc
+++ b/paddle/cinn/optim/if_simplify.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/if_simplify.h"
+#include "paddle/cinn/optim/if_simplify.h"
 
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/if_simplify.h b/paddle/cinn/optim/if_simplify.h
index 2e4fa1426ee59..97f66122b4318 100644
--- a/paddle/cinn/optim/if_simplify.h
+++ b/paddle/cinn/optim/if_simplify.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/if_simplify_test.cc b/paddle/cinn/optim/if_simplify_test.cc
index 1221a58b805cc..201d82f8f1aca 100644
--- a/paddle/cinn/optim/if_simplify_test.cc
+++ b/paddle/cinn/optim/if_simplify_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/if_simplify.h"
+#include "paddle/cinn/optim/if_simplify.h"
 
 #include <gtest/gtest.h>
 
 #include <string>
 
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/insert_debug_log_callee.cc b/paddle/cinn/optim/insert_debug_log_callee.cc
index 6c7988b6016c8..7addcad664d01 100644
--- a/paddle/cinn/optim/insert_debug_log_callee.cc
+++ b/paddle/cinn/optim/insert_debug_log_callee.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/insert_debug_log_callee.h"
+#include "paddle/cinn/optim/insert_debug_log_callee.h"
 
 #include <sstream>
 #include <string>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/runtime/intrinsic.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/runtime/intrinsic.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/insert_debug_log_callee.h b/paddle/cinn/optim/insert_debug_log_callee.h
index 470c909d36ce1..9ad26cc4ce22c 100644
--- a/paddle/cinn/optim/insert_debug_log_callee.h
+++ b/paddle/cinn/optim/insert_debug_log_callee.h
@@ -16,7 +16,7 @@
 #include <tuple>
 #include <utility>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/ir_copy.cc b/paddle/cinn/optim/ir_copy.cc
index 0603a2998def7..6adf9b44a1a9b 100644
--- a/paddle/cinn/optim/ir_copy.cc
+++ b/paddle/cinn/optim/ir_copy.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/ir/module.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/module.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/ir_copy.h b/paddle/cinn/optim/ir_copy.h
index 38baef7067f11..726739394eba6 100644
--- a/paddle/cinn/optim/ir_copy.h
+++ b/paddle/cinn/optim/ir_copy.h
@@ -17,8 +17,8 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/lowered_func.h"
 
 namespace cinn {
 
diff --git a/paddle/cinn/optim/ir_copy_test.cc b/paddle/cinn/optim/ir_copy_test.cc
index ee592fda58aed..5aba015617d77 100644
--- a/paddle/cinn/optim/ir_copy_test.cc
+++ b/paddle/cinn/optim/ir_copy_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_copy.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/ir_replace.cc b/paddle/cinn/optim/ir_replace.cc
index 9ebf0c7271680..ce6f1f3c57f8c 100755
--- a/paddle/cinn/optim/ir_replace.cc
+++ b/paddle/cinn/optim/ir_replace.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/ir_replace.h"
+#include "paddle/cinn/optim/ir_replace.h"
 
 #include <set>
 
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/ir_replace.h b/paddle/cinn/optim/ir_replace.h
index c6982056693e4..7c95d1e6f6c38 100644
--- a/paddle/cinn/optim/ir_replace.h
+++ b/paddle/cinn/optim/ir_replace.h
@@ -15,7 +15,7 @@
 #pragma once
 #include <string>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc
index 0ed3d92c93aeb..910a6f17b03eb 100644
--- a/paddle/cinn/optim/ir_simplify.cc
+++ b/paddle/cinn/optim/ir_simplify.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/ir_simplify.h"
 
 #include <absl/container/flat_hash_map.h>
 #include <ginac/ginac.h>
@@ -21,16 +21,16 @@
 #include <map>
 #include <string>
 
-#include "cinn/common/arithmatic.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/optim/cast_simplify.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/optim/cast_simplify.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/ir_simplify.h b/paddle/cinn/optim/ir_simplify.h
index f5e2bdf82f6ba..972f3d44564ba 100644
--- a/paddle/cinn/optim/ir_simplify.h
+++ b/paddle/cinn/optim/ir_simplify.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/ir_simplify_test.cc b/paddle/cinn/optim/ir_simplify_test.cc
index b9e7fb807a072..21d904abb2e77 100755
--- a/paddle/cinn/optim/ir_simplify_test.cc
+++ b/paddle/cinn/optim/ir_simplify_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/ir_simplify.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/lower_function_call_bind_vars.cc b/paddle/cinn/optim/lower_function_call_bind_vars.cc
index abb80fc56b871..333ae623c620c 100644
--- a/paddle/cinn/optim/lower_function_call_bind_vars.cc
+++ b/paddle/cinn/optim/lower_function_call_bind_vars.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/lower_function_call_bind_vars.h"
+#include "paddle/cinn/optim/lower_function_call_bind_vars.h"
 
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/lower_function_call_bind_vars.h b/paddle/cinn/optim/lower_function_call_bind_vars.h
index d5b941862a9c7..1bd2297a615b4 100644
--- a/paddle/cinn/optim/lower_function_call_bind_vars.h
+++ b/paddle/cinn/optim/lower_function_call_bind_vars.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/module.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/module.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/lower_intrin.cc b/paddle/cinn/optim/lower_intrin.cc
index e342af8fbeb22..7431f6b66d292 100644
--- a/paddle/cinn/optim/lower_intrin.cc
+++ b/paddle/cinn/optim/lower_intrin.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/lower_intrin.h"
+#include "paddle/cinn/optim/lower_intrin.h"
 
 #include <string>
 
-#include "cinn/backends/llvm/llvm_intrin_rule.h"
-#include "cinn/cinn.h"
-#include "cinn/ir/intrinsic_ops.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/registry.h"
+#include "paddle/cinn/backends/llvm/llvm_intrin_rule.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/registry.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/lower_intrin.h b/paddle/cinn/optim/lower_intrin.h
index 1b4b5cd2ac42d..86ac60bda9a84 100644
--- a/paddle/cinn/optim/lower_intrin.h
+++ b/paddle/cinn/optim/lower_intrin.h
@@ -17,7 +17,7 @@
 #include <set>
 #include <string>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/map_extern_call.cc b/paddle/cinn/optim/map_extern_call.cc
index f67129a64567b..024a8e8385aee 100644
--- a/paddle/cinn/optim/map_extern_call.cc
+++ b/paddle/cinn/optim/map_extern_call.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/map_extern_call.h"
+#include "paddle/cinn/optim/map_extern_call.h"
 
-#include "cinn/cinn.h"
-#include "cinn/hlir/op/op_util.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/hlir/op/op_util.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/map_extern_call.h b/paddle/cinn/optim/map_extern_call.h
index 6ece28f96bad6..022ca4b032d3e 100644
--- a/paddle/cinn/optim/map_extern_call.h
+++ b/paddle/cinn/optim/map_extern_call.h
@@ -17,7 +17,7 @@
 #include <set>
 #include <string>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
index 55ddc705700de..afd91b84be5d1 100644
--- a/paddle/cinn/optim/optimize.cc
+++ b/paddle/cinn/optim/optimize.cc
@@ -12,30 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/optimize.h"
+#include "paddle/cinn/optim/optimize.h"
 
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_schedule_util.h"
-#include "cinn/optim/call_arg_list_to_pod_value.h"
-#include "cinn/optim/cast_bool_to_int8.h"
-#include "cinn/optim/cast_simplify.h"
-#include "cinn/optim/eliminate_broadcast_in_forloop.h"
-#include "cinn/optim/extern_call_process.h"
-#include "cinn/optim/fold_cinn_call_arguments.h"
-#include "cinn/optim/if_simplify.h"
-#include "cinn/optim/insert_debug_log_callee.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/lower_function_call_bind_vars.h"
-#include "cinn/optim/lower_intrin.h"
-#include "cinn/optim/map_extern_call.h"
-#include "cinn/optim/remove_nested_block.h"
-#include "cinn/optim/remove_schedule_block.h"
-#include "cinn/optim/replace_const_param_to_integer.h"
-#include "cinn/optim/transform_gpu_forloop.h"
-#include "cinn/optim/transform_polyfor_to_for.h"
-#include "cinn/optim/unroll_loops.h"
-#include "cinn/optim/vectorize_loops.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule_util.h"
+#include "paddle/cinn/optim/call_arg_list_to_pod_value.h"
+#include "paddle/cinn/optim/cast_bool_to_int8.h"
+#include "paddle/cinn/optim/cast_simplify.h"
+#include "paddle/cinn/optim/eliminate_broadcast_in_forloop.h"
+#include "paddle/cinn/optim/extern_call_process.h"
+#include "paddle/cinn/optim/fold_cinn_call_arguments.h"
+#include "paddle/cinn/optim/if_simplify.h"
+#include "paddle/cinn/optim/insert_debug_log_callee.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/lower_function_call_bind_vars.h"
+#include "paddle/cinn/optim/lower_intrin.h"
+#include "paddle/cinn/optim/map_extern_call.h"
+#include "paddle/cinn/optim/remove_nested_block.h"
+#include "paddle/cinn/optim/remove_schedule_block.h"
+#include "paddle/cinn/optim/replace_const_param_to_integer.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/optim/unroll_loops.h"
+#include "paddle/cinn/optim/vectorize_loops.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/optim/optimize.h b/paddle/cinn/optim/optimize.h
index 7d1165f3d883c..7b7a0afbff672 100644
--- a/paddle/cinn/optim/optimize.h
+++ b/paddle/cinn/optim/optimize.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir.h"
-#include "cinn/ir/module.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/module.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/optimize_test.cc b/paddle/cinn/optim/optimize_test.cc
index 1479fa6b37871..bd1515fd7924c 100755
--- a/paddle/cinn/optim/optimize_test.cc
+++ b/paddle/cinn/optim/optimize_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/optimize.h"
+#include "paddle/cinn/optim/optimize.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/remove_nested_block.cc b/paddle/cinn/optim/remove_nested_block.cc
index 366dd23a1a33f..a748b53867f18 100644
--- a/paddle/cinn/optim/remove_nested_block.cc
+++ b/paddle/cinn/optim/remove_nested_block.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/remove_nested_block.h"
+#include "paddle/cinn/optim/remove_nested_block.h"
 
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/remove_nested_block.h b/paddle/cinn/optim/remove_nested_block.h
index cf6393fc863a1..41220c18b254a 100644
--- a/paddle/cinn/optim/remove_nested_block.h
+++ b/paddle/cinn/optim/remove_nested_block.h
@@ -18,8 +18,8 @@
 #pragma once
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/remove_nested_block_test.cc b/paddle/cinn/optim/remove_nested_block_test.cc
index a62689c7d1ea0..3ae1948a703a9 100644
--- a/paddle/cinn/optim/remove_nested_block_test.cc
+++ b/paddle/cinn/optim/remove_nested_block_test.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/remove_nested_block.h"
+#include "paddle/cinn/optim/remove_nested_block.h"
 
 #include <gtest/gtest.h>
 
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir_printer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/remove_schedule_block.cc b/paddle/cinn/optim/remove_schedule_block.cc
index e496ccdca4f0f..853c13811e2d6 100644
--- a/paddle/cinn/optim/remove_schedule_block.cc
+++ b/paddle/cinn/optim/remove_schedule_block.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/remove_schedule_block.h"
+#include "paddle/cinn/optim/remove_schedule_block.h"
 
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/optim/replace_var_with_expr.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/remove_schedule_block.h b/paddle/cinn/optim/remove_schedule_block.h
index 791c12159f81f..e2792314367a2 100644
--- a/paddle/cinn/optim/remove_schedule_block.h
+++ b/paddle/cinn/optim/remove_schedule_block.h
@@ -18,8 +18,8 @@
 #pragma once
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/remove_schedule_block_test.cc b/paddle/cinn/optim/remove_schedule_block_test.cc
index bf41b729ea900..556d10e4275fd 100755
--- a/paddle/cinn/optim/remove_schedule_block_test.cc
+++ b/paddle/cinn/optim/remove_schedule_block_test.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/remove_schedule_block.h"
+#include "paddle/cinn/optim/remove_schedule_block.h"
 
 #include <gtest/gtest.h>
 
 #include <string>
 #include <vector>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/replace_call_with_expr.cc b/paddle/cinn/optim/replace_call_with_expr.cc
index ac69e484cec31..388f5ae076e88 100644
--- a/paddle/cinn/optim/replace_call_with_expr.cc
+++ b/paddle/cinn/optim/replace_call_with_expr.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/replace_call_with_expr.h"
+#include "paddle/cinn/optim/replace_call_with_expr.h"
 
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/replace_var_with_expr.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/replace_call_with_expr.h b/paddle/cinn/optim/replace_call_with_expr.h
index 470a4835038e8..219cb9984736b 100644
--- a/paddle/cinn/optim/replace_call_with_expr.h
+++ b/paddle/cinn/optim/replace_call_with_expr.h
@@ -16,7 +16,7 @@
 #include <map>
 #include <string>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/replace_call_with_expr_test.cc b/paddle/cinn/optim/replace_call_with_expr_test.cc
index f5d08027a89d4..70548cb2bffe8 100644
--- a/paddle/cinn/optim/replace_call_with_expr_test.cc
+++ b/paddle/cinn/optim/replace_call_with_expr_test.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/replace_call_with_expr.h"
+#include "paddle/cinn/optim/replace_call_with_expr.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/ir/buffer.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/ast_gen.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/ast_gen.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/replace_const_param_to_integer.cc b/paddle/cinn/optim/replace_const_param_to_integer.cc
index 9d270e4e8d9b6..4e14f8c74c4aa 100644
--- a/paddle/cinn/optim/replace_const_param_to_integer.cc
+++ b/paddle/cinn/optim/replace_const_param_to_integer.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/replace_const_param_to_integer.h"
+#include "paddle/cinn/optim/replace_const_param_to_integer.h"
 
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/poly/ast_gen.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/poly/ast_gen.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/replace_const_param_to_integer.h b/paddle/cinn/optim/replace_const_param_to_integer.h
index 40b7dee5b3299..496540213fa01 100644
--- a/paddle/cinn/optim/replace_const_param_to_integer.h
+++ b/paddle/cinn/optim/replace_const_param_to_integer.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/replace_var_with_expr.cc b/paddle/cinn/optim/replace_var_with_expr.cc
index c10a16bb60339..6a1cece5be7a2 100644
--- a/paddle/cinn/optim/replace_var_with_expr.cc
+++ b/paddle/cinn/optim/replace_var_with_expr.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/replace_var_with_expr.h"
-
-#include "cinn/common/cas.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/replace_const_param_to_integer.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
+
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/replace_const_param_to_integer.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/replace_var_with_expr.h b/paddle/cinn/optim/replace_var_with_expr.h
index 50b2b2dd3ce31..6f99de5dc0002 100644
--- a/paddle/cinn/optim/replace_var_with_expr.h
+++ b/paddle/cinn/optim/replace_var_with_expr.h
@@ -18,7 +18,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/tensor_write_tell.cc b/paddle/cinn/optim/tensor_write_tell.cc
index d52590cf17d29..9f0f5747c3f3d 100644
--- a/paddle/cinn/optim/tensor_write_tell.cc
+++ b/paddle/cinn/optim/tensor_write_tell.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/tensor_write_tell.h"
+#include "paddle/cinn/optim/tensor_write_tell.h"
 
 namespace cinn {
 namespace optim {}  // namespace optim
diff --git a/paddle/cinn/optim/tensor_write_tell.h b/paddle/cinn/optim/tensor_write_tell.h
index a44664ca25baf..6a5ba1fd03877 100644
--- a/paddle/cinn/optim/tensor_write_tell.h
+++ b/paddle/cinn/optim/tensor_write_tell.h
@@ -16,8 +16,8 @@
 #include <set>
 #include <string>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index 86e2572a7a70c..e99f2fdd81399 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
 
 #include <algorithm>
 #include <map>
@@ -20,19 +20,19 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/replace_var_with_expr.h"
-#include "cinn/poly/isl_utils.h"
-#include "cinn/poly/stage.h"
-#include "cinn/runtime/intrinsic.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/runtime/intrinsic.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/transform_gpu_forloop.h b/paddle/cinn/optim/transform_gpu_forloop.h
index bffe8f412c8a7..23884bd583394 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.h
+++ b/paddle/cinn/optim/transform_gpu_forloop.h
@@ -17,10 +17,10 @@
 #include <unordered_set>
 #include <utility>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/poly/isl_utils.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/transform_polyfor_to_for.cc b/paddle/cinn/optim/transform_polyfor_to_for.cc
index 3913056fbf719..544ead54780ce 100644
--- a/paddle/cinn/optim/transform_polyfor_to_for.cc
+++ b/paddle/cinn/optim/transform_polyfor_to_for.cc
@@ -12,21 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
 
 #include <cmath>
 #include <vector>
 
-#include "cinn/common/arithmatic.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/type.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/ir_simplify.h"
+#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_simplify.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/transform_polyfor_to_for.h b/paddle/cinn/optim/transform_polyfor_to_for.h
index d31bc6c4584f7..d4942333ae5b7 100644
--- a/paddle/cinn/optim/transform_polyfor_to_for.h
+++ b/paddle/cinn/optim/transform_polyfor_to_for.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/transform_polyfor_to_for_test.cc b/paddle/cinn/optim/transform_polyfor_to_for_test.cc
index d98dd770c4549..9fedeb9c65c62 100644
--- a/paddle/cinn/optim/transform_polyfor_to_for_test.cc
+++ b/paddle/cinn/optim/transform_polyfor_to_for_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
+#include "paddle/cinn/cinn.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc
index 7262a77878900..3727b87464171 100755
--- a/paddle/cinn/optim/unroll_loops.cc
+++ b/paddle/cinn/optim/unroll_loops.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/unroll_loops.h"
+#include "paddle/cinn/optim/unroll_loops.h"
 
 #include <utility>
 #include <vector>
 
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/ir_replace.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_replace.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/unroll_loops.h b/paddle/cinn/optim/unroll_loops.h
index 283991b4f81dc..14f916bd2beb9 100644
--- a/paddle/cinn/optim/unroll_loops.h
+++ b/paddle/cinn/optim/unroll_loops.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/unroll_loops_test.cc b/paddle/cinn/optim/unroll_loops_test.cc
index e4dbf49055da7..809a1e30ab893 100644
--- a/paddle/cinn/optim/unroll_loops_test.cc
+++ b/paddle/cinn/optim/unroll_loops_test.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/unroll_loops.h"
+#include "paddle/cinn/optim/unroll_loops.h"
 
 #include <gtest/gtest.h>
 
 #include <vector>
 
-#include "cinn/cinn.h"
-#include "cinn/ir/ir_schedule.h"
-#include "cinn/lang/lower.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/lang/lower.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/var_mod_simplify.cc b/paddle/cinn/optim/var_mod_simplify.cc
index af099fe028391..81ab15797a3d6 100644
--- a/paddle/cinn/optim/var_mod_simplify.cc
+++ b/paddle/cinn/optim/var_mod_simplify.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/var_mod_simplify.h"
+#include "paddle/cinn/optim/var_mod_simplify.h"
 
 #include <absl/container/flat_hash_map.h>
 
-#include "cinn/common/cas.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn::optim {
 
diff --git a/paddle/cinn/optim/var_mod_simplify.h b/paddle/cinn/optim/var_mod_simplify.h
index fb01e7e39215a..5fec5cd1741f6 100644
--- a/paddle/cinn/optim/var_mod_simplify.h
+++ b/paddle/cinn/optim/var_mod_simplify.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 /** simplify expressions with vars' div and mod.
  *
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index fc6d97f1daf4b..d58c4b6cf0538 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/vectorize_loops.h"
+#include "paddle/cinn/optim/vectorize_loops.h"
 
 #include <absl/container/flat_hash_map.h>
 #include <glog/logging.h>
@@ -23,17 +23,17 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/cas.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/ir_replace.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/tensor_write_tell.h"
-#include "cinn/optim/unroll_loops.h"
-#include "cinn/utils/functional.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_replace.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/tensor_write_tell.h"
+#include "paddle/cinn/optim/unroll_loops.h"
+#include "paddle/cinn/utils/functional.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/vectorize_loops.h b/paddle/cinn/optim/vectorize_loops.h
index a3ecc2625a219..6dfeefda6cdc5 100644
--- a/paddle/cinn/optim/vectorize_loops.h
+++ b/paddle/cinn/optim/vectorize_loops.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/vectorize_loops_test.cc b/paddle/cinn/optim/vectorize_loops_test.cc
index afcf88ae07f73..01b0a10f3ca52 100644
--- a/paddle/cinn/optim/vectorize_loops_test.cc
+++ b/paddle/cinn/optim/vectorize_loops_test.cc
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/optim/vectorize_loops.h"
+#include "paddle/cinn/optim/vectorize_loops.h"
 
 #include <gtest/gtest.h>
 
 #include <vector>
 
-#include "cinn/cinn.h"
-#include "cinn/common/common.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/optimize.h"
-#include "cinn/optim/transform_polyfor_to_for.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/optimize.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/poly/CMakeLists.txt b/paddle/cinn/poly/CMakeLists.txt
index f7eb9af0a2cb8..01365f1f9a22e 100644
--- a/paddle/cinn/poly/CMakeLists.txt
+++ b/paddle/cinn/poly/CMakeLists.txt
@@ -15,10 +15,10 @@ gather_srcs(cinnapi_src SRCS
     compute_at_transform.cc
     )
 
-# cc_test(test_graph SRCS graph_test.cc DEPS core)
-cc_test(test_schedule SRCS schedule_test.cc DEPS cinncore)
-# cc_test(test_poly_scheduler SRCS poly_scheduler_test.cc DEPS core)
-cc_test(test_stage SRCS stage_test.cc DEPS cinncore)
-cc_test(test_compute_at_transform SRCS compute_at_transform_test.cc DEPS cinncore)
-cc_test(test_ast_gen SRCS ast_gen_test.cc DEPS cinncore)
-cc_test(test_isl_utils SRCS isl_utils_test.cc DEPS cinncore)
+# cinn_cc_test(test_graph SRCS graph_test.cc DEPS core)
+cinn_cc_test(test_schedule SRCS schedule_test.cc DEPS cinncore)
+# cinn_cc_test(test_poly_scheduler SRCS poly_scheduler_test.cc DEPS core)
+cinn_cc_test(test_stage SRCS stage_test.cc DEPS cinncore)
+cinn_cc_test(test_compute_at_transform SRCS compute_at_transform_test.cc DEPS cinncore)
+cinn_cc_test(test_ast_gen SRCS ast_gen_test.cc DEPS cinncore)
+cinn_cc_test(test_isl_utils SRCS isl_utils_test.cc DEPS cinncore)
diff --git a/paddle/cinn/poly/ast_gen.cc b/paddle/cinn/poly/ast_gen.cc
index e8f4ca8c43629..a2baa023845fc 100644
--- a/paddle/cinn/poly/ast_gen.cc
+++ b/paddle/cinn/poly/ast_gen.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/ast_gen.h"
+#include "paddle/cinn/poly/ast_gen.h"
 
 #include <llvm/Support/FormatVariadic.h>
 
 #include <utility>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/poly/domain_add_unit_loop_mutator.h"
-#include "cinn/poly/isl_utils.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/poly/domain_add_unit_loop_mutator.h"
+#include "paddle/cinn/poly/isl_utils.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/ast_gen.h b/paddle/cinn/poly/ast_gen.h
index 02cf5fb9ea1e4..faa28e5a30c68 100644
--- a/paddle/cinn/poly/ast_gen.h
+++ b/paddle/cinn/poly/ast_gen.h
@@ -23,12 +23,12 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/tensor.h"
-#include "cinn/poly/isl_utils.h"
-#include "cinn/poly/poly_scheduler.h"
-#include "cinn/poly/schedule.h"
-#include "cinn/poly/stage.h"
-#include "cinn/utils/functional.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/poly/poly_scheduler.h"
+#include "paddle/cinn/poly/schedule.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/utils/functional.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/ast_gen_test.cc b/paddle/cinn/poly/ast_gen_test.cc
index 53c44968960cf..308dd9fe15ecf 100644
--- a/paddle/cinn/poly/ast_gen_test.cc
+++ b/paddle/cinn/poly/ast_gen_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/ast_gen.h"
+#include "paddle/cinn/poly/ast_gen.h"
 
 #include <gtest/gtest.h>
 
@@ -21,13 +21,13 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/poly/schedule.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/schedule.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/compute_at_transform.cc b/paddle/cinn/poly/compute_at_transform.cc
index 1f69706a4c4c9..05bcc61b283ae 100755
--- a/paddle/cinn/poly/compute_at_transform.cc
+++ b/paddle/cinn/poly/compute_at_transform.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/compute_at_transform.h"
+#include "paddle/cinn/poly/compute_at_transform.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/compute_at_transform.h b/paddle/cinn/poly/compute_at_transform.h
index dc091fc8b2e35..6c650bbcd509f 100644
--- a/paddle/cinn/poly/compute_at_transform.h
+++ b/paddle/cinn/poly/compute_at_transform.h
@@ -22,10 +22,10 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/poly/isl_utils.h"
-#include "cinn/poly/map.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/poly/map.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/compute_at_transform_test.cc b/paddle/cinn/poly/compute_at_transform_test.cc
index 7ede92a7c142a..825a9738f3517 100644
--- a/paddle/cinn/poly/compute_at_transform_test.cc
+++ b/paddle/cinn/poly/compute_at_transform_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/compute_at_transform.h"
+#include "paddle/cinn/poly/compute_at_transform.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/poly/dim.cc b/paddle/cinn/poly/dim.cc
index db76650d91d10..5094da439ef2a 100644
--- a/paddle/cinn/poly/dim.cc
+++ b/paddle/cinn/poly/dim.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/dim.h"
+#include "paddle/cinn/poly/dim.h"
 
-#include "cinn/ir/ir_printer.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/dim.h b/paddle/cinn/poly/dim.h
index b273efd4b7c66..c0e2896a95d24 100644
--- a/paddle/cinn/poly/dim.h
+++ b/paddle/cinn/poly/dim.h
@@ -19,7 +19,7 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_base.h"
 
 /**
  * \file
diff --git a/paddle/cinn/poly/domain.cc b/paddle/cinn/poly/domain.cc
index d2a0baaed0c56..56a7b4d2c8f89 100644
--- a/paddle/cinn/poly/domain.cc
+++ b/paddle/cinn/poly/domain.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/domain.h"
+#include "paddle/cinn/poly/domain.h"
 
 #include <glog/logging.h>
 
@@ -22,9 +22,9 @@
 #include <sstream>
 #include <unordered_set>
 
-#include "cinn/common/context.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/domain.h b/paddle/cinn/poly/domain.h
index 68d053489df8e..6511b6ca37c15 100644
--- a/paddle/cinn/poly/domain.h
+++ b/paddle/cinn/poly/domain.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/poly/dim.h"
+#include "paddle/cinn/poly/dim.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/domain_add_unit_loop_mutator.cc b/paddle/cinn/poly/domain_add_unit_loop_mutator.cc
index 9100262a52f9f..63abd9567c5b1 100644
--- a/paddle/cinn/poly/domain_add_unit_loop_mutator.cc
+++ b/paddle/cinn/poly/domain_add_unit_loop_mutator.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/domain_add_unit_loop_mutator.h"
+#include "paddle/cinn/poly/domain_add_unit_loop_mutator.h"
 
 #include <glog/logging.h>
 
 #include <tuple>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/domain_add_unit_loop_mutator.h b/paddle/cinn/poly/domain_add_unit_loop_mutator.h
index 8b4f013255dd2..ea20088754c7b 100644
--- a/paddle/cinn/poly/domain_add_unit_loop_mutator.h
+++ b/paddle/cinn/poly/domain_add_unit_loop_mutator.h
@@ -15,8 +15,8 @@
 #include <tuple>
 #include <vector>
 
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_mutator.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/graph.cc b/paddle/cinn/poly/graph.cc
index 67132eab041f0..8a0d2841043e9 100755
--- a/paddle/cinn/poly/graph.cc
+++ b/paddle/cinn/poly/graph.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/graph.h"
+#include "paddle/cinn/poly/graph.h"
 
 #include <deque>
 #include <map>
diff --git a/paddle/cinn/poly/graph.h b/paddle/cinn/poly/graph.h
index 43a9c8c8c3c56..fc10a0a14aa3e 100644
--- a/paddle/cinn/poly/graph.h
+++ b/paddle/cinn/poly/graph.h
@@ -21,8 +21,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/graph_test.cc b/paddle/cinn/poly/graph_test.cc
index 0ae7bf78c4e7b..9860816d9afb9 100644
--- a/paddle/cinn/poly/graph_test.cc
+++ b/paddle/cinn/poly/graph_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/graph.h"
+#include "paddle/cinn/poly/graph.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/ir/ir_operators.h"
-#include "cinn/lang/buffer.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/lang/buffer.h"
 
 namespace cinn {
 namespace poly {}  // namespace poly
diff --git a/paddle/cinn/poly/isl_utils.cc b/paddle/cinn/poly/isl_utils.cc
index e1b10ced35cd6..3b3712dbaf1b1 100644
--- a/paddle/cinn/poly/isl_utils.cc
+++ b/paddle/cinn/poly/isl_utils.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/isl_utils.h"
+#include "paddle/cinn/poly/isl_utils.h"
 
 #include <glog/logging.h>
 #include <isl/cpp.h>
@@ -20,8 +20,8 @@
 #include <algorithm>
 #include <set>
 
-#include "cinn/common/common.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/isl_utils_test.cc b/paddle/cinn/poly/isl_utils_test.cc
index 35f8922363a4f..e092227cc90a4 100644
--- a/paddle/cinn/poly/isl_utils_test.cc
+++ b/paddle/cinn/poly/isl_utils_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/isl_utils.h"
+#include "paddle/cinn/poly/isl_utils.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/poly/map.cc b/paddle/cinn/poly/map.cc
index 6186cc329188d..2496964c8a482 100644
--- a/paddle/cinn/poly/map.cc
+++ b/paddle/cinn/poly/map.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/map.h"
+#include "paddle/cinn/poly/map.h"
 
-#include "cinn/poly/isl_utils.h"
-#include "cinn/utils/functional.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/utils/functional.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/map.h b/paddle/cinn/poly/map.h
index fb754e7e68469..e0390d0d6d4a5 100644
--- a/paddle/cinn/poly/map.h
+++ b/paddle/cinn/poly/map.h
@@ -21,9 +21,9 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/poly/dim.h"
-#include "cinn/poly/domain.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/poly/dim.h"
+#include "paddle/cinn/poly/domain.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/naive_scheduler.cc b/paddle/cinn/poly/naive_scheduler.cc
index f77f7f550894e..97cc89ef15eb0 100644
--- a/paddle/cinn/poly/naive_scheduler.cc
+++ b/paddle/cinn/poly/naive_scheduler.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/naive_scheduler.h"
+#include "paddle/cinn/poly/naive_scheduler.h"
 
 #include <vector>
 
diff --git a/paddle/cinn/poly/naive_scheduler.h b/paddle/cinn/poly/naive_scheduler.h
index 328ef13b56f58..7a3bb4d0af909 100644
--- a/paddle/cinn/poly/naive_scheduler.h
+++ b/paddle/cinn/poly/naive_scheduler.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/poly/schedule.h"
+#include "paddle/cinn/poly/schedule.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/poly_scheduler.cc b/paddle/cinn/poly/poly_scheduler.cc
index 048ee7823f8b6..59b77124c4b4c 100755
--- a/paddle/cinn/poly/poly_scheduler.cc
+++ b/paddle/cinn/poly/poly_scheduler.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/poly_scheduler.h"
+#include "paddle/cinn/poly/poly_scheduler.h"
 
 #include <glog/logging.h>
 
@@ -23,7 +23,7 @@
 #include <stack>
 #include <unordered_set>
 
-#include "cinn/poly/isl_utils.h"
+#include "paddle/cinn/poly/isl_utils.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/poly_scheduler.h b/paddle/cinn/poly/poly_scheduler.h
index 1cb61cb9de14e..df33a06178251 100644
--- a/paddle/cinn/poly/poly_scheduler.h
+++ b/paddle/cinn/poly/poly_scheduler.h
@@ -21,13 +21,13 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/poly/graph.h"
-#include "cinn/poly/isl_utils.h"
-#include "cinn/poly/map.h"
-#include "cinn/poly/schedule.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/poly/graph.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/poly/map.h"
+#include "paddle/cinn/poly/schedule.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/poly_scheduler_test.cc b/paddle/cinn/poly/poly_scheduler_test.cc
index 773748663bba4..dda970ed47760 100644
--- a/paddle/cinn/poly/poly_scheduler_test.cc
+++ b/paddle/cinn/poly/poly_scheduler_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/poly_scheduler.h"
+#include "paddle/cinn/poly/poly_scheduler.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/poly/schedule.cc b/paddle/cinn/poly/schedule.cc
index df9d1362e9bf8..0c2137d4d8d2b 100644
--- a/paddle/cinn/poly/schedule.cc
+++ b/paddle/cinn/poly/schedule.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/schedule.h"
+#include "paddle/cinn/poly/schedule.h"
 
 #include <deque>
 #include <set>
 #include <sstream>
 
-#include "cinn/common/graph_utils.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/poly/naive_scheduler.h"
-#include "cinn/poly/poly_scheduler.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/poly/naive_scheduler.h"
+#include "paddle/cinn/poly/poly_scheduler.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/schedule.h b/paddle/cinn/poly/schedule.h
index 77b17112768cb..43318a5f1bcbd 100755
--- a/paddle/cinn/poly/schedule.h
+++ b/paddle/cinn/poly/schedule.h
@@ -25,12 +25,12 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/poly/graph.h"
-#include "cinn/poly/isl_utils.h"
-#include "cinn/poly/map.h"
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/poly/graph.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/poly/map.h"
+#include "paddle/cinn/poly/stage.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/schedule_test.cc b/paddle/cinn/poly/schedule_test.cc
index 417b31c89b005..3390b524e1aea 100755
--- a/paddle/cinn/poly/schedule_test.cc
+++ b/paddle/cinn/poly/schedule_test.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/schedule.h"
+#include "paddle/cinn/poly/schedule.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index ade202c010ee8..172edd1457825 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/poly/stage.h"
 
 #include <math.h>
 
@@ -21,22 +21,22 @@
 #include <unordered_set>
 #include <utility>
 
-#include "cinn/common/axis.h"
-#include "cinn/ir/collect_ir_nodes.h"
-#include "cinn/ir/ir_mutator.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/ir/operation.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/compute.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/ir_replace.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/optim/replace_var_with_expr.h"
-#include "cinn/poly/compute_at_transform.h"
-#include "cinn/poly/isl_utils.h"
-#include "cinn/utils/functional.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/axis.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/operation.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_replace.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
+#include "paddle/cinn/poly/compute_at_transform.h"
+#include "paddle/cinn/poly/isl_utils.h"
+#include "paddle/cinn/utils/functional.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/stage.h b/paddle/cinn/poly/stage.h
index 7e834123c2d24..8c11b813d06a1 100755
--- a/paddle/cinn/poly/stage.h
+++ b/paddle/cinn/poly/stage.h
@@ -27,11 +27,11 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/common/common.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/poly/domain.h"
-#include "cinn/poly/map.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/poly/domain.h"
+#include "paddle/cinn/poly/map.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/poly/stage_test.cc b/paddle/cinn/poly/stage_test.cc
index 717643aa8908b..cf0a629858c88 100755
--- a/paddle/cinn/poly/stage_test.cc
+++ b/paddle/cinn/poly/stage_test.cc
@@ -12,20 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/poly/stage.h"
+#include "paddle/cinn/poly/stage.h"
 
 #include <gtest/gtest.h>
 
 #include <set>
 
-#include "cinn/backends/llvm/codegen_llvm.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/cinn.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
 
 namespace cinn {
 namespace poly {
diff --git a/paddle/cinn/pybind/CMakeLists.txt b/paddle/cinn/pybind/CMakeLists.txt
index 683bf422d3cb6..710a0f8dc92db 100755
--- a/paddle/cinn/pybind/CMakeLists.txt
+++ b/paddle/cinn/pybind/CMakeLists.txt
@@ -2,7 +2,7 @@ set(srcs runtime.cc common.cc lang.cc ir.cc poly.cc backends.cc bind.cc optim.cc
 
 if (WITH_CUDA)
   message(STATUS "Compile core_api with CUDA support")
-  nv_library(core_api SHARED
+  cinn_nv_library(core_api SHARED
       SRCS ${srcs}
       DEPS cinncore_static cinn_runtime pybind)
   message("cuda_nvrtc: ${CUDA_NVRTC}")
@@ -12,7 +12,7 @@ if (WITH_CUDA)
   endif()
 else()
   message(STATUS "Compile core_api without CUDA support")
-  cc_library(core_api SHARED
+  cinn_cc_library(core_api SHARED
       SRCS ${srcs}
       DEPS cinncore_static cinn_runtime pybind ${llvm_libs})
 endif()
diff --git a/paddle/cinn/pybind/backends.cc b/paddle/cinn/pybind/backends.cc
index 830d8daed5f76..bd5a116bdbb89 100644
--- a/paddle/cinn/pybind/backends.cc
+++ b/paddle/cinn/pybind/backends.cc
@@ -16,9 +16,9 @@
 
 #include <functional>
 
-#include "cinn/backends/compiler.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/pybind/bind.h"
+#include "paddle/cinn/backends/compiler.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/pybind/bind.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/cinn/pybind/bind.cc b/paddle/cinn/pybind/bind.cc
index 6b0326378cb1f..3ee99abacd8ad 100644
--- a/paddle/cinn/pybind/bind.cc
+++ b/paddle/cinn/pybind/bind.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/pybind/bind.h"
+#include "paddle/cinn/pybind/bind.h"
 
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/runtime/use_extern_funcs.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/runtime/use_extern_funcs.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/cinn/pybind/bind_utils.h b/paddle/cinn/pybind/bind_utils.h
index 5d8a807eb4ce6..3b00029aaa31f 100644
--- a/paddle/cinn/pybind/bind_utils.h
+++ b/paddle/cinn/pybind/bind_utils.h
@@ -18,14 +18,14 @@
 
 #include <string>
 
-#include "cinn/common/cinn_value.h"
-#include "cinn/common/shared.h"
-#include "cinn/ir/ir.h"
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/pybind/bind.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/cinn/pybind/common.cc b/paddle/cinn/pybind/common.cc
index a5249904846f7..08bc8afb72dc6 100644
--- a/paddle/cinn/pybind/common.cc
+++ b/paddle/cinn/pybind/common.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/ir_util.h"
-#include "cinn/common/object.h"
-#include "cinn/common/shared.h"
-#include "cinn/common/target.h"
-#include "cinn/common/type.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/pybind/bind.h"
-#include "cinn/pybind/bind_utils.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/pybind/bind_utils.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/cinn/pybind/framework.cc b/paddle/cinn/pybind/framework.cc
index f6898c81ed719..0ea698694f5e4 100755
--- a/paddle/cinn/pybind/framework.cc
+++ b/paddle/cinn/pybind/framework.cc
@@ -17,16 +17,16 @@
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 
-#include "cinn/common/cinn_value.h"
-#include "cinn/frontend/interpreter.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/hlir/framework/scope.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/pybind/bind.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/frontend/interpreter.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/paddle/cinn/pybind/frontend.cc b/paddle/cinn/pybind/frontend.cc
index 73de15adab305..386ef9957a021 100644
--- a/paddle/cinn/pybind/frontend.cc
+++ b/paddle/cinn/pybind/frontend.cc
@@ -17,27 +17,27 @@
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 
-#include "cinn/common/common.h"
-#include "cinn/frontend/computation.h"
-#include "cinn/frontend/decomposer/use_decomposer.h"
-#include "cinn/frontend/decomposer_registry.h"
-#include "cinn/frontend/interpreter.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/optimize.h"
-#include "cinn/frontend/paddle_model_convertor.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/hlir/framework/graph.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/hlir/framework/visualize_helper.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/pybind/bind.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/string.h"
-#include "cinn/utils/timer.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/frontend/computation.h"
+#include "paddle/cinn/frontend/decomposer/use_decomposer.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/interpreter.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/paddle_model_convertor.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/string.h"
+#include "paddle/cinn/utils/timer.h"
 
 namespace cinn::pybind {
 using common::Type;
diff --git a/paddle/cinn/pybind/ir.cc b/paddle/cinn/pybind/ir.cc
index 550b9f65122ab..767d0eb11af03 100755
--- a/paddle/cinn/pybind/ir.cc
+++ b/paddle/cinn/pybind/ir.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"
 
 #include <llvm/Support/FormatVariadic.h>
 #include <pybind11/functional.h>
@@ -22,18 +22,18 @@
 #include <string>
 #include <type_traits>
 
-#include "cinn/ir/ir_base.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/ir/ir_printer.h"
-#include "cinn/ir/ir_visitor.h"
-#include "cinn/ir/lowered_func.h"
-#include "cinn/ir/operation.h"
-#include "cinn/ir/registry.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/packed_func.h"
-#include "cinn/poly/stage.h"
-#include "cinn/pybind/bind.h"
-#include "cinn/pybind/bind_utils.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/operation.h"
+#include "paddle/cinn/ir/registry.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/pybind/bind_utils.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/cinn/pybind/lang.cc b/paddle/cinn/pybind/lang.cc
index 9d5c05c77570b..94819339a4243 100644
--- a/paddle/cinn/pybind/lang.cc
+++ b/paddle/cinn/pybind/lang.cc
@@ -17,17 +17,17 @@
 
 #include <memory>
 
-#include "cinn/backends/codegen_c.h"
-#include "cinn/common/target.h"
-#include "cinn/ir/module.h"
-#include "cinn/ir/tensor.h"
-#include "cinn/lang/buffer.h"
-#include "cinn/lang/builtin.h"
-#include "cinn/lang/compute.h"
-#include "cinn/lang/lower.h"
-#include "cinn/lang/placeholder.h"
-#include "cinn/pybind/bind.h"
-#include "cinn/pybind/bind_utils.h"
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/buffer.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/pybind/bind_utils.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/cinn/pybind/optim.cc b/paddle/cinn/pybind/optim.cc
index a828b911ba8a0..728965c46af34 100755
--- a/paddle/cinn/pybind/optim.cc
+++ b/paddle/cinn/pybind/optim.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/ir_util.h"
-#include "cinn/common/object.h"
-#include "cinn/common/shared.h"
-#include "cinn/common/target.h"
-#include "cinn/common/type.h"
-#include "cinn/ir/ir_operators.h"
-#include "cinn/optim/ir_copy.h"
-#include "cinn/optim/ir_simplify.h"
-#include "cinn/pybind/bind.h"
-#include "cinn/pybind/bind_utils.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/ir/ir_operators.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/pybind/bind_utils.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/cinn/pybind/pe.cc b/paddle/cinn/pybind/pe.cc
index 9d6e32c807ded..b91976029dc19 100644
--- a/paddle/cinn/pybind/pe.cc
+++ b/paddle/cinn/pybind/pe.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/reduction.h"
-#include "cinn/hlir/pe/transform.h"
-#include "cinn/pybind/bind.h"
-#include "cinn/pybind/bind_utils.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/reduction.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/pybind/bind_utils.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/cinn/pybind/poly.cc b/paddle/cinn/pybind/poly.cc
index dae4c51700096..24c782df3deaa 100644
--- a/paddle/cinn/pybind/poly.cc
+++ b/paddle/cinn/pybind/poly.cc
@@ -14,9 +14,9 @@
 
 #include <llvm/Support/FormatVariadic.h>
 
-#include "cinn/poly/stage.h"
-#include "cinn/pybind/bind.h"
-#include "cinn/pybind/bind_utils.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/pybind/bind_utils.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/cinn/pybind/runtime.cc b/paddle/cinn/pybind/runtime.cc
index 1f9b747fc7b80..98c1afb542fd4 100644
--- a/paddle/cinn/pybind/runtime.cc
+++ b/paddle/cinn/pybind/runtime.cc
@@ -21,9 +21,9 @@
 #include <cstring>
 #include <memory>
 
-#include "cinn/pybind/bind.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace py = pybind11;
 namespace cinn::pybind {
diff --git a/paddle/cinn/pybind/utils.cc b/paddle/cinn/pybind/utils.cc
index c64fc569c63c7..dcae237d1eb60 100644
--- a/paddle/cinn/pybind/utils.cc
+++ b/paddle/cinn/pybind/utils.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/pybind/bind.h"
-#include "cinn/utils/profiler.h"
+#include "paddle/cinn/pybind/bind.h"
+#include "paddle/cinn/utils/profiler.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/cinn/runtime/CMakeLists.txt b/paddle/cinn/runtime/CMakeLists.txt
index 82d2e05d9eb15..0a90f8ca66784 100644
--- a/paddle/cinn/runtime/CMakeLists.txt
+++ b/paddle/cinn/runtime/CMakeLists.txt
@@ -8,15 +8,15 @@ gather_srcs(cinnapi_src SRCS
   custom_function.cc
   )
 
-cc_library(cinn_runtime SRCS cinn_runtime.cc buffer.cc
+cinn_cc_library(cinn_runtime SRCS cinn_runtime.cc buffer.cc
         #cinn_x86_device_impl.cc
         )
 
-cc_test(test_cinn_runtime SRCS cinn_runtime_test.cc DEPS cinn_runtime)
-cc_test(test_custom_function SRCS custom_function_test.cc DEPS cinncore)
+cinn_cc_test(test_cinn_runtime SRCS cinn_runtime_test.cc DEPS cinn_runtime)
+cinn_cc_test(test_custom_function SRCS custom_function_test.cc DEPS cinncore)
 
 if (WITH_OPENMP)
-cc_library(tiny_runtime STATIC SRCS tiny_runtime.cc)
+cinn_cc_library(tiny_runtime STATIC SRCS tiny_runtime.cc)
 endif()
 
 add_subdirectory(cuda)
diff --git a/paddle/cinn/runtime/buffer.cc b/paddle/cinn/runtime/buffer.cc
index 2c5510f91f928..4fdb93cf1d6e3 100755
--- a/paddle/cinn/runtime/buffer.cc
+++ b/paddle/cinn/runtime/buffer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/buffer.h"
+#include "paddle/cinn/runtime/buffer.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/cinn_runtime.cc b/paddle/cinn/runtime/cinn_runtime.cc
index b86cafe34089f..7af3b7163b234 100644
--- a/paddle/cinn/runtime/cinn_runtime.cc
+++ b/paddle/cinn/runtime/cinn_runtime.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 #include <stdarg.h>
 #include <stdio.h>
@@ -492,4 +492,4 @@ cinn_type_t cinn_type_of<float16*>() {
   return cinn_float64_t();
 }
 
-#include "cinn/runtime/cinn_x86_device_impl.cc"
+#include "paddle/cinn/runtime/cinn_x86_device_impl.cc"
diff --git a/paddle/cinn/runtime/cinn_runtime.h b/paddle/cinn/runtime/cinn_runtime.h
index e8243e26be042..cdd75f7895d93 100755
--- a/paddle/cinn/runtime/cinn_runtime.h
+++ b/paddle/cinn/runtime/cinn_runtime.h
@@ -34,11 +34,11 @@
 #endif
 
 #ifndef CINN_COMMON_FLOAT16_H
-#include "cinn/common/float16.h"
+#include "paddle/cinn/common/float16.h"
 #endif  // CINN_COMMON_FLOAT16_H
 
 #ifndef CINN_COMMON_BFLOAT16_H
-#include "cinn/common/bfloat16.h"
+#include "paddle/cinn/common/bfloat16.h"
 #endif  // CINN_COMMON_BFLOAT16_H
 
 #ifdef __cplusplus
diff --git a/paddle/cinn/runtime/cinn_runtime_test.cc b/paddle/cinn/runtime/cinn_runtime_test.cc
index 9f69a9070b2ff..73bf9b7359e8b 100644
--- a/paddle/cinn/runtime/cinn_runtime_test.cc
+++ b/paddle/cinn/runtime/cinn_runtime_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/runtime/cinn_x86_device_impl.cc b/paddle/cinn/runtime/cinn_x86_device_impl.cc
index f976aee5dcf67..3581251affa83 100644
--- a/paddle/cinn/runtime/cinn_x86_device_impl.cc
+++ b/paddle/cinn/runtime/cinn_x86_device_impl.cc
@@ -14,7 +14,7 @@
 
 #include <stdlib.h>
 
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 int cinn_x86_malloc(void* context, cinn_buffer_t* buf) {
   // ASSERT_NOT_NULL(context)
diff --git a/paddle/cinn/runtime/cpu/CMakeLists.txt b/paddle/cinn/runtime/cpu/CMakeLists.txt
index df13981b15242..a0b8070f1840c 100644
--- a/paddle/cinn/runtime/cpu/CMakeLists.txt
+++ b/paddle/cinn/runtime/cpu/CMakeLists.txt
@@ -14,13 +14,13 @@ if (WITH_MKL_CBLAS)
 endif()
 
 
-cc_test(test_host_intrinsics SRCS host_intrinsics_test.cc DEPS cinncore)
+cinn_cc_test(test_host_intrinsics SRCS host_intrinsics_test.cc DEPS cinncore)
 if (WITH_MKL_CBLAS)
   if (NOT WITH_CUDA)
-    cc_test(test_mkl_math SRCS mkl_math_test.cc mkl_math.cc DEPS cinncore)
+    cinn_cc_test(test_mkl_math SRCS mkl_math_test.cc mkl_math.cc DEPS cinncore)
   endif()
 
   if (WITH_MKLDNN)
-    cc_test(test_mkldnn_math SRCS mkldnn_math_test.cc mkldnn_math.cc DEPS cinncore)
+    cinn_cc_test(test_mkldnn_math SRCS mkldnn_math_test.cc mkldnn_math.cc DEPS cinncore)
   endif ()
 endif()
diff --git a/paddle/cinn/runtime/cpu/cblas.cc b/paddle/cinn/runtime/cpu/cblas.cc
index 88b77b17b28b0..8c0594f49b3d3 100644
--- a/paddle/cinn/runtime/cpu/cblas.cc
+++ b/paddle/cinn/runtime/cpu/cblas.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cpu/cblas.h"
+#include "paddle/cinn/runtime/cpu/cblas.h"
 
 #include <vector>
 
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/common/cas.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/common/cas.h"
 
 namespace {
 
diff --git a/paddle/cinn/runtime/cpu/cblas.h b/paddle/cinn/runtime/cpu/cblas.h
index 369a93b03cc09..96126545113f3 100644
--- a/paddle/cinn/runtime/cpu/cblas.h
+++ b/paddle/cinn/runtime/cpu/cblas.h
@@ -14,7 +14,7 @@
 
 #pragma once
 //! \file This file defines some C APIs to trigger CBLAS methods.
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 #ifdef CINN_WITH_MKL_CBLAS
 #include <mkl_cblas.h>
diff --git a/paddle/cinn/runtime/cpu/host_intrinsics.cc b/paddle/cinn/runtime/cpu/host_intrinsics.cc
index f6ce5ca108ce5..4bfa08fbe2211 100644
--- a/paddle/cinn/runtime/cpu/host_intrinsics.cc
+++ b/paddle/cinn/runtime/cpu/host_intrinsics.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
 
 #include <glog/logging.h>
 #include <math.h>
 
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/function_prototype.h"
-#include "cinn/common/target.h"
-#include "cinn/runtime/custom_function.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/function_prototype.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/runtime/custom_function.h"
 
 #ifdef CINN_WITH_MKL_CBLAS
-#include "cinn/runtime/cpu/mkl_math.h"
+#include "paddle/cinn/runtime/cpu/mkl_math.h"
 #endif
 
 extern "C" {
diff --git a/paddle/cinn/runtime/cpu/host_intrinsics.h b/paddle/cinn/runtime/cpu/host_intrinsics.h
index 385d9fec1cfb3..057e60316ef2c 100644
--- a/paddle/cinn/runtime/cpu/host_intrinsics.h
+++ b/paddle/cinn/runtime/cpu/host_intrinsics.h
@@ -16,7 +16,7 @@
 /**
  * \file This file implements some intrinsic functions for math operation in host device.
  */
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 extern "C" {
 
diff --git a/paddle/cinn/runtime/cpu/host_intrinsics_test.cc b/paddle/cinn/runtime/cpu/host_intrinsics_test.cc
index 1a7e6200617bc..6520c8ede851a 100644
--- a/paddle/cinn/runtime/cpu/host_intrinsics_test.cc
+++ b/paddle/cinn/runtime/cpu/host_intrinsics_test.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
 
 #include <gtest/gtest.h>
 
-#include "cinn/backends/compiler.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/cinn.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/target.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/runtime/cpu/use_extern_funcs.h"
+#include "paddle/cinn/backends/compiler.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/cpu/mkl_math.cc b/paddle/cinn/runtime/cpu/mkl_math.cc
index ca389d3dbc378..d375270803c87 100644
--- a/paddle/cinn/runtime/cpu/mkl_math.cc
+++ b/paddle/cinn/runtime/cpu/mkl_math.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cpu/mkl_math.h"
+#include "paddle/cinn/runtime/cpu/mkl_math.h"
 
 #include <glog/logging.h>
 #include <mkl.h>
@@ -20,9 +20,9 @@
 
 #include <cmath>
 
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/function_prototype.h"
-#include "cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/function_prototype.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
 
 #define CINN_MKL_VECTOR_MATH_FP(fn__, name__)                                                                    \
   void cinn_mkl_##name__##_v_fp32(cinn_buffer_t *x, cinn_buffer_t *out) {                                        \
diff --git a/paddle/cinn/runtime/cpu/mkl_math.h b/paddle/cinn/runtime/cpu/mkl_math.h
index da39998e72995..97df519fc0dcf 100644
--- a/paddle/cinn/runtime/cpu/mkl_math.h
+++ b/paddle/cinn/runtime/cpu/mkl_math.h
@@ -17,7 +17,7 @@
  */
 #pragma once
 
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 extern "C" {
 
diff --git a/paddle/cinn/runtime/cpu/mkl_math_test.cc b/paddle/cinn/runtime/cpu/mkl_math_test.cc
index a580a8c12107b..720f200742578 100644
--- a/paddle/cinn/runtime/cpu/mkl_math_test.cc
+++ b/paddle/cinn/runtime/cpu/mkl_math_test.cc
@@ -14,16 +14,16 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/backends/compiler.h"
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/cinn.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/target.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/runtime/cpu/host_intrinsics.h"
-#include "cinn/runtime/cpu/use_extern_funcs.h"
+#include "paddle/cinn/backends/compiler.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math.cc b/paddle/cinn/runtime/cpu/mkldnn_math.cc
index 47f9a01804057..7bb4457979026 100644
--- a/paddle/cinn/runtime/cpu/mkldnn_math.cc
+++ b/paddle/cinn/runtime/cpu/mkldnn_math.cc
@@ -12,22 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cpu/mkldnn_math.h"
+#include "paddle/cinn/runtime/cpu/mkldnn_math.h"
 
 #include <vector>
 
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/common/cas.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/common/cas.h"
 
-using mkldnn::algorithm;
-using mkldnn::memory;
+using dnnl::algorithm;
+using dnnl::memory;
 using tag = memory::format_tag;
 using dt  = memory::data_type;
 
 void cinn_cpu_mkldnn_softmax_fp32(
     int batch, int channel, int h, int w, int axis, cinn_buffer_t* inputs, cinn_buffer_t* out) {
-  auto engine = mkldnn::engine(mkldnn::engine::kind::cpu, 0);
-  mkldnn::stream engine_stream(engine);
+  auto engine = dnnl::engine(dnnl::engine::kind::cpu, 0);
+  dnnl::stream engine_stream(engine);
 
   memory::dims src_dims = {batch, channel};
   if (h != 1) src_dims.push_back(h);
@@ -52,9 +52,9 @@ void cinn_cpu_mkldnn_softmax_fp32(
   auto src_md       = memory::desc(src_dims, dt::f32, format_tag);
   auto src_mem      = memory(src_md, engine, reinterpret_cast<float*>(inputs->memory));
   auto dst_mem      = memory(src_md, engine, reinterpret_cast<float*>(out->memory));
-  auto softmax_d    = mkldnn::softmax_forward::desc(mkldnn::prop_kind::forward_inference, src_md, axis);
-  auto softmax_pd   = mkldnn::softmax_forward::primitive_desc(softmax_d, engine);
-  auto softmax_prim = mkldnn::softmax_forward(softmax_pd);
+  auto softmax_d    = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_inference, src_md, axis);
+  auto softmax_pd   = dnnl::softmax_forward::primitive_desc(softmax_d, engine);
+  auto softmax_prim = dnnl::softmax_forward(softmax_pd);
 
   softmax_prim.execute(engine_stream, {{DNNL_ARG_SRC, src_mem}, {DNNL_ARG_DST, dst_mem}});
   engine_stream.wait();
@@ -77,8 +77,8 @@ void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
                                       cinn_buffer_t* inputs,
                                       cinn_buffer_t* weights,
                                       cinn_buffer_t* out) {
-  auto cpu_engine = mkldnn::engine(mkldnn::engine::kind::cpu, 0);
-  mkldnn::stream cpu_stream(cpu_engine);
+  auto cpu_engine = dnnl::engine(dnnl::engine::kind::cpu, 0);
+  dnnl::stream cpu_stream(cpu_engine);
 
   memory::dims conv_src_tz     = {batch_size, c_in, input_h, input_w};
   memory::dims conv_weights_tz = {c_out, c_in, filter_h, filter_w};
@@ -104,8 +104,8 @@ void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
   auto conv_weights_md = memory::desc({conv_weights_tz}, dt::f32, tag::any);
   auto conv_dst_md     = memory::desc({conv_dst_tz}, dt::f32, tag::nchw);
 
-  auto conv_desc = mkldnn::convolution_forward::desc(mkldnn::prop_kind::forward_inference,
-                                                     mkldnn::algorithm::convolution_direct,
+  auto conv_desc = dnnl::convolution_forward::desc(dnnl::prop_kind::forward_inference,
+                                                     dnnl::algorithm::convolution_direct,
                                                      conv_src_md,
                                                      conv_weights_md,
                                                      conv_dst_md,
@@ -114,7 +114,7 @@ void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
                                                      conv_paddings,
                                                      conv_paddings);
 
-  auto conv_prim_desc = mkldnn::convolution_forward::primitive_desc(conv_desc, cpu_engine);
+  auto conv_prim_desc = dnnl::convolution_forward::primitive_desc(conv_desc, cpu_engine);
 
   auto conv_src_memory     = conv_user_src_memory;
   auto conv_weights_memory = conv_user_weights_memory;
@@ -122,13 +122,13 @@ void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
   if (conv_prim_desc.dst_desc() != conv_user_dst_memory.get_desc()) {
     conv_dst_memory = memory(conv_prim_desc.dst_desc(), cpu_engine);
   }
-  auto conv = mkldnn::convolution_forward(conv_prim_desc);
+  auto conv = dnnl::convolution_forward(conv_prim_desc);
   conv.execute(cpu_stream,
-               {{MKLDNN_ARG_SRC, conv_src_memory},
-                {MKLDNN_ARG_WEIGHTS, conv_weights_memory},
-                {MKLDNN_ARG_DST, conv_dst_memory}});
+               {{DNNL_ARG_SRC, conv_src_memory},
+                {DNNL_ARG_WEIGHTS, conv_weights_memory},
+                {DNNL_ARG_DST, conv_dst_memory}});
   if (conv_prim_desc.dst_desc() != conv_user_dst_memory.get_desc()) {
-    mkldnn::reorder(conv_dst_memory, conv_user_dst_memory).execute(cpu_stream, conv_dst_memory, conv_user_dst_memory);
+    dnnl::reorder(conv_dst_memory, conv_user_dst_memory).execute(cpu_stream, conv_dst_memory, conv_user_dst_memory);
   } else {
     conv_user_dst_memory = conv_dst_memory;
   }
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math.h b/paddle/cinn/runtime/cpu/mkldnn_math.h
index 74a0f9a07f097..ebac2bf1f20e7 100644
--- a/paddle/cinn/runtime/cpu/mkldnn_math.h
+++ b/paddle/cinn/runtime/cpu/mkldnn_math.h
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 #ifdef CINN_WITH_MKLDNN
-#include "mkldnn.hpp"
+#include "dnnl.hpp"
 #endif
 
 // define some C APIs
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math_test.cc b/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
index 53d25ae7a45d5..3bb576fd59ae8 100644
--- a/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
+++ b/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
@@ -14,16 +14,16 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/backends/compiler.h"
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/cinn.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/common/target.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/runtime/cpu/host_intrinsics.h"
-#include "cinn/runtime/cpu/use_extern_funcs.h"
+#include "paddle/cinn/backends/compiler.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/cpu/thread_backend.cc b/paddle/cinn/runtime/cpu/thread_backend.cc
index 9594bec36a1a4..91c22959af05e 100644
--- a/paddle/cinn/runtime/cpu/thread_backend.cc
+++ b/paddle/cinn/runtime/cpu/thread_backend.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cpu/thread_backend.h"
+#include "paddle/cinn/runtime/cpu/thread_backend.h"
 
 #include <algorithm>
 #include <vector>
@@ -21,10 +21,10 @@
 #include <omp.h>
 #endif  // CINN_USE_OPENMP
 
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/llvm/runtime_symbol_registry.h"
-#include "cinn/common/cas.h"
-#include "cinn/runtime/intrinsic.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/runtime/intrinsic.h"
 
 int max_concurrency() {
   int max_concurrency = 1;
diff --git a/paddle/cinn/runtime/cpu/thread_backend.h b/paddle/cinn/runtime/cpu/thread_backend.h
index ef6c71af116f8..e98e03fbd53a0 100644
--- a/paddle/cinn/runtime/cpu/thread_backend.h
+++ b/paddle/cinn/runtime/cpu/thread_backend.h
@@ -16,7 +16,7 @@
 
 #include <thread>
 
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 extern "C" {
 
diff --git a/paddle/cinn/runtime/cpu/use_extern_funcs.h b/paddle/cinn/runtime/cpu/use_extern_funcs.h
index 0e4895e2daabc..24b3e7995d647 100644
--- a/paddle/cinn/runtime/cpu/use_extern_funcs.h
+++ b/paddle/cinn/runtime/cpu/use_extern_funcs.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
 
 CINN_USE_REGISTER(host_intrinsics)
 #ifdef CINN_WITH_MKL_CBLAS
diff --git a/paddle/cinn/runtime/cuda/CMakeLists.txt b/paddle/cinn/runtime/cuda/CMakeLists.txt
index 7d8ae44b9afd9..be3a6bcd9fe8f 100755
--- a/paddle/cinn/runtime/cuda/CMakeLists.txt
+++ b/paddle/cinn/runtime/cuda/CMakeLists.txt
@@ -15,5 +15,5 @@ gather_srcs(cinnapi_src SRCS
         )
 
 
-nv_test(test_cuda_module SRCS cuda_module_test.cc DEPS cinncore)
-nv_library(cuda_runtime SRCS cinn_cuda_runtime_source.cuh)
+cinn_nv_test(test_cuda_module SRCS cuda_module_test.cc DEPS cinncore)
+cinn_nv_library(cuda_runtime SRCS cinn_cuda_runtime_source.cuh)
diff --git a/paddle/cinn/runtime/cuda/cublas_util.h b/paddle/cinn/runtime/cuda/cublas_util.h
index 24ae8774c2dc5..cd72cc61c2a6c 100644
--- a/paddle/cinn/runtime/cuda/cublas_util.h
+++ b/paddle/cinn/runtime/cuda/cublas_util.h
@@ -15,7 +15,7 @@
 
 #include <cublas_v2.h>
 
-#include "cinn/common/type.h"
+#include "paddle/cinn/common/type.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/runtime/cuda/cuda_instrinsics_bfloat16.cc b/paddle/cinn/runtime/cuda/cuda_instrinsics_bfloat16.cc
index c46eb00d6ea85..eb0160552db0d 100644
--- a/paddle/cinn/runtime/cuda/cuda_instrinsics_bfloat16.cc
+++ b/paddle/cinn/runtime/cuda/cuda_instrinsics_bfloat16.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/function_prototype.h"
-#include "cinn/common/bfloat16.h"
-#include "cinn/common/cas.h"
-#include "cinn/runtime/cuda/cuda_util.h"
-#include "cinn/runtime/custom_function.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/function_prototype.h"
+#include "paddle/cinn/common/bfloat16.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/custom_function.h"
 
 using cinn::common::bfloat16;
 
diff --git a/paddle/cinn/runtime/cuda/cuda_instrinsics_float16.cc b/paddle/cinn/runtime/cuda/cuda_instrinsics_float16.cc
index ee6440ba530ba..5910c9dcac0e1 100644
--- a/paddle/cinn/runtime/cuda/cuda_instrinsics_float16.cc
+++ b/paddle/cinn/runtime/cuda/cuda_instrinsics_float16.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/function_prototype.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/float16.h"
-#include "cinn/runtime/cuda/cuda_util.h"
-#include "cinn/runtime/custom_function.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/function_prototype.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/float16.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/custom_function.h"
 
 using cinn::common::float16;
 
diff --git a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc
index 88e48973b3d13..2da1e46ed7b10 100644
--- a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc
+++ b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/function_prototype.h"
-#include "cinn/common/cas.h"
-#include "cinn/runtime/cuda/cuda_util.h"
-#include "cinn/runtime/custom_function.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/function_prototype.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/custom_function.h"
 
 CINN_REGISTER_HELPER(cuda_intrinsics) {
   auto target = cinn::common::DefaultNVGPUTarget();
diff --git a/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc b/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
index 0ad5010a0ff5b..d3c9e9423549e 100644
--- a/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
+++ b/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/backends/function_prototype.h"
-#include "cinn/common/bfloat16.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/float16.h"
-#include "cinn/runtime/cuda/cuda_util.h"
-#include "cinn/runtime/custom_function.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/function_prototype.h"
+#include "paddle/cinn/common/bfloat16.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/float16.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/custom_function.h"
 
 using cinn::common::bfloat16;
 using cinn::common::float16;
diff --git a/paddle/cinn/runtime/cuda/cuda_module.cc b/paddle/cinn/runtime/cuda/cuda_module.cc
index 56963e4efb934..c5facd97bb1cd 100644
--- a/paddle/cinn/runtime/cuda/cuda_module.cc
+++ b/paddle/cinn/runtime/cuda/cuda_module.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -23,10 +23,10 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/runtime/cuda/cuda_util.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/profiler.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/profiler.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/cuda/cuda_module.h b/paddle/cinn/runtime/cuda/cuda_module.h
index 1fb2151e31d35..8bb276a0c3e55 100644
--- a/paddle/cinn/runtime/cuda/cuda_module.h
+++ b/paddle/cinn/runtime/cuda/cuda_module.h
@@ -22,8 +22,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/cuda/cuda_module_test.cc b/paddle/cinn/runtime/cuda/cuda_module_test.cc
index f58f57d393f1a..85d66dd355596 100644
--- a/paddle/cinn/runtime/cuda/cuda_module_test.cc
+++ b/paddle/cinn/runtime/cuda/cuda_module_test.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
 
 #include <gtest/gtest.h>
 
 #include <random>
 
-#include "cinn/backends/nvrtc/nvrtc_util.h"
-#include "cinn/cinn.h"
-#include "cinn/runtime/cuda/cuda_util.h"
-#include "cinn/runtime/cuda/test_util.h"
-#include "cinn/runtime/cuda/use_extern_funcs.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/cuda/test_util.h"
+#include "paddle/cinn/runtime/cuda/use_extern_funcs.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc
index 3671987c6159c..8a40c99aa328d 100644
--- a/paddle/cinn/runtime/cuda/cuda_util.cc
+++ b/paddle/cinn/runtime/cuda/cuda_util.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
 
 #include <absl/container/flat_hash_map.h>
 #include <cublas_v2.h>
@@ -29,14 +29,14 @@
 #include <cudnn.h>
 #endif
 
-#include "cinn/backends/cuda_util.h"
-#include "cinn/backends/extern_func_jit_register.h"
-#include "cinn/common/target.h"
-#include "cinn/runtime/cuda/cublas_util.h"
-#include "cinn/runtime/custom_function.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/profiler.h"
-#include "cinn/utils/timer.h"
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/runtime/cuda/cublas_util.h"
+#include "paddle/cinn/runtime/custom_function.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/profiler.h"
+#include "paddle/cinn/utils/timer.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/cuda/cuda_util.h b/paddle/cinn/runtime/cuda/cuda_util.h
index ef01ac644902e..1e8d691e48fa3 100644
--- a/paddle/cinn/runtime/cuda/cuda_util.h
+++ b/paddle/cinn/runtime/cuda/cuda_util.h
@@ -20,8 +20,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/common/type.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/cuda/test_util.h b/paddle/cinn/runtime/cuda/test_util.h
index bcf1b1d1b10c3..7a5683471dc3c 100644
--- a/paddle/cinn/runtime/cuda/test_util.h
+++ b/paddle/cinn/runtime/cuda/test_util.h
@@ -15,7 +15,7 @@
 #include <numeric>
 #include <vector>
 
-#include "cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/cuda/use_extern_funcs.h b/paddle/cinn/runtime/cuda/use_extern_funcs.h
index fea7d7e0b49d1..89a01df613fb5 100644
--- a/paddle/cinn/runtime/cuda/use_extern_funcs.h
+++ b/paddle/cinn/runtime/cuda/use_extern_funcs.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
 
 #ifdef CINN_WITH_CUDA
 CINN_USE_REGISTER(cinn_cuda_host_api)
diff --git a/paddle/cinn/runtime/custom_function.cc b/paddle/cinn/runtime/custom_function.cc
index 4e79f6c9bf2d1..3b612b208f89e 100644
--- a/paddle/cinn/runtime/custom_function.cc
+++ b/paddle/cinn/runtime/custom_function.cc
@@ -18,9 +18,9 @@
 #include <cuda_runtime.h>
 #endif
 
-#include "cinn/runtime/custom_function.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/string.h"
+#include "paddle/cinn/runtime/custom_function.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/string.h"
 
 DECLARE_string(cinn_check_fusion_accuracy_pass);
 
diff --git a/paddle/cinn/runtime/custom_function.h b/paddle/cinn/runtime/custom_function.h
index 520c3803fb19f..5c8a6c3a34a70 100644
--- a/paddle/cinn/runtime/custom_function.h
+++ b/paddle/cinn/runtime/custom_function.h
@@ -18,10 +18,10 @@
 #include <unordered_map>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/tensor.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/custom_function_test.cc b/paddle/cinn/runtime/custom_function_test.cc
index 35012de7e731a..0f5935fa774f6 100644
--- a/paddle/cinn/runtime/custom_function_test.cc
+++ b/paddle/cinn/runtime/custom_function_test.cc
@@ -21,15 +21,15 @@
 #ifdef CINN_WITH_CUDA
 #include <cuda_runtime.h>
 
-#include "cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
 #endif
 
 #ifdef CINN_WITH_MKL_CBLAS
-#include "cinn/runtime/cpu/cblas.h"
+#include "paddle/cinn/runtime/cpu/cblas.h"
 #endif
 
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/runtime/custom_function.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/custom_function.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/flags.cc b/paddle/cinn/runtime/flags.cc
index 649baced8d2d8..cf213a58fe21e 100644
--- a/paddle/cinn/runtime/flags.cc
+++ b/paddle/cinn/runtime/flags.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/flags.h"
+#include "paddle/cinn/runtime/flags.h"
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
@@ -22,7 +22,7 @@
 
 #include <unordered_set>
 
-#include "cinn/common/target.h"
+#include "paddle/cinn/common/target.h"
 
 #ifdef CINN_WITH_CUDNN
 DEFINE_bool(cinn_cudnn_deterministic,
diff --git a/paddle/cinn/runtime/flags.h b/paddle/cinn/runtime/flags.h
index 6a663d12af231..c13103a3335bc 100644
--- a/paddle/cinn/runtime/flags.h
+++ b/paddle/cinn/runtime/flags.h
@@ -16,7 +16,7 @@
 
 #include <string>
 
-#include "cinn/common/target.h"
+#include "paddle/cinn/common/target.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/intrinsic.cc b/paddle/cinn/runtime/intrinsic.cc
index a22fd8cbc0298..d9f80996d754b 100644
--- a/paddle/cinn/runtime/intrinsic.cc
+++ b/paddle/cinn/runtime/intrinsic.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/intrinsic.h"
+#include "paddle/cinn/runtime/intrinsic.h"
 
-#include "cinn/common/common.h"
-#include "cinn/ir/ir.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/intrinsic.h b/paddle/cinn/runtime/intrinsic.h
index 7b1a7f9ab0948..49798f9bdc84c 100644
--- a/paddle/cinn/runtime/intrinsic.h
+++ b/paddle/cinn/runtime/intrinsic.h
@@ -16,9 +16,9 @@
 #include <string>
 #include <vector>
 
-#include "cinn/ir/buffer.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/runtime/intrinsic_types.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/runtime/intrinsic_types.h"
 
 /**
  * \file This file implements some runtime concepts used in analysis and codegen.
diff --git a/paddle/cinn/runtime/intrinsic_types.cc b/paddle/cinn/runtime/intrinsic_types.cc
index 6d2f20a61984a..5007c9928498b 100644
--- a/paddle/cinn/runtime/intrinsic_types.cc
+++ b/paddle/cinn/runtime/intrinsic_types.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/runtime/intrinsic_types.h"
+#include "paddle/cinn/runtime/intrinsic_types.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/intrinsic_types.h b/paddle/cinn/runtime/intrinsic_types.h
index ec9a5041ccf13..15b417fde84ad 100644
--- a/paddle/cinn/runtime/intrinsic_types.h
+++ b/paddle/cinn/runtime/intrinsic_types.h
@@ -17,7 +17,7 @@
  * \file This file implements some intrinsic types used in CodeGen.
  */
 
-#include "cinn/common/common.h"
+#include "paddle/cinn/common/common.h"
 
 namespace cinn {
 namespace runtime {
diff --git a/paddle/cinn/runtime/use_extern_funcs.h b/paddle/cinn/runtime/use_extern_funcs.h
index a2afce9705296..682f41c55c6a4 100644
--- a/paddle/cinn/runtime/use_extern_funcs.h
+++ b/paddle/cinn/runtime/use_extern_funcs.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "cinn/runtime/cpu/use_extern_funcs.h"
+#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
 #ifdef CINN_WITH_CUDA
-#include "cinn/runtime/cuda/use_extern_funcs.h"
+#include "paddle/cinn/runtime/cuda/use_extern_funcs.h"
 #endif
diff --git a/paddle/cinn/utils/CMakeLists.txt b/paddle/cinn/utils/CMakeLists.txt
index 3b6e6ecb0d751..58839bfe83f17 100755
--- a/paddle/cinn/utils/CMakeLists.txt
+++ b/paddle/cinn/utils/CMakeLists.txt
@@ -16,8 +16,8 @@ gather_srcs(cinnapi_src SRCS
   random_engine.cc
   )
 
-cc_test(test_string SRCS string_test.cc DEPS cinncore)
-cc_test(test_sized_multi_set SRCS sized_multi_set_test.cc DEPS cinncore)
-cc_test(test_multi_threading SRCS multi_threading_test.cc DEPS cinncore)
-cc_test(test_functional SRCS string.cc functional.cc functional_test.cc DEPS absl Threads::Threads)
-cc_test(test_profiler SRCS profiler_test.cc DEPS cinncore)
+cinn_cc_test(test_string SRCS string_test.cc DEPS cinncore)
+cinn_cc_test(test_sized_multi_set SRCS sized_multi_set_test.cc DEPS cinncore)
+cinn_cc_test(test_multi_threading SRCS multi_threading_test.cc DEPS cinncore)
+cinn_cc_test(test_functional SRCS string.cc functional.cc functional_test.cc DEPS absl Threads::Threads)
+cinn_cc_test(test_profiler SRCS profiler_test.cc DEPS cinncore)
diff --git a/paddle/cinn/utils/data_util.cc b/paddle/cinn/utils/data_util.cc
index 907d931f722d7..515c381e3381d 100644
--- a/paddle/cinn/utils/data_util.cc
+++ b/paddle/cinn/utils/data_util.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/data_util.h"
+#include "paddle/cinn/utils/data_util.h"
 
 #include "iostream"
 
diff --git a/paddle/cinn/utils/data_util.h b/paddle/cinn/utils/data_util.h
index 39393e3b3d770..0d1b65042e799 100644
--- a/paddle/cinn/utils/data_util.h
+++ b/paddle/cinn/utils/data_util.h
@@ -16,8 +16,8 @@
 #include <random>
 #include <vector>
 
-#include "cinn/common/target.h"
-#include "cinn/hlir/framework/tensor.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
 #ifdef CINN_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
diff --git a/paddle/cinn/utils/dot_lang.cc b/paddle/cinn/utils/dot_lang.cc
index bf43f3277e478..184eed21a0a64 100644
--- a/paddle/cinn/utils/dot_lang.cc
+++ b/paddle/cinn/utils/dot_lang.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/dot_lang.h"
+#include "paddle/cinn/utils/dot_lang.h"
 
 #include <glog/logging.h>
 
diff --git a/paddle/cinn/utils/error.cc b/paddle/cinn/utils/error.cc
index 9207c319b3fb3..e3920d45b0fcd 100644
--- a/paddle/cinn/utils/error.cc
+++ b/paddle/cinn/utils/error.cc
@@ -12,6 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/error.h"
+#include "paddle/cinn/utils/error.h"
 
 namespace cinn::utils {}  // namespace cinn::utils
diff --git a/paddle/cinn/utils/event.cc b/paddle/cinn/utils/event.cc
index 3e0ceaf0814e3..bf731496eb976 100644
--- a/paddle/cinn/utils/event.cc
+++ b/paddle/cinn/utils/event.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/event.h"
+#include "paddle/cinn/utils/event.h"
 
 #include <glog/logging.h>  // for GLog
 
diff --git a/paddle/cinn/utils/functional.cc b/paddle/cinn/utils/functional.cc
index b0b471fdfc1e4..44826cae9da4d 100644
--- a/paddle/cinn/utils/functional.cc
+++ b/paddle/cinn/utils/functional.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/functional.h"
+#include "paddle/cinn/utils/functional.h"
 
 #include "glog/logging.h"
 
diff --git a/paddle/cinn/utils/functional_test.cc b/paddle/cinn/utils/functional_test.cc
index caa36580f5b1c..026fa05c41659 100644
--- a/paddle/cinn/utils/functional_test.cc
+++ b/paddle/cinn/utils/functional_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/functional.h"
+#include "paddle/cinn/utils/functional.h"
 
 #include <absl/algorithm/container.h>
 #include <glog/logging.h>
@@ -24,7 +24,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/utils/string.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace utils {
diff --git a/paddle/cinn/utils/multi_threading.cc b/paddle/cinn/utils/multi_threading.cc
index 8828006de338b..b26fdf21c9143 100644
--- a/paddle/cinn/utils/multi_threading.cc
+++ b/paddle/cinn/utils/multi_threading.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/multi_threading.h"
+#include "paddle/cinn/utils/multi_threading.h"
 
 #include <glog/logging.h>
 
@@ -21,7 +21,7 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/utils/string.h"
+#include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace utils {
diff --git a/paddle/cinn/utils/multi_threading_test.cc b/paddle/cinn/utils/multi_threading_test.cc
index 66f300d37479b..0ba093ff65fc5 100644
--- a/paddle/cinn/utils/multi_threading_test.cc
+++ b/paddle/cinn/utils/multi_threading_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/multi_threading.h"
+#include "paddle/cinn/utils/multi_threading.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
diff --git a/paddle/cinn/utils/profiler.cc b/paddle/cinn/utils/profiler.cc
index 1f9050e773612..a7a141cb4373b 100644
--- a/paddle/cinn/utils/profiler.cc
+++ b/paddle/cinn/utils/profiler.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/profiler.h"
+#include "paddle/cinn/utils/profiler.h"
 
 #include <gflags/gflags.h>
 
@@ -23,7 +23,7 @@
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 
-#include "cinn/backends/cuda_util.h"
+#include "paddle/cinn/backends/cuda_util.h"
 #endif
 #include <chrono>
 
diff --git a/paddle/cinn/utils/profiler.h b/paddle/cinn/utils/profiler.h
index 53a59868d0e49..ef7516cbf0578 100644
--- a/paddle/cinn/utils/profiler.h
+++ b/paddle/cinn/utils/profiler.h
@@ -21,7 +21,7 @@
 #include <nvToolsExt.h>
 #endif
 
-#include "cinn/utils/event.h"
+#include "paddle/cinn/utils/event.h"
 #include "glog/logging.h"
 
 namespace cinn {
diff --git a/paddle/cinn/utils/profiler_test.cc b/paddle/cinn/utils/profiler_test.cc
index 1723cecea648f..8a5951a152525 100644
--- a/paddle/cinn/utils/profiler_test.cc
+++ b/paddle/cinn/utils/profiler_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/profiler.h"
+#include "paddle/cinn/utils/profiler.h"
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
diff --git a/paddle/cinn/utils/random_engine.cc b/paddle/cinn/utils/random_engine.cc
index c1021e7c52e94..03577ede9edb0 100644
--- a/paddle/cinn/utils/random_engine.cc
+++ b/paddle/cinn/utils/random_engine.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/random_engine.h"
+#include "paddle/cinn/utils/random_engine.h"
 
 namespace cinn {
 namespace utils {
diff --git a/paddle/cinn/utils/sized_multi_set.cc b/paddle/cinn/utils/sized_multi_set.cc
index 8d6901b98b220..366f9e05e7261 100644
--- a/paddle/cinn/utils/sized_multi_set.cc
+++ b/paddle/cinn/utils/sized_multi_set.cc
@@ -12,6 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/sized_multi_set.h"
+#include "paddle/cinn/utils/sized_multi_set.h"
 
 namespace cinn::utils {}  // namespace cinn::utils
diff --git a/paddle/cinn/utils/sized_multi_set_test.cc b/paddle/cinn/utils/sized_multi_set_test.cc
index 613ed41140658..51d7100970101 100644
--- a/paddle/cinn/utils/sized_multi_set_test.cc
+++ b/paddle/cinn/utils/sized_multi_set_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/sized_multi_set.h"
+#include "paddle/cinn/utils/sized_multi_set.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/utils/small_vector.cc b/paddle/cinn/utils/small_vector.cc
index 47ce820fe4d00..801dfeda86478 100644
--- a/paddle/cinn/utils/small_vector.cc
+++ b/paddle/cinn/utils/small_vector.cc
@@ -12,6 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/small_vector.h"
+#include "paddle/cinn/utils/small_vector.h"
 
 namespace cinn::utils {}  // namespace cinn::utils
diff --git a/paddle/cinn/utils/string.cc b/paddle/cinn/utils/string.cc
index be5566facb09a..c81455dedd28e 100644
--- a/paddle/cinn/utils/string.cc
+++ b/paddle/cinn/utils/string.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/string.h"
+#include "paddle/cinn/utils/string.h"
 
 #include <stdarg.h>
 
diff --git a/paddle/cinn/utils/string.h b/paddle/cinn/utils/string.h
index 1f1151b7f9f52..d4a48b0335448 100644
--- a/paddle/cinn/utils/string.h
+++ b/paddle/cinn/utils/string.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>
 
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace utils {
diff --git a/paddle/cinn/utils/string_test.cc b/paddle/cinn/utils/string_test.cc
index 526a904ee4d1e..5925052f9ece5 100644
--- a/paddle/cinn/utils/string_test.cc
+++ b/paddle/cinn/utils/string_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/string.h"
+#include "paddle/cinn/utils/string.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/utils/timer.cc b/paddle/cinn/utils/timer.cc
index c023af026c7d0..42b6d38786c05 100644
--- a/paddle/cinn/utils/timer.cc
+++ b/paddle/cinn/utils/timer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "cinn/utils/timer.h"
+#include "paddle/cinn/utils/timer.h"
 
 namespace cinn {
 namespace utils {
diff --git a/test/cinn/fusion/fusion_test.py b/test/cinn/fusion/fusion_test.py
index 899f94ac2e0f7..f69f0543e819e 100644
--- a/test/cinn/fusion/fusion_test.py
+++ b/test/cinn/fusion/fusion_test.py
@@ -14,7 +14,7 @@
 
 import logging
 import os
-from tests.passes.pass_test import PassTest
+from test.cinn.passes.pass_test import PassTest
 
 logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
 logger = logging.getLogger(name="pass_test")
diff --git a/test/cinn/op_mappers/op_mapper_test.py b/test/cinn/op_mappers/op_mapper_test.py
index 38ed6a0d3ea0e..9e4dbbf824021 100644
--- a/test/cinn/op_mappers/op_mapper_test.py
+++ b/test/cinn/op_mappers/op_mapper_test.py
@@ -28,7 +28,7 @@
 
 import sys
 sys.path.append("/work/dev_CINN/build/python/tests")
-from tests.ops.op_test import OpTest, OpTestTool
+from test.cinn.ops.op_test import OpTest, OpTestTool
 
 logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
 logger = logging.getLogger(name="op_test")
diff --git a/test/cinn/passes/pass_test.py b/test/cinn/passes/pass_test.py
index 57009d64d2504..b7066806acdab 100644
--- a/test/cinn/passes/pass_test.py
+++ b/test/cinn/passes/pass_test.py
@@ -16,7 +16,7 @@
 from cinn.frontend import get_default_program_pass, get_default_graph_pass
 import logging
 import os
-from tests.ops.op_test import OpTest
+from test.cinn.ops.op_test import OpTest
 
 logging.basicConfig(level=os.environ.get('LOG_LEVEL', 'INFO').upper())
 logger = logging.getLogger(name="pass_test")
diff --git a/test/cpp/cinn/CMakeLists.txt b/test/cpp/cinn/CMakeLists.txt
index 68e1ce566a4c4..b993b8a61b288 100644
--- a/test/cpp/cinn/CMakeLists.txt
+++ b/test/cpp/cinn/CMakeLists.txt
@@ -1,22 +1,22 @@
-cc_library(test_program_builder SRCS program_builder.cc DEPS cinncore)
+cinn_cc_library(test_program_builder SRCS program_builder.cc DEPS cinncore)
 
 if (WITH_TESTING)
-  include_directories(${CMAKE_SOURCE_DIR}/cinn/runtime)
+  include_directories(${CMAKE_SOURCE_DIR}/paddle/cinn/runtime)
   add_subdirectory(benchmark)
 
-  cc_test(test01_elementwise_add_main SRCS test01_elementwise_add_main.cc DEPS cinncore
+  cinn_cc_test(test01_elementwise_add_main SRCS test01_elementwise_add_main.cc DEPS cinncore
     ARGS ${global_test_args}
     )
-  cc_test(test01_elementwise_add_case SRCS test01_elementwise_add_case.cc DEPS cinncore)
+  cinn_cc_test(test01_elementwise_add_case SRCS test01_elementwise_add_case.cc DEPS cinncore)
   add_run_test_dependency(test01_elementwise_add_case test01_elementwise_add_main)
 
-  cc_test(test02_matmul_main SRCS test02_matmul_main.cc DEPS cinncore ARGS ${global_test_args})
-  cc_test(test02_matmul_case SRCS test02_matmul_case.cc DEPS cinncore)
+  cinn_cc_test(test02_matmul_main SRCS test02_matmul_main.cc DEPS cinncore ARGS ${global_test_args})
+  cinn_cc_test(test02_matmul_case SRCS test02_matmul_case.cc DEPS cinncore)
   target_compile_options(test02_matmul_case PRIVATE "-O3")
   add_run_test_dependency(test02_matmul_case test02_matmul_main)
 
-  cc_test(test03_conv_main SRCS test03_convolution_main.cc DEPS cinncore ARGS ${global_test_args})
-  cc_test(test03_conv_case SRCS test03_convolution_case.cc DEPS cinncore)
+  cinn_cc_test(test03_conv_main SRCS test03_convolution_main.cc DEPS cinncore ARGS ${global_test_args})
+  cinn_cc_test(test03_conv_case SRCS test03_convolution_case.cc DEPS cinncore)
   target_compile_options(test03_conv_case PRIVATE "-O3")
   add_run_test_dependency(test03_conv_case test03_conv_main)
 endif()
diff --git a/test/cpp/cinn/benchmark/CMakeLists.txt b/test/cpp/cinn/benchmark/CMakeLists.txt
index b83f49b23616b..35b41f0becb69 100755
--- a/test/cpp/cinn/benchmark/CMakeLists.txt
+++ b/test/cpp/cinn/benchmark/CMakeLists.txt
@@ -1,11 +1,11 @@
-include_directories(${CMAKE_SOURCE_DIR}/cinn/runtime)
+include_directories(${CMAKE_SOURCE_DIR}/paddle/cinn/runtime)
 set(srcs test_utils.cc test_matmul.cc test_elementwise.cc test_all_ops_default.cc)
 
-#cc_test(test_bk_matmul SRCS test_matmul.cc test_utils.cc DEPS cinncore ARGS ${global_test_args})
+#cinn_cc_test(test_bk_matmul SRCS test_matmul.cc test_utils.cc DEPS cinncore ARGS ${global_test_args})
 #target_compile_options(test_bk_matmul PRIVATE "-O3")
 
-cc_test(test_bk_elementwise SRCS test_elementwise.cc test_utils.cc DEPS cinncore ARGS ${global_test_args})
+cinn_cc_test(test_bk_elementwise SRCS test_elementwise.cc test_utils.cc DEPS cinncore ARGS ${global_test_args})
 target_compile_options(test_bk_elementwise PRIVATE "-O3")
 
-#cc_test(test_all_ops_default SRCS test_all_ops_default.cc test_utils.cc DEPS cinncore ARGS ${global_test_args})
+#cinn_cc_test(test_all_ops_default SRCS test_all_ops_default.cc test_utils.cc DEPS cinncore ARGS ${global_test_args})
 #target_compile_options(test_all_ops_default PRIVATE "-O3")
diff --git a/test/cpp/cinn/benchmark/test_all_ops_default.cc b/test/cpp/cinn/benchmark/test_all_ops_default.cc
index 1de336684a53d..85ae8bf5997f6 100644
--- a/test/cpp/cinn/benchmark/test_all_ops_default.cc
+++ b/test/cpp/cinn/benchmark/test_all_ops_default.cc
@@ -18,10 +18,10 @@
 #include <string>
 #include <vector>
 
-#include "cinn/cinn.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/runtime/cpu/use_extern_funcs.h"
-#include "tests/benchmark/test_utils.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
+#include "test/cpp/cinn/benchmark/test_utils.h"
 
 namespace cinn {
 namespace tests {
diff --git a/test/cpp/cinn/benchmark/test_elementwise.cc b/test/cpp/cinn/benchmark/test_elementwise.cc
index 2c5f5b9221e59..1fd23ddef22d4 100644
--- a/test/cpp/cinn/benchmark/test_elementwise.cc
+++ b/test/cpp/cinn/benchmark/test_elementwise.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "tests/benchmark/test_elementwise.h"
+#include "test/cpp/cinn/benchmark/test_elementwise.h"
 
-#include "cinn/cinn.h"
-#include "cinn/hlir/framework/node.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/hlir/framework/node.h"
 
 namespace cinn {
 namespace tests {
diff --git a/test/cpp/cinn/benchmark/test_elementwise.h b/test/cpp/cinn/benchmark/test_elementwise.h
index 53108e6064599..81e9012ad238a 100644
--- a/test/cpp/cinn/benchmark/test_elementwise.h
+++ b/test/cpp/cinn/benchmark/test_elementwise.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>
 
-#include "tests/benchmark/test_utils.h"
+#include "test/cpp/cinn/benchmark/test_utils.h"
 
 namespace cinn {
 namespace tests {
diff --git a/test/cpp/cinn/benchmark/test_matmul.cc b/test/cpp/cinn/benchmark/test_matmul.cc
index 037726b808e19..c40413a7f637b 100644
--- a/test/cpp/cinn/benchmark/test_matmul.cc
+++ b/test/cpp/cinn/benchmark/test_matmul.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "tests/benchmark/test_matmul.h"
+#include "test/cpp/cinn/benchmark/test_matmul.h"
 
 #include <gtest/gtest.h>
 
diff --git a/test/cpp/cinn/benchmark/test_matmul.h b/test/cpp/cinn/benchmark/test_matmul.h
index 5db8229607ab5..aebada75238c7 100644
--- a/test/cpp/cinn/benchmark/test_matmul.h
+++ b/test/cpp/cinn/benchmark/test_matmul.h
@@ -17,8 +17,8 @@
 #include <string>
 #include <vector>
 
-#include "cinn/hlir/pe/transform.h"
-#include "tests/benchmark/test_utils.h"
+#include "paddle/cinn/hlir/pe/transform.h"
+#include "test/cpp/cinn/benchmark/test_utils.h"
 
 namespace cinn {
 namespace tests {
diff --git a/test/cpp/cinn/benchmark/test_utils.cc b/test/cpp/cinn/benchmark/test_utils.cc
index 350ff551192ed..32d7d67a0590f 100755
--- a/test/cpp/cinn/benchmark/test_utils.cc
+++ b/test/cpp/cinn/benchmark/test_utils.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "tests/benchmark/test_utils.h"
-
-#include "cinn/backends/llvm/codegen_x86.h"
-#include "cinn/common/cas.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/op.h"
-#include "cinn/hlir/framework/op_strategy.h"
-#include "cinn/optim/transform_gpu_forloop.h"
-#include "cinn/runtime/flags.h"
-#include "cinn/utils/timer.h"
+#include "test/cpp/cinn/benchmark/test_utils.h"
+
+#include "paddle/cinn/backends/llvm/codegen_x86.h"
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/op_strategy.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/timer.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
diff --git a/test/cpp/cinn/benchmark/test_utils.h b/test/cpp/cinn/benchmark/test_utils.h
index 38c1c29476193..0257a33460558 100755
--- a/test/cpp/cinn/benchmark/test_utils.h
+++ b/test/cpp/cinn/benchmark/test_utils.h
@@ -18,11 +18,11 @@
 #include <string>
 #include <vector>
 
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/cinn.h"
-#include "cinn/hlir/framework/node.h"
-#include "cinn/hlir/op/use_ops.h"
-#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/op/use_ops.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
 
 namespace cinn {
 namespace tests {
diff --git a/test/cpp/cinn/concrete_program_builder.h b/test/cpp/cinn/concrete_program_builder.h
index 5b70f76a974ba..78507f123d2b5 100644
--- a/test/cpp/cinn/concrete_program_builder.h
+++ b/test/cpp/cinn/concrete_program_builder.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "tests/program_builder.h"
+#include "test/cpp/cinn/program_builder.h"
 
 namespace cinn {
 namespace tests {
diff --git a/test/cpp/cinn/program_builder.cc b/test/cpp/cinn/program_builder.cc
index b392b63bbf51a..16b98f9ecc7ca 100644
--- a/test/cpp/cinn/program_builder.cc
+++ b/test/cpp/cinn/program_builder.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "tests/program_builder.h"
+#include "test/cpp/cinn/program_builder.h"
 
 namespace cinn {
 namespace tests {
diff --git a/test/cpp/cinn/program_builder.h b/test/cpp/cinn/program_builder.h
index fbb142d3623a4..26963d84cea71 100644
--- a/test/cpp/cinn/program_builder.h
+++ b/test/cpp/cinn/program_builder.h
@@ -15,10 +15,10 @@
 #pragma once
 #include <string>
 
-#include "cinn/common/type.h"
-#include "cinn/frontend/net_builder.h"
-#include "cinn/frontend/syntax.h"
-#include "cinn/utils/type_defs.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/utils/type_defs.h"
 
 namespace cinn {
 namespace tests {
diff --git a/test/cpp/cinn/test01_elementwise_add_case.cc b/test/cpp/cinn/test01_elementwise_add_case.cc
index e0bf806b4a84e..48317e150381e 100644
--- a/test/cpp/cinn/test01_elementwise_add_case.cc
+++ b/test/cpp/cinn/test01_elementwise_add_case.cc
@@ -15,12 +15,12 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/common/test_helper.h"
-#include "cinn/runtime/cinn_runtime.h"
-#include "tests/test01_elementwise_add.h"
-#include "tests/test01_elementwise_add_compute_at.h"
-#include "tests/test01_elementwise_add_compute_at_level1.h"
-#include "tests/test01_elementwise_add_vectorize.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "test/cpp/cinn/test01_elementwise_add.h"
+#include "test/cpp/cinn/test01_elementwise_add_compute_at.h"
+#include "test/cpp/cinn/test01_elementwise_add_compute_at_level1.h"
+#include "test/cpp/cinn/test01_elementwise_add_vectorize.h"
 
 namespace cinn {
 
@@ -155,8 +155,8 @@ TEST(test01, compute_at_level1) {
 
 // include the generated C source code:
 // @{
-#include "tests/test01_elementwise_add.cc"
-#include "tests/test01_elementwise_add_compute_at.cc"
-#include "tests/test01_elementwise_add_compute_at_level1.cc"
-#include "tests/test01_elementwise_add_vectorize.cc"
+#include "test/cpp/cinn/test01_elementwise_add.cc"
+#include "test/cpp/cinn/test01_elementwise_add_compute_at.cc"
+#include "test/cpp/cinn/test01_elementwise_add_compute_at_level1.cc"
+#include "test/cpp/cinn/test01_elementwise_add_vectorize.cc"
 // @}
diff --git a/test/cpp/cinn/test01_elementwise_add_main.cc b/test/cpp/cinn/test01_elementwise_add_main.cc
index 7f488209002d6..11d99ca5b39cf 100644
--- a/test/cpp/cinn/test01_elementwise_add_main.cc
+++ b/test/cpp/cinn/test01_elementwise_add_main.cc
@@ -14,10 +14,10 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/common/ir_util.h"
-#include "cinn/hlir/pe/broadcast.h"
-#include "cinn/optim/optimize.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/hlir/pe/broadcast.h"
+#include "paddle/cinn/optim/optimize.h"
 namespace cinn {
 
 TEST(test01_elementwise_add, basic) {
diff --git a/test/cpp/cinn/test02_helper.h b/test/cpp/cinn/test02_helper.h
index 63e3ffdedd017..0e29bc341a164 100644
--- a/test/cpp/cinn/test02_helper.h
+++ b/test/cpp/cinn/test02_helper.h
@@ -22,10 +22,10 @@
 #include <utility>
 #include <vector>
 
-#include "cinn/backends/llvm/execution_engine.h"
-#include "cinn/backends/llvm/simple_jit.h"
-#include "cinn/cinn.h"
-#include "cinn/optim/optimize.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/optim/optimize.h"
 
 namespace cinn {
 namespace tests {
diff --git a/test/cpp/cinn/test02_matmul_case.cc b/test/cpp/cinn/test02_matmul_case.cc
index cbf2eebd4e6c0..90d652859e50c 100644
--- a/test/cpp/cinn/test02_matmul_case.cc
+++ b/test/cpp/cinn/test02_matmul_case.cc
@@ -15,20 +15,20 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/utils/timer.h"
-#include "tests/test02_helper.h"
-#include "tests/test02_matmul.h"
-#include "tests/test02_matmul_array_packing.h"
-#include "tests/test02_matmul_array_packing_dynamic_shape.h"
-#include "tests/test02_matmul_block.h"
-#include "tests/test02_matmul_call.h"
-#include "tests/test02_matmul_loop_permutation.h"
-#include "tests/test02_matmul_split.h"
-#include "tests/test02_matmul_tile.h"
-#include "tests/test02_matmul_varient_shape.h"
-#include "tests/test02_matmul_varient_shape_tile.h"
-#include "tests/test02_matmul_vectorize.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/utils/timer.h"
+#include "test/cpp/cinn/test02_helper.h"
+#include "test/cpp/cinn/test02_matmul.h"
+#include "test/cpp/cinn/test02_matmul_array_packing.h"
+#include "test/cpp/cinn/test02_matmul_array_packing_dynamic_shape.h"
+#include "test/cpp/cinn/test02_matmul_block.h"
+#include "test/cpp/cinn/test02_matmul_call.h"
+#include "test/cpp/cinn/test02_matmul_loop_permutation.h"
+#include "test/cpp/cinn/test02_matmul_split.h"
+#include "test/cpp/cinn/test02_matmul_tile.h"
+#include "test/cpp/cinn/test02_matmul_varient_shape.h"
+#include "test/cpp/cinn/test02_matmul_varient_shape_tile.h"
+#include "test/cpp/cinn/test02_matmul_vectorize.h"
 
 TEST(test02, basic) {
   const int M  = 1024;
@@ -208,15 +208,15 @@ TEST(test02, basic) {
 
 // include the generated C source code:
 // @{
-#include "tests/test02_matmul.cc"
-#include "tests/test02_matmul_array_packing.cc"
-#include "tests/test02_matmul_array_packing_dynamic_shape.cc"
-#include "tests/test02_matmul_block.cc"
-#include "tests/test02_matmul_call.cc"
-#include "tests/test02_matmul_loop_permutation.cc"
-#include "tests/test02_matmul_split.cc"
-#include "tests/test02_matmul_tile.cc"
-#include "tests/test02_matmul_varient_shape.cc"
-#include "tests/test02_matmul_varient_shape_tile.cc"
-#include "tests/test02_matmul_vectorize.cc"
+#include "test/cpp/cinn/test02_matmul.cc"
+#include "test/cpp/cinn/test02_matmul_array_packing.cc"
+#include "test/cpp/cinn/test02_matmul_array_packing_dynamic_shape.cc"
+#include "test/cpp/cinn/test02_matmul_block.cc"
+#include "test/cpp/cinn/test02_matmul_call.cc"
+#include "test/cpp/cinn/test02_matmul_loop_permutation.cc"
+#include "test/cpp/cinn/test02_matmul_split.cc"
+#include "test/cpp/cinn/test02_matmul_tile.cc"
+#include "test/cpp/cinn/test02_matmul_varient_shape.cc"
+#include "test/cpp/cinn/test02_matmul_varient_shape_tile.cc"
+#include "test/cpp/cinn/test02_matmul_vectorize.cc"
 // @}
diff --git a/test/cpp/cinn/test02_matmul_main.cc b/test/cpp/cinn/test02_matmul_main.cc
index 5d04c98672765..7b1fc5a1f164a 100644
--- a/test/cpp/cinn/test02_matmul_main.cc
+++ b/test/cpp/cinn/test02_matmul_main.cc
@@ -14,9 +14,9 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/optim/optimize.h"
-#include "tests/test02_helper.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/optim/optimize.h"
+#include "test/cpp/cinn/test02_helper.h"
 
 namespace cinn {
 using poly::Iterator;
diff --git a/test/cpp/cinn/test03_convolution_case.cc b/test/cpp/cinn/test03_convolution_case.cc
index 38b6ea52b760b..4af1c6f51046a 100644
--- a/test/cpp/cinn/test03_convolution_case.cc
+++ b/test/cpp/cinn/test03_convolution_case.cc
@@ -15,13 +15,13 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "cinn/runtime/cinn_runtime.h"
-#include "cinn/utils/timer.h"
-#include "tests/test03_convolution.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/cinn/utils/timer.h"
+#include "test/cpp/cinn/test03_convolution.h"
 
 TEST(test03, basic) {}
 
 // include the generated C source code:
 // @{
-#include "tests/test03_convolution.cc"
+#include "test/cpp/cinn/test03_convolution.cc"
 // @}
diff --git a/test/cpp/cinn/test03_convolution_main.cc b/test/cpp/cinn/test03_convolution_main.cc
index ca819a272ddff..cdda8c852896e 100755
--- a/test/cpp/cinn/test03_convolution_main.cc
+++ b/test/cpp/cinn/test03_convolution_main.cc
@@ -14,8 +14,8 @@
 
 #include <gtest/gtest.h>
 
-#include "cinn/cinn.h"
-#include "cinn/common/target.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/target.h"
 
 namespace cinn {
 

From 71a0413edc165ec0b94b137c76cea76a8eac0fba Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Mon, 19 Jun 2023 19:08:31 +0800
Subject: [PATCH 10/14] fix(cmake): fix cinn third_party model path

---
 test/cinn/CMakeLists.txt          | 24 ++++++++++++------------
 tools/cinn/build.sh               |  5 +++--
 tools/cinn/tutorials_demo/demo.cc | 14 +++++++-------
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/test/cinn/CMakeLists.txt b/test/cinn/CMakeLists.txt
index 9223c4ba80076..96d0e9fd2a996 100644
--- a/test/cinn/CMakeLists.txt
+++ b/test/cinn/CMakeLists.txt
@@ -46,8 +46,8 @@ if(WITH_GPU)
   # ADD_TEST(NAME test_cinn_frontend
   #     COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
   #     python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_frontend.py
-  #     ${CMAKE_BINARY_DIR}/thirds/naive_mul_model
-  #     ${CMAKE_BINARY_DIR}/thirds/multi_fc_model
+  #     ${CMAKE_BINARY_DIR}/third_party/naive_mul_model
+  #     ${CMAKE_BINARY_DIR}/third_party/multi_fc_model
   #     "${WITH_GPU}" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
   # )
   add_test(
@@ -62,7 +62,7 @@ endif()
 #ADD_TEST(NAME test_computation_python
 #    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
 #    python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_computation.py
-#    ${CMAKE_BINARY_DIR}/thirds/naive_mul_model
+#    ${CMAKE_BINARY_DIR}/third_party/naive_mul_model
 #    "${WITH_GPU}" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
 #)
 
@@ -87,7 +87,7 @@ if(WITH_GPU)
       ${CMAKE_COMMAND} -E env
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet.py
-      "${CMAKE_BINARY_DIR}/thirds/resnet_model" "${WITH_GPU}"
+      "${CMAKE_BINARY_DIR}/third_party/resnet_model" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
   add_test(
@@ -96,7 +96,7 @@ if(WITH_GPU)
       ${CMAKE_COMMAND} -E env
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet18.py
-      "${CMAKE_BINARY_DIR}/thirds/ResNet18" "${WITH_GPU}"
+      "${CMAKE_BINARY_DIR}/third_party/ResNet18" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
   add_test(
@@ -105,7 +105,7 @@ if(WITH_GPU)
       ${CMAKE_COMMAND} -E env
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_mobilenetv2.py
-      "${CMAKE_BINARY_DIR}/thirds/MobileNetV2" "${WITH_GPU}"
+      "${CMAKE_BINARY_DIR}/third_party/MobileNetV2" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
   add_test(
@@ -114,7 +114,7 @@ if(WITH_GPU)
       ${CMAKE_COMMAND} -E env
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_efficientnet.py
-      "${CMAKE_BINARY_DIR}/thirds/EfficientNet" "${WITH_GPU}"
+      "${CMAKE_BINARY_DIR}/third_party/EfficientNet" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
   add_test(
@@ -123,7 +123,7 @@ if(WITH_GPU)
       ${CMAKE_COMMAND} -E env
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_mobilenetv1.py
-      "${CMAKE_BINARY_DIR}/thirds/MobilenetV1" "${WITH_GPU}"
+      "${CMAKE_BINARY_DIR}/third_party/MobilenetV1" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
   add_test(
@@ -132,7 +132,7 @@ if(WITH_GPU)
       ${CMAKE_COMMAND} -E env
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_resnet50.py
-      "${CMAKE_BINARY_DIR}/thirds/ResNet50" "${WITH_GPU}"
+      "${CMAKE_BINARY_DIR}/third_party/ResNet50" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
   add_test(
@@ -141,7 +141,7 @@ if(WITH_GPU)
       ${CMAKE_COMMAND} -E env
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_squeezenet.py
-      "${CMAKE_BINARY_DIR}/thirds/SqueezeNet" "${WITH_GPU}"
+      "${CMAKE_BINARY_DIR}/third_party/SqueezeNet" "${WITH_GPU}"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
   add_test(
@@ -150,13 +150,13 @@ if(WITH_GPU)
       ${CMAKE_COMMAND} -E env
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH} python3
       ${CMAKE_CURRENT_SOURCE_DIR}/test_paddle_model_convertor.py --path
-      "${CMAKE_BINARY_DIR}/thirds/resnet_model"
+      "${CMAKE_BINARY_DIR}/third_party/resnet_model"
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endif()
 
 #ADD_TEST(NAME test_cinn_real_facedet
 #    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
-#    python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_facedet.py "${CMAKE_BINARY_DIR}/thirds/FaceDet" "${WITH_GPU}"
+#    python3 ${CMAKE_CURRENT_SOURCE_DIR}/test_facedet.py "${CMAKE_BINARY_DIR}/third_party/FaceDet" "${WITH_GPU}"
 #    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 if(WITH_GPU)
   file(
diff --git a/tools/cinn/build.sh b/tools/cinn/build.sh
index f0576d4230fd7..52473532c45bc 100755
--- a/tools/cinn/build.sh
+++ b/tools/cinn/build.sh
@@ -21,6 +21,7 @@ build_dir=$workspace/${build_dir_name}
 py_version=${py_version:-3.7}
 cinn_whl_path=python/dist/cinn-0.0.0-py3-none-any.whl
 
+
 #export LLVM11_DIR=${workspace}/THIRDS/usr
 
 JOBS=8
@@ -148,7 +149,7 @@ function _download_and_untar {
 
 function prepare_model {
     proxy_off
-    cd $build_dir/thirds
+    cd $build_dir/third_party
 
     _download_and_untar ResNet18.tar.gz
     _download_and_untar MobileNetV2.tar.gz
@@ -173,7 +174,7 @@ function prepare_model {
         wget https://github.com/T8T9/files/raw/main/mkldnn.tgz
     fi
     tar -zxvf mkldnn.tgz
-    cd $build_dir/thirds
+    cd $build_dir/third_party
     python${py_version} $workspace/test/cinn/fake_model/naive_mul.py
     python${py_version} $workspace/test/cinn/fake_model/naive_multi_fc.py
     python${py_version} $workspace/test/cinn/fake_model/resnet_model.py
diff --git a/tools/cinn/tutorials_demo/demo.cc b/tools/cinn/tutorials_demo/demo.cc
index 71bfc78ad80ce..751430b82d687 100644
--- a/tools/cinn/tutorials_demo/demo.cc
+++ b/tools/cinn/tutorials_demo/demo.cc
@@ -14,13 +14,13 @@
 
 #include <iostream>
 
-#include "cinn/backends/compiler.h"
-#include "cinn/cinn.h"
-#include "cinn/common/test_helper.h"
-#include "cinn/hlir/pe/elementwise.h"
-#include "cinn/hlir/pe/nn.h"
-#include "cinn/runtime/cpu/use_extern_funcs.h"
-#include "cinn/utils/timer.h"
+#include "paddle/cinn/backends/compiler.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
+#include "paddle/cinn/utils/timer.h"
 
 namespace cinn {
 namespace backends {

From 049cbcb66697f8ca25149838f440f11ff4fef0c6 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Sun, 25 Jun 2023 10:10:09 +0800
Subject: [PATCH 11/14] gflags dynamic dependce

---
 cmake/cinn.cmake      | 4 ----
 cmake/cinn/core.cmake | 6 +++---
 setup.py              | 2 +-
 test/CMakeLists.txt   | 8 +++-----
 4 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
index 08fb81d5d6239..cf8a677a47ae8 100644
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -1,5 +1,3 @@
-set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE)
-
 set(CINN_THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party")
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(DOWNLOAD_MODEL_DIR "${CINN_THIRD_PARTY_PATH}/model")
@@ -134,7 +132,6 @@ cinn_cc_library(
   ${cinnapi_src}
   DEPS
   glog
-  gflags
   ${llvm_libs}
   cinn_framework_proto
   param_proto
@@ -186,7 +183,6 @@ function(gen_cinncore LINKTYPE)
     ${core_src}
     DEPS
     glog
-    gflags
     ${llvm_libs}
     cinn_framework_proto
     param_proto
diff --git a/cmake/cinn/core.cmake b/cmake/cinn/core.cmake
index 91809b697aeec..c1e923f66bda1 100644
--- a/cmake/cinn/core.cmake
+++ b/cmake/cinn/core.cmake
@@ -65,7 +65,7 @@ function(remove_gflags TARGET_NAME)
 endfunction()
 
 function(cinn_cc_test TARGET_NAME)
-  if(WITH_TESTING)
+  if(WITH_TESTING AND CINN_ONLY)
     set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
@@ -145,7 +145,7 @@ function(cinn_nv_binary TARGET_NAME)
 endfunction(cinn_nv_binary)
 
 function(cinn_nv_test TARGET_NAME)
-  if(WITH_GPU AND WITH_TESTING)
+  if(WITH_GPU AND WITH_TESTING AND ONLY_CINN)
     set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
@@ -185,7 +185,7 @@ endfunction(cinn_nv_test)
 
 # Add dependency that TARGET will depend on test result of DEP, this function executes the DEP during make.
 function(add_run_test_dependency TARGET_NAME DEP_NAME)
-  if(WITH_TESTING)
+  if(WITH_TESTING AND ONLY_CINN)
     set(custom_target_name ${TARGET_NAME}_TEST_OUTPUT_DEPENDENCY_ON_${DEP_NAME})
     add_custom_target(
       ${custom_target_name}
diff --git a/setup.py b/setup.py
index 96210cf989a4a..ffcc41de8e47d 100644
--- a/setup.py
+++ b/setup.py
@@ -1025,7 +1025,7 @@ def get_package_data_and_package_dir():
         )
         shutil.copy(
             env_dict.get("CINN_INCLUDE_DIR")
-            + '/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh',
+            + '/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh',
             libs_path,
         )
         package_data['paddle.libs'] += ['libcinnapi.so']
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4288f13d77375..ea7af47ad5861 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -91,12 +91,10 @@ function(bash_test_modules TARGET_NAME)
 endfunction()
 
 if(WITH_TESTING)
-  if(WITH_CINN)
+  if(CINN_ONLY)
     add_subdirectory(cpp/cinn)
-    if(CINN_ONLY)
-      add_subdirectory(cinn)
-      return()
-    endif()
+    add_subdirectory(cinn)
+    return()
   endif()
   add_subdirectory(amp)
   add_subdirectory(asp)

From f370bd5f737492b3590b1eff1d187d2d0d2bb80f Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Sun, 25 Jun 2023 16:51:54 +0800
Subject: [PATCH 12/14] fix ci build_demo

---
 CMakeLists.txt                          |  9 ++++++--
 cmake/cinn.cmake                        |  3 +++
 cmake/cinn/core.cmake                   |  6 +++---
 tools/cinn/build.sh                     |  5 +++--
 tools/cinn/tutorials_demo/build_demo.sh | 28 +++++++++++++++++++++++++
 5 files changed, 44 insertions(+), 7 deletions(-)
 create mode 100755 tools/cinn/tutorials_demo/build_demo.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3246087627a4a..fed539e51ecac 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
 option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
 option(WITH_SHARED_PHI "Compile PaddlePaddle with SHARED LIB of PHI" OFF)
 option(CINN_ONLY "Compile CINN only in Paddle" OFF)
+option(CINN_WITH_CUDNN "Compile CINN with CUDNN support" ON)
 
 find_package(Git REQUIRED)
 
@@ -581,11 +582,15 @@ if(WITH_CINN)
   message(STATUS "Compile Paddle with CINN.")
   include(cmake/cinn.cmake)
   add_definitions(-DPADDLE_WITH_CINN)
+  # TODO(6clc): Use CINN_WITH_CUDNN to completely replace WITH_CUDNN in CINN.
+  #             Use WITH_GPU to completely replace WITH_CUDA in CINN.
   if(WITH_GPU)
     set(WITH_CUDA ${WITH_GPU})
-    set(WITH_CUDNN ${WITH_GPU})
     add_definitions(-DCINN_WITH_CUDA)
-    add_definitions(-DCINN_WITH_CUDNN)
+    set(WITH_CUDNN ${CINN_WITH_CUDNN})
+    if(WITH_CUDNN)
+      add_definitions(-DCINN_WITH_CUDNN)
+    endif()
   endif()
 
   if(CINN_ONLY)
diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
index cf8a677a47ae8..9a286968545e9 100644
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -94,6 +94,9 @@ include(cmake/cinn/external/ginac.cmake)
 include(cmake/cinn/external/openmp.cmake)
 include(cmake/cinn/external/jitify.cmake)
 
+if(CINN_ONLY)
+  LINK_LIBRARIES(gflags)
+endif()
 
 set(LINK_FLAGS
     "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cinn/export.map"
diff --git a/cmake/cinn/core.cmake b/cmake/cinn/core.cmake
index c1e923f66bda1..d76b59732827f 100644
--- a/cmake/cinn/core.cmake
+++ b/cmake/cinn/core.cmake
@@ -145,7 +145,7 @@ function(cinn_nv_binary TARGET_NAME)
 endfunction(cinn_nv_binary)
 
 function(cinn_nv_test TARGET_NAME)
-  if(WITH_GPU AND WITH_TESTING AND ONLY_CINN)
+  if(WITH_GPU AND WITH_TESTING AND CINN_ONLY)
     set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
@@ -185,7 +185,7 @@ endfunction(cinn_nv_test)
 
 # Add dependency that TARGET will depend on test result of DEP, this function executes the DEP during make.
 function(add_run_test_dependency TARGET_NAME DEP_NAME)
-  if(WITH_TESTING AND ONLY_CINN)
+  if(WITH_TESTING AND CINN_ONLY)
     set(custom_target_name ${TARGET_NAME}_TEST_OUTPUT_DEPENDENCY_ON_${DEP_NAME})
     add_custom_target(
       ${custom_target_name}
@@ -195,7 +195,7 @@ function(add_run_test_dependency TARGET_NAME DEP_NAME)
       COMMAND cd ${CMAKE_BINARY_DIR}
       DEPENDS ${DEP_NAME})
     add_dependencies(${TARGET_NAME} ${DEP_NAME} ${custom_target_name})
-  endif(WITH_TESTING)
+  endif(WITH_TESTING AND CINN_ONLY)
 endfunction(add_run_test_dependency)
 
 # find all third_party modules is used for paddle static library
diff --git a/tools/cinn/build.sh b/tools/cinn/build.sh
index 52473532c45bc..542eb1f78d0d4 100755
--- a/tools/cinn/build.sh
+++ b/tools/cinn/build.sh
@@ -18,7 +18,7 @@ set -ex
 workspace=$(cd $(dirname ${BASH_SOURCE[0]})/../..; pwd)
 build_dir_name=${cinn_build:-build_ci}
 build_dir=$workspace/${build_dir_name}
-py_version=${py_version:-3.7}
+py_version=${py_version:-3.8}
 cinn_whl_path=python/dist/cinn-0.0.0-py3-none-any.whl
 
 
@@ -133,7 +133,8 @@ function cmake_ {
     cd $build_dir
     set -x
     cmake ${workspace} -DCINN_ONLY=ON -DWITH_CINN=ON -DWITH_GPU=${cuda_config} \
-      -DWITH_TESTING=ON  -DWITH_MKL=${mklcblas_config}  -DPY_VERSION=${py_version}
+      -DWITH_TESTING=ON  -DWITH_MKL=${mklcblas_config}  -DCINN_WITH_CUDNN=${cudnn_config} \
+      -DPY_VERSION=${py_version}
     set +x
 
 }
diff --git a/tools/cinn/tutorials_demo/build_demo.sh b/tools/cinn/tutorials_demo/build_demo.sh
new file mode 100755
index 0000000000000..8cabbbdd592dc
--- /dev/null
+++ b/tools/cinn/tutorials_demo/build_demo.sh
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 CINN Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# build with gpu
+ABSL_INCLUDE_FLAG=-Ithird_party/absl/include
+ISL_INCLUDE_FLAG=-Ithird_party/isl/include
+LLVM_INCLUDE_FLAG=-Ithird_party/llvm/include
+GLOG_INCLUDE_FLAG=-Ithird_party/glog/include
+GFLAGS_INCLUDE_FLAG=-Ithird_party/gflags/include
+PROTOBUF_INCLUDE_FLAG=-Ithird_party/protobuf/include
+MKLML_INCLUDE_FLAG=-Ithird_party/mklml/include
+
+THIRD_PARTY_INCLUDES="${ABSL_INCLUDE_FLAG} ${ISL_INCLUDE_FLAG} ${LLVM_INCLUDE_FLAG} ${GLOG_INCLUDE_FLAG} ${GFLAGS_INCLUDE_FLAG} ${PROTOBUF_INCLUDE_FLAG} ${MKLML_INCLUDE_FLAG}"
+g++ demo.cc -o demo -fPIC -mavx -Wno-write-strings -Wno-psabi   -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -std=c++14 ${THIRD_PARTY_INCLUDES} -I./cinn/include cinn/lib/libcinnapi.so -lpthread -ldl
+
+# build without gpu
+# g++ demo.cc -o demo -fPIC -mavx -Wno-write-strings -Wno-psabi   -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -std=gnu++1z -I./cinn/include -I./third_party/llvm11/include -I./third_party/glog/include -I./third_party/gflags/include -I./third_party/protobuf/include -I./third_party/mklml/include ./cinn/lib/libcinncore.a ../cinn/frontend/paddle/libframework_proto.a ./third_party/glog/lib/libglog.a ./third_party/gflags/lib/libgflags.a ./third_party/protobuf/lib/libprotobuf.a ./third_party/llvm11/lib/libLLVMX86CodeGen.a ./third_party/llvm11/lib/libLLVMAsmPrinter.a ./third_party/llvm11/lib/libLLVMDebugInfoDWARF.a ./third_party/llvm11/lib/libLLVMCFGuard.a ./third_party/llvm11/lib/libLLVMGlobalISel.a ./third_party/llvm11/lib/libLLVMSelectionDAG.a ./third_party/llvm11/lib/libLLVMX86AsmParser.a ./third_party/llvm11/lib/libLLVMX86Desc.a ./third_party/llvm11/lib/libLLVMX86Disassembler.a ./third_party/llvm11/lib/libLLVMMCDisassembler.a ./third_party/llvm11/lib/libLLVMX86Info.a ./third_party/llvm11/lib/libLLVMOrcJIT.a ./third_party/llvm11/lib/libLLVMJITLink.a ./third_party/llvm11/lib/libLLVMOrcError.a ./third_party/llvm11/lib/libLLVMPasses.a ./third_party/llvm11/lib/libLLVMCoroutines.a ./third_party/llvm11/lib/libLLVMipo.a ./third_party/llvm11/lib/libLLVMIRReader.a ./third_party/llvm11/lib/libLLVMAsmParser.a ./third_party/llvm11/lib/libLLVMInstrumentation.a ./third_party/llvm11/lib/libLLVMVectorize.a ./third_party/llvm11/lib/libLLVMFrontendOpenMP.a ./third_party/llvm11/lib/libLLVMLinker.a ./third_party/llvm11/lib/libLLVMMCJIT.a ./third_party/llvm11/lib/libLLVMExecutionEngine.a ./third_party/llvm11/lib/libLLVMRuntimeDyld.a ./third_party/llvm11/lib/libLLVMCodeGen.a ./third_party/llvm11/lib/libLLVMTarget.a ./third_party/llvm11/lib/libLLVMBitWriter.a ./third_party/llvm11/lib/libLLVMScalarOpts.a ./third_party/llvm11/lib/libLLVMAggressiveInstCombine.a ./third_party/llvm11/lib/libLLVMInstCombine.a ./third_party/llvm11/lib/libLLVMTransformUtils.a ./third_party/llvm11/lib/libLLVMAnalysis.a ./third_party/llvm11/lib/libLLVMProfileData.a ./third_party/llvm11/lib/libLLVMObject.a ./third_party/llvm11/lib/libLLVMBitReader.a ./third_party/llvm11/lib/libLLVMCore.a ./third_party/llvm11/lib/libLLVMRemarks.a ./third_party/llvm11/lib/libLLVMBitstreamReader.a ./third_party/llvm11/lib/libLLVMMCParser.a ./third_party/llvm11/lib/libLLVMMC.a ./third_party/llvm11/lib/libLLVMDebugInfoCodeView.a ./third_party/llvm11/lib/libLLVMDebugInfoMSF.a ./third_party/llvm11/lib/libLLVMTextAPI.a ./third_party/llvm11/lib/libLLVMBinaryFormat.a ./third_party/llvm11/lib/libLLVMSupport.a ./third_party/llvm11/lib/libLLVMDemangle.a -lpthread -ldl -ltinfo /usr/local/lib/libisl.so -lginac

From a04b499e22bff6ecd08551f02c4c762b4f1bc17c Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Sun, 25 Jun 2023 16:54:35 +0800
Subject: [PATCH 13/14] tmp update to c++17 of cinn-only test

---
 cmake/cinn.cmake               | 29 +++++++++++++++++++++--------
 cmake/cinn/core.cmake          |  2 +-
 cmake/cinn/external/absl.cmake |  1 +
 paddle/cinn/pybind/bind.h      |  2 --
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
index 9a286968545e9..594eed3e116d2 100644
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -2,6 +2,22 @@ set(CINN_THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party")
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(DOWNLOAD_MODEL_DIR "${CINN_THIRD_PARTY_PATH}/model")
 
+string(REGEX MATCH "-std=(c\\+\\+[^ ]+)" STD_FLAG "${CMAKE_CXX_FLAGS}")
+if (NOT STD_FLAG)
+  if (NOT CMAKE_CXX_STANDARD)
+    message(STATUS "STD_FLAG and CMAKE_CXX_STANDARD not found, using default flag: -std=c++17")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+    set(CMAKE_CXX_STANDARD 17)
+  else()
+    message(STATUS "Got CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}, append -std=c++${CMAKE_CXX_STANDARD} to CMAKE_CXX_FLAGS")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${CMAKE_CXX_STANDARD}")
+  endif()
+else()
+  string(REGEX MATCH "([0-9]+)" STD_VALUE "${STD_FLAG}")
+  message(STATUS "Got STD_FLAG=${STD_FLAG}, set CMAKE_CXX_STANDARD=${STD_VALUE}")
+  set(CMAKE_CXX_STANDARD ${STD_VALUE})
+endif()
+
 if(NOT DEFINED ENV{runtime_include_dir})
   message(
     STATUS
@@ -54,6 +70,7 @@ if(WITH_GPU)
 
   cuda_select_nvcc_arch_flags(ARCH_FLAGS Auto)
   list(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
 
   message(
     STATUS
@@ -64,14 +81,10 @@ if(WITH_GPU)
 
   find_library(CUDASTUB libcuda.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/
                                          REQUIRED)
-  find_library(CUBLAS libcublas.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-                                         /usr/lib REQUIRED)
-  find_library(CUDNN libcudnn.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib
-                                       REQUIRED)
-  find_library(CURAND libcurand.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-                                         /usr/lib REQUIRED)
-  find_library(CUSOLVER libcusolver.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-                                             /usr/lib REQUIRED)
+  find_library(CUBLAS libcublas.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib /usr/lib64 REQUIRED)
+  find_library(CUDNN libcudnn.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib /usr/lib64 REQUIRED)
+  find_library(CURAND libcurand.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib /usr/lib64 REQUIRED)
+  find_library(CUSOLVER libcusolver.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib /usr/lib64 REQUIRED)
 endif()
 
 set(cinnapi_src CACHE INTERNAL "" FORCE)
diff --git a/cmake/cinn/core.cmake b/cmake/cinn/core.cmake
index d76b59732827f..2f5019b6eb027 100644
--- a/cmake/cinn/core.cmake
+++ b/cmake/cinn/core.cmake
@@ -151,7 +151,7 @@ function(cinn_nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cinn_nv_test "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
-    cuda_add_executable(${TARGET_NAME} ${cinn_nv_test_SRCS})
+    cuda_add_executable(${TARGET_NAME} ${cinn_nv_test_SRCS} OPTIONS "-std=c++${CMAKE_CUDA_STANDARD}")
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
     target_link_libraries(
       ${TARGET_NAME}
diff --git a/cmake/cinn/external/absl.cmake b/cmake/cinn/external/absl.cmake
index 93c70c54959d4..b7ded7502e281 100644
--- a/cmake/cinn/external/absl.cmake
+++ b/cmake/cinn/external/absl.cmake
@@ -12,6 +12,7 @@ set(OPTIONAL_ARGS
     "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
     "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
     "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+    "-DABSL_CXX_STANDARD=17"
     "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
     "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
     "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
diff --git a/paddle/cinn/pybind/bind.h b/paddle/cinn/pybind/bind.h
index 588f2ea58fa3b..78c8b121580f1 100644
--- a/paddle/cinn/pybind/bind.h
+++ b/paddle/cinn/pybind/bind.h
@@ -23,8 +23,6 @@
 
 namespace pybind11 {
 namespace detail {
-template <typename... Ts>
-struct type_caster<absl::variant<Ts...>> : variant_caster<absl::variant<Ts...>> {};
 
 template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc>
 struct type_caster<absl::flat_hash_map<Key, Value, Hash, Equal, Alloc>>

From b2674e8a1400705f391f00fce04f5c3ec40be355 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Sun, 25 Jun 2023 23:40:07 +0800
Subject: [PATCH 14/14] fix cinn only with c++17

---
 cmake/external/gtest.cmake              | 7 ++++++-
 setup.py                                | 4 ++--
 test/cpp/fluid/cinn/CMakeLists.txt      | 2 +-
 tools/cinn/tutorials_demo/build_demo.sh | 8 ++++----
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index f71e6e09b07c4..b5dd3c71b29f4 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -56,9 +56,14 @@ else()
       "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgmock.a"
       CACHE FILEPATH "gmock libraries." FORCE)
   set(GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-  set(GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  if(CINN_ONLY)
+    set(GTEST_CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS}")
+  else()
+    set(GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  endif()
 endif()
 
+
 if(WITH_MKLML)
   # wait for mklml downloading completed
   set(GTEST_DEPENDS ${MKLML_PROJECT})
diff --git a/setup.py b/setup.py
index ffcc41de8e47d..da430b694c5ed 100644
--- a/setup.py
+++ b/setup.py
@@ -1032,13 +1032,13 @@ def get_package_data_and_package_dir():
         package_data['paddle.libs'] += ['cinn_cuda_runtime_source.cuh']
 
         cinn_fp16_file = (
-            env_dict.get("CINN_INCLUDE_DIR") + '/cinn/runtime/cuda/float16.h'
+            env_dict.get("CINN_INCLUDE_DIR") + '/paddle/cinn/runtime/cuda/float16.h'
         )
         if os.path.exists(cinn_fp16_file):
             shutil.copy(cinn_fp16_file, libs_path)
             package_data['paddle.libs'] += ['float16.h']
         cinn_bf16_file = (
-            env_dict.get("CINN_INCLUDE_DIR") + '/cinn/runtime/cuda/bfloat16.h'
+            env_dict.get("CINN_INCLUDE_DIR") + '/paddle/cinn/runtime/cuda/bfloat16.h'
         )
         if os.path.exists(cinn_bf16_file):
             shutil.copy(cinn_bf16_file, libs_path)
diff --git a/test/cpp/fluid/cinn/CMakeLists.txt b/test/cpp/fluid/cinn/CMakeLists.txt
index 9d2d875d8e36b..1fb0b5fae6009 100644
--- a/test/cpp/fluid/cinn/CMakeLists.txt
+++ b/test/cpp/fluid/cinn/CMakeLists.txt
@@ -18,7 +18,7 @@ if(WITH_TESTING)
                                                            "RUN_TYPE=CINN")
 
   set(CINN_RUN_ENVIRONMENT
-      "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda"
+    "OMP_NUM_THREADS=1;runtime_include_dir=${CINN_INCLUDE_DIR}/paddle/cinn/runtime/cuda/"
   )
   # cc_test_old(
   #   cinn_launch_op_test
diff --git a/tools/cinn/tutorials_demo/build_demo.sh b/tools/cinn/tutorials_demo/build_demo.sh
index 8cabbbdd592dc..000ec0d6d8c59 100755
--- a/tools/cinn/tutorials_demo/build_demo.sh
+++ b/tools/cinn/tutorials_demo/build_demo.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 CINN Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,7 +22,7 @@ PROTOBUF_INCLUDE_FLAG=-Ithird_party/protobuf/include
 MKLML_INCLUDE_FLAG=-Ithird_party/mklml/include
 
 THIRD_PARTY_INCLUDES="${ABSL_INCLUDE_FLAG} ${ISL_INCLUDE_FLAG} ${LLVM_INCLUDE_FLAG} ${GLOG_INCLUDE_FLAG} ${GFLAGS_INCLUDE_FLAG} ${PROTOBUF_INCLUDE_FLAG} ${MKLML_INCLUDE_FLAG}"
-g++ demo.cc -o demo -fPIC -mavx -Wno-write-strings -Wno-psabi   -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -std=c++14 ${THIRD_PARTY_INCLUDES} -I./cinn/include cinn/lib/libcinnapi.so -lpthread -ldl
+g++ demo.cc -o demo -fPIC -mavx -Wno-write-strings -Wno-psabi   -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -std=c++17 ${THIRD_PARTY_INCLUDES} -I./cinn/include cinn/lib/libcinnapi.so -lpthread -ldl
 
 # build without gpu
 # g++ demo.cc -o demo -fPIC -mavx -Wno-write-strings -Wno-psabi   -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -D_GNU_SOURCE -D_DEBUG -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -std=gnu++1z -I./cinn/include -I./third_party/llvm11/include -I./third_party/glog/include -I./third_party/gflags/include -I./third_party/protobuf/include -I./third_party/mklml/include ./cinn/lib/libcinncore.a ../cinn/frontend/paddle/libframework_proto.a ./third_party/glog/lib/libglog.a ./third_party/gflags/lib/libgflags.a ./third_party/protobuf/lib/libprotobuf.a ./third_party/llvm11/lib/libLLVMX86CodeGen.a ./third_party/llvm11/lib/libLLVMAsmPrinter.a ./third_party/llvm11/lib/libLLVMDebugInfoDWARF.a ./third_party/llvm11/lib/libLLVMCFGuard.a ./third_party/llvm11/lib/libLLVMGlobalISel.a ./third_party/llvm11/lib/libLLVMSelectionDAG.a ./third_party/llvm11/lib/libLLVMX86AsmParser.a ./third_party/llvm11/lib/libLLVMX86Desc.a ./third_party/llvm11/lib/libLLVMX86Disassembler.a ./third_party/llvm11/lib/libLLVMMCDisassembler.a ./third_party/llvm11/lib/libLLVMX86Info.a ./third_party/llvm11/lib/libLLVMOrcJIT.a ./third_party/llvm11/lib/libLLVMJITLink.a ./third_party/llvm11/lib/libLLVMOrcError.a ./third_party/llvm11/lib/libLLVMPasses.a ./third_party/llvm11/lib/libLLVMCoroutines.a ./third_party/llvm11/lib/libLLVMipo.a ./third_party/llvm11/lib/libLLVMIRReader.a ./third_party/llvm11/lib/libLLVMAsmParser.a ./third_party/llvm11/lib/libLLVMInstrumentation.a ./third_party/llvm11/lib/libLLVMVectorize.a ./third_party/llvm11/lib/libLLVMFrontendOpenMP.a ./third_party/llvm11/lib/libLLVMLinker.a ./third_party/llvm11/lib/libLLVMMCJIT.a ./third_party/llvm11/lib/libLLVMExecutionEngine.a ./third_party/llvm11/lib/libLLVMRuntimeDyld.a ./third_party/llvm11/lib/libLLVMCodeGen.a ./third_party/llvm11/lib/libLLVMTarget.a ./third_party/llvm11/lib/libLLVMBitWriter.a ./third_party/llvm11/lib/libLLVMScalarOpts.a ./third_party/llvm11/lib/libLLVMAggressiveInstCombine.a ./third_party/llvm11/lib/libLLVMInstCombine.a ./third_party/llvm11/lib/libLLVMTransformUtils.a ./third_party/llvm11/lib/libLLVMAnalysis.a ./third_party/llvm11/lib/libLLVMProfileData.a ./third_party/llvm11/lib/libLLVMObject.a ./third_party/llvm11/lib/libLLVMBitReader.a ./third_party/llvm11/lib/libLLVMCore.a ./third_party/llvm11/lib/libLLVMRemarks.a ./third_party/llvm11/lib/libLLVMBitstreamReader.a ./third_party/llvm11/lib/libLLVMMCParser.a ./third_party/llvm11/lib/libLLVMMC.a ./third_party/llvm11/lib/libLLVMDebugInfoCodeView.a ./third_party/llvm11/lib/libLLVMDebugInfoMSF.a ./third_party/llvm11/lib/libLLVMTextAPI.a ./third_party/llvm11/lib/libLLVMBinaryFormat.a ./third_party/llvm11/lib/libLLVMSupport.a ./third_party/llvm11/lib/libLLVMDemangle.a -lpthread -ldl -ltinfo /usr/local/lib/libisl.so -lginac